mtmd: refactor image preprocessing (#21031)

* mtmd: refactor image pre-processing * correct some places * correct lfm2 * fix deepseek-ocr on server * add comment to clarify about mtmd_image_preprocessor_dyn_size
2026-03-26 19:49:20 +01:00 · 2026-03-26 19:49:20 +01:00 · a73bbd5d92
parent ded446b34c
commit a73bbd5d92
8 changed files with 1602 additions and 1539 deletions
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
 add_library(mtmd
            mtmd.cpp
            mtmd-audio.cpp
            mtmd-image.cpp
            mtmd.h
            mtmd-helper.cpp
            mtmd-helper.h
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -51,7 +51,6 @@
 #define KEY_MM_PATCH_MERGE_TYPE    "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS   "clip.vision.image_grid_pinpoints"
 #define KEY_IMAGE_CROP_RESOLUTION  "clip.vision.image_crop_resolution"
 #define KEY_WIN_ATTN_PATTERN       "clip.vision.n_wa_pattern"
 #define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
 #define KEY_ATTN_WINDOW_SIZE       "clip.vision.window_size"
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@ -28,6 +28,13 @@ enum patch_merge_type {
    PATCH_MERGE_SPATIAL_UNPAD,
 };
 enum resize_algo {
    RESIZE_ALGO_BILINEAR, // stretch to target resolution
    RESIZE_ALGO_BICUBIC, // center-crop when aspect ratio doesn't match
    RESIZE_ALGO_BICUBIC_PILLOW,
    // RESIZE_ALGO_LANCZOS, // TODO
 };
 struct clip_hparams {
    int32_t image_size = 0;
    int32_t patch_size = 0;
@ -37,13 +44,26 @@ struct clip_hparams {
    int32_t n_head = 0;
    int32_t n_layer = 0;
    // idefics3
    int32_t n_merge = 0; // number of patch merges **per-side**
    // for preprocessor
    int32_t image_longest_edge = 0;
    int32_t image_min_pixels = -1;
    int32_t image_max_pixels = -1;
-    int32_t n_merge = 0; // number of patch merges **per-side**
+    resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
    bool image_resize_pad = true; // if false, center-crop will be applied when resizing
    std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
    // (preprocessor) for llava-uhd style models
    std::vector<clip_image_size> image_res_candidates;
    int32_t preproc_min_tiles = 0;
    int32_t preproc_max_tiles = 0;
    resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
    resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
    bool image_pad_rf = true;  // if true, refined image will be padded (e.g. llava-1.6)
    bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
    std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
    std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
    float image_mean[3];
    float image_std[3];
@ -60,8 +80,6 @@ struct clip_hparams {
    float eps = 1e-6;
    float rope_theta = 0.0;
    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
    int32_t image_crop_resolution;
    std::unordered_set<int32_t> vision_feature_layer;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@ -97,9 +97,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
 */
 void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
 /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
 bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
 struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
--- a/tools/mtmd/mtmd-image.h
+++ b/tools/mtmd/mtmd-image.h
@ -0,0 +1,150 @@
 #pragma once
 #include "ggml.h"
 #include "clip-model.h"
 #include <vector>
 #include <string>
 #define MTMD_INTERNAL_HEADER
 // base class, models must inherit from this class
 struct mtmd_image_preprocessor {
    const clip_hparams & hparams;
    mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
    virtual ~mtmd_image_preprocessor() = default;
    virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;
    void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
    void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
 };
 /**
 * implementation of LLaVA-UHD:
 *  - https://arxiv.org/pdf/2403.11703
 *  - https://github.com/thunlp/LLaVA-UHD
 *  - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
 *
 * overview:
 *   - an image always have a single overview (downscaled image)
 *   - an image can have 0 or multiple slices, depending on the image size
 *   - each slice can then be considered as a separate image
 *
 * note: the term "slice" and "tile" are used interchangeably
 *
 * for example:
 *
 * [overview] --> [slice 1] --> [slice 2]
 *           |                |
 *           +--> [slice 3] --> [slice 4]
 */
 struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
    mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
    struct slice_coordinates {
        int x;
        int y;
        clip_image_size size;
    };
    struct slice_instructions {
        clip_image_size overview_size; // size of downscaled image
        clip_image_size refined_size;  // size of image right before slicing (must be multiple of slice size)
        clip_image_size grid_size;     // grid_size.width * grid_size.height = number of slices
        std::vector<slice_coordinates> slices;
    };
    // LFM2 override this function to implement its custom slicing logic
    virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
    std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
 private:
    clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
    clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max);
    /**
     * Selects the best resolution from a list of possible resolutions based on the original size.
     *
     * For example, when given a list of resolutions:
     *  - 100x100
     *  - 200x100
     *  - 100x200
     *  - 200x200
     *
     * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
     *
     * @param original_size The original size of the image
     * @param possible_resolutions A list of possible resolutions
     * @return The best fit resolution
     */
    clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions);
    int ensure_divide(int length, int patch_size);
    clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false);
    clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio);
 };
 // downscale or upscale the input image to fixed size
 struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
    mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 // resize image to multiple of patch_size*n_merge, while preserving aspect ratio
 // if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad
 // this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc
 struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
    mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 // similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio
 struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
    mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 // custom llava-uhd slicing logic for LFM2
 // ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
 struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd {
    // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
    static constexpr int   min_tiles            = 2;
    static constexpr int   max_tiles            = 10;
    static constexpr float max_pixels_tolerance = 2.0f;
    static constexpr int   tile_size            = 512;
    using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd;
    slice_instructions get_slice_instructions(const clip_image_size & original_size) override;
 private:
    clip_image_size find_closest_aspect_ratio(
            float aspect_ratio,
            const std::vector<clip_image_size> & target_ratios,
            int width, int height);
    std::vector<clip_image_size> get_target_ratios();
    clip_image_size get_grid_layout(int height, int width);
 };
 struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
    mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
    mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
    mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -2,6 +2,7 @@
 #include "clip-impl.h"
 #include "mtmd.h"
 #include "mtmd-audio.h"
 #include "mtmd-image.h"
 #include "debug/mtmd-debug.h"
 #include "llama.h"
@ -138,7 +139,7 @@ struct mtmd_context {
    // for llava-uhd style models, we need special tokens in-between slices
    // minicpmv calls them "slices", llama 4 calls them "tiles"
-    mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
+    mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
    std::vector<llama_token> tok_ov_img_start;  // overview image
    std::vector<llama_token> tok_ov_img_end;    // overview image
    std::vector<llama_token> tok_slices_start;  // start of all slices
@ -147,13 +148,14 @@ struct mtmd_context {
    std::vector<llama_token> tok_sli_img_end;   // single slice end
    std::vector<llama_token> tok_sli_img_mid;   // between 2 slices
    std::vector<llama_token> tok_row_end;       // end of row
-    bool        tok_row_end_trail = false;
+    bool tok_row_end_trail = false;
-    bool        ov_img_first      = false;
+    bool ov_img_first      = false;
    // string template for slice image delimiters with row/col (idefics3)
    std::string sli_img_start_tmpl;
    std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
    std::unique_ptr<mtmd_image_preprocessor> image_preproc;
    // TODO @ngxson : add timings
@ -221,123 +223,193 @@ struct mtmd_context {
    void init_vision() {
        GGML_ASSERT(ctx_v != nullptr);
        image_preproc.reset();
        projector_type proj = clip_get_projector_type(ctx_v);
        int minicpmv_version = clip_is_minicpmv(ctx_v);
        if (minicpmv_version == 2) {
            // minicpmv 2.5 format:
            // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5;
            tok_ov_img_start  = {lookup_token("<image>")};
            tok_ov_img_end    = {lookup_token("</image>")};
            tok_slices_start  = {lookup_token("<slice>")};
            tok_slices_end    = {lookup_token("</slice>")};
            tok_sli_img_start = tok_ov_img_start;
            tok_sli_img_end   = tok_ov_img_end;
            tok_row_end       = {lookup_token("\n")};
            tok_row_end_trail = false; // no trailing end-of-row token
            ov_img_first      = true;
-        } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
+        switch (proj) {
-            // minicpmv 2.6 format:
+            case PROJECTOR_TYPE_MLP:
-            // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
+            case PROJECTOR_TYPE_MLP_NORM:
-            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
+            case PROJECTOR_TYPE_LDP:
-            tok_ov_img_start  = {lookup_token("<image>")};
+            case PROJECTOR_TYPE_LDPV2:
-            tok_ov_img_end    = {lookup_token("</image>")};
+            case PROJECTOR_TYPE_COGVLM:
-            tok_sli_img_start = {lookup_token("<slice>")};
+            case PROJECTOR_TYPE_JANUS_PRO:
-            tok_sli_img_end   = {lookup_token("</slice>")};
+            case PROJECTOR_TYPE_GLM_EDGE:
-            tok_row_end       = {lookup_token("\n")};
+                {
-            tok_row_end_trail = false; // no trailing end-of-row token
+                    bool has_pinpoints = !clip_get_hparams(ctx_v)->image_res_candidates.empty();
-            ov_img_first      = true;
+                    if (has_pinpoints) {
                        image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
                    } else {
                        image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
                    }
                } break;
            case PROJECTOR_TYPE_MINICPMV:
                {
                    int minicpmv_version = clip_is_minicpmv(ctx_v);
                    if (minicpmv_version == 2) {
                        // minicpmv 2.5 format:
                        // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
                        slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5;
                        tok_ov_img_start  = {lookup_token("<image>")};
                        tok_ov_img_end    = {lookup_token("</image>")};
                        tok_slices_start  = {lookup_token("<slice>")};
                        tok_slices_end    = {lookup_token("</slice>")};
                        tok_sli_img_start = tok_ov_img_start;
                        tok_sli_img_end   = tok_ov_img_end;
                        tok_row_end       = {lookup_token("\n")};
                        tok_row_end_trail = false; // no trailing end-of-row token
                        ov_img_first      = true;
-        } else if (minicpmv_version != 0) {
+                    } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
-            GGML_ASSERT(false && "unsupported minicpmv version");
+                        // minicpmv 2.6 format:
-        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+                        // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
-            // llama 4 format:
+                        slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
-            // <|image_start|>
+                        tok_ov_img_start  = {lookup_token("<image>")};
-            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+                        tok_ov_img_end    = {lookup_token("</image>")};
-            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
+                        tok_sli_img_start = {lookup_token("<slice>")};
-            //     ... <|tile_y_separator|>   <-- trailing end-of-row token
+                        tok_sli_img_end   = {lookup_token("</slice>")};
-            // <|image|> (overview)           <-- overview image is last
+                        tok_row_end       = {lookup_token("\n")};
-            // <|image_end|>
+                        tok_row_end_trail = false; // no trailing end-of-row token
-            slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
+                        ov_img_first      = true;
-            tok_ov_img_start  = {lookup_token("<|image|>")};
+
-            tok_sli_img_mid   = {lookup_token("<|tile_x_separator|>")};
+                    } else if (minicpmv_version != 0) {
-            tok_row_end       = {lookup_token("<|tile_y_separator|>")};
+                        throw std::runtime_error(string_format("unsupported minicpmv version: %d\n", minicpmv_version));
-            tok_row_end_trail = true; // add trailing end-of-row token
+                    }
-            ov_img_first      = false; // overview image is last
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
                } break;
            case PROJECTOR_TYPE_QWEN2VL:
            case PROJECTOR_TYPE_QWEN25VL:
            case PROJECTOR_TYPE_QWEN3VL:
                {
                    // <|vision_start|> ... (image embeddings) ... <|vision_end|>
                    img_beg = "<|vision_start|>";
                    img_end = "<|vision_end|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_YOUTUVL:
                {
                    // <|vision_start|> ... (image embeddings) ... <|vision_end|>
                    img_beg = "<|vision_start|>";
                    img_end = "<|vision_end|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_youtuvl>(ctx_v);
                } break;
            case PROJECTOR_TYPE_GEMMA3:
            case PROJECTOR_TYPE_GEMMA3NV:
                {
                    // <start_of_image> ... (image embeddings) ... <end_of_image>
                    img_beg = "<start_of_image>";
                    img_end = "<end_of_image>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_IDEFICS3:
                {
                    // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
                    slice_tmpl         = MTMD_SLICE_TMPL_IDEFICS3;
                    tok_ov_img_start   = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
                    tok_ov_img_end     = {lookup_token("<fake_token_around_image>")};
                    tok_row_end        = {lookup_token("\n")};
                    sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_idefics3>(ctx_v);
                } break;
            case PROJECTOR_TYPE_PIXTRAL:
                {
                    // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
                    img_end = "[IMG_END]";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_PHI4:
                {
                    // Phi-4 uses media marker insertion only. Keep image boundary text empty.
                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_LLAMA4:
                {
                    // (more details in mtmd_context constructor)
                    img_beg = "<|image_start|>";
                    img_end = "<|image_end|>";
                    LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
                            "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
                    image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
                } break;
            case PROJECTOR_TYPE_INTERNVL:
                {
                    // <img> ... (image embeddings) ... </img>
                    img_beg = "<img>";
                    img_end = "</img>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_internvl>(ctx_v);
                } break;
            case PROJECTOR_TYPE_KIMIVL:
                {
                    // <|media_start|> ... (image embeddings) ... <|media_end|>
                    img_beg = "<|media_start|>";
                    img_end = "<|media_end|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_KIMIK25:
                {
                    // <|media_begin|> ... (image embeddings) ... <|media_end|>
                    img_beg = "<|media_begin|>";
                    img_end = "<|media_end|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_LIGHTONOCR:
                {
                    // <|im_start|> ... (image embeddings) ... <|im_end|>
                    img_beg = "<|im_start|>";
                    img_end = "<|im_end|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_longest_edge>(ctx_v);
                } break;
            case PROJECTOR_TYPE_NEMOTRON_V2_VL:
                {
                    image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_LFM2:
                {
                    // multi-tile:
                    //   <|image_start|>
                    //     <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
                    //     <|img_thumbnail|> (thumbnail)
                    //   <|image_end|>
                    // single-tile:
                    //   <|image_start|> (image) <|image_end|>
                    img_beg            = "<|image_start|>";
                    img_end            = "<|image_end|>";
                    slice_tmpl         = MTMD_SLICE_TMPL_LFM2;
                    sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
                    tok_ov_img_start   = {lookup_token("<|img_thumbnail|>")};
                    ov_img_first       = false;
                    image_preproc = std::make_unique<mtmd_image_preprocessor_lfm2>(ctx_v);
                } break;
            case PROJECTOR_TYPE_GLM4V:
                {
                    // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
                    img_beg = "<|begin_of_image|>";
                    img_end = "<|end_of_image|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_PADDLEOCR:
                {
                    // <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
                    img_beg = "<|IMAGE_START|>";
                    img_end = "<|IMAGE_END|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
                } break;
            case PROJECTOR_TYPE_DEEPSEEKOCR:
                {
                    img_end = "\n"; // prevent empty batch on llama-server
                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                } break;
            default:
                throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
        }
-        // set boi/eoi
+        GGML_ASSERT(image_preproc != nullptr);
        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
            // <start_of_image> ... (image embeddings) ... <end_of_image>
            img_beg = "<start_of_image>";
            img_end = "<end_of_image>";
        } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
            // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
            slice_tmpl         = MTMD_SLICE_TMPL_IDEFICS3;
            tok_ov_img_start   = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
            tok_ov_img_end     = {lookup_token("<fake_token_around_image>")};
            tok_row_end        = {lookup_token("\n")};
            sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
        } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
            img_end = "[IMG_END]";
        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
            img_beg = "<|vision_start|>";
            img_end = "<|vision_end|>";
        } else if (proj == PROJECTOR_TYPE_PHI4) {
            // Phi-4 uses media marker insertion only. Keep image boundary text empty.
        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
            // (more details in mtmd_context constructor)
            img_beg = "<|image_start|>";
            img_end = "<|image_end|>";
            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
        } else if (proj == PROJECTOR_TYPE_INTERNVL) {
            // <img> ... (image embeddings) ... </img>
            img_beg = "<img>";
            img_end = "</img>";
        } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
            // <|im_start|> ... (image embeddings) ... <|im_end|>
            img_beg = "<|im_start|>";
            img_end = "<|im_end|>";
        } else if (proj == PROJECTOR_TYPE_LFM2) {
            // multi-tile:
            //   <|image_start|>
            //     <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
            //     <|img_thumbnail|> (thumbnail)
            //   <|image_end|>
            // single-tile:
            //   <|image_start|> (image) <|image_end|>
            img_beg            = "<|image_start|>";
            img_end            = "<|image_end|>";
            slice_tmpl         = MTMD_SLICE_TMPL_LFM2;
            sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
            tok_ov_img_start   = {lookup_token("<|img_thumbnail|>")};
            ov_img_first       = false;
        } else if (proj == PROJECTOR_TYPE_GLM4V) {
            img_beg = "<|begin_of_image|>";
            img_end = "<|end_of_image|>";
        } else if (proj == PROJECTOR_TYPE_PADDLEOCR) {
            // <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
            img_beg = "<|IMAGE_START|>";
            img_end = "<|IMAGE_END|>";
        }
    }
    void init_audio() {
        GGML_ASSERT(ctx_a != nullptr);
        audio_preproc.reset();
        projector_type proj = clip_get_projector_type(ctx_a);
        LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
@ -347,36 +419,40 @@ struct mtmd_context {
        switch (proj) {
            case PROJECTOR_TYPE_QWEN2A:
            case PROJECTOR_TYPE_QWEN25O:
-            case PROJECTOR_TYPE_ULTRAVOX:
+                {
                    // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
                    aud_beg = "<|audio_bos|>";
                    aud_end = "<|audio_eos|>";
                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                } break;
            case PROJECTOR_TYPE_VOXTRAL:
-            case PROJECTOR_TYPE_GLMA:
+                {
                    // [BEGIN_AUDIO] ... (embeddings) ...
                    aud_beg = "[BEGIN_AUDIO]";
                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                } break;
            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-                audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
+                {
-                break;
+                    // <sound> ... (embeddings) ...
                    aud_beg = "<sound>";
                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                } break;
            case PROJECTOR_TYPE_ULTRAVOX:
            case PROJECTOR_TYPE_GLMA:
                {
                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                } break;
            case PROJECTOR_TYPE_LFM2A:
-                audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
+                {
-                break;
+                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
                } break;
            default:
-                GGML_ABORT("unsupported audio projector type");
+                throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj));
        }
        // initialize audio preprocessor
        GGML_ASSERT(audio_preproc != nullptr);
        audio_preproc->initialize();
        // set special tokens
        if (proj == PROJECTOR_TYPE_QWEN2A) {
            // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
            aud_beg = "<|audio_bos|>";
            aud_end = "<|audio_eos|>";
        } else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
            // [BEGIN_AUDIO] ... (embeddings) ...
            aud_beg = "[BEGIN_AUDIO]";
        } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
            // <sound> ... (embeddings) ...
            aud_beg = "<sound>";
        }
    }
    // get clip ctx based on chunk type
@ -573,8 +649,9 @@ struct mtmd_tokenizer {
            std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
            // preprocess image
            GGML_ASSERT(ctx->image_preproc != nullptr);
            clip_image_f32_batch batch_f32;
-            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
+            bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
            if (!ok) {
                LOG_ERR("Unable to preprocess image\n");
                return 2;
@ -1225,7 +1302,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
    img_u8.ny = ny;
    img_u8.buf = rgb_values;
    clip_image_f32_batch batch_f32;
-    bool ok = clip_image_preprocess(ctx->ctx_v, &img_u8, &batch_f32);
+    GGML_ASSERT(ctx->image_preproc != nullptr);
    bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
    if (!ok) {
        LOG_ERR("%s: failed to preprocess image\n", __func__);
        return;