From 09343c01986667b26d414733950c9d38e91c3065 Mon Sep 17 00:00:00 2001
From: forforever73 <63285796+forforever73@users.noreply.github.com>
Date: Wed, 8 Apr 2026 15:51:31 +0800
Subject: [PATCH] model : support step3-vl-10b (#21287)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: support step3-vl-10b

* use fused QKV && mapping tensor in tensor_mapping.py

* guard hardcoded params and drop crop metadata

* get understand_projector_stride from global config

* img_u8_resize_bilinear_to_f32 move in step3vl class

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* fix the \r\n mess

* add width and heads to MmprojModel.set_gguf_parameters

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 convert_hf_to_gguf.py          |  87 ++++++++++-
 gguf-py/gguf/constants.py      |   4 +
 gguf-py/gguf/tensor_mapping.py |  11 ++
 tools/mtmd/CMakeLists.txt      |   1 +
 tools/mtmd/clip-impl.h         |   2 +
 tools/mtmd/clip-model.h        |   1 -
 tools/mtmd/clip.cpp            |  47 ++++++
 tools/mtmd/models/models.h     |   5 +
 tools/mtmd/models/step3vl.cpp  |  81 +++++++++++
 tools/mtmd/mtmd-image.cpp      | 254 +++++++++++++++++++++++++++++++++
 tools/mtmd/mtmd-image.h        |  29 ++++
 tools/mtmd/mtmd.cpp            |  19 ++-
 12 files changed, 537 insertions(+), 4 deletions(-)
 create mode 100644 tools/mtmd/models/step3vl.cpp

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 09f6e7ae29..adce4f8390 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2219,10 +2219,10 @@ class MmprojModel(ModelBase):
             self.image_size = self.find_vparam(["image_size"])
             self.gguf_writer.add_vision_image_size(self.image_size)
             self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
-            self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
+            self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"]))
             self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
             self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
-            self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
+            self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"]))
 
             # preprocessor config
             image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -4949,6 +4949,73 @@ class Glm4VVisionModel(Qwen3VLVisionModel):
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("StepVLForConditionalGeneration")
+class Step3VLVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+
+        if not self.hparams_vision.get("intermediate_size"):
+            hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0
+            assert hidden_size > 0
+            mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536))
+            self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
+
+        self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN))
+        self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD))
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        assert self.hparams_vision is not None
+
+        projector_stride = int(self.global_config.get("understand_projector_stride", -1))
+        hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1)))
+        num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1)))
+        assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), (
+            "current Step3-VL conversion path is only validated for Step3-VL-10B"
+        )
+
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL)
+        self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5)))
+        self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2)
+        # 3024 max resize comes from step3-vl-10b processing_step3.py.
+        self.gguf_writer.add_vision_preproc_image_size(3024)
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        if ".position_embd." in new_name:
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("model.") or name.startswith("lm_head."):
+            return
+
+        if name.startswith("vision_model.vit_downsampler"):
+            match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name)
+            if match is None:
+                raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}")
+
+            proj_id = int(match.group(1)) - 1
+            suffix = f".{match.group(2)}"
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch)
+            return
+
+        if name == "vit_large_projector.weight":
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch)
+            return
+
+        if name.startswith("vision_model."):
+            if name == "vision_model.positional_embedding":
+                name += ".weight"
+            elif name.endswith(".gamma") and ".ls_" in name:
+                name = name.removesuffix(".gamma") + ".weight"
+
+            name = name.replace("attn.in_proj_weight", "attn.in_proj.weight")
+            name = name.replace("attn.in_proj_bias", "attn.in_proj.bias")
+
+            yield from super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Qwen3VLForConditionalGeneration")
 class Qwen3VLTextModel(Qwen3Model):
     model_arch = gguf.MODEL_ARCH.QWEN3VL
@@ -4969,6 +5036,16 @@ class Qwen3VLTextModel(Qwen3Model):
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("StepVLForConditionalGeneration")
+class Step3VLTextModel(Qwen3Model):
+    model_arch = gguf.MODEL_ARCH.QWEN3
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("vision_model.") or name.startswith("model.vision_model.") or name.startswith("vit_large_projector."):
+            return
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Qwen3VLMoeForConditionalGeneration")
 class Qwen3VLMoeTextModel(Qwen3MoeModel):
     model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
@@ -12994,6 +13071,12 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
         # For non-hf Mamba and Mamba2 models
         arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
 
+    # Step3-VL keeps text config under text_config but uses a custom top-level architecture.
+    # For text conversion we route to a dedicated text-only class.
+    # TODO: refactor this later to avoid adding exception here
+    if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
+        return arch
+
     # if "architectures" is found in the sub-config, use that instead
     if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
         arch = text_config["architectures"][0]
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0cdd1c471d..6760296b22 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -506,6 +506,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
     GEMMA3N   = auto()
     GEMMA3    = auto()
     QWEN3VL   = auto()
+    STEP3VL   = auto()
     COGVLM    = auto()
 
 
@@ -987,6 +988,8 @@ VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
     VISION_PROJECTOR_TYPE.GLM_EDGE:  "adapter",
     VISION_PROJECTOR_TYPE.MERGER:    "qwen2vl_merger",
     VISION_PROJECTOR_TYPE.GEMMA3:    "gemma3",
+    VISION_PROJECTOR_TYPE.QWEN3VL:   "qwen3vl_merger",
+    VISION_PROJECTOR_TYPE.STEP3VL:   "step3vl",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -4105,6 +4108,7 @@ class VisionProjectorType:
     QWEN2VL = "qwen2vl_merger"
     QWEN25VL = "qwen2.5vl_merger"
     QWEN3VL = "qwen3vl_merger"
+    STEP3VL = "step3vl"
     ULTRAVOX = "ultravox"
     INTERNVL = "internvl"
     QWEN2A = "qwen2a" # audio
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 9c713456ed..a2aa139de1 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1406,6 +1406,7 @@ class TensorNameMap:
             "siglip2.vision_model.embeddings.patch_embedding",
             "vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
             "model.vision_tower.patch_embedder.input_proj", # gemma4
+            "vision_model.conv1", # Step3-VL
         ),
 
         MODEL_TENSOR.V_ENC_EMBD_NORM: (
@@ -1425,6 +1426,7 @@ class TensorNameMap:
             "visual.embeddings.position_embedding", # glm4v
             "vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
             "model.vision_tower.patch_embedder.position_embedding_table", # gemma4
+            "vision_model.positional_embedding", # Step3-VL
         ),
 
         MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
@@ -1443,6 +1445,7 @@ class TensorNameMap:
             "model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP
             "vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
             "vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL
+            "vision_model.transformer.resblocks.{bid}.attn.in_proj", # Step3-VL
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_Q: (
@@ -1523,6 +1526,7 @@ class TensorNameMap:
             "model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP
             "siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
             "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
+            "vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1543,6 +1547,7 @@ class TensorNameMap:
             "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
             "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
             "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
+            "vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
         ),
 
         MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1562,6 +1567,7 @@ class TensorNameMap:
             "siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
             "vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
             "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
+            "vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
         ),
 
         MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1582,6 +1588,7 @@ class TensorNameMap:
             "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
             "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
             "vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
+            "vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
         ),
 
         MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1609,6 +1616,7 @@ class TensorNameMap:
             "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
             "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
             "vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
+            "vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_POST_NORM: (
@@ -1622,11 +1630,13 @@ class TensorNameMap:
         MODEL_TENSOR.V_LAYER_SCALE_1: (
             "vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
             "model.vision_tower.encoder.layer.{bid}.lambda_1", # Intern-S1
+            "vision_model.transformer.resblocks.{bid}.ls_1", # Step3-VL
         ),
 
         MODEL_TENSOR.V_LAYER_SCALE_2: (
             "vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
             "model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
+            "vision_model.transformer.resblocks.{bid}.ls_2", # Step3-VL
         ),
 
         MODEL_TENSOR.V_LAYER_OUT_SCALE: (
@@ -1639,6 +1649,7 @@ class TensorNameMap:
             "vision_encoder.ln_pre", # pixtral
             "vision_model.layernorm_pre", # llama4
             "model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP
+            "vision_model.ln_pre", # Step3-VL
         ),
 
         MODEL_TENSOR.V_POST_NORM: (
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index 6ffdb674de..151c15d704 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -31,6 +31,7 @@ add_library(mtmd
             models/pixtral.cpp
             models/qwen2vl.cpp
             models/qwen3vl.cpp
+            models/step3vl.cpp
             models/siglip.cpp
             models/whisper-enc.cpp
             models/deepseekocr.cpp
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 81b92841ca..0c3e60e1a8 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -242,6 +242,7 @@ enum projector_type {
     PROJECTOR_TYPE_GLM_EDGE,
     PROJECTOR_TYPE_QWEN2VL,
     PROJECTOR_TYPE_QWEN3VL,
+    PROJECTOR_TYPE_STEP3VL,
     PROJECTOR_TYPE_GEMMA3,
     PROJECTOR_TYPE_GEMMA3NV,
     PROJECTOR_TYPE_GEMMA3NA,
@@ -284,6 +285,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"},
     { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
     { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
+    { PROJECTOR_TYPE_STEP3VL,   "step3vl"},
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
     { PROJECTOR_TYPE_GEMMA3NV,  "gemma3nv"},
     { PROJECTOR_TYPE_GEMMA3NA,  "gemma3na"},
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index b85c4122ed..b2cd27dcbf 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -79,7 +79,6 @@ struct clip_hparams {
 
     float eps = 1e-6;
     float rope_theta = 0.0;
-
     std::unordered_set<int32_t> vision_feature_layer;
     int32_t attn_window_size = 0;
     int32_t n_wa_pattern = 0;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 2faf595a9f..9c886bc890 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -862,6 +862,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
             } break;
+        case PROJECTOR_TYPE_STEP3VL:
+            {
+                builder = std::make_unique<clip_graph_step3vl>(ctx, img);
+            } break;
         case PROJECTOR_TYPE_MINICPMV:
             {
                 builder = std::make_unique<clip_graph_minicpmv>(ctx, img);
@@ -1337,6 +1341,17 @@ struct clip_model_loader {
                             LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
                         }
                     } break;
+                case PROJECTOR_TYPE_STEP3VL:
+                    {
+                        hparams.n_merge = 4; // two stride-2 downsamplers after patching
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                        hparams.rope_theta = 10000.0f;
+                        get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
+                        if (hparams.image_longest_edge == 0) {
+                            hparams.image_longest_edge = 3024;
+                        }
+                        hparams.warmup_image_size = hparams.image_size;
+                    } break;
                 case PROJECTOR_TYPE_YOUTUVL:
                     {
                         hparams.n_merge = 2;
@@ -1769,6 +1784,14 @@ struct clip_model_loader {
                     model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
                     model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                 } break;
+            case PROJECTOR_TYPE_STEP3VL:
+                {
+                    model.mm_0_w     = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b     = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
+                    model.mm_1_w     = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+                    model.mm_1_b     = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
+                    model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
+                } break;
             case PROJECTOR_TYPE_YOUTUVL:
                 {
                     model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);        // merger.ln_q (RMS norm)
@@ -2615,6 +2638,8 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
         case PROJECTOR_TYPE_HUNYUANOCR:
         case PROJECTOR_TYPE_YOUTUVL:
             return (img->nx / params.patch_size) / 2;
+        case PROJECTOR_TYPE_STEP3VL:
+            return img->nx / (params.patch_size * params.n_merge);
         default:
             break;
     }
@@ -2632,6 +2657,8 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
         case PROJECTOR_TYPE_PADDLEOCR:
         case PROJECTOR_TYPE_YOUTUVL:
             return (img->ny / params.patch_size) / 2;
+        case PROJECTOR_TYPE_STEP3VL:
+            return img->ny / (params.patch_size * params.n_merge);
         default:
             break;
     }
@@ -2702,6 +2729,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                 int y_patch = img->ny / (params.patch_size * 2);
                 n_patches = x_patch * y_patch;
             } break;
+        case PROJECTOR_TYPE_STEP3VL:
+            {
+                int x_patch = img->nx / (params.patch_size * params.n_merge);
+                int y_patch = img->ny / (params.patch_size * params.n_merge);
+                n_patches = x_patch * y_patch;
+            } break;
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_GEMMA4V:
         case PROJECTOR_TYPE_IDEFICS3:
@@ -3004,6 +3037,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
                 set_input_i32("positions", positions);
             } break;
+        case PROJECTOR_TYPE_STEP3VL:
+            {
+                std::vector<int32_t> pos_data(n_pos);
+                for (int i = 0; i < n_pos; i++) {
+                    pos_data[i] = i / pos_w;
+                }
+                set_input_i32("pos_h", pos_data);
+                for (int i = 0; i < n_pos; i++) {
+                    pos_data[i] = i % pos_w;
+                }
+                set_input_i32("pos_w", pos_data);
+            } break;
         case PROJECTOR_TYPE_PADDLEOCR:
             {
                 const int merge_ratio = hparams.n_merge;
@@ -3358,6 +3403,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_QWEN3VL:
             // main path + deepstack paths
             return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
+        case PROJECTOR_TYPE_STEP3VL:
+            return ctx->model.mm_model_proj->ne[1];
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_GEMMA3NV:
             return ctx->model.mm_input_proj_w->ne[0];
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 6f9632b62a..47e2cde2b9 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -33,6 +33,11 @@ struct clip_graph_qwen3vl : clip_graph {
     ggml_cgraph * build() override;
 };
 
+struct clip_graph_step3vl : clip_graph {
+    clip_graph_step3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
 struct clip_graph_youtuvl : clip_graph {
     clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
diff --git a/tools/mtmd/models/step3vl.cpp b/tools/mtmd/models/step3vl.cpp
new file mode 100644
index 0000000000..5142b0bba3
--- /dev/null
+++ b/tools/mtmd/models/step3vl.cpp
@@ -0,0 +1,81 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_step3vl::build() {
+    GGML_ASSERT(model.class_embedding == nullptr);
+    GGML_ASSERT(model.patch_embeddings_0 != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    norm_type norm_t = NORM_TYPE_NORMAL;
+
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * learned_pos_embd = resize_position_embeddings();
+
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+    };
+
+    auto add_spatial_bias = [&](ggml_tensor * cur, ggml_tensor * bias) {
+        if (bias == nullptr) {
+            return cur;
+        }
+
+        const int64_t width    = cur->ne[0];
+        const int64_t height   = cur->ne[1];
+        const int64_t channels = cur->ne[2];
+
+        cur = ggml_reshape_2d(ctx0, cur, width * height, channels);
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+        cur = ggml_add(ctx0, cur, bias);
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+        cur = ggml_reshape_3d(ctx0, cur, width, height, channels);
+
+        return cur;
+    };
+
+    ggml_tensor * cur = build_vit(
+        inp,
+        n_patches,
+        norm_t,
+        hparams.ffn_op,
+        learned_pos_embd,
+        add_pos);
+    cb(cur, "vit_out", -1);
+
+    // [n_embd, n_patches] -> [w, h, n_embd] for spatial downsampling convolutions.
+    cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
+    cur = ggml_cont_3d(ctx0, cur, n_patches_x, n_patches_y, n_embd);
+
+    // First downsampler: Conv2d(1536 -> 3072, k=3, s=2, p=1)
+    cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, 2, 2, 1, 1, 1, 1);
+    cur = add_spatial_bias(cur, model.mm_0_b);
+    cb(cur, "downsample_0", -1);
+
+    // Second downsampler: Conv2d(3072 -> 6144, k=3, s=2, p=1)
+    cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 2, 2, 1, 1, 1, 1);
+    cur = add_spatial_bias(cur, model.mm_1_b);
+    cb(cur, "downsample_1", -1);
+
+    // [w, h, c] -> [c, w*h]
+    {
+        const int64_t w = cur->ne[0];
+        const int64_t h = cur->ne[1];
+        cur = ggml_reshape_3d(ctx0, cur, w * h, cur->ne[2], cur->ne[3]);
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
+    }
+    cb(cur, "downsample_flatten", -1);
+
+    // Final projector: Linear(6144 -> projection_dim)
+    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+    cb(cur, "projector_out", -1);
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
index a2166622b7..4f4eb5da69 100644
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -1114,6 +1114,260 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
     return true;
 }
 
+//
+// mtmd_image_preprocessor_step3vl
+//
+
+void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32(
+        const clip_image_u8 & src,
+        clip_image_f32 & dst,
+        int target_width,
+        int target_height,
+        const float mean[3],
+        const float std[3]) {
+    if (src.nx == target_width && src.ny == target_height) {
+        img_u8_to_f32(src, dst, mean, std);
+        return;
+    }
+
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+
+    const float scale_x = static_cast<float>(src.nx) / target_width;
+    const float scale_y = static_cast<float>(src.ny) / target_height;
+
+    for (int y = 0; y < target_height; ++y) {
+        const float src_y = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
+        const int y0_floor = static_cast<int>(std::floor(src_y));
+        const int y0 = std::max(0, std::min(y0_floor, src.ny - 1));
+        const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1));
+        const float ly = src_y - y0_floor;
+
+        for (int x = 0; x < target_width; ++x) {
+            const float src_x = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
+            const int x0_floor = static_cast<int>(std::floor(src_x));
+            const int x0 = std::max(0, std::min(x0_floor, src.nx - 1));
+            const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1));
+            const float lx = src_x - x0_floor;
+
+            const size_t idx00 = 3 * (y0 * src.nx + x0);
+            const size_t idx01 = 3 * (y0 * src.nx + x1);
+            const size_t idx10 = 3 * (y1 * src.nx + x0);
+            const size_t idx11 = 3 * (y1 * src.nx + x1);
+            const size_t idx_dst = 3 * (y * target_width + x);
+
+            for (int c = 0; c < 3; ++c) {
+                const float v00 = (static_cast<float>(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c];
+                const float v01 = (static_cast<float>(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c];
+                const float v10 = (static_cast<float>(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c];
+                const float v11 = (static_cast<float>(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c];
+
+                const float top = v00 + (v01 - v00) * lx;
+                const float bot = v10 + (v11 - v10) * lx;
+                dst.buf[idx_dst + c] = top + (bot - top) * ly;
+            }
+        }
+    }
+}
+
+int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) {
+    return params.image_longest_edge > 0 ? params.image_longest_edge : default_image_longest_edge;
+}
+
+int mtmd_image_preprocessor_step3vl::determine_window_size(const clip_hparams & params, int longer, int shorter) {
+    const int image_size = params.image_size;
+    const int crop_size  = default_image_crop_size;
+    const float aspect_ratio = static_cast<float>(longer) / shorter;
+
+    if (longer <= image_size) {
+        return aspect_ratio > small_aspect_ratio_limit ? shorter : 0;
+    }
+
+    return aspect_ratio > wide_aspect_ratio_limit ? std::min(shorter, crop_size) : crop_size;
+}
+
+int mtmd_image_preprocessor_step3vl::calc_crop_extent(int length, int window_size) {
+    const float ratio = static_cast<float>(length) / window_size;
+    if (ratio < 1.0f) {
+        return length;
+    }
+
+    const float decimal = ratio - std::floor(ratio);
+    const int rounded = decimal > crop_rounding_threshold
+        ? static_cast<int>(std::floor(ratio)) + 1
+        : static_cast<int>(std::floor(ratio));
+    return window_size * rounded;
+}
+
+std::vector<int> mtmd_image_preprocessor_step3vl::calc_grid(int length, int window_size) {
+    const int n = length <= window_size
+        ? 1
+        : static_cast<int>(std::ceil(static_cast<float>(length - window_size) / window_size + 1.0f));
+    std::vector<int> starts(n);
+
+    for (int i = 0; i < n; ++i) {
+        starts[i] = window_size * i;
+    }
+
+    if (n > 1 && starts.back() + window_size > length) {
+        starts.back() = length - window_size;
+    }
+
+    return starts;
+}
+
+clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) {
+    clip_image_u8 resized = img;
+    const float aspect_ratio = img.ny > 0 ? static_cast<float>(img.nx) / img.ny : 1.0f;
+    if (std::min(img.nx, img.ny) < 32 &&
+        (aspect_ratio > wide_aspect_ratio_limit ||
+         aspect_ratio < 1.0f / wide_aspect_ratio_limit)) {
+        const int square_size = std::max(img.nx, img.ny);
+        clip_image_u8 padded;
+        padded.nx = square_size;
+        padded.ny = square_size;
+        padded.buf.resize(3 * square_size * square_size);
+        img_tool::fill(padded, {0, 0, 0});
+        img_tool::composite(padded, img, 0, 0);
+        resized = std::move(padded);
+    }
+
+    const int max_image_size = get_image_longest_edge(params);
+    if (std::max(resized.nx, resized.ny) > max_image_size) {
+        const float scale = static_cast<float>(max_image_size) / std::max(resized.nx, resized.ny);
+        const clip_image_size new_size = {
+            std::max(1, static_cast<int>(std::floor(resized.nx * scale))),
+            std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
+        };
+        clip_image_u8 scaled;
+        img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false);
+        resized = std::move(scaled);
+    }
+
+    return resized;
+}
+
+clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) {
+    clip_image_u8 dst;
+    dst.nx = w;
+    dst.ny = h;
+    dst.buf.resize(3 * w * h, 0);
+
+    const int src_x0 = std::max(0, x);
+    const int src_y0 = std::max(0, y);
+    const int src_x1 = std::min(image.nx, x + w);
+    const int src_y1 = std::min(image.ny, y + h);
+
+    if (src_x0 >= src_x1 || src_y0 >= src_y1) {
+        return dst;
+    }
+
+    const int dst_x0 = src_x0 - x;
+    const int dst_y0 = src_y0 - y;
+
+    for (int yy = 0; yy < src_y1 - src_y0; ++yy) {
+        for (int xx = 0; xx < src_x1 - src_x0; ++xx) {
+            const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx));
+            const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx));
+            dst.buf[dst_idx + 0] = image.buf[src_idx + 0];
+            dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
+            dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+        }
+    }
+
+    return dst;
+}
+
+mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step3vl::build_slice_instructions(
+        const clip_hparams & params,
+        const clip_image_size & prepared_size) {
+    slice_instructions instructions;
+    instructions.overview_size = prepared_size;
+
+    const int window_size = determine_window_size(
+        params,
+        std::max(prepared_size.width, prepared_size.height),
+        std::min(prepared_size.width, prepared_size.height));
+    if (window_size <= 0) {
+        instructions.refined_size = clip_image_size{0, 0};
+        instructions.grid_size    = clip_image_size{0, 0};
+        return instructions;
+    }
+
+    const int crop_width  = calc_crop_extent(prepared_size.width,  window_size);
+    const int crop_height = calc_crop_extent(prepared_size.height, window_size);
+    instructions.refined_size = clip_image_size{crop_width, crop_height};
+
+    const auto xs = calc_grid(crop_width,  window_size);
+    const auto ys = calc_grid(crop_height, window_size);
+    instructions.grid_size = clip_image_size{
+        static_cast<int>(xs.size()),
+        static_cast<int>(ys.size()),
+    };
+
+    for (int y : ys) {
+        for (int x : xs) {
+            instructions.slices.push_back(slice_coordinates{
+                /* x    */ x,
+                /* y    */ y,
+                /* size */ clip_image_size{window_size, window_size},
+            });
+        }
+    }
+
+    return instructions;
+}
+
+bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    clip_image_u8 prepared = prepare_image(img, hparams);
+    const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny});
+
+    clip_image_f32_ptr overview_f32(clip_image_f32_init());
+    img_u8_resize_bilinear_to_f32(
+        prepared,
+        *overview_f32,
+        hparams.image_size,
+        hparams.image_size,
+        hparams.image_mean,
+        hparams.image_std);
+    output.entries.push_back(std::move(overview_f32));
+
+    if (instructions.slices.empty()) {
+        output.grid_x = 0;
+        output.grid_y = 0;
+        return true;
+    }
+
+    clip_image_u8 img_for_crop = prepared;
+    if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
+        clip_image_u8 refined;
+        img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false);
+        img_for_crop = std::move(refined);
+    }
+
+    const int crop_size = default_image_crop_size;
+    for (const auto & slice : instructions.slices) {
+        // If the requested patch extends past the source image, pad the out-of-bounds area with black.
+        clip_image_u8 patch = crop_with_black_padding(img_for_crop, slice.x, slice.y, slice.size.width, slice.size.height);
+
+        clip_image_f32_ptr patch_f32(clip_image_f32_init());
+        img_u8_resize_bilinear_to_f32(
+            patch,
+            *patch_f32,
+            crop_size,
+            crop_size,
+            hparams.image_mean,
+            hparams.image_std);
+        output.entries.push_back(std::move(patch_f32));
+    }
+
+    output.grid_x = instructions.grid_size.width;
+    output.grid_y = instructions.grid_size.height;
+
+    return true;
+}
+
 //
 // mtmd_image_preprocessor_youtuvl
 //
diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h
index 065b937d61..08129a08ed 100644
--- a/tools/mtmd/mtmd-image.h
+++ b/tools/mtmd/mtmd-image.h
@@ -144,6 +144,35 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
     bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };
 
+// custom image preprocessing for Step3VL
+// ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py
+struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd {
+    mtmd_image_preprocessor_step3vl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+    static slice_instructions build_slice_instructions(const clip_hparams & params, const clip_image_size & prepared_size);
+
+private:
+    static constexpr int   default_image_longest_edge = 3024;
+    static constexpr int   default_image_crop_size    = 504;
+    static constexpr float small_aspect_ratio_limit   = 1.5f;
+    static constexpr float wide_aspect_ratio_limit    = 4.0f;
+    static constexpr float crop_rounding_threshold    = 0.2f;
+
+    void img_u8_resize_bilinear_to_f32(
+            const clip_image_u8 & src,
+            clip_image_f32 & dst,
+            int target_width,
+            int target_height,
+            const float mean[3],
+            const float std[3]);
+    static int get_image_longest_edge(const clip_hparams & params);
+    static int determine_window_size(const clip_hparams & params, int longer, int shorter);
+    static int calc_crop_extent(int length, int window_size);
+    static std::vector<int> calc_grid(int length, int window_size);
+    static clip_image_u8 prepare_image(const clip_image_u8 & img, const clip_hparams & params);
+    static clip_image_u8 crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h);
+};
+
 struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
     mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
     bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 4b6dd44f09..4cbb3301ea 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -88,6 +88,7 @@ enum mtmd_slice_tmpl {
     MTMD_SLICE_TMPL_LLAMA4,
     MTMD_SLICE_TMPL_IDEFICS3,
     MTMD_SLICE_TMPL_LFM2,
+    MTMD_SLICE_TMPL_STEP3VL,
 };
 
 const char * mtmd_default_marker() {
@@ -259,7 +260,6 @@ struct mtmd_context {
                         tok_row_end       = {lookup_token("\n")};
                         tok_row_end_trail = false; // no trailing end-of-row token
                         ov_img_first      = true;
-
                     } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
                         // minicpmv 2.6 format:
                         // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
@@ -331,6 +331,22 @@ struct mtmd_context {
                             "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
                     image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
                 } break;
+            case PROJECTOR_TYPE_STEP3VL:
+                {
+                    // Step3 format:
+                    //   <patch_start> (patch) <patch_end> [<patch_newline>]
+                    //   ... (all patch rows)
+                    //   <im_start> (overview) <im_end>
+                    slice_tmpl        = MTMD_SLICE_TMPL_STEP3VL;
+                    tok_ov_img_start  = {lookup_token("<im_start>")};
+                    tok_ov_img_end    = {lookup_token("<im_end>")};
+                    tok_sli_img_start = {lookup_token("<patch_start>")};
+                    tok_sli_img_end   = {lookup_token("<patch_end>")};
+                    tok_row_end       = {lookup_token("<patch_newline>")};
+                    tok_row_end_trail = false;
+                    ov_img_first      = false; // patches first, overview last
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_step3vl>(ctx_v);
+                } break;
             case PROJECTOR_TYPE_INTERNVL:
                 {
                     // <img> ... (image embeddings) ... </img>
@@ -682,6 +698,7 @@ struct mtmd_tokenizer {
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
+                || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
                 || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
             ) {
                 const int n_col = batch_f32.grid_x;