From 09343c01986667b26d414733950c9d38e91c3065 Mon Sep 17 00:00:00 2001 From: forforever73 <63285796+forforever73@users.noreply.github.com> Date: Wed, 8 Apr 2026 15:51:31 +0800 Subject: [PATCH] model : support step3-vl-10b (#21287) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: support step3-vl-10b * use fused QKV && mapping tensor in tensor_mapping.py * guard hardcoded params and drop crop metadata * get understand_projector_stride from global config * img_u8_resize_bilinear_to_f32 move in step3vl class * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret * fix the \r\n mess * add width and heads to MmprojModel.set_gguf_parameters --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 87 ++++++++++- gguf-py/gguf/constants.py | 4 + gguf-py/gguf/tensor_mapping.py | 11 ++ tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-impl.h | 2 + tools/mtmd/clip-model.h | 1 - tools/mtmd/clip.cpp | 47 ++++++ tools/mtmd/models/models.h | 5 + tools/mtmd/models/step3vl.cpp | 81 +++++++++++ tools/mtmd/mtmd-image.cpp | 254 +++++++++++++++++++++++++++++++++ tools/mtmd/mtmd-image.h | 29 ++++ tools/mtmd/mtmd.cpp | 19 ++- 12 files changed, 537 insertions(+), 4 deletions(-) create mode 100644 tools/mtmd/models/step3vl.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 09f6e7ae29..adce4f8390 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2219,10 +2219,10 @@ class MmprojModel(ModelBase): self.image_size = self.find_vparam(["image_size"]) self.gguf_writer.add_vision_image_size(self.image_size) self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"])) - self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"])) + self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"])) self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"])) self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys)) - self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"])) + self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"])) # preprocessor config image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] @@ -4949,6 +4949,73 @@ class Glm4VVisionModel(Qwen3VLVisionModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("StepVLForConditionalGeneration") +class Step3VLVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + + if not self.hparams_vision.get("intermediate_size"): + hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0 + assert hidden_size > 0 + mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536)) + self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio)) + + self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN)) + self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD)) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + assert self.hparams_vision is not None + + projector_stride = int(self.global_config.get("understand_projector_stride", -1)) + hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1))) + num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1))) + assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), ( + "current Step3-VL conversion path is only validated for Step3-VL-10B" + ) + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL) + self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5))) + self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2) + # 3024 max resize comes from step3-vl-10b processing_step3.py. + self.gguf_writer.add_vision_preproc_image_size(3024) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".position_embd." in new_name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("model.") or name.startswith("lm_head."): + return + + if name.startswith("vision_model.vit_downsampler"): + match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name) + if match is None: + raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}") + + proj_id = int(match.group(1)) - 1 + suffix = f".{match.group(2)}" + yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch) + return + + if name == "vit_large_projector.weight": + yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch) + return + + if name.startswith("vision_model."): + if name == "vision_model.positional_embedding": + name += ".weight" + elif name.endswith(".gamma") and ".ls_" in name: + name = name.removesuffix(".gamma") + ".weight" + + name = name.replace("attn.in_proj_weight", "attn.in_proj.weight") + name = name.replace("attn.in_proj_bias", "attn.in_proj.bias") + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Qwen3VLForConditionalGeneration") class Qwen3VLTextModel(Qwen3Model): model_arch = gguf.MODEL_ARCH.QWEN3VL @@ -4969,6 +5036,16 @@ class Qwen3VLTextModel(Qwen3Model): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("StepVLForConditionalGeneration") +class Step3VLTextModel(Qwen3Model): + model_arch = gguf.MODEL_ARCH.QWEN3 + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("vision_model.") or name.startswith("model.vision_model.") or name.startswith("vit_large_projector."): + return + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Qwen3VLMoeForConditionalGeneration") class Qwen3VLMoeTextModel(Qwen3MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3VLMOE @@ -12994,6 +13071,12 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st # For non-hf Mamba and Mamba2 models arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM" + # Step3-VL keeps text config under text_config but uses a custom top-level architecture. + # For text conversion we route to a dedicated text-only class. + # TODO: refactor this later to avoid adding exception here + if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration": + return arch + # if "architectures" is found in the sub-config, use that instead if model_type == ModelType.TEXT and text_config.get("architectures") is not None: arch = text_config["architectures"][0] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0cdd1c471d..6760296b22 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -506,6 +506,7 @@ class VISION_PROJECTOR_TYPE(IntEnum): GEMMA3N = auto() GEMMA3 = auto() QWEN3VL = auto() + STEP3VL = auto() COGVLM = auto() @@ -987,6 +988,8 @@ VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter", VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger", VISION_PROJECTOR_TYPE.GEMMA3: "gemma3", + VISION_PROJECTOR_TYPE.QWEN3VL: "qwen3vl_merger", + VISION_PROJECTOR_TYPE.STEP3VL: "step3vl", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -4105,6 +4108,7 @@ class VisionProjectorType: QWEN2VL = "qwen2vl_merger" QWEN25VL = "qwen2.5vl_merger" QWEN3VL = "qwen3vl_merger" + STEP3VL = "step3vl" ULTRAVOX = "ultravox" INTERNVL = "internvl" QWEN2A = "qwen2a" # audio diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 9c713456ed..a2aa139de1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1406,6 +1406,7 @@ class TensorNameMap: "siglip2.vision_model.embeddings.patch_embedding", "vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL "model.vision_tower.patch_embedder.input_proj", # gemma4 + "vision_model.conv1", # Step3-VL ), MODEL_TENSOR.V_ENC_EMBD_NORM: ( @@ -1425,6 +1426,7 @@ class TensorNameMap: "visual.embeddings.position_embedding", # glm4v "vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL "model.vision_tower.patch_embedder.position_embedding_table", # gemma4 + "vision_model.positional_embedding", # Step3-VL ), MODEL_TENSOR.V_ENC_EMBD_IMGNL: ( @@ -1443,6 +1445,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP "vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5 "vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL + "vision_model.transformer.resblocks.{bid}.attn.in_proj", # Step3-VL ), MODEL_TENSOR.V_ENC_ATTN_Q: ( @@ -1523,6 +1526,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.layer_norm1", "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL + "vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL ), MODEL_TENSOR.V_ENC_ATTN_O: ( @@ -1543,6 +1547,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4 + "vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( @@ -1562,6 +1567,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.layer_norm2", "vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4 + "vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL ), MODEL_TENSOR.V_ENC_FFN_UP: ( @@ -1582,6 +1588,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.mlp.fc1", "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL "vision_model.model.layers.{bid}.mlp.up_proj", # gemma4 + "vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL ), MODEL_TENSOR.V_ENC_FFN_GATE: ( @@ -1609,6 +1616,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2", "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL "vision_model.model.layers.{bid}.mlp.down_proj", # gemma4 + "vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL ), MODEL_TENSOR.V_ENC_ATTN_POST_NORM: ( @@ -1622,11 +1630,13 @@ class TensorNameMap: MODEL_TENSOR.V_LAYER_SCALE_1: ( "vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL "model.vision_tower.encoder.layer.{bid}.lambda_1", # Intern-S1 + "vision_model.transformer.resblocks.{bid}.ls_1", # Step3-VL ), MODEL_TENSOR.V_LAYER_SCALE_2: ( "vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL "model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1 + "vision_model.transformer.resblocks.{bid}.ls_2", # Step3-VL ), MODEL_TENSOR.V_LAYER_OUT_SCALE: ( @@ -1639,6 +1649,7 @@ class TensorNameMap: "vision_encoder.ln_pre", # pixtral "vision_model.layernorm_pre", # llama4 "model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP + "vision_model.ln_pre", # Step3-VL ), MODEL_TENSOR.V_POST_NORM: ( diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 6ffdb674de..151c15d704 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -31,6 +31,7 @@ add_library(mtmd models/pixtral.cpp models/qwen2vl.cpp models/qwen3vl.cpp + models/step3vl.cpp models/siglip.cpp models/whisper-enc.cpp models/deepseekocr.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 81b92841ca..0c3e60e1a8 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -242,6 +242,7 @@ enum projector_type { PROJECTOR_TYPE_GLM_EDGE, PROJECTOR_TYPE_QWEN2VL, PROJECTOR_TYPE_QWEN3VL, + PROJECTOR_TYPE_STEP3VL, PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_GEMMA3NV, PROJECTOR_TYPE_GEMMA3NA, @@ -284,6 +285,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, + { PROJECTOR_TYPE_STEP3VL, "step3vl"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"}, { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index b85c4122ed..b2cd27dcbf 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -79,7 +79,6 @@ struct clip_hparams { float eps = 1e-6; float rope_theta = 0.0; - std::unordered_set vision_feature_layer; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 2faf595a9f..9c886bc890 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -862,6 +862,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_STEP3VL: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_MINICPMV: { builder = std::make_unique(ctx, img); @@ -1337,6 +1341,17 @@ struct clip_model_loader { LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); } } break; + case PROJECTOR_TYPE_STEP3VL: + { + hparams.n_merge = 4; // two stride-2 downsamplers after patching + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + hparams.rope_theta = 10000.0f; + get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); + if (hparams.image_longest_edge == 0) { + hparams.image_longest_edge = 3024; + } + hparams.warmup_image_size = hparams.image_size; + } break; case PROJECTOR_TYPE_YOUTUVL: { hparams.n_merge = 2; @@ -1769,6 +1784,14 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); } break; + case PROJECTOR_TYPE_STEP3VL: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); + } break; case PROJECTOR_TYPE_YOUTUVL: { model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm) @@ -2615,6 +2638,8 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_HUNYUANOCR: case PROJECTOR_TYPE_YOUTUVL: return (img->nx / params.patch_size) / 2; + case PROJECTOR_TYPE_STEP3VL: + return img->nx / (params.patch_size * params.n_merge); default: break; } @@ -2632,6 +2657,8 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_YOUTUVL: return (img->ny / params.patch_size) / 2; + case PROJECTOR_TYPE_STEP3VL: + return img->ny / (params.patch_size * params.n_merge); default: break; } @@ -2702,6 +2729,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int y_patch = img->ny / (params.patch_size * 2); n_patches = x_patch * y_patch; } break; + case PROJECTOR_TYPE_STEP3VL: + { + int x_patch = img->nx / (params.patch_size * params.n_merge); + int y_patch = img->ny / (params.patch_size * params.n_merge); + n_patches = x_patch * y_patch; + } break; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA4V: case PROJECTOR_TYPE_IDEFICS3: @@ -3004,6 +3037,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("positions", positions); } break; + case PROJECTOR_TYPE_STEP3VL: + { + std::vector pos_data(n_pos); + for (int i = 0; i < n_pos; i++) { + pos_data[i] = i / pos_w; + } + set_input_i32("pos_h", pos_data); + for (int i = 0; i < n_pos; i++) { + pos_data[i] = i % pos_w; + } + set_input_i32("pos_w", pos_data); + } break; case PROJECTOR_TYPE_PADDLEOCR: { const int merge_ratio = hparams.n_merge; @@ -3358,6 +3403,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_QWEN3VL: // main path + deepstack paths return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers); + case PROJECTOR_TYPE_STEP3VL: + return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: return ctx->model.mm_input_proj_w->ne[0]; diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 6f9632b62a..47e2cde2b9 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -33,6 +33,11 @@ struct clip_graph_qwen3vl : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_step3vl : clip_graph { + clip_graph_step3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_youtuvl : clip_graph { clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/models/step3vl.cpp b/tools/mtmd/models/step3vl.cpp new file mode 100644 index 0000000000..5142b0bba3 --- /dev/null +++ b/tools/mtmd/models/step3vl.cpp @@ -0,0 +1,81 @@ +#include "models.h" + +ggml_cgraph * clip_graph_step3vl::build() { + GGML_ASSERT(model.class_embedding == nullptr); + GGML_ASSERT(model.patch_embeddings_0 != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + norm_type norm_t = NORM_TYPE_NORMAL; + + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * inp = build_inp(); + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + + auto add_spatial_bias = [&](ggml_tensor * cur, ggml_tensor * bias) { + if (bias == nullptr) { + return cur; + } + + const int64_t width = cur->ne[0]; + const int64_t height = cur->ne[1]; + const int64_t channels = cur->ne[2]; + + cur = ggml_reshape_2d(ctx0, cur, width * height, channels); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cur = ggml_add(ctx0, cur, bias); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cur = ggml_reshape_3d(ctx0, cur, width, height, channels); + + return cur; + }; + + ggml_tensor * cur = build_vit( + inp, + n_patches, + norm_t, + hparams.ffn_op, + learned_pos_embd, + add_pos); + cb(cur, "vit_out", -1); + + // [n_embd, n_patches] -> [w, h, n_embd] for spatial downsampling convolutions. + cur = ggml_permute(ctx0, cur, 1, 0, 2, 3); + cur = ggml_cont_3d(ctx0, cur, n_patches_x, n_patches_y, n_embd); + + // First downsampler: Conv2d(1536 -> 3072, k=3, s=2, p=1) + cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, 2, 2, 1, 1, 1, 1); + cur = add_spatial_bias(cur, model.mm_0_b); + cb(cur, "downsample_0", -1); + + // Second downsampler: Conv2d(3072 -> 6144, k=3, s=2, p=1) + cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 2, 2, 1, 1, 1, 1); + cur = add_spatial_bias(cur, model.mm_1_b); + cb(cur, "downsample_1", -1); + + // [w, h, c] -> [c, w*h] + { + const int64_t w = cur->ne[0]; + const int64_t h = cur->ne[1]; + cur = ggml_reshape_3d(ctx0, cur, w * h, cur->ne[2], cur->ne[3]); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)); + } + cb(cur, "downsample_flatten", -1); + + // Final projector: Linear(6144 -> projection_dim) + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + cb(cur, "projector_out", -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index a2166622b7..4f4eb5da69 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -1114,6 +1114,260 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, return true; } +// +// mtmd_image_preprocessor_step3vl +// + +void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32( + const clip_image_u8 & src, + clip_image_f32 & dst, + int target_width, + int target_height, + const float mean[3], + const float std[3]) { + if (src.nx == target_width && src.ny == target_height) { + img_u8_to_f32(src, dst, mean, std); + return; + } + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + const float scale_x = static_cast(src.nx) / target_width; + const float scale_y = static_cast(src.ny) / target_height; + + for (int y = 0; y < target_height; ++y) { + const float src_y = (static_cast(y) + 0.5f) * scale_y - 0.5f; + const int y0_floor = static_cast(std::floor(src_y)); + const int y0 = std::max(0, std::min(y0_floor, src.ny - 1)); + const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1)); + const float ly = src_y - y0_floor; + + for (int x = 0; x < target_width; ++x) { + const float src_x = (static_cast(x) + 0.5f) * scale_x - 0.5f; + const int x0_floor = static_cast(std::floor(src_x)); + const int x0 = std::max(0, std::min(x0_floor, src.nx - 1)); + const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1)); + const float lx = src_x - x0_floor; + + const size_t idx00 = 3 * (y0 * src.nx + x0); + const size_t idx01 = 3 * (y0 * src.nx + x1); + const size_t idx10 = 3 * (y1 * src.nx + x0); + const size_t idx11 = 3 * (y1 * src.nx + x1); + const size_t idx_dst = 3 * (y * target_width + x); + + for (int c = 0; c < 3; ++c) { + const float v00 = (static_cast(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c]; + const float v01 = (static_cast(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c]; + const float v10 = (static_cast(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c]; + const float v11 = (static_cast(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c]; + + const float top = v00 + (v01 - v00) * lx; + const float bot = v10 + (v11 - v10) * lx; + dst.buf[idx_dst + c] = top + (bot - top) * ly; + } + } + } +} + +int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) { + return params.image_longest_edge > 0 ? params.image_longest_edge : default_image_longest_edge; +} + +int mtmd_image_preprocessor_step3vl::determine_window_size(const clip_hparams & params, int longer, int shorter) { + const int image_size = params.image_size; + const int crop_size = default_image_crop_size; + const float aspect_ratio = static_cast(longer) / shorter; + + if (longer <= image_size) { + return aspect_ratio > small_aspect_ratio_limit ? shorter : 0; + } + + return aspect_ratio > wide_aspect_ratio_limit ? std::min(shorter, crop_size) : crop_size; +} + +int mtmd_image_preprocessor_step3vl::calc_crop_extent(int length, int window_size) { + const float ratio = static_cast(length) / window_size; + if (ratio < 1.0f) { + return length; + } + + const float decimal = ratio - std::floor(ratio); + const int rounded = decimal > crop_rounding_threshold + ? static_cast(std::floor(ratio)) + 1 + : static_cast(std::floor(ratio)); + return window_size * rounded; +} + +std::vector mtmd_image_preprocessor_step3vl::calc_grid(int length, int window_size) { + const int n = length <= window_size + ? 1 + : static_cast(std::ceil(static_cast(length - window_size) / window_size + 1.0f)); + std::vector starts(n); + + for (int i = 0; i < n; ++i) { + starts[i] = window_size * i; + } + + if (n > 1 && starts.back() + window_size > length) { + starts.back() = length - window_size; + } + + return starts; +} + +clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) { + clip_image_u8 resized = img; + const float aspect_ratio = img.ny > 0 ? static_cast(img.nx) / img.ny : 1.0f; + if (std::min(img.nx, img.ny) < 32 && + (aspect_ratio > wide_aspect_ratio_limit || + aspect_ratio < 1.0f / wide_aspect_ratio_limit)) { + const int square_size = std::max(img.nx, img.ny); + clip_image_u8 padded; + padded.nx = square_size; + padded.ny = square_size; + padded.buf.resize(3 * square_size * square_size); + img_tool::fill(padded, {0, 0, 0}); + img_tool::composite(padded, img, 0, 0); + resized = std::move(padded); + } + + const int max_image_size = get_image_longest_edge(params); + if (std::max(resized.nx, resized.ny) > max_image_size) { + const float scale = static_cast(max_image_size) / std::max(resized.nx, resized.ny); + const clip_image_size new_size = { + std::max(1, static_cast(std::floor(resized.nx * scale))), + std::max(1, static_cast(std::floor(resized.ny * scale))), + }; + clip_image_u8 scaled; + img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false); + resized = std::move(scaled); + } + + return resized; +} + +clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) { + clip_image_u8 dst; + dst.nx = w; + dst.ny = h; + dst.buf.resize(3 * w * h, 0); + + const int src_x0 = std::max(0, x); + const int src_y0 = std::max(0, y); + const int src_x1 = std::min(image.nx, x + w); + const int src_y1 = std::min(image.ny, y + h); + + if (src_x0 >= src_x1 || src_y0 >= src_y1) { + return dst; + } + + const int dst_x0 = src_x0 - x; + const int dst_y0 = src_y0 - y; + + for (int yy = 0; yy < src_y1 - src_y0; ++yy) { + for (int xx = 0; xx < src_x1 - src_x0; ++xx) { + const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx)); + const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx)); + dst.buf[dst_idx + 0] = image.buf[src_idx + 0]; + dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + } + } + + return dst; +} + +mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step3vl::build_slice_instructions( + const clip_hparams & params, + const clip_image_size & prepared_size) { + slice_instructions instructions; + instructions.overview_size = prepared_size; + + const int window_size = determine_window_size( + params, + std::max(prepared_size.width, prepared_size.height), + std::min(prepared_size.width, prepared_size.height)); + if (window_size <= 0) { + instructions.refined_size = clip_image_size{0, 0}; + instructions.grid_size = clip_image_size{0, 0}; + return instructions; + } + + const int crop_width = calc_crop_extent(prepared_size.width, window_size); + const int crop_height = calc_crop_extent(prepared_size.height, window_size); + instructions.refined_size = clip_image_size{crop_width, crop_height}; + + const auto xs = calc_grid(crop_width, window_size); + const auto ys = calc_grid(crop_height, window_size); + instructions.grid_size = clip_image_size{ + static_cast(xs.size()), + static_cast(ys.size()), + }; + + for (int y : ys) { + for (int x : xs) { + instructions.slices.push_back(slice_coordinates{ + /* x */ x, + /* y */ y, + /* size */ clip_image_size{window_size, window_size}, + }); + } + } + + return instructions; +} + +bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + clip_image_u8 prepared = prepare_image(img, hparams); + const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny}); + + clip_image_f32_ptr overview_f32(clip_image_f32_init()); + img_u8_resize_bilinear_to_f32( + prepared, + *overview_f32, + hparams.image_size, + hparams.image_size, + hparams.image_mean, + hparams.image_std); + output.entries.push_back(std::move(overview_f32)); + + if (instructions.slices.empty()) { + output.grid_x = 0; + output.grid_y = 0; + return true; + } + + clip_image_u8 img_for_crop = prepared; + if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) { + clip_image_u8 refined; + img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false); + img_for_crop = std::move(refined); + } + + const int crop_size = default_image_crop_size; + for (const auto & slice : instructions.slices) { + // If the requested patch extends past the source image, pad the out-of-bounds area with black. + clip_image_u8 patch = crop_with_black_padding(img_for_crop, slice.x, slice.y, slice.size.width, slice.size.height); + + clip_image_f32_ptr patch_f32(clip_image_f32_init()); + img_u8_resize_bilinear_to_f32( + patch, + *patch_f32, + crop_size, + crop_size, + hparams.image_mean, + hparams.image_std); + output.entries.push_back(std::move(patch_f32)); + } + + output.grid_x = instructions.grid_size.width; + output.grid_y = instructions.grid_size.height; + + return true; +} + // // mtmd_image_preprocessor_youtuvl // diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h index 065b937d61..08129a08ed 100644 --- a/tools/mtmd/mtmd-image.h +++ b/tools/mtmd/mtmd-image.h @@ -144,6 +144,35 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor { bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; +// custom image preprocessing for Step3VL +// ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py +struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd { + mtmd_image_preprocessor_step3vl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + static slice_instructions build_slice_instructions(const clip_hparams & params, const clip_image_size & prepared_size); + +private: + static constexpr int default_image_longest_edge = 3024; + static constexpr int default_image_crop_size = 504; + static constexpr float small_aspect_ratio_limit = 1.5f; + static constexpr float wide_aspect_ratio_limit = 4.0f; + static constexpr float crop_rounding_threshold = 0.2f; + + void img_u8_resize_bilinear_to_f32( + const clip_image_u8 & src, + clip_image_f32 & dst, + int target_width, + int target_height, + const float mean[3], + const float std[3]); + static int get_image_longest_edge(const clip_hparams & params); + static int determine_window_size(const clip_hparams & params, int longer, int shorter); + static int calc_crop_extent(int length, int window_size); + static std::vector calc_grid(int length, int window_size); + static clip_image_u8 prepare_image(const clip_image_u8 & img, const clip_hparams & params); + static clip_image_u8 crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h); +}; + struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor { mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 4b6dd44f09..4cbb3301ea 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -88,6 +88,7 @@ enum mtmd_slice_tmpl { MTMD_SLICE_TMPL_LLAMA4, MTMD_SLICE_TMPL_IDEFICS3, MTMD_SLICE_TMPL_LFM2, + MTMD_SLICE_TMPL_STEP3VL, }; const char * mtmd_default_marker() { @@ -259,7 +260,6 @@ struct mtmd_context { tok_row_end = {lookup_token("\n")}; tok_row_end_trail = false; // no trailing end-of-row token ov_img_first = true; - } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) { // minicpmv 2.6 format: // (overview) (slice) (slice) \n ... @@ -331,6 +331,22 @@ struct mtmd_context { " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__); image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_STEP3VL: + { + // Step3 format: + // (patch) [] + // ... (all patch rows) + // (overview) + slice_tmpl = MTMD_SLICE_TMPL_STEP3VL; + tok_ov_img_start = {lookup_token("")}; + tok_ov_img_end = {lookup_token("")}; + tok_sli_img_start = {lookup_token("")}; + tok_sli_img_end = {lookup_token("")}; + tok_row_end = {lookup_token("")}; + tok_row_end_trail = false; + ov_img_first = false; // patches first, overview last + image_preproc = std::make_unique(ctx_v); + } break; case PROJECTOR_TYPE_INTERNVL: { // ... (image embeddings) ... @@ -682,6 +698,7 @@ struct mtmd_tokenizer { || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6 || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4 || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3 + || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid) ) { const int n_col = batch_f32.grid_x;