address #16574; fold CLI into mtmd-cli; use ggml_rope_ext + bicubic;switch to 'jinaclip2'; fix converter constants

This commit is contained in:
liyang 2025-10-31 16:28:58 +08:00
parent 0c21677e43
commit 463f536695
14 changed files with 776 additions and 7 deletions

View File

@ -2747,14 +2747,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.embd_normalize = value;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--embd-output-format"}, "FORMAT",
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
[](common_params & params, const std::string & value) {
params.embd_out = value;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_MTMD}));
add_opt(common_arg(
{"--embd-separator"}, "STRING",
"separator of embeddings (default \\n) for example \"<#sep#>\"",

View File

@ -5635,7 +5635,18 @@ class XLMRobertaModel(BertModel):
if lora_names := hparams.get("lora_adaptations"):
self._lora_names = lora_names
self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
try:
text_cfg = hparams.get("text_config", {}) if isinstance(hparams.get("text_config", {}), dict) else {}
pe_type = (text_cfg.get("position_embedding_type") or hparams.get("position_embedding_type") or "").lower()
rope_base = text_cfg.get("rotary_emb_base", hparams.get("rotary_emb_base"))
name_path = (hparams.get("_name_or_path") or "").lower()
is_vx = ("jina" in name_path and ("v2" in name_path or "v3" in name_path))
is_v3 = (pe_type == "rotary" or rope_base is not None) and is_vx
if (is_v3) or self._lora_names:
self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
except Exception:
pass
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
self._xlmroberta_tokenizer_init()
@ -7049,6 +7060,254 @@ class JinaBertV2Model(BertModel):
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
@ModelBase.register("JinaCLIPVisionModel", "JinaCLIPModel")
class JinaCLIPVisionModel(MmprojModel):
"""JinaCLIP v2 Vision Encoder Model - handles vision component only"""
model_arch = gguf.MODEL_ARCH.MMPROJ
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Load config for vision encoder
config_path = self.dir_model / "config.json"
if not config_path.exists():
raise FileNotFoundError(
f"JinaCLIPVisionModel: missing config.json in {self.dir_model}. "
"Please ensure the original model config is present; default hyperparameter fallbacks are not used."
)
with open(config_path, encoding="utf-8") as f:
self.vision_config = json.load(f)
def set_vocab(self):
# Vision encoder doesn't need vocabulary
pass
def set_gguf_parameters(self):
cfg = self.vision_config
try:
width = int(cfg["width"]) # channel dim
head_width = int(cfg["head_width"]) # per-head dim
layers = int(cfg["layers"]) # block count
image_size = int(cfg["image_size"]) # input image size
patch_size = int(cfg["patch_size"]) # patch size
except KeyError as e:
raise KeyError(f"JinaCLIPVisionModel: missing key in config.json: {e}")
if width % head_width != 0:
raise ValueError(
f"JinaCLIPVisionModel: width ({width}) not divisible by head_width ({head_width})"
)
n_head = width // head_width
if "mlp_ratio" in cfg:
n_ff = int(width * float(cfg["mlp_ratio"]))
elif bool(cfg.get("naive_swiglu", False)):
n_ff = int((width * 8) // 3)
else:
raise ValueError("JinaCLIPVisionModel: unable to infer FFN size; please provide 'mlp_ratio' or set 'naive_swiglu' in config.json")
self.gguf_writer.add_clip_has_vision_encoder(True)
proj_dim = int(cfg.get("projection_dim", width))
self.gguf_writer.add_vision_projection_dim(proj_dim)
self.gguf_writer.add_vision_image_size(image_size)
self.gguf_writer.add_vision_patch_size(patch_size)
self.gguf_writer.add_vision_embedding_length(width)
self.gguf_writer.add_vision_block_count(layers)
self.gguf_writer.add_vision_head_count(n_head)
self.gguf_writer.add_vision_feed_forward_length(n_ff)
self.gguf_writer.add_vision_attention_layernorm_eps(float(cfg.get("layer_norm_eps", 1e-5)))
mean = self.preprocessor_config.get("image_mean", self.preprocessor_config.get("mean"))
std = self.preprocessor_config.get("image_std", self.preprocessor_config.get("std"))
if mean is None or std is None:
raise KeyError(
"JinaCLIPVisionModel: preprocessor_config missing image mean/std (expected keys: 'image_mean'/'image_std' or 'mean'/'std')"
)
self.gguf_writer.add_vision_image_mean(mean)
self.gguf_writer.add_vision_image_std(std)
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JINACLIP2)
self.gguf_writer.add_vision_use_silu(True)
def _strip_vm_prefix(self, name: str) -> str:
return name[len('vision_model.'):] if name.startswith('vision_model.') else name
def _map_block_tensor(self, layer: int, rest: str, data_torch: Tensor, name: str) -> list[tuple[str, Tensor]] | None:
parts = rest.split('.')
# layer norms
if rest.startswith('norm1.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.ln1.{suffix}', data_torch)]
if rest.startswith('norm2.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.ln2.{suffix}', data_torch)]
if rest.startswith('attn.inner_attn_ln.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.attn_ln.{suffix}', data_torch)]
# fused qkv
if rest == 'attn.qkv.weight':
w = data_torch
wdim = w.shape[0]
if wdim % 3 != 0:
logger.warning('mmproj(jinaclip): unexpected qkv weight shape %s for %s', tuple(w.shape), name)
d = wdim // 3
q, k, v = w[0:d, :], w[d:2 * d, :], w[2 * d:, :]
return [
(f'v.blk.{layer}.attn_q.weight', q),
(f'v.blk.{layer}.attn_k.weight', k),
(f'v.blk.{layer}.attn_v.weight', v),
]
if rest == 'attn.qkv.bias':
b = data_torch
bdim = b.shape[0]
if bdim % 3 != 0:
logger.warning('mmproj(jinaclip): unexpected qkv bias shape %s for %s', tuple(b.shape), name)
d = bdim // 3
qb, kb, vb = b[0:d], b[d:2 * d], b[2 * d:]
return [
(f'v.blk.{layer}.attn_q.bias', qb),
(f'v.blk.{layer}.attn_k.bias', kb),
(f'v.blk.{layer}.attn_v.bias', vb),
]
# separate q/v bias (some checkpoints)
if rest == 'attn.q_bias':
return [(f'v.blk.{layer}.attn_q.bias', data_torch)]
if rest == 'attn.v_bias':
return [(f'v.blk.{layer}.attn_v.bias', data_torch)]
# separate projections
if rest.startswith('attn.q_proj.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.attn_q.{suffix}', data_torch)]
if rest.startswith('attn.k_proj.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.attn_k.{suffix}', data_torch)]
if rest.startswith('attn.v_proj.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.attn_v.{suffix}', data_torch)]
if rest.startswith('attn.proj.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.attn_out.{suffix}', data_torch)]
# MLP
if rest.startswith('mlp.w1.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.ffn_gate.{suffix}', data_torch)]
if rest.startswith('mlp.w2.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)]
if rest.startswith('mlp.w3.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)]
if rest.startswith('mlp.ffn_ln.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.ffn_norm.{suffix}', data_torch)]
if rest.startswith('mlp.fc1.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)]
if rest.startswith('mlp.fc2.'):
suffix = parts[-1]
return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)]
return None
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
"""Prefer base table-driven mapping; keep Jina-specific targets if already mapped; fallback to legacy mapper."""
# Already a GGUF target name (e.g., "v.*" or "mm.*"): return as-is
if name.startswith('v.') or name.startswith('mm.'):
return name
# Try the base mapping first
try:
return super().map_tensor_name(name, try_suffixes=try_suffixes)
except Exception:
# Fallback to legacy Jina-specific mapper for any remaining edge keys
if hasattr(self, "_map_jinaclip_tensor_name"):
mapped = self._map_jinaclip_tensor_name(name) # type: ignore[attr-defined]
if mapped:
return mapped
return name
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
yielded_any = False
try:
for name, tensor in super().get_tensors():
yielded_any = True
yield name, tensor
except Exception as e:
logger.warning("mmproj(jinaclip): base get_tensors failed, falling back: %s", e)
if yielded_any:
return
candidates = [
self.dir_model / "pytorch_model.bin",
self.dir_model / "vision_model_weights.bin",
]
model_path = next((p for p in candidates if p.exists()), None)
if model_path is None:
raise FileNotFoundError(f"mmproj(jinaclip): no model weights found in {self.dir_model}")
try:
state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
except TypeError:
state_dict = torch.load(model_path, map_location="cpu")
for name, tensor in state_dict.items():
yield name, tensor
def _should_be_f32(self, gguf_name: str) -> bool:
patterns = (
".ln1.weight", ".ln1.bias",
".ln2.weight", ".ln2.bias",
".attn_ln.weight", ".attn_ln.bias",
".ffn_norm.weight", ".ffn_norm.bias",
"v.patch_embd.proj.bias",
)
return any(p in gguf_name for p in patterns)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
src = name
if src.startswith('v.') or src.startswith('mm.'):
return [(src, data_torch)]
# Drop 'vision_model.' prefix if present
src_no_vm = self._strip_vm_prefix(src)
# Top-level direct mappings — use gguf constants directly for canonical names
if src_no_vm == 'cls_token':
base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_CLS]
return [(base, data_torch)]
if src_no_vm.startswith('patch_embed.proj.'):
suffix = src_no_vm.split('.')[-1]
base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH]
return [(f'{base}.{suffix}', data_torch)]
if src_no_vm == 'pos_embed':
pos_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_POS] + '.weight'
return [(pos_name, data_torch)]
if src_no_vm.startswith('norm.'):
suffix = src_no_vm.split('.')[-1]
base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_POST_NORM]
return [(f'{base}.{suffix}', data_torch)]
if src_no_vm.startswith('blocks.'):
parts = src_no_vm.split('.')
if len(parts) >= 3 and parts[1].isdigit():
layer = int(parts[1])
rest = '.'.join(parts[2:])
mapped = self._map_block_tensor(layer, rest, data_torch, name)
if mapped is not None:
return mapped
try:
return [(self.map_tensor_name(name), data_torch)]
except Exception:
logger.debug("mmproj(jinaclip): skip unmapped tensor %s", name)
return []
@ModelBase.register("OpenELMForCausalLM")
class OpenELMModel(TextModel):
model_arch = gguf.MODEL_ARCH.OPENELM

View File

@ -3604,6 +3604,7 @@ class VisionProjectorType:
QWEN3VL = "qwen3vl_merger"
ULTRAVOX = "ultravox"
INTERNVL = "internvl"
JINACLIP2 = "jinaclip2"
QWEN2A = "qwen2a" # audio
GLMA = "glma" # audio
QWEN25O = "qwen2.5o" # omni

View File

@ -19,6 +19,7 @@ add_library(mtmd
models/glm4v.cpp
models/internvl.cpp
models/kimivl.cpp
models/jinaclip2.cpp
models/llama4.cpp
models/llava.cpp
models/minicpmv.cpp

View File

@ -44,6 +44,7 @@
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
#define KEY_VISION_ROPE_THETA "clip.vision.rope_theta"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@ -75,14 +76,15 @@
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
#define TN_ATTN_LN "%s.blk.%d.attn_ln.%s" // inner attention LayerNorm
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm
#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s"
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale
#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale
#define TN_LN_PRE "%s.pre_ln.%s"
@ -225,6 +227,7 @@ enum projector_type {
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_VOXTRAL,
PROJECTOR_TYPE_MUSIC_FLAMINGO,
PROJECTOR_TYPE_JINACLIP2, // JinaCLIP v2
PROJECTOR_TYPE_LFM2,
PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR,
@ -261,6 +264,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LFM2, "lfm2"},
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_JINACLIP2, "jinaclip2"},
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},

View File

@ -813,6 +813,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_llama4>(ctx, img);
} break;
case PROJECTOR_TYPE_JINACLIP2:
{
builder = std::make_unique<clip_graph_jinaclip2>(ctx, img);
} break;
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_QWEN2A:
@ -1198,6 +1202,11 @@ struct clip_model_loader {
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
set_llava_uhd_res_candidates(model, 3);
} break;
case PROJECTOR_TYPE_JINACLIP2:
{
hparams.rope_theta = 10000.0f;
get_f32(KEY_VISION_ROPE_THETA, hparams.rope_theta, /*required=*/false);
} break;
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
@ -1783,6 +1792,10 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
} break;
case PROJECTOR_TYPE_JINACLIP2:
{
// JinaCLIP2 is a pure vision encoder without additional projection layers.
} break;
case PROJECTOR_TYPE_LFM2A:
{
for (int i : {0, 2, 3, 5, 6}) {
@ -3018,6 +3031,44 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_y = inst.grid_size.height;
} break;
case PROJECTOR_TYPE_JINACLIP2:
{
clip_image_u8 processed_image;
const int sz = params.image_size;
// 1) Preserve aspect ratio: resize so that the shorter side == sz (bicubic).
const int in_w = img->nx;
const int in_h = img->ny;
if (in_w <= 0 || in_h <= 0) {
LOG_ERR("%s: invalid input image size %dx%d\n", __func__, in_w, in_h);
return false;
}
int out_w = 0, out_h = 0;
if (in_w < in_h) {
out_w = sz;
out_h = std::max(1, (int) std::round((double) in_h * sz / in_w));
} else {
out_h = sz;
out_w = std::max(1, (int) std::round((double) in_w * sz / in_h));
}
clip_image_u8 resized_keep_ratio;
img_tool::resize(*img, resized_keep_ratio, clip_image_size{out_w, out_h}, img_tool::RESIZE_ALGO_BICUBIC);
// 2) Center-crop to sz x sz.
const int x0 = std::max(0, (resized_keep_ratio.nx - sz) / 2);
const int y0 = std::max(0, (resized_keep_ratio.ny - sz) / 2);
const int crop_w = std::min(sz, resized_keep_ratio.nx);
const int crop_h = std::min(sz, resized_keep_ratio.ny);
img_tool::crop(resized_keep_ratio, processed_image, x0, y0, crop_w, crop_h);
// 3) Normalize.
clip_image_f32_ptr img_f32(clip_image_f32_init());
normalize_image_u8_to_f32(processed_image, *img_f32, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(img_f32));
} break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
{
@ -3181,6 +3232,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
{
// do nothing
} break;
case PROJECTOR_TYPE_JINACLIP2:
{
n_patches = 1;
} break;
case PROJECTOR_TYPE_LDP:
case PROJECTOR_TYPE_LDPV2:
case PROJECTOR_TYPE_GLM_EDGE:
@ -3608,6 +3663,57 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_JINACLIP2:
{
std::vector<int32_t> positions(n_pos);
for (int i = 0; i < n_pos; i++) {
positions[i] = i;
}
set_input_i32("positions", positions);
const int n_patches = model.class_embedding ? (n_pos - 1) : n_pos;
const int n_patches_per_col = image_size_width / patch_size;
std::vector<int32_t> pos_data(n_pos, 0);
for (int i = 0; i < n_patches; ++i) {
const int idx = model.class_embedding ? (i + 1) : i;
pos_data[idx] = i / n_patches_per_col;
}
set_input_i32("pos_h", pos_data);
std::fill(pos_data.begin(), pos_data.end(), 0);
for (int i = 0; i < n_patches; ++i) {
const int idx = model.class_embedding ? (i + 1) : i;
pos_data[idx] = i % n_patches_per_col;
}
set_input_i32("pos_w", pos_data);
int pt_seq_len = 16;
if (patch_size > 0) {
const int cand = (int) llroundf(224.0f / (float) patch_size);
if (cand > 0) {
pt_seq_len = cand;
}
}
const float s = (float) pt_seq_len / (float) n_patches_per_col;
const int d_head_local = hparams.n_embd / hparams.n_head;
const int half_local = d_head_local / 2;
std::vector<float> rope_c_first(half_local);
std::vector<float> rope_c_second(half_local);
const float odd = std::pow(hparams.rope_theta, (float) -2.0f / (float) d_head_local);
for (int k = 0; k < half_local; ++k) {
rope_c_first[k] = 1.0f / s;
rope_c_second[k] = 1.0f / (s * odd);
}
ggml_tensor * t1 = ggml_graph_get_tensor(gf, "rope_c_first");
ggml_tensor * t2 = ggml_graph_get_tensor(gf, "rope_c_second");
GGML_ASSERT(t1 && (t1->flags & GGML_TENSOR_FLAG_INPUT));
GGML_ASSERT(t2 && (t2->flags & GGML_TENSOR_FLAG_INPUT));
ggml_backend_tensor_set(t1, rope_c_first.data(), 0, ggml_nbytes(t1));
ggml_backend_tensor_set(t2, rope_c_second.data(), 0, ggml_nbytes(t2));
} break;
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_LDP:
@ -3732,6 +3838,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_JINACLIP2:
return ctx->model.hparams.projection_dim;
case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0];
case PROJECTOR_TYPE_MINICPMV:

View File

@ -111,6 +111,7 @@ bool clip_is_llava(const struct clip_ctx * ctx);
// note for contributor: this clip_is_(model) pattern is deprecated
// do NOT add new functions like this
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
// use by audio input

View File

@ -0,0 +1,128 @@
#include "models.h"
#include <cmath>
ggml_cgraph * clip_graph_jinaclip2::build() {
const bool has_cls = model.class_embedding != nullptr;
GGML_ASSERT(has_cls && "JinaCLIP2 requires a CLS token");
const int n_pos = n_patches + (has_cls ? 1 : 0);
GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
// input for learned position embeddings
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
// inputs for 2D RoPE positions (includes CLS at index 0)
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
// frequency scaling factors for the 2D RoPE halves
GGML_ASSERT(d_head % 2 == 0);
ggml_tensor * rope_c_first = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head / 2);
ggml_set_name(rope_c_first, "rope_c_first");
ggml_set_input(rope_c_first);
ggml_tensor * rope_c_second = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head / 2);
ggml_set_name(rope_c_second, "rope_c_second");
ggml_set_input(rope_c_second);
ggml_tensor * inp = build_inp();
if (has_cls) {
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
}
inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
auto apply_rope_2d = [&](ggml_tensor * cur) -> ggml_tensor * {
// cur is [d_head, n_head, n_pos]; convert to [d_head, n_pos, n_head] for convenient slicing
ggml_tensor * cur_in = ggml_permute(ctx0, cur, 0, 2, 1, 3);
const int64_t n_dim = cur_in->ne[0];
const int64_t seq = cur_in->ne[1];
const int64_t nhead = cur_in->ne[2];
GGML_ASSERT(seq == n_pos);
GGML_ASSERT(n_dim % 2 == 0);
const int64_t half = n_dim / 2;
ggml_tensor * cls = nullptr;
ggml_tensor * patches = cur_in;
int64_t n_pos_patches = seq;
int64_t pos_offset = 0;
if (has_cls) {
cls = ggml_view_3d(ctx0, cur_in, n_dim, 1, nhead, cur_in->nb[1], cur_in->nb[2], 0);
patches = ggml_view_3d(ctx0, cur_in, n_dim, seq - 1, nhead, cur_in->nb[1], cur_in->nb[2], cur_in->nb[1]);
n_pos_patches = seq - 1;
pos_offset = 1;
}
// select positions for patch tokens
ggml_tensor * pos_a = ggml_view_1d(ctx0, pos_h, n_pos_patches, pos_offset * (int64_t) ggml_element_size(pos_h));
ggml_tensor * pos_b = ggml_view_1d(ctx0, pos_w, n_pos_patches, pos_offset * (int64_t) ggml_element_size(pos_w));
// first half (H)
ggml_tensor * first = ggml_view_3d(ctx0, patches,
half, nhead, n_pos_patches,
patches->nb[2], patches->nb[1], 0);
ggml_tensor * first_rot = ggml_rope_ext(
ctx0,
first,
pos_a,
rope_c_first,
half,
0, 0, hparams.rope_theta,
1.0f,
0.0f, 1.0f, 0.0f, 0.0f);
first = ggml_view_3d(ctx0, first_rot,
half, n_pos_patches, nhead,
first_rot->nb[2], first_rot->nb[1], 0);
// second half (W)
ggml_tensor * second = ggml_view_3d(ctx0, patches,
half, nhead, n_pos_patches,
patches->nb[2], patches->nb[1],
half * (int64_t) ggml_element_size(patches));
ggml_tensor * second_rot = ggml_rope_ext(
ctx0,
second,
pos_b,
rope_c_second,
half,
0, 0, hparams.rope_theta,
1.0f,
0.0f, 1.0f, 0.0f, 0.0f);
second = ggml_view_3d(ctx0, second_rot,
half, n_pos_patches, nhead,
second_rot->nb[2], second_rot->nb[1], 0);
ggml_tensor * patches_out = ggml_concat(ctx0, first, second, 0);
ggml_tensor * out_seq = has_cls ? ggml_concat(ctx0, cls, patches_out, 1) : patches_out;
return ggml_permute(ctx0, out_seq, 0, 2, 1, 3);
};
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return apply_rope_2d(cur);
};
ggml_tensor * cur = build_vit(
inp, n_pos,
NORM_TYPE_NORMAL,
hparams.ffn_op,
nullptr,
add_pos);
// Output: CLS embedding only (1 token).
ggml_tensor * cls = ggml_view_2d(ctx0, cur, cur->ne[0], /*rows=*/1, cur->nb[1], /*offset=*/0);
ggml_set_name(cls, "cls_view");
ggml_build_forward_expand(gf, cls);
return gf;
}

View File

@ -52,6 +52,11 @@ struct clip_graph_kimivl : clip_graph {
ggml_cgraph * build() override;
};
struct clip_graph_jinaclip2 : clip_graph {
clip_graph_jinaclip2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_cogvlm : clip_graph {
clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;

View File

@ -40,7 +40,12 @@ static void show_additional_info(int /*argc*/, char ** argv) {
LOG(
"Experimental CLI for multimodal\n\n"
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
" -m and --mmproj are required\n"
" -m and --mmproj are required in chat/generation modes\n"
" Special case: when only --mmproj and --image are provided and projector is JinaCLIP,\n"
" run the projector-only embedding path (no -m needed)\n"
" Embedding output options (projector-only):\n"
" --embd-output-format {array|json|json+|''} print embedding to stdout; empty to disable\n"
" --embd-normalize {…} normalization uses common --embd-normalize\n"
" -hf user/repo can replace both -m and --mmproj in most cases\n"
" --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
@ -174,6 +179,106 @@ struct mtmd_cli_context {
}
};
static int run_mmproj_only(common_params & params) {
if (params.mmproj.path.empty() || params.image.empty()) return -1;
mtmd_context_params ctx_params = mtmd_context_params_default();
ctx_params.use_gpu = params.mmproj_use_gpu;
ctx_params.verbosity = (params.verbosity > 0) ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
mtmd_mmproj_context * mctx = mtmd_mmproj_init(params.mmproj.path.c_str(), ctx_params);
if (!mctx) {
LOG_ERR("[ERROR] Failed to load vision mmproj: %s\n", params.mmproj.path.c_str());
return 1;
}
if (!mtmd_mmproj_is_supported(mctx)) {
mtmd_mmproj_free(mctx);
return -1;
}
const std::string fmt = params.embd_out; // "array" | "json" | "json+" | ""
const bool silent_json = !fmt.empty();
if (!silent_json) {
LOG("VISION(projector-only auto): image_size=%d patch=%d hidden=%d\n", mtmd_mmproj_get_image_size(mctx), mtmd_mmproj_get_patch_size(mctx), mtmd_mmproj_get_hidden_size(mctx));
}
bool printed_any = false;
if (fmt == "array" || fmt == "json" || fmt == "json+") {
if (fmt == "array") {
LOG("[");
} else {
LOG("{\n \"object\": \"list\",\n \"data\": [\n");
}
}
for (size_t i = 0; i < params.image.size(); ++i) {
const char * image_path = params.image[i].c_str();
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file_noctx(image_path));
if (!bmp.ptr) { LOG_ERR("[ERROR] Failed to load image %s\n", image_path); continue; }
float * emb = nullptr; size_t n_el = 0;
auto enc_start = std::chrono::high_resolution_clock::now();
int enc_rc = mtmd_mmproj_encode_bitmap(mctx, bmp.ptr.get(), params.cpuparams.n_threads, &emb, &n_el);
auto enc_end = std::chrono::high_resolution_clock::now();
auto enc_ms = std::chrono::duration_cast<std::chrono::microseconds>(enc_end - enc_start).count() / 1000.0;
if (enc_rc != 0) {
LOG_ERR("[ERROR] Image encoding failed: %s\n", image_path);
continue;
}
std::vector<float> image_embd(emb, emb + n_el);
std::free(emb);
if (!silent_json) {
LOG("IMAGE %zu/%zu: %s encode_ms=%.3f out_dim=%zu\n", i+1, params.image.size(), image_path, enc_ms, image_embd.size());
}
{
const int truncate = 512;
if (truncate > 0 && image_embd.size() > (size_t) truncate) image_embd.resize(truncate);
}
{
const int embd_norm = params.embd_normalize;
if (embd_norm != -1) {
common_embd_normalize(image_embd.data(), image_embd.data(), (int) image_embd.size(), embd_norm);
}
}
if (fmt == "array") {
if (printed_any) LOG(",");
LOG("[");
for (size_t k = 0; k < image_embd.size(); ++k) {
if (k) LOG(",");
LOG("%.7f", image_embd[k]);
}
LOG("]");
printed_any = true;
} else if (fmt == "json" || fmt == "json+") {
if (printed_any) LOG(",\n");
LOG(" {\n \"object\": \"embedding\",\n \"index\": %zu,\n \"embedding\": ", i);
LOG("[");
for (size_t k = 0; k < image_embd.size(); ++k) {
if (k) LOG(",");
LOG("%.7f", image_embd[k]);
}
LOG("]\n }");
printed_any = true;
}
}
if (fmt == "array") {
LOG("]\n");
} else if (fmt == "json" || fmt == "json+") {
if (fmt == "json+" && params.image.size() > 1) {
LOG(",\n \"cosineSimilarity\": [\n");
for (size_t i = 0; i < params.image.size(); ++i) {
LOG(" [");
for (size_t j = 0; j < params.image.size(); ++j) {
LOG("%6.2f", 0.0f);
if (j + 1 < params.image.size()) LOG(", ");
}
LOG(" ]%s\n", (i + 1 < params.image.size() ? "," : ""));
}
LOG(" ]\n");
}
LOG("\n}\n");
}
mtmd_mmproj_free(mctx);
return 0;
}
static int generate_response(mtmd_cli_context & ctx, int n_predict) {
llama_tokens generated_tokens;
for (int i = 0; i < n_predict; i++) {
@ -282,6 +387,12 @@ int main(int argc, char ** argv) {
return 1;
}
// try projector-only path for JinaCLIP first (otherwise return -1 and continue normal flow)
{
int rc = run_mmproj_only(params);
if (rc >= 0) return rc; // 0 success; 1 failure; -1 not JinaCLIP
}
common_init();
mtmd_helper_log_set(common_log_default_callback, nullptr);

View File

@ -519,3 +519,16 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
}
// image-only helper, no mtmd_context required
mtmd_bitmap * mtmd_helper_bitmap_init_from_file_noctx(const char * fname) {
int nx = 0, ny = 0, nc = 0;
unsigned char * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
LOG_ERR("%s: failed to decode image file %s\n", __func__, fname);
return nullptr;
}
mtmd_bitmap * result = mtmd_bitmap_init(nx, ny, data);
stbi_image_free(data);
return result;
}

View File

@ -40,6 +40,10 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
// image-only helper: decode an image file without requiring an mtmd_context
// returns nullptr on failure
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file_noctx(const char * fname);
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);

View File

@ -1,6 +1,7 @@
#include "clip.h"
#include "clip-impl.h"
#include "mtmd.h"
#include "mtmd-helper.h"
#include "mtmd-audio.h"
#include "llama.h"
@ -425,6 +426,112 @@ void mtmd_free(mtmd_context * ctx) {
delete ctx;
}
// ------------------------------
// Projector-only (mmproj-only) utilities
// ------------------------------
struct mtmd_mmproj_context {
clip_ctx * ctx_v = nullptr;
};
mtmd_mmproj_context * mtmd_mmproj_init(const char * mmproj_fname,
const struct mtmd_context_params ctx_params) {
clip_context_params clip_params;
clip_params.use_gpu = ctx_params.use_gpu;
clip_params.verbosity = ctx_params.verbosity;
auto res = clip_init(mmproj_fname, clip_params);
if (!res.ctx_v) {
return nullptr;
}
auto * ctx = new mtmd_mmproj_context();
ctx->ctx_v = res.ctx_v;
return ctx;
}
void mtmd_mmproj_free(struct mtmd_mmproj_context * ctx) {
if (!ctx) return;
clip_free(ctx->ctx_v);
delete ctx;
}
int mtmd_mmproj_get_image_size(struct mtmd_mmproj_context * ctx) {
return ctx && ctx->ctx_v ? clip_get_image_size(ctx->ctx_v) : -1;
}
int mtmd_mmproj_get_patch_size(struct mtmd_mmproj_context * ctx) {
return ctx && ctx->ctx_v ? clip_get_patch_size(ctx->ctx_v) : -1;
}
int mtmd_mmproj_get_hidden_size(struct mtmd_mmproj_context * ctx) {
return ctx && ctx->ctx_v ? clip_get_hidden_size(ctx->ctx_v) : -1;
}
bool mtmd_mmproj_is_jinaclip(struct mtmd_mmproj_context * ctx) {
return ctx && ctx->ctx_v ? clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_JINACLIP2 : false;
}
bool mtmd_mmproj_is_supported(struct mtmd_mmproj_context * ctx) {
if (!ctx || !ctx->ctx_v) return false;
projector_type proj = clip_get_projector_type(ctx->ctx_v);
// extendable: list of projectors supported by this mmproj-only path
switch (proj) {
case PROJECTOR_TYPE_JINACLIP2: return true;
default: return false;
}
}
int mtmd_mmproj_encode_bitmap(struct mtmd_mmproj_context * ctx,
const mtmd_bitmap * bmp,
int n_threads,
float ** out_data,
size_t * out_count) {
if (!ctx || !ctx->ctx_v || !bmp || !out_data || !out_count) {
LOG_ERR("%s: invalid args: ctx=%p ctx_v=%p bmp=%p out_data=%p out_count=%p\n",
__func__, (void*) ctx, ctx ? (void*) ctx->ctx_v : (void*) nullptr,
(void*) bmp, (void*) out_data, (void*) out_count);
return 1;
}
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bmp->nx;
img_u8->ny = bmp->ny;
img_u8->buf.resize(bmp->data.size());
std::memcpy(img_u8->buf.data(), bmp->data.data(), img_u8->nx * img_u8->ny * 3);
clip_image_f32_batch batch_f32;
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
if (!ok) {
LOG_ERR("%s: image preprocess failed (nx=%u ny=%u proj=%d)\n",
__func__, img_u8->nx, img_u8->ny, (int) clip_get_projector_type(ctx->ctx_v));
return 1;
}
clip_image_f32 * processed_img = clip_image_f32_get_img(&batch_f32, 0);
if (!processed_img) {
LOG_ERR("%s: preprocessed image is null\n", __func__);
return 1;
}
const int n_tok = clip_n_output_tokens(ctx->ctx_v, processed_img);
const int n_embd = clip_n_mmproj_embd(ctx->ctx_v);
const size_t n_el = (size_t) n_tok * (size_t) n_embd;
std::vector<float> buf(n_el);
if (!clip_image_encode(ctx->ctx_v, n_threads, processed_img, buf.data())) {
LOG_ERR("%s: image encode failed (threads=%d tokens=%d embd=%d)\n",
__func__, n_threads, n_tok, n_embd);
return 1;
}
float * out = (float *) std::malloc(n_el * sizeof(float));
if (!out) {
LOG_ERR("%s: malloc failed (elements=%zu bytes=%zu)\n", __func__, n_el, n_el * sizeof(float));
return 1;
}
std::memcpy(out, buf.data(), n_el * sizeof(float));
*out_data = out;
*out_count = n_el;
return 0;
}
struct mtmd_tokenizer {
mtmd_context * ctx;
std::vector<const mtmd_bitmap *> bitmaps;

View File

@ -231,6 +231,32 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
// If this is not called, or NULL is supplied, everything is output on stderr.
MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
struct mtmd_mmproj_context;
// initialize a minimal context that only loads the projector (vision) from a GGUF file
// returns nullptr on failure
MTMD_API struct mtmd_mmproj_context * mtmd_mmproj_init(const char * mmproj_fname,
const struct mtmd_context_params ctx_params);
// free projector-only context
MTMD_API void mtmd_mmproj_free(struct mtmd_mmproj_context * ctx);
// basic queries
MTMD_API int mtmd_mmproj_get_image_size (struct mtmd_mmproj_context * ctx);
MTMD_API int mtmd_mmproj_get_patch_size (struct mtmd_mmproj_context * ctx);
MTMD_API int mtmd_mmproj_get_hidden_size(struct mtmd_mmproj_context * ctx);
MTMD_API bool mtmd_mmproj_is_jinaclip (struct mtmd_mmproj_context * ctx);
// generic support check for projector-only encode path
MTMD_API bool mtmd_mmproj_is_supported (struct mtmd_mmproj_context * ctx);
// encode a bitmap (RGB) to projector embeddings
// returns 0 on success, 1 on failure
MTMD_API int mtmd_mmproj_encode_bitmap(struct mtmd_mmproj_context * ctx,
const mtmd_bitmap * bmp,
int n_threads,
float ** out_data,
size_t * out_count);
/////////////////////////////////////////
// test function, to be used in test-mtmd-c-api.c