concat image_newline and image_seperator tokens

This commit is contained in:
Saba Fallah 2025-11-18 09:43:11 +01:00
parent 331cea8f8e
commit 63a042f21e
2 changed files with 33 additions and 36 deletions

View File

@ -91,7 +91,7 @@
#define TN_MM_INP_NORM_B "mm.input_norm.bias" #define TN_MM_INP_NORM_B "mm.input_norm.bias"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 #define TN_MM_PROJECTOR "mm.model.fc.%s" // idefics3, deepseekocr
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1 #define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral #define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) #define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)

View File

@ -316,7 +316,8 @@ struct clip_model {
ggml_tensor * post_ln_w; ggml_tensor * post_ln_w;
ggml_tensor * post_ln_b; ggml_tensor * post_ln_b;
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) ggml_tensor * fc_w;
ggml_tensor * fc_b;
ggml_tensor * mm_fc_w; ggml_tensor * mm_fc_w;
ggml_tensor * mm_fc_b; ggml_tensor * mm_fc_b;
@ -623,7 +624,7 @@ struct clip_graph {
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
const int scale_factor = model.hparams.n_merge; const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor); cur = build_patch_merge_permute(cur, scale_factor);
cur = ggml_mul_mat(ctx0, model.projection, cur); cur = ggml_mul_mat(ctx0, model.fc_w, cur);
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
// pixel unshuffle block // pixel unshuffle block
@ -844,15 +845,12 @@ struct clip_graph {
ggml_row_size(global_features_2->type, n_embd), 0); ggml_row_size(global_features_2->type, n_embd), 0);
ggml_tensor * global_features = ggml_concat(ctx0, global_features_2, global_features_1, 1); ggml_tensor * global_features = ggml_concat(ctx0, global_features_2, global_features_1, 1);
global_features = build_global_local_features( global_features = ggml_reshape_2d(ctx0, global_features, 2* n_embd, n_patches);
ctx0, global_features = ggml_cont(ctx0, global_features);
global_features, global_features = ggml_mul_mat(ctx0, model.fc_w, global_features);
n_patches_y, global_features = ggml_add(ctx0, global_features, model.fc_b);
n_patches_x, global_features = build_global_local_features(ctx0,global_features);
n_embd
);
ggml_build_forward_expand(gf, global_features); ggml_build_forward_expand(gf, global_features);
return gf; return gf;
} }
@ -861,41 +859,31 @@ struct clip_graph {
// view_separator: [n_dim] // view_separator: [n_dim]
ggml_tensor * build_global_local_features(ggml_context * ctx0, ggml_tensor * build_global_local_features(ggml_context * ctx0,
ggml_tensor * global_features, ggml_tensor * global_features) {
int h,
int w,
int n_dim) {
GGML_ASSERT(model.image_newline != nullptr); GGML_ASSERT(model.image_newline != nullptr);
GGML_ASSERT(model.view_seperator != nullptr); GGML_ASSERT(model.view_seperator != nullptr);
GGML_ASSERT(global_features->ne[0] == static_cast<int64_t>(n_dim));
GGML_ASSERT(global_features->ne[1] == static_cast<int64_t>(2 * (h * w)));
// 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim] // 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
ggml_tensor * t = ggml_reshape_3d(ctx0, global_features, n_dim, w, h); // (n_dim, w, h) ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 64, 64, 1); // (n_dim, w, h)
t = ggml_permute(ctx0, t, 2, 1, 0, 3); // (h, w, n_dim) t = ggml_permute(ctx0, t, 2, 1, 0, 3); // (h, w, n_dim)
ggml_tensor * nl = ggml_permute(ctx0, model.image_newline, 2, 1, 0, 3);
nl = ggml_repeat_4d(ctx0, nl, 64, 1, 1280, 1); // n_pos rows
// 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim] // 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim]
ggml_tensor * nl = ggml_reshape_3d(ctx0, model.image_newline, 1, 1, n_dim); // (1, 1, n_dim)
ggml_tensor * nl_target_shape = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, h, n_dim); // (1, h, n_dim)
nl = ggml_repeat(ctx0, nl, nl_target_shape); // (1, h, n_dim)
nl = ggml_permute(ctx0, nl, 1, 0, 2, 3); // (h, 1, n_dim)
// 3) concat along width dimension (dim=1): (h, w, n_dim) + (h, 1, n_dim) -> (h, w+1, n_dim)
t = ggml_concat(ctx0, t, nl, 1); // (h, w+1, n_dim) t = ggml_concat(ctx0, t, nl, 1); // (h, w+1, n_dim)
// 4) flatten back to token axis: (h, w+1, n_dim) -> (n_dim, h*(w+1)) t = ggml_reshape_2d(ctx0, t, 1280, 64 * (64 + 1)); // (n_dim, h*(w+1))
t = ggml_permute(ctx0, t, 2, 1, 0, 3); // (n_dim, w+1, h)
t = ggml_cont_2d(ctx0, t, n_dim, (w + 1) * h); // (n_dim, h*(w+1))
// 5) append view_separator as an extra "token": // 5) append view_separator as an extra "token":
// view_separator: [n_dim] -> [n_dim, 1] // view_separator: [n_dim] -> [n_dim, 1]
ggml_tensor * vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1) ggml_tensor * vs = ggml_reshape_2d(ctx0, model.view_seperator, 1280, 1); // (n_dim, 1)
// concat along token dimension (dim=1): // concat along token dimension (dim=1):
ggml_tensor * global_local_features = ggml_concat(ctx0, t, vs, 1); // (n_dim, h*(w+1) + 1) t = ggml_concat(ctx0, t, vs, 1); // (n_dim, h*(w+1) + 1)
return global_local_features; return t;
} }
@ -3488,7 +3476,7 @@ struct clip_model_loader {
} break; } break;
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
{ {
model.projection = get_tensor(TN_MM_PROJECTOR); model.fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
} break; } break;
case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL: case PROJECTOR_TYPE_KIMIVL:
@ -3561,13 +3549,13 @@ struct clip_model_loader {
} break; } break;
case PROJECTOR_TYPE_LLAMA4: case PROJECTOR_TYPE_LLAMA4:
{ {
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
} break; } break;
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
{ {
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight")); model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight"));
model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias")); model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias"));
model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight")); model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight"));
@ -3617,6 +3605,9 @@ struct clip_model_loader {
} }
model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false); model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false); model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false);
model.fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
model.fc_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
break; break;
default: default:
@ -5086,6 +5077,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
{ {
n_patches += 2; // for BOI and EOI token embeddings n_patches += 2; // for BOI and EOI token embeddings
} break; } break;
case PROJECTOR_TYPE_DEEPSEEKOCR:
{
n_patches += 2;
} break;
default: default:
GGML_ABORT("unsupported projector type"); GGML_ABORT("unsupported projector type");
} }
@ -5512,7 +5507,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3:
return ctx->model.mm_input_proj_w->ne[0]; return ctx->model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_IDEFICS3:
return ctx->model.projection->ne[1]; return ctx->model.fc_w->ne[1];
case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_VOXTRAL:
return ctx->model.mm_2_w->ne[1]; return ctx->model.mm_2_w->ne[1];
@ -5527,6 +5522,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->model.mm_2_w->ne[1]; return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_COGVLM: case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1]; return ctx->model.mm_4h_to_h_w->ne[1];
case PROJECTOR_TYPE_DEEPSEEKOCR:
return ctx->model.fc_w->ne[1];
default: default:
GGML_ABORT("Unknown projector type"); GGML_ABORT("Unknown projector type");
} }