#include "models.h" #include ggml_cgraph * clip_graph_gemma4v::build() { ggml_tensor * inp_raw = build_inp_raw(); // patches = 2 * (patches - 0.5) // equivalent to: patches * 2 - 1 inp_raw = ggml_scale_bias(ctx0, inp_raw, 2.0f, -1.0f); ggml_set_name(inp_raw, "inp_raw_scaled"); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); ggml_set_name(inp, "inp"); // note: no patch bias ggml_tensor * pos_x = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); ggml_set_name(pos_x, "pos_x"); ggml_set_input(pos_x); ggml_tensor * pos_y = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); ggml_set_name(pos_y, "pos_y"); ggml_set_input(pos_y); { const int64_t pos_size = model.position_embeddings->ne[1]; const size_t nb1 = ggml_row_size(model.position_embeddings->type, n_embd); // positional embeddings are stored as lookup tables (one for x, one for y) ggml_tensor * tbl_x = ggml_view_2d(ctx0, model.position_embeddings, n_embd, pos_size, nb1, 0); ggml_tensor * tbl_y = ggml_view_2d(ctx0, model.position_embeddings, n_embd, pos_size, nb1, pos_size * nb1); // ggml_get_rows: [n_embd, n_patches] ggml_tensor * emb_x = ggml_get_rows(ctx0, tbl_x, pos_x); ggml_tensor * emb_y = ggml_get_rows(ctx0, tbl_y, pos_y); inp = ggml_add(ctx0, inp, emb_x); inp = ggml_add(ctx0, inp, emb_y); cb(inp, "pos_embd", -1); } // similar to build_rope_2d, but use neox ordering auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { const int64_t n_dim = cur->ne[0]; const int64_t n_head = cur->ne[1]; const int64_t n_pos = cur->ne[2]; // first half ggml_tensor * first; { first = ggml_view_3d(ctx0, cur, n_dim/2, n_head, n_pos, cur->nb[1], cur->nb[2], 0); first = ggml_rope_ext( ctx0, first, pos_x, // positions nullptr, // freq factors n_dim/2, // n_dims GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f ); } // second half ggml_tensor * second; { second = ggml_view_3d(ctx0, cur, n_dim/2, n_head, n_pos, cur->nb[1], cur->nb[2], n_dim/2 * ggml_element_size(cur)); second = ggml_rope_ext( ctx0, second, pos_y, // positions nullptr, // freq factors n_dim/2, // n_dims GGML_ROPE_TYPE_NEOX, 0, hparams.rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f ); } cur = ggml_concat(ctx0, first, second, 0); return cur; }; kq_scale = 1.0f; ggml_tensor * cur = build_vit( inp, n_patches, NORM_TYPE_RMS, hparams.ffn_op, nullptr, // pos embd is already handled above add_pos); // Gemma4VisionPooler { const int kernel_size = hparams.n_merge; GGML_ASSERT(kernel_size > 0); // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1] cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1); cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); const int out_x = n_patches_x / kernel_size; const int out_y = n_patches_y / kernel_size; // [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y] cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd)); cb(cur, "pooled", -1); } // hidden_states = (hidden_states - self.std_bias) * self.std_scale if (model.std_bias && model.std_scale) { cur = ggml_sub(ctx0, cur, model.std_bias); cur = ggml_mul(ctx0, cur, model.std_scale); cb(cur, "std_scaled", -1); } // Gemma4MultimodalEmbedder cur = build_mm(model.mm_input_proj_w, cur); cb(cur, "projected", -1); // embedding_post_projection_norm cur = ggml_rms_norm(ctx0, cur, hparams.eps); cb(cur, "projected_normed", -1); ggml_build_forward_expand(gf, cur); return gf; } ggml_tensor * clip_graph_gemma4v::build_mm(ggml_tensor * w, ggml_tensor * x) const { // Gemma4ClippableLinear auto it = model.clamp_info_map.find(w->name); if (it == model.clamp_info_map.end()) { return ggml_mul_mat(ctx0, w, x); } else { const auto & clamp_info = it->second; ggml_tensor * clamped = ggml_clamp(ctx0, x, clamp_info.inp_min, clamp_info.inp_max); ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped); out = ggml_clamp(ctx0, out, clamp_info.out_min, clamp_info.out_max); return out; } }