#include "models.h" #include ggml_cgraph * clip_graph_jinaclip2::build() { const bool has_cls = model.class_embedding != nullptr; GGML_ASSERT(has_cls && "JinaCLIP2 requires a CLS token"); const int n_pos = n_patches + (has_cls ? 1 : 0); GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported"); ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); ggml_set_name(positions, "positions"); ggml_set_input(positions); ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); ggml_set_name(pos_h, "pos_h"); ggml_set_input(pos_h); ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); ggml_set_name(pos_w, "pos_w"); ggml_set_input(pos_w); GGML_ASSERT(d_head % 2 == 0); ggml_tensor * rope_c_first = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head / 2); ggml_set_name(rope_c_first, "rope_c_first"); ggml_set_input(rope_c_first); ggml_tensor * rope_c_second = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head / 2); ggml_set_name(rope_c_second, "rope_c_second"); ggml_set_input(rope_c_second); ggml_tensor * inp = build_inp(); if (has_cls) { inp = ggml_concat(ctx0, model.class_embedding, inp, 1); } inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); auto apply_rope_2d = [&](ggml_tensor * cur) -> ggml_tensor * { ggml_tensor * cur_in = ggml_permute(ctx0, cur, 0, 2, 1, 3); const int64_t n_dim = cur_in->ne[0]; const int64_t seq = cur_in->ne[1]; const int64_t nhead = cur_in->ne[2]; GGML_ASSERT(seq == n_pos); GGML_ASSERT(n_dim % 2 == 0); const int64_t half = n_dim / 2; ggml_tensor * cls = nullptr; ggml_tensor * patches = cur_in; int64_t n_pos_patches = seq; int64_t pos_offset = 0; if (has_cls) { cls = ggml_view_3d(ctx0, cur_in, n_dim, 1, nhead, cur_in->nb[1], cur_in->nb[2], 0); patches = ggml_view_3d(ctx0, cur_in, n_dim, seq - 1, nhead, cur_in->nb[1], cur_in->nb[2], cur_in->nb[1]); n_pos_patches = seq - 1; pos_offset = 1; } // select positions ggml_tensor * pos_a = ggml_view_1d(ctx0, pos_h, n_pos_patches, pos_offset * (int64_t) ggml_element_size(pos_h)); ggml_tensor * pos_b = ggml_view_1d(ctx0, pos_w, n_pos_patches, pos_offset * (int64_t) ggml_element_size(pos_w)); ggml_tensor * first = ggml_view_3d(ctx0, patches, half, nhead, n_pos_patches, patches->nb[2], patches->nb[1], 0); ggml_tensor * first_rot = ggml_rope_ext( ctx0, first, pos_a, rope_c_first, half, 0, 0, hparams.rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); first = ggml_view_3d(ctx0, first_rot, half, n_pos_patches, nhead, first_rot->nb[2], first_rot->nb[1], 0); ggml_tensor * second = ggml_view_3d(ctx0, patches, half, nhead, n_pos_patches, patches->nb[2], patches->nb[1], half * (int64_t) ggml_element_size(patches)); ggml_tensor * second_rot = ggml_rope_ext( ctx0, second, pos_b, rope_c_second, half, 0, 0, hparams.rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); second = ggml_view_3d(ctx0, second_rot, half, n_pos_patches, nhead, second_rot->nb[2], second_rot->nb[1], 0); ggml_tensor * patches_out = ggml_concat(ctx0, first, second, 0); ggml_tensor * out_seq = has_cls ? ggml_concat(ctx0, cls, patches_out, 1) : patches_out; return ggml_permute(ctx0, out_seq, 0, 2, 1, 3); }; auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { return apply_rope_2d(cur); }; ggml_tensor * cur = build_vit( inp, n_pos, NORM_TYPE_NORMAL, hparams.ffn_op, nullptr, add_pos); ggml_tensor * cls = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], 0); ggml_set_name(cls, "cls_view"); ggml_build_forward_expand(gf, cls); return gf; }