llama.cpp/tools/mtmd/models/jinaclip2.cpp

123 lines
4.2 KiB
C++

#include "models.h"
#include <cmath>
ggml_cgraph * clip_graph_jinaclip2::build() {
const bool has_cls = model.class_embedding != nullptr;
GGML_ASSERT(has_cls && "JinaCLIP2 requires a CLS token");
const int n_pos = n_patches + (has_cls ? 1 : 0);
GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(positions, "positions");
ggml_set_input(positions);
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
GGML_ASSERT(d_head % 2 == 0);
ggml_tensor * rope_c_first = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head / 2);
ggml_set_name(rope_c_first, "rope_c_first");
ggml_set_input(rope_c_first);
ggml_tensor * rope_c_second = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head / 2);
ggml_set_name(rope_c_second, "rope_c_second");
ggml_set_input(rope_c_second);
ggml_tensor * inp = build_inp();
if (has_cls) {
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
}
inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
auto apply_rope_2d = [&](ggml_tensor * cur) -> ggml_tensor * {
ggml_tensor * cur_in = ggml_permute(ctx0, cur, 0, 2, 1, 3);
const int64_t n_dim = cur_in->ne[0];
const int64_t seq = cur_in->ne[1];
const int64_t nhead = cur_in->ne[2];
GGML_ASSERT(seq == n_pos);
GGML_ASSERT(n_dim % 2 == 0);
const int64_t half = n_dim / 2;
ggml_tensor * cls = nullptr;
ggml_tensor * patches = cur_in;
int64_t n_pos_patches = seq;
int64_t pos_offset = 0;
if (has_cls) {
cls = ggml_view_3d(ctx0, cur_in, n_dim, 1, nhead, cur_in->nb[1], cur_in->nb[2], 0);
patches = ggml_view_3d(ctx0, cur_in, n_dim, seq - 1, nhead, cur_in->nb[1], cur_in->nb[2], cur_in->nb[1]);
n_pos_patches = seq - 1;
pos_offset = 1;
}
// select positions
ggml_tensor * pos_a = ggml_view_1d(ctx0, pos_h, n_pos_patches, pos_offset * (int64_t) ggml_element_size(pos_h));
ggml_tensor * pos_b = ggml_view_1d(ctx0, pos_w, n_pos_patches, pos_offset * (int64_t) ggml_element_size(pos_w));
ggml_tensor * first = ggml_view_3d(ctx0, patches,
half, nhead, n_pos_patches,
patches->nb[2], patches->nb[1], 0);
ggml_tensor * first_rot = ggml_rope_ext(
ctx0,
first,
pos_a,
rope_c_first,
half,
0, 0, hparams.rope_theta,
1.0f,
0.0f, 1.0f, 0.0f, 0.0f);
first = ggml_view_3d(ctx0, first_rot,
half, n_pos_patches, nhead,
first_rot->nb[2], first_rot->nb[1], 0);
ggml_tensor * second = ggml_view_3d(ctx0, patches,
half, nhead, n_pos_patches,
patches->nb[2], patches->nb[1],
half * (int64_t) ggml_element_size(patches));
ggml_tensor * second_rot = ggml_rope_ext(
ctx0,
second,
pos_b,
rope_c_second,
half,
0, 0, hparams.rope_theta,
1.0f,
0.0f, 1.0f, 0.0f, 0.0f);
second = ggml_view_3d(ctx0, second_rot,
half, n_pos_patches, nhead,
second_rot->nb[2], second_rot->nb[1], 0);
ggml_tensor * patches_out = ggml_concat(ctx0, first, second, 0);
ggml_tensor * out_seq = has_cls ? ggml_concat(ctx0, cls, patches_out, 1) : patches_out;
return ggml_permute(ctx0, out_seq, 0, 2, 1, 3);
};
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return apply_rope_2d(cur);
};
ggml_tensor * cur = build_vit(
inp, n_pos,
NORM_TYPE_NORMAL,
hparams.ffn_op,
nullptr,
add_pos);
ggml_tensor * cls = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], 0);
ggml_set_name(cls, "cls_view");
ggml_build_forward_expand(gf, cls);
return gf;
}