llama.cpp/tools/mtmd/models/step3vl.cpp

82 lines
2.6 KiB
C++

#include "models.h"
ggml_cgraph * clip_graph_step3vl::build() {
GGML_ASSERT(model.class_embedding == nullptr);
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
norm_type norm_t = NORM_TYPE_NORMAL;
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
ggml_tensor * inp = build_inp();
ggml_tensor * learned_pos_embd = resize_position_embeddings();
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
};
auto add_spatial_bias = [&](ggml_tensor * cur, ggml_tensor * bias) {
if (bias == nullptr) {
return cur;
}
const int64_t width = cur->ne[0];
const int64_t height = cur->ne[1];
const int64_t channels = cur->ne[2];
cur = ggml_reshape_2d(ctx0, cur, width * height, channels);
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
cur = ggml_add(ctx0, cur, bias);
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
cur = ggml_reshape_3d(ctx0, cur, width, height, channels);
return cur;
};
ggml_tensor * cur = build_vit(
inp,
n_patches,
norm_t,
hparams.ffn_op,
learned_pos_embd,
add_pos);
cb(cur, "vit_out", -1);
// [n_embd, n_patches] -> [w, h, n_embd] for spatial downsampling convolutions.
cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
cur = ggml_cont_3d(ctx0, cur, n_patches_x, n_patches_y, n_embd);
// First downsampler: Conv2d(1536 -> 3072, k=3, s=2, p=1)
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, 2, 2, 1, 1, 1, 1);
cur = add_spatial_bias(cur, model.mm_0_b);
cb(cur, "downsample_0", -1);
// Second downsampler: Conv2d(3072 -> 6144, k=3, s=2, p=1)
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 2, 2, 1, 1, 1, 1);
cur = add_spatial_bias(cur, model.mm_1_b);
cb(cur, "downsample_1", -1);
// [w, h, c] -> [c, w*h]
{
const int64_t w = cur->ne[0];
const int64_t h = cur->ne[1];
cur = ggml_reshape_3d(ctx0, cur, w * h, cur->ne[2], cur->ne[3]);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
}
cb(cur, "downsample_flatten", -1);
// Final projector: Linear(6144 -> projection_dim)
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
cb(cur, "projector_out", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}