82 lines
2.6 KiB
C++
82 lines
2.6 KiB
C++
#include "models.h"
|
|
|
|
ggml_cgraph * clip_graph_step3vl::build() {
|
|
GGML_ASSERT(model.class_embedding == nullptr);
|
|
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
|
|
GGML_ASSERT(model.position_embeddings != nullptr);
|
|
|
|
norm_type norm_t = NORM_TYPE_NORMAL;
|
|
|
|
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
|
ggml_set_name(pos_h, "pos_h");
|
|
ggml_set_input(pos_h);
|
|
|
|
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
|
ggml_set_name(pos_w, "pos_w");
|
|
ggml_set_input(pos_w);
|
|
|
|
ggml_tensor * inp = build_inp();
|
|
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
|
|
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
|
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
|
};
|
|
|
|
auto add_spatial_bias = [&](ggml_tensor * cur, ggml_tensor * bias) {
|
|
if (bias == nullptr) {
|
|
return cur;
|
|
}
|
|
|
|
const int64_t width = cur->ne[0];
|
|
const int64_t height = cur->ne[1];
|
|
const int64_t channels = cur->ne[2];
|
|
|
|
cur = ggml_reshape_2d(ctx0, cur, width * height, channels);
|
|
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
cur = ggml_add(ctx0, cur, bias);
|
|
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
cur = ggml_reshape_3d(ctx0, cur, width, height, channels);
|
|
|
|
return cur;
|
|
};
|
|
|
|
ggml_tensor * cur = build_vit(
|
|
inp,
|
|
n_patches,
|
|
norm_t,
|
|
hparams.ffn_op,
|
|
learned_pos_embd,
|
|
add_pos);
|
|
cb(cur, "vit_out", -1);
|
|
|
|
// [n_embd, n_patches] -> [w, h, n_embd] for spatial downsampling convolutions.
|
|
cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
|
|
cur = ggml_cont_3d(ctx0, cur, n_patches_x, n_patches_y, n_embd);
|
|
|
|
// First downsampler: Conv2d(1536 -> 3072, k=3, s=2, p=1)
|
|
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, 2, 2, 1, 1, 1, 1);
|
|
cur = add_spatial_bias(cur, model.mm_0_b);
|
|
cb(cur, "downsample_0", -1);
|
|
|
|
// Second downsampler: Conv2d(3072 -> 6144, k=3, s=2, p=1)
|
|
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 2, 2, 1, 1, 1, 1);
|
|
cur = add_spatial_bias(cur, model.mm_1_b);
|
|
cb(cur, "downsample_1", -1);
|
|
|
|
// [w, h, c] -> [c, w*h]
|
|
{
|
|
const int64_t w = cur->ne[0];
|
|
const int64_t h = cur->ne[1];
|
|
cur = ggml_reshape_3d(ctx0, cur, w * h, cur->ne[2], cur->ne[3]);
|
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
|
|
}
|
|
cb(cur, "downsample_flatten", -1);
|
|
|
|
// Final projector: Linear(6144 -> projection_dim)
|
|
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
|
cb(cur, "projector_out", -1);
|
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
return gf;
|
|
}
|