diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a21ffc5da3..223d615a6d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9706,6 +9706,7 @@ class LFM2AudioModel(MmprojModel): return self.global_config.get("encoder") def set_gguf_parameters(self): + assert self.hparams_audio is not None self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"] self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"] @@ -9755,6 +9756,10 @@ class LFM2AudioModel(MmprojModel): (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b), ] + # reshape conv weights + if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): + data_torch = data_torch[:, None, None] + return [(self.map_tensor_name(name), data_torch)] diff --git a/tools/mtmd/models/lfm2-audio-enc.cpp b/tools/mtmd/models/lfm2-audio-enc.cpp index c3869c52e2..831099f8eb 100644 --- a/tools/mtmd/models/lfm2-audio-enc.cpp +++ b/tools/mtmd/models/lfm2-audio-enc.cpp @@ -1,8 +1,8 @@ #include "models.h" ggml_cgraph * clip_graph_lfm2_audio_enc::build() { - const int n_frames = img.nx; - const int n_pos = n_frames / 2; + const int n_frames = img.nx; + const int n_pos = n_frames / 2; const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); @@ -20,7 +20,7 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { { // layer.0 - conv2d cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[0], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]); cb(cur, "conformer.pre_encode.conv.{}", 0); // layer.1 - relu @@ -28,12 +28,12 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { // layer.2 conv2d dw cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[2], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]); cb(cur, "conformer.pre_encode.conv.{}", 2); // layer.3 conv2d cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[3], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]); cb(cur, "conformer.pre_encode.conv.{}", 3); // layer.4 - relu @@ -41,12 +41,12 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { // layer.5 conv2d dw cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[5], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]); cb(cur, "conformer.pre_encode.conv.{}", 5); // layer.6 conv2d cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[6], 1, 1, cur->ne[2], 1)); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]); cb(cur, "conformer.pre_encode.conv.{}", 6); // layer.7 - relu @@ -76,94 +76,74 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il); cb(cur, "conformer.layers.{}.norm_feed_forward1", il); - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - nullptr, nullptr, - layer.ff_down_w, layer.ff_down_b, - FFN_SILU, il); + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU, + il); cb(cur, "conformer.layers.{}.feed_forward1.linear2", il); const auto fc_factor = 0.5f; - residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); // self-attention { cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il); cb(cur, "conformer.layers.{}.norm_self_att", il); - cb(cur, "conformer.layers.{}.self_attn.id", il); - ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); - Qcur = ggml_add(ctx0, Qcur, layer.q_b); - cb(Qcur, "conformer.layers.{}.self_attn.linear_q", il); + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); + ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); + Q_bias_u = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3)); + ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v); + Q_bias_v = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3)); ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); - Kcur = ggml_add(ctx0, Kcur, layer.k_b); - cb(Kcur, "conformer.layers.{}.self_attn.linear_k", il); + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); - Vcur = ggml_add(ctx0, Vcur, layer.v_b); - cb(Vcur, "conformer.layers.{}.self_attn.linear_v", il); + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); + Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3)); - Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); - Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); - Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); - - ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); - ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v); - - Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); - Q_bias_u = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3)); + // build_attn won't fit due to matrix_ac and matrix_bd separation ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur); - matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); + matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il); auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); cb(p, "conformer.layers.{}.self_attn.linear_pos", il); p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]); + p = ggml_cont(ctx0, ggml_permute(ctx0, p, 0, 2, 1, 3)); - Q_bias_v = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3)); - cb(Q_bias_v, "conformer.layers.{}.self_attn.id0", il); - p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 2, 0, 3)); - cb(p, "conformer.layers.{}.self_attn.id1", il); - - p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 0, 2, 3)); auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p); - matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); - + matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); // rel shift { const auto pos_len = matrix_bd->ne[0]; - const auto q_len = matrix_bd->ne[1]; + const auto q_len = matrix_bd->ne[1]; const auto h = matrix_bd->ne[2]; - matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); - matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); - matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); - matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, - q_len, pos_len, h, - matrix_bd->nb[1], matrix_bd->nb[2], matrix_bd->nb[0] * q_len)); - matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, pos_len, q_len, h); + matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); + matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1], + matrix_bd->nb[2], matrix_bd->nb[0] * q_len)); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, pos_len, q_len, h); } - matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, - matrix_ac->ne[0], matrix_bd->ne[1], matrix_bd->ne[2], - matrix_bd->nb[1], matrix_bd->nb[2], 0)); + matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1], + matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0)); auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd); - scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); + scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); cb(scores, "conformer.layers.{}.self_attn.id0", il); - ggml_tensor * attn = ggml_soft_max(ctx0, scores); - // TODO(tarek): combine permutes - Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 0, 2, 1, 3)); - Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 0, 2, 3)); - ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); - // TODO(tarek): combine permutes - x = ggml_cont(ctx0, ggml_permute(ctx0, x, 1, 0, 2, 3)); - x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3)); - x = ggml_reshape_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); + ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); + x = ggml_cont(ctx0, ggml_permute(ctx0, x, 2, 0, 1, 3)); + x = ggml_reshape_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); - x = ggml_mul_mat(ctx0, layer.o_w, x); + x = ggml_mul_mat(ctx0, layer.o_w, x); ggml_tensor * out = ggml_add(ctx0, x, layer.o_b); cb(out, "conformer.layers.{}.self_attn.linear_out", il); @@ -171,57 +151,44 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { } residual = ggml_add(ctx0, residual, cur); - cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il); + cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il); cb(cur, "conformer.layers.{}.norm_conv", il); // conv { auto * x = cur; - auto * conv_pw1_w = ggml_reshape_2d(ctx0, layer.conv_pw1_w, layer.conv_pw1_w->ne[1], layer.conv_pw1_w->ne[2]); + auto * conv_pw1_w = + ggml_reshape_2d(ctx0, layer.conv_pw1_w, layer.conv_pw1_w->ne[1], layer.conv_pw1_w->ne[2]); x = ggml_mul_mat(ctx0, conv_pw1_w, x); x = ggml_add(ctx0, x, layer.conv_pw1_b); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); cb(x, "conformer.layers.{}.conv.pointwise_conv1", il); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - - // TODO: add support of torch.funtional.glu + // ggml_glu doesn't support sigmoid { - int64_t d = x->ne[0] / 2; - ggml_tensor *gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); - x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + int64_t d = x->ne[0] / 2; + ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); + x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); } // use ggml_ssm_conv for f32 precision - x = ggml_pad(ctx0, x, 4, 0, 0, 0); - x = ggml_roll(ctx0, x, 4, 0, 0, 0); - x = ggml_pad(ctx0, x, 4, 0, 0, 0); - x = ggml_cont(ctx0, x); + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_roll(ctx0, x, 4, 0, 0, 0); + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_cont(ctx0, x); auto * conv_dw_w = ggml_reshape_2d(ctx0, layer.conv_dw_w, layer.conv_dw_w->ne[0], layer.conv_dw_w->ne[2]); - x = ggml_ssm_conv(ctx0, x, conv_dw_w); - x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0])); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + x = ggml_ssm_conv(ctx0, x, conv_dw_w); + x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0])); - cb(x, "conformer.layers.{}.conv.depthwise_conv", il); - - { - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - cb(x, "conformer.layers.{}.conv.batch_norm", il); - } + x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); x = ggml_silu(ctx0, x); // pointwise_conv2 - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - auto * conv_pw2_w = ggml_reshape_2d(ctx0, layer.conv_pw2_w, layer.conv_pw2_w->ne[1], layer.conv_pw2_w->ne[2]); + auto * conv_pw2_w = + ggml_reshape_2d(ctx0, layer.conv_pw2_w, layer.conv_pw2_w->ne[1], layer.conv_pw2_w->ne[2]); x = ggml_mul_mat(ctx0, conv_pw2_w, x); x = ggml_add(ctx0, x, layer.conv_pw2_b); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); - cb(x, "conformer.layers.{}.conv.pointwise_conv2", il); - x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); cur = x; } @@ -230,11 +197,8 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il); cb(cur, "conformer.layers.{}.norm_feed_forward2", il); - cur = build_ffn(cur, - layer.ff_up_1_w, layer.ff_up_1_b, - nullptr, nullptr, - layer.ff_down_1_w, layer.ff_down_1_b, - FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams + cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b, + FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams cb(cur, "conformer.layers.{}.feed_forward2.linear2", il); residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); @@ -245,22 +209,12 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() { } // audio adapter - { - cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); - cb(cur, "audio_adapter.model.{}", 0); - cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); - cur = ggml_add(ctx0, cur, model.mm_1_b); - cb(cur, "audio_adapter.model.{}", 1); - cur = ggml_gelu_erf(ctx0, cur); - cb(cur, "audio_adapter.model.{}", 2); - cur = ggml_mul_mat(ctx0, model.mm_3_w, cur); - cur = ggml_add(ctx0, cur, model.mm_3_b); - cb(cur, "audio_adapter.model.{}", 3); - } + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cb(cur, "audio_adapter.model.{}", 0); + cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1); cb(cur, "projected", -1); - ggml_build_forward_expand(gf, cur); return gf;