Address PR feedback

This commit is contained in:
Tarek Dakhran 2025-12-15 21:24:32 +01:00
parent f5b132a68c
commit ba9e59739c
No known key found for this signature in database
GPG Key ID: AA0AF9C3E1721799
2 changed files with 68 additions and 109 deletions

View File

@ -9706,6 +9706,7 @@ class LFM2AudioModel(MmprojModel):
return self.global_config.get("encoder") return self.global_config.get("encoder")
def set_gguf_parameters(self): def set_gguf_parameters(self):
assert self.hparams_audio is not None
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"] self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"] self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"]
self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"] self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"]
@ -9755,6 +9756,10 @@ class LFM2AudioModel(MmprojModel):
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b), (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
] ]
# reshape conv weights
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
data_torch = data_torch[:, None, None]
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]

View File

@ -20,7 +20,7 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
{ {
// layer.0 - conv2d // layer.0 - conv2d
cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1); cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[0], 1, 1, cur->ne[2], 1)); cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
cb(cur, "conformer.pre_encode.conv.{}", 0); cb(cur, "conformer.pre_encode.conv.{}", 0);
// layer.1 - relu // layer.1 - relu
@ -28,12 +28,12 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
// layer.2 conv2d dw // layer.2 conv2d dw
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1); cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[2], 1, 1, cur->ne[2], 1)); cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
cb(cur, "conformer.pre_encode.conv.{}", 2); cb(cur, "conformer.pre_encode.conv.{}", 2);
// layer.3 conv2d // layer.3 conv2d
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1); cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[3], 1, 1, cur->ne[2], 1)); cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
cb(cur, "conformer.pre_encode.conv.{}", 3); cb(cur, "conformer.pre_encode.conv.{}", 3);
// layer.4 - relu // layer.4 - relu
@ -41,12 +41,12 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
// layer.5 conv2d dw // layer.5 conv2d dw
cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1); cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[5], 1, 1, cur->ne[2], 1)); cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
cb(cur, "conformer.pre_encode.conv.{}", 5); cb(cur, "conformer.pre_encode.conv.{}", 5);
// layer.6 conv2d // layer.6 conv2d
cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1); cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
cur = ggml_add(ctx0, cur, ggml_reshape_4d(ctx0, model.pre_encode_conv_X_b[6], 1, 1, cur->ne[2], 1)); cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
cb(cur, "conformer.pre_encode.conv.{}", 6); cb(cur, "conformer.pre_encode.conv.{}", 6);
// layer.7 - relu // layer.7 - relu
@ -76,11 +76,8 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il); cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_feed_forward1", il); cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
cur = build_ffn(cur, cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
layer.ff_up_w, layer.ff_up_b, il);
nullptr, nullptr,
layer.ff_down_w, layer.ff_down_b,
FFN_SILU, il);
cb(cur, "conformer.layers.{}.feed_forward1.linear2", il); cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
const auto fc_factor = 0.5f; const auto fc_factor = 0.5f;
@ -91,28 +88,25 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il); cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_self_att", il); cb(cur, "conformer.layers.{}.norm_self_att", il);
cb(cur, "conformer.layers.{}.self_attn.id", il);
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
Qcur = ggml_add(ctx0, Qcur, layer.q_b); Qcur = ggml_add(ctx0, Qcur, layer.q_b);
cb(Qcur, "conformer.layers.{}.self_attn.linear_q", il); Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
Q_bias_u = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3));
ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
Q_bias_v = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3));
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
Kcur = ggml_add(ctx0, Kcur, layer.k_b); Kcur = ggml_add(ctx0, Kcur, layer.k_b);
cb(Kcur, "conformer.layers.{}.self_attn.linear_k", il); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
Vcur = ggml_add(ctx0, Vcur, layer.v_b); Vcur = ggml_add(ctx0, Vcur, layer.v_b);
cb(Vcur, "conformer.layers.{}.self_attn.linear_v", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); // build_attn won't fit due to matrix_ac and matrix_bd separation
ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
Q_bias_u = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3));
ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur); ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il); cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
@ -120,17 +114,11 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
cb(p, "conformer.layers.{}.self_attn.linear_pos", il); cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]); p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
p = ggml_cont(ctx0, ggml_permute(ctx0, p, 0, 2, 1, 3));
Q_bias_v = ggml_cont(ctx0, ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3));
cb(Q_bias_v, "conformer.layers.{}.self_attn.id0", il);
p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 2, 0, 3));
cb(p, "conformer.layers.{}.self_attn.id1", il);
p = ggml_cont(ctx0, ggml_permute(ctx0, p, 1, 0, 2, 3));
auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p); auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
// rel shift // rel shift
{ {
const auto pos_len = matrix_bd->ne[0]; const auto pos_len = matrix_bd->ne[0];
@ -139,28 +127,20 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
q_len, pos_len, h, matrix_bd->nb[2], matrix_bd->nb[0] * q_len));
matrix_bd->nb[1], matrix_bd->nb[2], matrix_bd->nb[0] * q_len));
matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, pos_len, q_len, h); matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, pos_len, q_len, h);
} }
matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, matrix_bd = ggml_cont(ctx0, ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
matrix_ac->ne[0], matrix_bd->ne[1], matrix_bd->ne[2], matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0));
matrix_bd->nb[1], matrix_bd->nb[2], 0));
auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd); auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
cb(scores, "conformer.layers.{}.self_attn.id0", il); cb(scores, "conformer.layers.{}.self_attn.id0", il);
ggml_tensor * attn = ggml_soft_max(ctx0, scores); ggml_tensor * attn = ggml_soft_max(ctx0, scores);
// TODO(tarek): combine permutes
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 0, 2, 1, 3));
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 0, 2, 3));
ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur);
// TODO(tarek): combine permutes x = ggml_cont(ctx0, ggml_permute(ctx0, x, 2, 0, 1, 3));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 1, 0, 2, 3));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
x = ggml_reshape_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); x = ggml_reshape_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
x = ggml_mul_mat(ctx0, layer.o_w, x); x = ggml_mul_mat(ctx0, layer.o_w, x);
@ -177,18 +157,16 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
// conv // conv
{ {
auto * x = cur; auto * x = cur;
auto * conv_pw1_w = ggml_reshape_2d(ctx0, layer.conv_pw1_w, layer.conv_pw1_w->ne[1], layer.conv_pw1_w->ne[2]); auto * conv_pw1_w =
ggml_reshape_2d(ctx0, layer.conv_pw1_w, layer.conv_pw1_w->ne[1], layer.conv_pw1_w->ne[2]);
x = ggml_mul_mat(ctx0, conv_pw1_w, x); x = ggml_mul_mat(ctx0, conv_pw1_w, x);
x = ggml_add(ctx0, x, layer.conv_pw1_b); x = ggml_add(ctx0, x, layer.conv_pw1_b);
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
cb(x, "conformer.layers.{}.conv.pointwise_conv1", il); cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); // ggml_glu doesn't support sigmoid
// TODO: add support of torch.funtional.glu
{ {
int64_t d = x->ne[0] / 2; int64_t d = x->ne[0] / 2;
ggml_tensor *gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
} }
@ -201,27 +179,16 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
auto * conv_dw_w = ggml_reshape_2d(ctx0, layer.conv_dw_w, layer.conv_dw_w->ne[0], layer.conv_dw_w->ne[2]); auto * conv_dw_w = ggml_reshape_2d(ctx0, layer.conv_dw_w, layer.conv_dw_w->ne[0], layer.conv_dw_w->ne[2]);
x = ggml_ssm_conv(ctx0, x, conv_dw_w); x = ggml_ssm_conv(ctx0, x, conv_dw_w);
x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0])); x = ggml_add(ctx0, x, ggml_reshape_1d(ctx0, layer.conv_dw_b, layer.conv_dw_b->ne[0]));
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
cb(x, "conformer.layers.{}.conv.depthwise_conv", il);
{
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
cb(x, "conformer.layers.{}.conv.batch_norm", il);
}
x = ggml_silu(ctx0, x); x = ggml_silu(ctx0, x);
// pointwise_conv2 // pointwise_conv2
x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); auto * conv_pw2_w =
auto * conv_pw2_w = ggml_reshape_2d(ctx0, layer.conv_pw2_w, layer.conv_pw2_w->ne[1], layer.conv_pw2_w->ne[2]); ggml_reshape_2d(ctx0, layer.conv_pw2_w, layer.conv_pw2_w->ne[1], layer.conv_pw2_w->ne[2]);
x = ggml_mul_mat(ctx0, conv_pw2_w, x); x = ggml_mul_mat(ctx0, conv_pw2_w, x);
x = ggml_add(ctx0, x, layer.conv_pw2_b); x = ggml_add(ctx0, x, layer.conv_pw2_b);
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
cb(x, "conformer.layers.{}.conv.pointwise_conv2", il);
x = ggml_cont(ctx0, ggml_transpose(ctx0, x));
cur = x; cur = x;
} }
@ -230,10 +197,7 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il); cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_feed_forward2", il); cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
cur = build_ffn(cur, cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
layer.ff_up_1_w, layer.ff_up_1_b,
nullptr, nullptr,
layer.ff_down_1_w, layer.ff_down_1_b,
FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams
cb(cur, "conformer.layers.{}.feed_forward2.linear2", il); cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
@ -245,22 +209,12 @@ ggml_cgraph * clip_graph_lfm2_audio_enc::build() {
} }
// audio adapter // audio adapter
{
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
cb(cur, "audio_adapter.model.{}", 0); cb(cur, "audio_adapter.model.{}", 0);
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
cur = ggml_add(ctx0, cur, model.mm_1_b);
cb(cur, "audio_adapter.model.{}", 1);
cur = ggml_gelu_erf(ctx0, cur);
cb(cur, "audio_adapter.model.{}", 2);
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
cur = ggml_add(ctx0, cur, model.mm_3_b);
cb(cur, "audio_adapter.model.{}", 3);
}
cb(cur, "projected", -1); cb(cur, "projected", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
return gf; return gf;