From 8004f3a8d1f9e66ed534bc46f98c3a16c27f8c4a Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 19 Feb 2026 09:54:48 +0100 Subject: [PATCH] model : add tokenizer from LFM2.5-Audio-1.5B (#19687) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * model : Add tokenizer from LFM2.5-Audio-1.5B [LFM2.5-Audio-1.5B](https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B) introduced lightweight audio tokenizer. Tokenizer based on LFM2 architecture and acts as "embedding" model with different input `n_embd` and output `n_embd_out`. To be used in https://github.com/ggml-org/llama.cpp/pull/18641. To convert use ```shell python3 convert_hf_to_gguf.py /path/to/LFM2.5-Audio-1.5B/audio_detokenizer ``` * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Formatting * Rework check for attention layers * Add LFM2 SWA model support * Address PR feedback * Set vocab to none * Move helper function definitions to cpp file --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 24 +++- src/llama-graph.cpp | 6 +- src/llama-graph.h | 1 + src/llama-model.cpp | 17 ++- src/llama-model.h | 5 +- src/models/lfm2.cpp | 261 ++++++++++++++++++++++-------------------- src/models/models.h | 8 +- 7 files changed, 185 insertions(+), 137 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2e41a2ecce..c40df1201c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -10726,7 +10726,7 @@ class LFM2Model(TextModel): def set_gguf_parameters(self): # set num_key_value_heads only for attention layers self.hparams["num_key_value_heads"] = [ - self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0 + self.hparams["num_key_value_heads"] if layer_type != "conv" else 0 for layer_type in self.hparams["layer_types"] ] @@ -10912,6 +10912,28 @@ class LFM2AudioModel(ConformerAudioModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Lfm25AudioTokenizer") +class LFM25AudioTokenizer(LFM2Model): + model_arch = gguf.MODEL_ARCH.LFM2 + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_embedding_length_out(self.hparams["output_size"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name == "istft.window" or name.startswith("emb.emb"): + return + + if name.startswith("lin"): + name = name.replace("lin", "dense_2_out") + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("SmallThinkerForCausalLM") class SmallThinkerModel(TextModel): model_arch = gguf.MODEL_ARCH.SMALLTHINKER diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 6927249873..6afd1fdf17 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2417,8 +2417,9 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() void llm_graph_context::build_dense_out( ggml_tensor * dense_2, + ggml_tensor * dense_2_b, ggml_tensor * dense_3) const { - if (!cparams.embeddings || !(dense_2 || dense_3)) { + if (!cparams.embeddings || !(dense_2 || dense_2_b || dense_3)) { return; } ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd; @@ -2427,6 +2428,9 @@ void llm_graph_context::build_dense_out( if (dense_2) { cur = ggml_mul_mat(ctx0, dense_2, cur); } + if (dense_2_b) { + cur = ggml_add(ctx0, cur, dense_2_b); + } if (dense_3) { cur = ggml_mul_mat(ctx0, dense_3, cur); } diff --git a/src/llama-graph.h b/src/llama-graph.h index 74a4685121..22d11a8385 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -1015,6 +1015,7 @@ struct llm_graph_context { void build_dense_out( ggml_tensor * dense_2, + ggml_tensor * dense_2_b, ggml_tensor * dense_3) const; }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2aebaddf27..ef87ddcfa0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2348,6 +2348,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 10752: type = LLM_TYPE_2_6B; break; default: type = LLM_TYPE_UNKNOWN; } + if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + for (uint32_t il = 0; il < hparams.n_layer; ++il) { + hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il]; + } + } } break; case LLM_ARCH_LFM2MOE: { @@ -6896,7 +6902,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // for LFM2-ColBert-350M - dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED); + dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED); + dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"), {hparams.n_embd_out() }, TENSOR_NOT_REQUIRED); } break; case LLM_ARCH_SMALLTHINKER: { @@ -8672,7 +8679,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { case LLM_ARCH_LFM2: case LLM_ARCH_LFM2MOE: { - llm = std::make_unique(*this, params); + if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) { + llm = std::make_unique>(*this, params); + } else { + llm = std::make_unique>(*this, params); + } } break; case LLM_ARCH_SMALLTHINKER: { @@ -8744,7 +8755,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { // there will be two additional dense projection layers // dense linear projections are applied after pooling // TODO: move reranking logic here and generalize - llm->build_dense_out(dense_2_out_layers, dense_3_out_layers); + llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers); llm->res->set_outputs(); diff --git a/src/llama-model.h b/src/llama-model.h index 5ffba24fe9..422ed45699 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -492,8 +492,9 @@ struct llama_model { //Dense linear projections for SentenceTransformers models like embeddinggemma // For Sentence Transformers models structure see // https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models - struct ggml_tensor * dense_2_out_layers = nullptr; - struct ggml_tensor * dense_3_out_layers = nullptr; + struct ggml_tensor * dense_2_out_layers = nullptr; + struct ggml_tensor * dense_2_out_layers_b = nullptr; + struct ggml_tensor * dense_3_out_layers = nullptr; // gguf metadata std::unordered_map gguf_kv; diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index 7f805d7879..cf01ad6255 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -1,18 +1,149 @@ #include "models.h" +#include "../llama-memory-hybrid-iswa.h" #include "../llama-memory-hybrid.h" +template +llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + using inp_hybrid_type = std::conditional_t; + using inp_attn_type = std::conditional_t; + using mem_hybrid_ctx = std::conditional_t; -llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) : - llm_graph_context(params), - model(model) { + // lambda helpers for readability + auto build_dense_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * { + GGML_ASSERT(!model.layers[il].ffn_up_b); + GGML_ASSERT(!model.layers[il].ffn_gate_b); + GGML_ASSERT(!model.layers[il].ffn_down_b); + return build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + }; + auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * { + return build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0, + static_cast(hparams.expert_gating_func), il); + }; + auto build_attn_block = [&model, this](ggml_tensor * cur, + ggml_tensor * inp_pos, + inp_attn_type * inp_attn, + int il) -> ggml_tensor * { + GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il)); + const auto n_embd_head = hparams.n_embd_head_v; + const auto n_head_kv = hparams.n_head_kv(il); + + auto * q = build_lora_mm(model.layers[il].wq, cur); + cb(q, "model.layers.{}.self_attn.q_proj", il); + auto * k = build_lora_mm(model.layers[il].wk, cur); + cb(k, "model.layers.{}.self_attn.k_proj", il); + auto * v = build_lora_mm(model.layers[il].wv, cur); + cb(v, "model.layers.{}.self_attn.v_proj", il); + + q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens); + k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens); + v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens); + + // qk norm + q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(q, "model.layers.{}.self_attn.q_layernorm", il); + k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(k, "model.layers.{}.self_attn.k_layernorm", il); + + // RoPE + q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, + attn_factor, beta_fast, beta_slow); + k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, + attn_factor, beta_fast, beta_slow); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); + + cb(cur, "model.layers.{}.self_attn.out_proj", il); + + return cur; + }; + auto build_shortconv_block = [&model, this](ggml_tensor * cur, + llm_graph_input_rs * inp_recr, + int il) -> ggml_tensor * { + const auto * mctx_cur = static_cast(mctx)->get_recr(); + const uint32_t kv_head = mctx_cur->get_head(); + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + const int64_t n_seqs = ubatch.n_seqs; + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + GGML_ASSERT(hparams.n_shortconv_l_cache > 1); + const uint32_t d_conv = hparams.n_shortconv_l_cache - 1; + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur); + cb(bcx, "model.layers.{}.conv.in_proj", il); + + constexpr auto n_chunks = 3; + GGML_ASSERT(bcx->ne[0] % n_chunks == 0); + const auto chunk_size = bcx->ne[0] / n_chunks; + auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], + 0 * chunk_size * ggml_element_size(bcx)); + auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], + 1 * chunk_size * ggml_element_size(bcx)); + auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], + 2 * chunk_size * ggml_element_size(bcx)); + + auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x)); + + // read conv state + auto * conv_state = mctx_cur->get_r_l(il); + auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs); + auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs); + + bx = ggml_concat(ctx0, conv, bx, 0); + GGML_ASSERT(bx->ne[0] > conv->ne[0]); + + // last d_conv columns is a new conv state + auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], + (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx)); + GGML_ASSERT(ggml_are_same_shape(conv, new_conv)); + + // write new conv conv state + ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, + ggml_view_1d(ctx0, conv_state, ggml_nelements(new_conv), + kv_head * d_conv * n_embd * ggml_element_size(new_conv)))); + + auto * conv_kernel = model.layers[il].shortconv.conv; + auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel); + cb(conv_out, "model.layers.{}.conv.conv", il); + + auto * y = ggml_mul(ctx0, c, conv_out); + y = build_lora_mm(model.layers[il].shortconv.out_proj, y); + cb(y, "model.layers.{}.conv.out_proj", il); + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs); + + return y; + }; + + // actual graph construction starts here ggml_tensor * cur = build_inp_embd(model.tok_embd); cb(cur, "model.embed_tokens", -1); ggml_build_forward_expand(gf, cur); + inp_hybrid_type * inp_hybrid = nullptr; + if constexpr (iswa) { + inp_hybrid = build_inp_mem_hybrid_iswa(); + } else { + inp_hybrid = build_inp_mem_hybrid(); + } + ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_hybrid = build_inp_mem_hybrid(); ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { @@ -54,122 +185,6 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params ggml_build_forward_expand(gf, cur); } -ggml_tensor * llm_build_lfm2::build_moe_feed_forward(ggml_tensor * cur, int il) const { - return build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, - model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0, - static_cast(hparams.expert_gating_func), il); -} - -ggml_tensor * llm_build_lfm2::build_dense_feed_forward(ggml_tensor * cur, int il) const { - GGML_ASSERT(!model.layers[il].ffn_up_b); - GGML_ASSERT(!model.layers[il].ffn_gate_b); - GGML_ASSERT(!model.layers[il].ffn_down_b); - return build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); -} - -ggml_tensor * llm_build_lfm2::build_attn_block(ggml_tensor * cur, - ggml_tensor * inp_pos, - llm_graph_input_attn_kv * inp_attn, - int il) const { - GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il)); - const auto n_embd_head = hparams.n_embd_head_v; - const auto n_head_kv = hparams.n_head_kv(il); - - auto * q = build_lora_mm(model.layers[il].wq, cur); - cb(q, "model.layers.{}.self_attn.q_proj", il); - auto * k = build_lora_mm(model.layers[il].wk, cur); - cb(k, "model.layers.{}.self_attn.k_proj", il); - auto * v = build_lora_mm(model.layers[il].wv, cur); - cb(v, "model.layers.{}.self_attn.v_proj", il); - - q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens); - k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens); - v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens); - - // qk norm - q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); - cb(q, "model.layers.{}.self_attn.q_layernorm", il); - k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); - cb(k, "model.layers.{}.self_attn.k_layernorm", il); - - // RoPE - q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, - attn_factor, beta_fast, beta_slow); - k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, - attn_factor, beta_fast, beta_slow); - - cur = build_attn(inp_attn, - model.layers[il].wo, NULL, - q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); - - cb(cur, "model.layers.{}.self_attn.out_proj", il); - - return cur; -} - -ggml_tensor * llm_build_lfm2::build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il) { - const auto * mctx_cur = static_cast(mctx)->get_recr(); - const uint32_t kv_head = mctx_cur->get_head(); - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs()); - GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - - GGML_ASSERT(hparams.n_shortconv_l_cache > 1); - const uint32_t d_conv = hparams.n_shortconv_l_cache - 1; - - // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} - cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); - - auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur); - cb(bcx, "model.layers.{}.conv.in_proj", il); - - constexpr auto n_chunks = 3; - GGML_ASSERT(bcx->ne[0] % n_chunks == 0); - const auto chunk_size = bcx->ne[0] / n_chunks; - auto * b = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], - 0 * chunk_size * ggml_element_size(bcx)); - auto * c = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], - 1 * chunk_size * ggml_element_size(bcx)); - auto * x = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2], - 2 * chunk_size * ggml_element_size(bcx)); - - auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x)); - - // read conv state - auto * conv_state = mctx_cur->get_r_l(il); - auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs); - auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs); - - bx = ggml_concat(ctx0, conv, bx, 0); - GGML_ASSERT(bx->ne[0] > conv->ne[0]); - - // last d_conv columns is a new conv state - auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2], - (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx)); - GGML_ASSERT(ggml_are_same_shape(conv, new_conv)); - - // write new conv conv state - ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, - ggml_view_1d(ctx0, conv_state, ggml_nelements(new_conv), - kv_head * d_conv * n_embd * ggml_element_size(new_conv)))); - - auto * conv_kernel = model.layers[il].shortconv.conv; - auto * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel); - cb(conv_out, "model.layers.{}.conv.conv", il); - - auto * y = ggml_mul(ctx0, c, conv_out); - y = build_lora_mm(model.layers[il].shortconv.out_proj, y); - cb(y, "model.layers.{}.conv.out_proj", il); - // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} - y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs); - - return y; -} +// Explicit template instantiations +template struct llm_build_lfm2; +template struct llm_build_lfm2; diff --git a/src/models/models.h b/src/models/models.h index e23918f55f..7e5db59b00 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -347,15 +347,9 @@ struct llm_build_kimi_linear : public llm_build_delta_net_base { const llama_model & model; }; +template struct llm_build_lfm2 : public llm_graph_context { - const llama_model & model; - llm_build_lfm2(const llama_model & model, const llm_graph_params & params); - ggml_tensor * build_moe_feed_forward(ggml_tensor * cur, int il) const; - ggml_tensor * build_dense_feed_forward(ggml_tensor * cur, int il) const; - ggml_tensor * build_attn_block(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, int il) const; - ggml_tensor * build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il); - }; struct llm_build_llada : public llm_graph_context {