From 833dfb54aeae9958f41637b1d35983e4dffb0d48 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 10 Jun 2025 16:30:49 -0600 Subject: [PATCH] fix: Use per-layer n_embd_k/v_s calls for mamba (1) layers Branch: HybridRecurrentCache Signed-off-by: Gabe Goodhart --- src/llama-model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6e9dd53223..34643226e5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9194,11 +9194,11 @@ struct llm_build_mamba : public llm_graph_context { // (ab)using the KV cache to store the states ggml_tensor * conv = build_recurrent_state( gf, conv_states_all, state_copy, - hparams.n_embd_k_s(), n_seqs); + hparams.n_embd_k_s(il), n_seqs); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); ggml_tensor * ssm = build_recurrent_state( gf, ssm_states_all, state_copy, - hparams.n_embd_v_s(), n_seqs); + hparams.n_embd_v_s(il), n_seqs); ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}