From 382135aa3619294ab8bf87b0de4b1255ab7942f0 Mon Sep 17 00:00:00 2001 From: Aaron Lee Date: Sun, 17 Aug 2025 21:54:45 -0400 Subject: [PATCH] fixed mtp kv cache update sequencing after prompt processing --- tools/server/server.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b85fa4e769..e323f7b521 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3543,18 +3543,19 @@ struct server_context { const int tok_idx = slot.i_batch - i; - // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation - if (slot.has_mtp) { - mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch); - } - llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); slot.last_tok_idx = tok_idx; + SRV_INF("main loop sampled token: '%s'\n", common_token_to_piece(ctx, id, true).c_str()); slot.i_batch = -1; common_sampler_accept(slot.smpl, id, true); + // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation + if (slot.has_mtp) { + mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch); + } + slot.n_decoded += 1; const int64_t t_current = ggml_time_us();