From 382135aa3619294ab8bf87b0de4b1255ab7942f0 Mon Sep 17 00:00:00 2001
From: Aaron Lee <lee.aaron.65@gmail.com>
Date: Sun, 17 Aug 2025 21:54:45 -0400
Subject: [PATCH] fixed mtp kv cache update sequencing after prompt processing

---
 tools/server/server.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index b85fa4e769..e323f7b521 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3543,18 +3543,19 @@ struct server_context {
 
                 const int tok_idx = slot.i_batch - i;
 
-                // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
-                if (slot.has_mtp) {
-                    mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
-                }
-
                 llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
                 slot.last_tok_idx = tok_idx;
+                SRV_INF("main loop sampled token: '%s'\n", common_token_to_piece(ctx, id, true).c_str());
 
                 slot.i_batch = -1;
 
                 common_sampler_accept(slot.smpl, id, true);
 
+                // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
+                if (slot.has_mtp) {
+                    mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
+                }
+
                 slot.n_decoded += 1;
 
                 const int64_t t_current = ggml_time_us();