fixed mtp kv cache update sequencing after prompt processing

2025-08-17 21:54:45 -04:00 · 2025-08-17 21:54:45 -04:00 · 382135aa36
parent 6870f9790c
commit 382135aa36
1 changed files with 6 additions and 5 deletions
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -3543,18 +3543,19 @@ struct server_context {

                const int tok_idx = slot.i_batch - i;

-                // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
-                if (slot.has_mtp) {
-                    mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
-                }
-
                llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
                slot.last_tok_idx = tok_idx;
+                SRV_INF("main loop sampled token: '%s'\n", common_token_to_piece(ctx, id, true).c_str());

                slot.i_batch = -1;

                common_sampler_accept(slot.smpl, id, true);

+                // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
+                if (slot.has_mtp) {
+                    mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
+                }
+
                slot.n_decoded += 1;

                const int64_t t_current = ggml_time_us();