diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b85fa4e769..e323f7b521 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3543,18 +3543,19 @@ struct server_context { const int tok_idx = slot.i_batch - i; - // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation - if (slot.has_mtp) { - mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch); - } - llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); slot.last_tok_idx = tok_idx; + SRV_INF("main loop sampled token: '%s'\n", common_token_to_piece(ctx, id, true).c_str()); slot.i_batch = -1; common_sampler_accept(slot.smpl, id, true); + // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation + if (slot.has_mtp) { + mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch); + } + slot.n_decoded += 1; const int64_t t_current = ggml_time_us();