server : fix timing of prompt/generation (#18713)

This commit is contained in:
Georgi Gerganov 2026-01-09 12:59:50 +02:00 committed by GitHub
parent d3435efc8a
commit 53eb9435da
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 5 additions and 4 deletions

View File

@ -2615,10 +2615,6 @@ private:
// on successful decode, restore the original batch size
n_batch = llama_n_batch(ctx);
// technically, measuring the time here excludes the sampling time for the last batch
// but on the other hand, we don't want to do too many system calls to measure the time, so it's ok
const int64_t t_current = ggml_time_us();
for (auto & slot : slots) {
// may need to copy state to other slots
if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) {
@ -2685,6 +2681,9 @@ private:
common_sampler_accept(slot.smpl.get(), id, true);
// here we have synchronized the llama_context (due to the sampling above), so we can do time measurement
const int64_t t_current = ggml_time_us();
slot.n_decoded += 1;
if (slot.n_decoded == 1) {
@ -2728,6 +2727,8 @@ private:
slot.i_batch_dft.clear();
slot.drafted.clear();
const int64_t t_current = ggml_time_us();
slot.n_decoded += ids.size();
slot.t_token_generation = std::max<int64_t>(1, t_current - slot.t_start_generation) / 1e3;