diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index e1f65dfcce..324c3af30c 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2615,10 +2615,6 @@ private: // on successful decode, restore the original batch size n_batch = llama_n_batch(ctx); - // technically, measuring the time here excludes the sampling time for the last batch - // but on the other hand, we don't want to do too many system calls to measure the time, so it's ok - const int64_t t_current = ggml_time_us(); - for (auto & slot : slots) { // may need to copy state to other slots if (slot.state == SLOT_STATE_DONE_PROMPT && slot.is_parent()) { @@ -2685,6 +2681,9 @@ private: common_sampler_accept(slot.smpl.get(), id, true); + // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement + const int64_t t_current = ggml_time_us(); + slot.n_decoded += 1; if (slot.n_decoded == 1) { @@ -2728,6 +2727,8 @@ private: slot.i_batch_dft.clear(); slot.drafted.clear(); + const int64_t t_current = ggml_time_us(); + slot.n_decoded += ids.size(); slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3;