diff --git a/gemma/gemma.cc b/gemma/gemma.cc index 916241f..7f32804 100644 --- a/gemma/gemma.cc +++ b/gemma/gemma.cc @@ -1171,6 +1171,9 @@ void GenerateImpl(GemmaImpl& gemma, if (!runtime_config.stream_token(token, activations.logits[token])) { token = EOS_ID; } + if (generate_pos == 0) { + timing_info.time_to_first_token = hwy::platform::Now() - gen_start; + } } else { // We would take this branch if we were not doing Prefill but would // process the tokens of the prompt one at a time. diff --git a/gemma/gemma.h b/gemma/gemma.h index c2eb929..2b04b26 100644 --- a/gemma/gemma.h +++ b/gemma/gemma.h @@ -99,6 +99,7 @@ struct Gemma { struct TimingInfo { double prefill_tok_sec = 0.0; double gen_tok_sec = 0.0; + double time_to_first_token = 0; }; KVCache CreateKVCache(Model type); // convenient workaround for now diff --git a/gemma/run.cc b/gemma/run.cc index 8377bca..1b35523 100644 --- a/gemma/run.cc +++ b/gemma/run.cc @@ -224,7 +224,9 @@ void ReplGemma(gcpp::Gemma& model, ModelTraining training, << "\n" << timing_info.prefill_tok_sec << " prefill tokens / sec" << "\n" - << timing_info.gen_tok_sec << " tokens / sec" << "\n"; + << timing_info.gen_tok_sec << " tokens / sec" << "\n" + << static_cast(timing_info.time_to_first_token * 1000) + << " milliseconds time to first token" << "\n"; } std::cout << "\n\n"; }