From 8e641eb4cd84dfb6180fd2cb88652fd9b77c310e Mon Sep 17 00:00:00 2001 From: Apoorv Reddy Date: Thu, 16 May 2024 07:15:57 -0700 Subject: [PATCH] Add TTFT to TimingInfo PiperOrigin-RevId: 634378994 --- gemma/gemma.cc | 3 +++ gemma/gemma.h | 1 + gemma/run.cc | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gemma/gemma.cc b/gemma/gemma.cc index 916241f..7f32804 100644 --- a/gemma/gemma.cc +++ b/gemma/gemma.cc @@ -1171,6 +1171,9 @@ void GenerateImpl(GemmaImpl& gemma, if (!runtime_config.stream_token(token, activations.logits[token])) { token = EOS_ID; } + if (generate_pos == 0) { + timing_info.time_to_first_token = hwy::platform::Now() - gen_start; + } } else { // We would take this branch if we were not doing Prefill but would // process the tokens of the prompt one at a time. diff --git a/gemma/gemma.h b/gemma/gemma.h index c2eb929..2b04b26 100644 --- a/gemma/gemma.h +++ b/gemma/gemma.h @@ -99,6 +99,7 @@ struct Gemma { struct TimingInfo { double prefill_tok_sec = 0.0; double gen_tok_sec = 0.0; + double time_to_first_token = 0; }; KVCache CreateKVCache(Model type); // convenient workaround for now diff --git a/gemma/run.cc b/gemma/run.cc index 8377bca..1b35523 100644 --- a/gemma/run.cc +++ b/gemma/run.cc @@ -224,7 +224,9 @@ void ReplGemma(gcpp::Gemma& model, ModelTraining training, << "\n" << timing_info.prefill_tok_sec << " prefill tokens / sec" << "\n" - << timing_info.gen_tok_sec << " tokens / sec" << "\n"; + << timing_info.gen_tok_sec << " tokens / sec" << "\n" + << static_cast(timing_info.time_to_first_token * 1000) + << " milliseconds time to first token" << "\n"; } std::cout << "\n\n"; }