From 8e641eb4cd84dfb6180fd2cb88652fd9b77c310e Mon Sep 17 00:00:00 2001
From: Apoorv Reddy <apoorvreddy@google.com>
Date: Thu, 16 May 2024 07:15:57 -0700
Subject: [PATCH] Add TTFT to TimingInfo

PiperOrigin-RevId: 634378994
---
 gemma/gemma.cc | 3 +++
 gemma/gemma.h  | 1 +
 gemma/run.cc   | 4 +++-
 3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/gemma/gemma.cc b/gemma/gemma.cc
index 916241f..7f32804 100644
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@@ -1171,6 +1171,9 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma,
       if (!runtime_config.stream_token(token, activations.logits[token])) {
         token = EOS_ID;
       }
+      if (generate_pos == 0) {
+        timing_info.time_to_first_token = hwy::platform::Now() - gen_start;
+      }
     } else {
       // We would take this branch if we were not doing Prefill but would
       // process the tokens of the prompt one at a time.
diff --git a/gemma/gemma.h b/gemma/gemma.h
index c2eb929..2b04b26 100644
--- a/gemma/gemma.h
+++ b/gemma/gemma.h
@@ -99,6 +99,7 @@ struct Gemma {
 struct TimingInfo {
   double prefill_tok_sec = 0.0;
   double gen_tok_sec = 0.0;
+  double time_to_first_token = 0;
 };
 
 KVCache CreateKVCache(Model type);  // convenient workaround for now
diff --git a/gemma/run.cc b/gemma/run.cc
index 8377bca..1b35523 100644
--- a/gemma/run.cc
+++ b/gemma/run.cc
@@ -224,7 +224,9 @@ void ReplGemma(gcpp::Gemma& model, ModelTraining training,
                 << "\n"
                 << timing_info.prefill_tok_sec << " prefill tokens / sec"
                 << "\n"
-                << timing_info.gen_tok_sec << " tokens / sec" << "\n";
+                << timing_info.gen_tok_sec << " tokens / sec" << "\n"
+                << static_cast<int>(timing_info.time_to_first_token * 1000)
+                << " milliseconds time to first token" << "\n";
     }
     std::cout << "\n\n";
   }