diff --git a/BUILD.bazel b/BUILD.bazel
index f482e56..f5fad45 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -111,6 +111,7 @@ cc_library(
         ":basics",
         ":threading",
         ":topology",
+        ":zones",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
         "@highway//:profiler",
@@ -118,6 +119,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "zones",
+    srcs = ["util/zones.cc"],
+    hdrs = ["util/zones.h"],
+    deps = [
+        "@highway//:profiler",
+    ],
+)
+
 cc_test(
     name = "flash_attention_test",
     srcs = ["gemma/flash_attention_test.cc"],
@@ -263,6 +273,7 @@ cc_library(
         ":model_store",
         ":tensor_info",
         ":threading_context",
+        ":zones",
         "//compression:compress",
         "//io:blob_store",
         "@highway//:hwy",
@@ -321,6 +332,7 @@ cc_library(
         ":matmul_env",
         ":threading",
         ":threading_context",
+        ":zones",
         "//compression:compress",
         "@highway//:bit_set",
         "@highway//:hwy",
@@ -352,6 +364,7 @@ cc_library(
         ":matmul",
         ":matmul_env",
         ":threading_context",
+        ":zones",
         "//compression:compress",
         "//compression:types",
         "@highway//:hwy",
@@ -376,6 +389,7 @@ cc_library(
         ":matmul_env",  # MMOptions
         ":matmul_static",
         ":threading_context",
+        ":zones",
         "//compression:compress",
         "@highway//:algo",
         "@highway//:bit_set",
@@ -431,6 +445,7 @@ cc_test(
         ":ops",
         ":test_util",
         ":threading_context",
+        ":zones",
         "@googletest//:gtest_main",  # buildcleaner: keep
         "//compression:test_util",
         "//compression:types",
@@ -556,6 +571,7 @@ cc_library(
         ":threading",
         ":threading_context",
         ":weights",
+        ":zones",
         "//compression:compress",
         "//compression:types",
         "//io",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5dc4e11..983d643 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,6 +130,8 @@ set(SOURCES
   util/threading.h
   util/topology.cc
   util/topology.h
+  util/zones.cc
+  util/zones.h
 )
 
 # Add C API sources only when building DLL
diff --git a/gemma/attention.cc b/gemma/attention.cc
index a77021a..8950bc2 100644
--- a/gemma/attention.cc
+++ b/gemma/attention.cc
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#include "util/zones.h"
 #ifndef HWY_DISABLED_TARGETS
 #define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
 #endif  // HWY_DISABLED_TARGETS
@@ -55,8 +56,7 @@ static HWY_INLINE void QDotK(const size_t start_pos, const size_t last_pos,
                              const float* HWY_RESTRICT q,
                              const MatPtrT<KV_t>& k, float* HWY_RESTRICT att,
                              hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Gen.Attention.QDotK");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kGenAttentionQDotK));
   if (HWY_LIKELY(last_pos < static_cast<size_t>(div_seq_len.GetDivisor()))) {
     // Slightly faster: no wraparound.
     for (size_t pos = start_pos; pos <= last_pos; ++pos) {
@@ -175,7 +175,12 @@ void DotSoftmaxWeightedSum(const size_t num_tokens, const size_t layer_idx,
                            const LayerWeightsPtrs& layer,
                            AttentionActivations& activations, QBatch& qbatch,
                            ThreadingContext& ctx) {
-  static const auto zone = ctx.profiler.AddZone("Gen.Attention.DotSoftmax.par");
+  static const auto root_zone =
+      ctx.profiler.AddZone("Gen.Attention.DotSoftmaxWeightedSumInclusive",
+                           hwy::ProfilerFlags::kInclusive);
+  PROFILER_ZONE3(ctx.profiler, 0, root_zone);
+  const auto zone =
+      GetProfilerZone(Zones::kGenAttentionDotSoftmaxWeightedSumPar);
 
   const hwy::Divisor div_qbatch(qbatch.Size());
   const LayerConfig& layer_config = layer.layer_config;
diff --git a/gemma/flash_attention.cc b/gemma/flash_attention.cc
index 548c1aa..cfadf28 100644
--- a/gemma/flash_attention.cc
+++ b/gemma/flash_attention.cc
@@ -22,6 +22,7 @@
 
 #include "compression/types.h"  // GEMMA_DISABLED_TARGETS
 #include "util/threading_context.h"
+#include "util/zones.h"
 #ifndef HWY_DISABLED_TARGETS
 #define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
 #endif  // HWY_DISABLED_TARGETS
@@ -60,7 +61,7 @@ static constexpr size_t kNFx8HTileSize = 8;
 // possible consecutive elements have the same KV.
 static void TransposeQ(const MatPtrT<float>& q, MatPtrT<float>& q_t,
                        const size_t qbatch_size, ThreadingContext& ctx) {
-  static const auto zone = ctx.profiler.AddZone("Gen.Attention.TransposeQ");
+  const auto zone = GetProfilerZone(Zones::kFlashAttentionTransposeQ);
   // Group floats by the number of floats in a cache line.
   const size_t kNF = ctx.cache_info.LineBytes() / sizeof(float);
   const size_t num_heads = q.Cols() / q_t.Rows();
@@ -95,8 +96,8 @@ void RMSNormAndPositionalEncoding(const size_t num_tokens, const QBatch& qbatch,
                                   const LayerWeightsPtrs& layer,
                                   const AttentionActivations& activations,
                                   ThreadingContext& ctx) {
-  static const auto zone =
-      ctx.profiler.AddZone("Gen.Attention.RMSNormAndPositionalEncoding");
+  const auto zone =
+      GetProfilerZone(Zones::kFlashAttentionRmsNormAndPositionalEncoding);
   const float query_scale = activations.query_scale;
   const hwy::Divisor div_qbatch(qbatch.Size());
   const auto func = [&](const size_t task, size_t worker) HWY_ATTR {
@@ -158,8 +159,8 @@ void SingleFlashAttention(const size_t start_pos, const size_t last_pos,
                           const AttentionActivations& activations,
                           float* HWY_RESTRICT att_out, hwy::Profiler& p,
                           const size_t worker) {
-  static const auto zone = p.AddZone("Gen.Attention.SingleFlashAttention");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker,
+                 GetProfilerZone(Zones::kFlashAttentionSingleFlashAttention));
   const size_t pos_mod = activations.div_seq_len.Remainder(start_pos);
   float m = Dot(q, k.Row(pos_mod), k.Cols());
   if (float cap = activations.config.att_cap; cap > 0.0f) {
@@ -276,8 +277,8 @@ void TileFlashAttention(
     const LayerWeightsPtrs& layer, const AttentionActivations& activations,
     MatPtrT<float>& att_out, const uint32_t* HWY_RESTRICT out_offsets,
     hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Gen.Attention.TileFlashAttention");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker,
+                 GetProfilerZone(Zones::kFlashAttentionTileFlashAttention));
   constexpr int kHTileSize = kNFx8HTileSize;
   using DF = hn::ScalableTag<float>;
   const DF df;
@@ -430,8 +431,8 @@ void TileFlashAttention4(
     const LayerWeightsPtrs& layer, const AttentionActivations& activations,
     MatPtrT<float>& att_out, const uint32_t* HWY_RESTRICT out_offsets,
     hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Gen.Attention.TileFlashAttention4");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker,
+                 GetProfilerZone(Zones::kFlashAttentionTileFlashAttention4));
   using DF = hn::ScalableTag<float>;
   const DF df;
   using VF = hn::Vec<DF>;
@@ -524,6 +525,21 @@ static size_t RoundToSuitablePowerOf2(size_t n) {
   return 32;
 }
 
+// The vertical tile size is determined by the ability to use tiling and the
+// target_parallelism. In practice the possible tile sizes in order of
+// preference for efficiency are kNF, 4, 1, where kNF is likely to be 4 8 or
+// 16. The final tile size is chosen to be the largest possible that allows
+// for target_parallelism parallel tasks.
+size_t GetVTileSize(size_t kNF, size_t num_head_groups, size_t num_tokens,
+                    size_t total_tasks, size_t target_parallelism) {
+  const size_t kMaxEqualK =
+      RoundToSuitablePowerOf2(num_head_groups * num_tokens);
+  const size_t kMinTileSize = (total_tasks / 4 >= target_parallelism) ? 4 : 1;
+  return (kNF <= kMaxEqualK && total_tasks / kNF >= target_parallelism)
+             ? kNF
+             : std::min(kMinTileSize, kMaxEqualK);
+}
+
 // The nominal aim of attention is to combine 3 inputs Q[L,D], K[L,D], V[L,D]
 // into a single output O[L,D].
 // Conventional attention first computes A[L,L] = Q . KT
@@ -582,7 +598,10 @@ void FlashAttention(const size_t num_tokens, const size_t target_parallelism,
                     const size_t layer_idx, const LayerWeightsPtrs& layer,
                     AttentionActivations& activations, QBatch& qbatch,
                     ThreadingContext& ctx) {
-  static const auto zone = ctx.profiler.AddZone("Gen.Attention.FlashAttention");
+  static const auto root_zone = ctx.profiler.AddZone(
+      "FlashAttention.Inclusive", hwy::ProfilerFlags::kInclusive);
+  PROFILER_ZONE3(ctx.profiler, 0, root_zone);
+  const auto zone = GetProfilerZone(Zones::kFlashAttentionFlashAttention);
   RMSNormAndPositionalEncoding(num_tokens, qbatch, activations.q, layer_idx,
                                layer, activations, ctx);
   const hwy::Divisor div_qbatch(qbatch.Size());
@@ -603,17 +622,8 @@ void FlashAttention(const size_t num_tokens, const size_t target_parallelism,
   const size_t kNF = hn::Lanes(df);
   constexpr size_t kMaxNF = hn::MaxLanes(df);
   HWY_DASSERT(kNF <= kMaxNF);
-  // The vertical tile size is determined by the ability to use tiling and the
-  // target_parallelism. In practice the possible tile sizes in order of
-  // preference for efficiency are kNF, 4, 1, where kNF is likely to be 4 8 or
-  // 16. The final tile size is chosen to be the largest possible that allows
-  // for target_parallelism parallel tasks.
-  const size_t kMaxEqualK = RoundToSuitablePowerOf2(kHeadGroups * num_tokens);
-  const size_t kMinTileSize = (total_tasks / 4 >= target_parallelism) ? 4 : 1;
-  const size_t kVTileSize =
-      (kNF <= kMaxEqualK && total_tasks / kNF >= target_parallelism)
-          ? kNF
-          : std::min(kMinTileSize, kMaxEqualK);
+  const size_t kVTileSize = GetVTileSize(kNF, kHeadGroups, num_tokens,
+                                         total_tasks, target_parallelism);
   // Only transpose Q if we are using tiling.
   if (kVTileSize == kNF) {
     size_t max_last = 0, min_start = std::numeric_limits<size_t>::max();
diff --git a/gemma/flash_attention.h b/gemma/flash_attention.h
index 75e087a..8aa787b 100644
--- a/gemma/flash_attention.h
+++ b/gemma/flash_attention.h
@@ -42,6 +42,9 @@ namespace gcpp {
                             float* HWY_RESTRICT att_out, hwy::Profiler& p,   \
                             size_t worker);                                  \
                                                                              \
+  size_t GetVTileSize(size_t kNF, size_t num_head_groups, size_t num_tokens, \
+                      size_t total_tasks, size_t target_parallelism);        \
+                                                                             \
   void FlashAttention(size_t num_tokens, size_t target_parallelism,          \
                       size_t layer_idx, const LayerWeightsPtrs& layer,       \
                       AttentionActivations& activations, QBatch& qbatch,     \
diff --git a/gemma/flash_attention_test.cc b/gemma/flash_attention_test.cc
index 7f8f31e..4147e38 100644
--- a/gemma/flash_attention_test.cc
+++ b/gemma/flash_attention_test.cc
@@ -101,7 +101,6 @@ void AssertClose(const MatPtrT<float>& a, const MatPtrT<float>& b) {
 void TestFlashAttention(size_t target_parallelism) {
   ThreadingArgs threading_args;
   ThreadingContext ctx(threading_args);
-  // hwy::ThreadPool& pool = ctx.pools.Pool();
   constexpr size_t kOuter = 1024;
   constexpr size_t kInner = 256;
   ModelConfig config(Model::GEMMA2_2B, Type::kF32, PromptWrapping::GEMMA_PT);
@@ -150,9 +149,19 @@ void TestFlashAttention(size_t target_parallelism) {
   // Copy the output to saved_att to allow for comparison.
   auto saved_att = MakeCopyOfMat(attention.att_out, ctx.allocator);
   SetMat(1, attention.q);
+  using DF = hn::ScalableTag<float>;
+  const DF df;
+  const size_t kNF = hn::Lanes(df);
+  const size_t total_tasks =
+      tokens.size() * div_qbatch.GetDivisor() * layer_config.heads;
+  const size_t kVTileSize = GetVTileSize(kNF, kHeadGroups, tokens.size(),
+                                         total_tasks, target_parallelism);
+  printf("FlashAttention: target_parallelism=%zu, kNF=%zu, kVTileSize=%zu\n",
+         target_parallelism, kNF, kVTileSize);
   FlashAttention(tokens.size(), target_parallelism, 0, layers, attention,
                  qbatch, ctx);
   AssertClose(attention.att_out, *saved_att);
+  ctx.profiler.PrintResults();
 }
 
 void TestAttention() {
diff --git a/gemma/gemma-inl.h b/gemma/gemma-inl.h
index bdf989a..ecfbe47 100644
--- a/gemma/gemma-inl.h
+++ b/gemma/gemma-inl.h
@@ -24,6 +24,7 @@
 #include "ops/matmul.h"
 #include "util/mat.h"
 #include "util/threading.h"
+#include "util/zones.h"
 #include "hwy/profiler.h"
 
 // Include guard (still compiled once per target)
@@ -48,8 +49,7 @@ template <typename T1, typename T2>
 void Activation(ActivationType activation, T1* HWY_RESTRICT c1,
                 const T2* HWY_RESTRICT c2, const size_t count, hwy::Profiler& p,
                 const size_t worker) {
-  static const auto zone = p.AddZone("Gen.Activation");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kGenActivation));
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<float>;
   using VF = hn::Vec<DF>;
@@ -88,8 +88,7 @@ static inline void Activation(ActivationType activation, const RowPtrsBF C1,
                               const IndexRange range_r,
                               const IndexRange range_c, const StridedViewBF C2,
                               hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Gen.ActivationFused");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kGenActivationFused));
 
   const size_t cols = range_c.Num();
   HWY_DASSERT(C2.Cols() == cols);
diff --git a/gemma/gemma.cc b/gemma/gemma.cc
index c3e2bac..78c9cc4 100644
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@@ -19,6 +19,7 @@
 #include "gemma/gemma.h"
 
 #include "compression/types.h"  // GEMMA_DISABLED_TARGETS
+#include "util/zones.h"
 #ifndef HWY_DISABLED_TARGETS
 #define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
 #endif  // HWY_DISABLED_TARGETS
@@ -466,14 +467,12 @@ ChooseSampleFunc(const RuntimeConfig& runtime_config,
   // If user provided a sample_func, use it.
   if (runtime_config.sample_func) return runtime_config.sample_func;
 
-  static const auto zone_top1 = ctx.profiler.AddZone("Gen.Sample Top1");
-  static const auto zone_topK = ctx.profiler.AddZone("Gen.Sample general");
-
   // Fast path for top-1 with no accept_token.
   if (runtime_config.top_k == 1 && !runtime_config.accept_token) {
     return [&](size_t /*qi*/, size_t /*pos*/, Logits logits, size_t worker)
                HWY_ATTR -> TokenAndProb {
-                 PROFILER_ZONE3(ctx.profiler, worker, zone_top1);
+                 PROFILER_ZONE3(ctx.profiler, worker,
+                                GetProfilerZone(Zones::kGenSampleTop1));
                  return Top1OfSoftmax(logits);
                };
   }
@@ -481,7 +480,8 @@ ChooseSampleFunc(const RuntimeConfig& runtime_config,
   // General case: Softmax with top-k sampling.
   return [&](size_t qi, size_t pos, Logits logits,
              size_t worker) HWY_ATTR -> TokenAndProb {
-    PROFILER_ZONE3(ctx.profiler, worker, zone_topK);
+    PROFILER_ZONE3(ctx.profiler, worker,
+                   GetProfilerZone(Zones::kGenSampleTopK));
     // We want a different sequence for each batch element and position.
     const uint64_t stream = (static_cast<uint64_t>(qi) << 32) | pos;
     RngStream gen(engine, stream);
diff --git a/gemma/weights.cc b/gemma/weights.cc
index fb59297..cd8875b 100644
--- a/gemma/weights.cc
+++ b/gemma/weights.cc
@@ -32,6 +32,7 @@
 #include "io/blob_store.h"
 #include "util/mat.h"
 #include "util/threading_context.h"
+#include "util/zones.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
@@ -379,8 +380,7 @@ static void DecompressToBF16(MatPtr& mat,
 
 static void ReadAllToBF16(const std::vector<TensorToRead>& tensors,
                           const BlobReader& reader, ThreadingContext& ctx) {
-  static const auto zone =
-      ctx.profiler.AddZone("Startup.Weights.ReadAllToBF16");
+  const auto zone = GetProfilerZone(Zones::kStartupWeightsReadAllToBF16);
   // Especially TSAN is slow enough to warrant hierarchical parallelism.
   const ParallelismStrategy strategy = HWY_IS_DEBUG_BUILD
                                            ? ParallelismStrategy::kHierarchical
@@ -463,7 +463,7 @@ static std::vector<IOBatch> MakeBatches(
 static void ReadBatches(const BlobReader& reader,
                         const std::vector<IOBatch>& batches,
                         ThreadingContext& ctx) {
-  static const auto zone = ctx.profiler.AddZone("Startup.Weights.ReadBatches");
+  const auto zone = GetProfilerZone(Zones::kStartupWeightsReadBatches);
   // >5x speedup from parallel reads when cached.
   ParallelFor(ParallelismStrategy::kHierarchical,
               batches.size(), ctx, /*cluster_idx=*/0,
diff --git a/ops/matmul-inl.h b/ops/matmul-inl.h
index 8957f4c..d72ac38 100644
--- a/ops/matmul-inl.h
+++ b/ops/matmul-inl.h
@@ -20,11 +20,12 @@
 #include <vector>
 
 #include "compression/types.h"
-#include "ops/matmul.h"  // IWYU pragma: export
+#include "ops/matmul.h"      // IWYU pragma: export
 #include "util/allocator.h"  // CacheInfo
 #include "util/basics.h"
 #include "util/mat.h"
 #include "util/threading_context.h"
+#include "util/zones.h"
 #include "hwy/base.h"
 #include "hwy/profiler.h"
 #include "hwy/timer.h"
@@ -290,7 +291,7 @@ class MMDecompress {
     const hn::ScalableTag<BF16> dbf;
     const size_t NBF = hn::Lanes(dbf);
 
-    static const auto zone = env.ctx.profiler.AddZone("MM.DecompressA");
+    const auto zone = GetProfilerZone(Zones::kMMDecompressA);
 
     const auto do_range =
         [&](const IndexRange& range_M, const IndexRange& range_K, size_t worker)
@@ -878,9 +879,9 @@ class MMLoops {
   static HWY_NOINLINE void Dispatch(const StridedViewBF A, const MatPtrT<TB>& B,
                                     const MatPtrT<TB>* B2, RowPtrs<TC> C,
                                     const MMArgs& args) {
-    static const auto zone = args.env.ctx.profiler.AddZone("MM.Dispatch");
     PROFILER_ZONE3(args.env.ctx.profiler,
-                   args.env.ctx.Worker(args.options.cluster_idx), zone);
+                   args.env.ctx.Worker(args.options.cluster_idx),
+                   GetProfilerZone(Zones::kMMDispatch));
 
     DispatchParallelism(
         args.options.parallelism, [&](const auto& parallel) HWY_ATTR {
@@ -903,7 +904,7 @@ class MMLoops {
                               const StridedViewBF A, const MatPtrT<TB>& B,
                               const MatPtrT<TB>* B2, RowPtrs<TC> C,
                               const MMArgs& args) {
-    static const auto zone = args.env.ctx.profiler.AddZone("MM.NT");
+    const auto zone = GetProfilerZone(Zones::kMMNT);
     HWY_DASSERT(args.ranges_mc.NumTasks() == 1);
     HWY_DASSERT(args.ranges_kc.NumTasks() == 1);
     const IndexRange& range_mc = args.ranges_mc.Range(0);
@@ -939,7 +940,7 @@ class MMLoops {
                               const StridedViewBF A, const MatPtrT<TB>& B,
                               const MatPtrT<TB>* B2, RowPtrs<TC> C,
                               const MMArgs& args) {
-    static const auto zone = args.env.ctx.profiler.AddZone("MM.NT_K");
+    const auto zone = GetProfilerZone(Zones::kMMNT_K);
     HWY_DASSERT(args.ranges_mc.NumTasks() == 1);
     const IndexRange& range_mc = args.ranges_mc.Range(0);
 
@@ -975,7 +976,7 @@ class MMLoops {
                               const StridedViewBF A, const MatPtrT<TB>& B,
                               const MatPtrT<TB>* B2, RowPtrs<TC> C,
                               const MMArgs& args) {
-    static const auto zone = args.env.ctx.profiler.AddZone("MM.NT_MT");
+    const auto zone = GetProfilerZone(Zones::kMMNT_MT);
     HWY_DASSERT(args.ranges_kc.NumTasks() == 1);
     const IndexRange& range_kc = args.ranges_kc.Range(0);
 
@@ -1009,7 +1010,7 @@ class MMLoops {
                               const StridedViewBF A, const MatPtrT<TB>& B,
                               const MatPtrT<TB>* B2, RowPtrs<TC> C,
                               const MMArgs& args) {
-    static const auto zone = args.env.ctx.profiler.AddZone("MM.NT_MT_K");
+    const auto zone = GetProfilerZone(Zones::kMMNT_MT_K);
 
     parallel.ForRangesMC_NC(
         args.env.ctx, args.ranges_mc, args.ranges_nc, args.options.cluster_idx,
@@ -1060,10 +1061,10 @@ template <typename TA, typename TB, typename TC>
 HWY_NOINLINE MMPerKey* MatMul(const MatPtrT<TA>& A, const MatPtrT<TB>& B,
                               const float* HWY_RESTRICT add, MatMulEnv& env,
                               MatPtrT<TC>& C, MMOptions options = MMOptions()) {
-  static const auto zone = env.ctx.profiler.AddZone("MM.MatMul");
   const size_t cluster_idx = options.cluster_idx;
   HWY_DASSERT(cluster_idx < env.row_ptrs.size());
-  PROFILER_ZONE3(env.ctx.profiler, env.ctx.Worker(cluster_idx), zone);
+  PROFILER_ZONE3(env.ctx.profiler, env.ctx.Worker(cluster_idx),
+                 GetProfilerZone(Zones::kMMMatMul));
 
   RowPtrs<TC> C_rows = GetOrSetTempRowPtrs(C, env.row_ptrs[cluster_idx]);
 
@@ -1121,10 +1122,10 @@ template <typename TB>
 HWY_NOINLINE MMPerKey* TwoMatMul(const MatPtrT<BF16>& A, const MatPtrT<TB>& B1,
                                  const MatPtrT<TB>& B2, MatMulEnv& env,
                                  MatPtrT<BF16>& C, MMOptions options) {
-  static const auto zone = env.ctx.profiler.AddZone("MM.TwoMatMul");
   const size_t cluster_idx = options.cluster_idx;
   HWY_DASSERT(cluster_idx < env.row_ptrs.size());
-  PROFILER_ZONE3(env.ctx.profiler, env.ctx.Worker(cluster_idx), zone);
+  PROFILER_ZONE3(env.ctx.profiler, env.ctx.Worker(cluster_idx),
+                 GetProfilerZone(Zones::kMMTwoMatMul));
 
   HWY_DASSERT(options.func != nullptr);  // no other way to get access to C2.
 
diff --git a/ops/ops-inl.h b/ops/ops-inl.h
index a52c788..162b48a 100644
--- a/ops/ops-inl.h
+++ b/ops/ops-inl.h
@@ -32,6 +32,7 @@
 #include "util/basics.h"  // TokenAndProb, RngStream
 #include "util/mat.h"
 #include "util/threading_context.h"
+#include "util/zones.h"
 #include "hwy/base.h"
 #include "hwy/bit_set.h"
 #include "hwy/contrib/sort/order.h"
@@ -206,8 +207,7 @@ namespace detail {
 template <typename VT>
 float RMSNormMul(const VT* HWY_RESTRICT x, const size_t size, hwy::Profiler& p,
                  const size_t worker) {
-  static const auto zone = p.AddZone("Ops.RMSNormMul");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsRmsNormMul));
 
   const hn::ScalableTag<float> d;
   const float l2 = DecompressAndCall(d, MakeSpan(x, size), DotKernelDefault());
@@ -223,8 +223,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(const XT* HWY_RESTRICT x,
                                            OT* HWY_RESTRICT out,
                                            const size_t size, hwy::Profiler& p,
                                            const size_t worker) {
-  static const auto zone = p.AddZone("Ops.RMSNorm");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsRmsNorm));
 
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<float>;
@@ -248,8 +247,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(const WT* HWY_RESTRICT weight,
                                                   const size_t size,
                                                   hwy::Profiler& p,
                                                   const size_t worker) {
-  static const auto zone = p.AddZone("Ops.RMSNormInplace");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsRmsNormInplace));
 
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<float>;
@@ -365,8 +363,7 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void Rope(
     float* HWY_RESTRICT x, const size_t dim_qkv,
     const float* HWY_RESTRICT inv_timescale, const int pos, hwy::Profiler& p,
     const size_t worker) {
-  static const auto zone = p.AddZone("Ops.Rope");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsRope));
   HWY_DASSERT(dim_qkv % 2 == 0);
   const size_t half_dim_qkv = dim_qkv / 2;
 
@@ -425,8 +422,7 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void RopeAndMulBy(
     const float mul, float* HWY_RESTRICT x, const size_t dim_qkv,
     const float* HWY_RESTRICT inv_timescale, const int pos, hwy::Profiler& p,
     const size_t worker) {
-  static const auto zone = p.AddZone("Ops.RopeAndMulBy");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsRopeAndMulBy));
   HWY_DASSERT(dim_qkv % 2 == 0);
   const size_t half_dim_qkv = dim_qkv / 2;
 
@@ -488,8 +484,7 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void AddFrom(const XT* HWY_RESTRICT x,
                                                   const size_t size,
                                                   hwy::Profiler& p,
                                                   const size_t worker) {
-  static const auto zone = p.AddZone("Ops.AddFrom");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsAddFrom));
 
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<float>;
@@ -568,8 +563,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConst(const float c, XT* HWY_RESTRICT x,
                                               const size_t size,
                                               hwy::Profiler& p,
                                               const size_t worker) {
-  static const auto zone = p.AddZone("Ops.MulByConst");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsMulByConst));
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<float>;
   using VF = hn::Vec<DF>;
@@ -587,8 +581,7 @@ template <typename XT, typename OT>
 HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstTo(
     const float c, const XT* HWY_RESTRICT x, OT* HWY_RESTRICT out,
     const size_t size, hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Ops.MulByConstTo");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsMulByConstTo));
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<float>;
   using VF = hn::Vec<DF>;
@@ -606,8 +599,7 @@ template <typename XT, typename OT>
 HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(
     const float c, const XT* HWY_RESTRICT x, OT* HWY_RESTRICT out,
     const size_t size, hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Ops.MulByConstAndAdd");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsMulByConstAndAdd));
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<float>;
   using VF = hn::Vec<DF>;
@@ -744,8 +736,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddTile(
     const size_t* HWY_RESTRICT pos, float* HWY_RESTRICT out,
     const uint32_t* HWY_RESTRICT out_offsets, const size_t size,
     hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Ops.MulByConstAndAdd");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsMulByConstAndAddTile));
   namespace hn = hwy::HWY_NAMESPACE;
   HWY_LANES_CONSTEXPR size_t NF = hn::Lanes(df);
 
@@ -1007,8 +998,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddTile4(
     const size_t* HWY_RESTRICT pos, float* HWY_RESTRICT out,
     const uint32_t* HWY_RESTRICT out_offsets, const size_t size,
     hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Ops.MulByConstAndAddTile4");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsMulByConstAndAddTile4));
   namespace hn = hwy::HWY_NAMESPACE;
   HWY_LANES_CONSTEXPR size_t NF = hn::Lanes(df);
 
@@ -1049,8 +1039,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddVector(
     const size_t pos, float* HWY_RESTRICT out,
     const uint32_t* HWY_RESTRICT out_offsets, const size_t size,
     hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Ops.MulByConstAndAdd");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsMulByConstAndAddVector));
   namespace hn = hwy::HWY_NAMESPACE;
   HWY_LANES_CONSTEXPR size_t NF = hn::Lanes(df);
 
@@ -1146,8 +1135,7 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddVector(
 static HWY_NOINLINE void Softmax(Logits logits, hwy::Profiler& p,
                                  const size_t worker,
                                  float temperature = 1.0f) {
-  static const auto zone = p.AddZone("Ops.Softmax");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsSoftmax));
   HWY_DASSERT(logits.size() != 0);
 
   namespace hn = hwy::HWY_NAMESPACE;
@@ -1280,8 +1268,7 @@ static HWY_MAYBE_UNUSED TokenAndProb Top1OfSoftmax(Logits logits) {
 
 static HWY_NOINLINE void LogitsSoftCap(const float cap, Logits logits,
                                        hwy::Profiler& p, const size_t worker) {
-  static const auto zone = p.AddZone("Ops.LogitsSoftCap");
-  PROFILER_ZONE3(p, worker, zone);
+  PROFILER_ZONE3(p, worker, GetProfilerZone(Zones::kOpsLogitsSoftCap));
 
   namespace hn = hwy::HWY_NAMESPACE;
   using DF = hn::ScalableTag<float>;
diff --git a/ops/ops_test.cc b/ops/ops_test.cc
index 213fdd0..40f1002 100644
--- a/ops/ops_test.cc
+++ b/ops/ops_test.cc
@@ -14,6 +14,7 @@
 // limitations under the License.
 
 #include "compression/types.h"
+#include "util/zones.h"
 #ifndef HWY_DISABLED_TARGETS
 #define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
 #endif  // HWY_DISABLED_TARGETS
@@ -132,6 +133,7 @@ class TestAddFrom {
     }
 
     SimpleAddFrom(o, e, count);
+    InitProfilerZones(hwy::Profiler::Get());
     AddFrom(o, x, count, hwy::Profiler::Get(), /*worker=*/0);
 
     hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
@@ -180,6 +182,7 @@ class TestMulByConstAndAdd {
     T constant = Random<T>(rng);
 
     SimpleMulByConstAndAdd(constant, o, e, count);
+    InitProfilerZones(hwy::Profiler::Get());
     MulByConstAndAdd(constant, o, x, count, hwy::Profiler::Get(), /*worker=*/0);
 
     hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
@@ -228,6 +231,7 @@ class TestMulByConst {
     T constant = Random<T>(rng);
 
     SimpleMulByConst(constant, e, count);
+    InitProfilerZones(hwy::Profiler::Get());
     MulByConst(constant, x, count, hwy::Profiler::Get(), /*worker=*/0);
 
     hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
@@ -274,6 +278,7 @@ struct TestMulByConstTo {
                                      hwy::ConvertScalarTo<float>(constant));
     }
 
+    InitProfilerZones(hwy::Profiler::Get());
     MulByConstTo(constant, x, actual, count, hwy::Profiler::Get(),
                  /*worker=*/0);
 
@@ -310,6 +315,7 @@ class TestSoftmax {
     }
 
     SimpleSoftmax(e, count);
+    InitProfilerZones(hwy::Profiler::Get());
     Softmax(Logits(x, count), hwy::Profiler::Get(), /*worker=*/0);
 
     T sum = 0.0f;
@@ -437,6 +443,7 @@ void TestRopeAndMulBy() {
   ThreadingArgs threading_args;
   ThreadingContext ctx(threading_args);
   hwy::Profiler& p = ctx.profiler;
+  InitProfilerZones(p);
   const size_t worker = 0;
 
   const ModelConfig config(Model::GEMMA2_9B, Type::kSFP,
@@ -551,6 +558,7 @@ struct TestRMSNorm {
     }
 
     ScalarRMSNorm(vec, weight, expected, kSize);
+    InitProfilerZones(hwy::Profiler::Get());
     RMSNorm(vec, weight, actual, kSize, hwy::Profiler::Get(), /*worker=*/0);
 
     for (size_t i = 0; i < kSize; i++) {
@@ -585,6 +593,7 @@ struct TestRMSNormInplace {
     }
 
     ScalarRMSNorm(expected, weight, expected, kSize);
+    InitProfilerZones(hwy::Profiler::Get());
     RMSNormInplace(weight, actual, kSize, hwy::Profiler::Get(),
                    /*worker=*/0);
 
@@ -707,6 +716,7 @@ void TestAllLayerNorm() {
 
 void TestSampleTopK() {
   hwy::Profiler& p = hwy::Profiler::Get();
+  InitProfilerZones(p);
   const size_t worker = 0;
   const size_t kSize = 52;
   std::vector<float> logits_vec(kSize);
diff --git a/util/zones.cc b/util/zones.cc
new file mode 100644
index 0000000..abc9dc2
--- /dev/null
+++ b/util/zones.cc
@@ -0,0 +1,70 @@
+#include "util/zones.h"
+
+#include "hwy/profiler.h"
+
+namespace gcpp {
+
+#if PROFILER_ENABLED
+static constexpr size_t kNumZones = static_cast<size_t>(Zones::kNumZones);
+
+static const char* kProfilerZoneNames[kNumZones] = {
+    // Keep in sync with Zones enum.
+    "Ops.RMSNormMul",
+    "Ops.RMSNorm",
+    "Ops.RMSNormInplace",
+    "Ops.Rope",
+    "Ops.RopeAndMulBy",
+    "Ops.AddFrom",
+    "Ops.MulByConst",
+    "Ops.MulByConstTo",
+    "Ops.MulByConstAndAdd",
+    "Ops.MulByConstAndAddTile",
+    "Ops.MulByConstAndAddTile4",
+    "Ops.MulByConstAndAddVector",
+    "Ops.Softmax",
+    "Ops.LogitsSoftCap",
+    "FlashAttention.TransposeQ",
+    "FlashAttention.RMSNormAndPositionalEncoding",
+    "FlashAttention.SingleFlashAttention",
+    "FlashAttention.TileFlashAttention",
+    "FlashAttention.TileFlashAttention4",
+    "FlashAttention.FlashAttention",
+    "Gen.Activation",
+    "Gen.ActivationFused",
+    "Gen.SampleTop1",
+    "Gen.SampleTopK",
+    "Gen.Attention.QDotK",
+    "Gen.Attention.DotSoftmaxWeightedSum.par",
+    "Startup.Weights.ReadAllToBF16",
+    "Startup.Weights.ReadBatches",
+    "MM.Dispatch",
+    "MM.MatMul",
+    "MM.TwoMatMul",
+    "MM.DecompressA",
+    "MM.NT",
+    "MM.NT_K",
+    "MM.NT_MT",
+    "MM.NT_MT_K",
+};
+
+static hwy::profiler::ZoneHandle profiler_zone_handles[kNumZones];
+#endif
+
+void InitProfilerZones(hwy::Profiler& profiler) {
+#if PROFILER_ENABLED
+  // Initialize the zone handles. This is done once at startup.
+  for (size_t i = 0; i < kNumZones; ++i) {
+    profiler_zone_handles[i] = profiler.AddZone(kProfilerZoneNames[i]);
+  }
+#endif
+}
+
+hwy::profiler::ZoneHandle GetProfilerZone(Zones zone) {
+#if PROFILER_ENABLED
+  return profiler_zone_handles[static_cast<size_t>(zone)];
+#else
+  return hwy::profiler::ZoneHandle();
+#endif
+}
+
+}  // namespace gcpp
diff --git a/util/zones.h b/util/zones.h
new file mode 100644
index 0000000..e78340a
--- /dev/null
+++ b/util/zones.h
@@ -0,0 +1,58 @@
+#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_ZONES_H_
+#define THIRD_PARTY_GEMMA_CPP_UTIL_ZONES_H_
+
+#include "hwy/profiler.h"
+
+namespace gcpp {
+
+// Zones for the profiler.
+enum class Zones {
+  kOpsRmsNormMul,
+  kOpsRmsNorm,
+  kOpsRmsNormInplace,
+  kOpsRope,
+  kOpsRopeAndMulBy,
+  kOpsAddFrom,
+  kOpsMulByConst,
+  kOpsMulByConstTo,
+  kOpsMulByConstAndAdd,
+  kOpsMulByConstAndAddTile,
+  kOpsMulByConstAndAddTile4,
+  kOpsMulByConstAndAddVector,
+  kOpsSoftmax,
+  kOpsLogitsSoftCap,
+  kFlashAttentionTransposeQ,
+  kFlashAttentionRmsNormAndPositionalEncoding,
+  kFlashAttentionSingleFlashAttention,
+  kFlashAttentionTileFlashAttention,
+  kFlashAttentionTileFlashAttention4,
+  kFlashAttentionFlashAttention,
+  kGenActivation,
+  kGenActivationFused,
+  kGenSampleTop1,
+  kGenSampleTopK,
+  kGenAttentionQDotK,
+  kGenAttentionDotSoftmaxWeightedSumPar,
+  kStartupWeightsReadAllToBF16,
+  kStartupWeightsReadBatches,
+  kMMDispatch,
+  kMMMatMul,
+  kMMTwoMatMul,
+  kMMDecompressA,
+  kMMNT,
+  kMMNT_K,
+  kMMNT_MT,
+  kMMNT_MT_K,
+  kNumZones
+};
+
+// Initializes the profiler zones. Must be called before any other profiler
+// functions.
+void InitProfilerZones(hwy::Profiler& profiler);
+
+// Returns the zone handle for the given zone enum value.
+hwy::profiler::ZoneHandle GetProfilerZone(Zones zone);
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_UTIL_ZONES_H_