Updated benchmarks.cc to recent changes to Gemma API.

PiperOrigin-RevId: 642285902
2024-06-11 08:54:58 -07:00 · 2024-06-11 08:54:58 -07:00 · bdf33c7008
parent b6565e3bf6
commit bdf33c7008
6 changed files with 355 additions and 9 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -165,7 +165,7 @@ cc_test(
        ":cross_entropy",
        ":gemma_lib",
        ":ops",
-        # "//base",
+        # Placeholder for internal dep, do not remove.,
        "@googletest//:gtest_main",
        "//compression:io",
        "@hwy//:hwy_test_util",
@ -181,7 +181,7 @@ cc_binary(
        ":args",
        ":common",
        ":gemma_lib",
-        # "//base",
+        # Placeholder for internal dep, do not remove.,
        "//compression:compress",
        "@hwy//:hwy",
        "@hwy//:nanobenchmark",
@ -198,7 +198,7 @@ cc_binary(
        ":common",
        ":gemma_lib",
        ":weights",
-        # "//base",
+        # Placeholder for internal dep, do not remove.,
        "//compression:compress",
        "@hwy//:hwy",
        "@hwy//:nanobenchmark",
@ -208,7 +208,7 @@ cc_binary(
 )
 cc_binary(
-    name = "benchmark",
+    name = "single_benchmark",
    srcs = ["gemma/benchmark.cc"],
    deps = [
        ":app",
@ -216,7 +216,7 @@ cc_binary(
        ":common",
        ":cross_entropy",
        ":gemma_lib",
-        # "//base",
+        # Placeholder for internal dep, do not remove.,
        "//compression:io",
        "@hwy//:hwy",
        "@hwy//:nanobenchmark",
@ -225,6 +225,16 @@ cc_binary(
    ],
 )
 cc_binary(
    name = "benchmarks",
    srcs = ["gemma/benchmarks.cc"],
    deps = [
        ":benchmark_helper",
        # Placeholder for internal dep, do not remove.,
        "@benchmark//:benchmark",
    ],
 )
 cc_binary(
    name = "debug_prompt",
    srcs = [
@ -234,7 +244,7 @@ cc_binary(
        ":app",
        ":args",
        ":gemma_lib",
-        # "//base",
+        # Placeholder for internal dep, do not remove.,
        "//compression:io",
        "@hwy//:hwy",
        "@hwy//:thread_pool",
@ -248,7 +258,7 @@ cc_binary(
    deps = [
        ":app",
        ":gemma_lib",
-        # "//base",
+        # Placeholder for internal dep, do not remove.,
        "@hwy//:hwy",
        "@hwy//:profiler",
        "@hwy//:thread_pool",
@ -308,6 +318,25 @@ cc_library(
    ],
 )
 cc_library(
    name = "benchmark_helper",
    srcs = [
        "gemma/benchmark_helper.cc",
    ],
    hdrs = [
        "gemma/benchmark_helper.h",
    ],
    deps = [
        ":app",
        ":common",
        ":gemma_lib",
        "@benchmark//:benchmark",
        "@hwy//:hwy",
        "@hwy//:nanobenchmark",
        "@hwy//:thread_pool",
    ],
 )
 cc_test(
    name = "backward_scalar_test",
    size = "large",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -97,8 +97,11 @@ add_executable(gemma gemma/run.cc)
 target_link_libraries(gemma libgemma hwy hwy_contrib)
 install(TARGETS gemma DESTINATION bin)
-add_executable(benchmark gemma/benchmark.cc)
+add_executable(single_benchmark gemma/benchmark.cc)
-target_link_libraries(benchmark libgemma hwy hwy_contrib nlohmann_json::nlohmann_json)
+target_link_libraries(single_benchmark libgemma hwy hwy_contrib nlohmann_json::nlohmann_json)
 add_executable(benchmarks gemma/benchmarks.cc)
 target_link_libraries(benchmarks libgemma hwy hwy_contrib nlohmann_json::nlohmann_json benchmark)
 add_executable(debug_prompt debug_prompt.cc)
 target_link_libraries(debug_prompt libgemma hwy hwy_contrib nlohmann_json::nlohmann_json)
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -63,3 +63,10 @@ http_archive(
    strip_prefix = "abseil-cpp-9687a8ea750bfcddf790372093245a1d041b21a3",
    urls = ["https://github.com/abseil/abseil-cpp/archive//9687a8ea750bfcddf790372093245a1d041b21a3.tar.gz"],
 )
 # Benchmark
 http_archive(
    name = "benchmark",
    urls = ["https://github.com/google/benchmark/archive/refs/tags/v1.8.2.tar.gz"],
    integrity = "sha256-KqspgNA3YTf5adkoSPu2gharsHYzA0U0/IxlzE56DpM=",
    strip_prefix = "benchmark-1.8.2",
 )
--- a/gemma/benchmark_helper.cc
+++ b/gemma/benchmark_helper.cc
@ -0,0 +1,118 @@
 // Copyright 2024 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "gemma/benchmark_helper.h"
 #include <cstdlib>  // EXIT_FAILURE
 #include <iostream>
 #include <memory>
 #include <ostream>
 #include <random>
 #include <string>
 #include <utility>  // std::pair
 #include <vector>
 #include "gemma/common.h"
 #include "gemma/gemma.h"
 #include "util/app.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
 #include "hwy/timer.h"
 namespace gcpp {
  GemmaEnv::GemmaEnv(int argc, char** argv)
      : loader_(argc, argv), inference_args_(argc, argv), app_(argc, argv),
        pool_(app_.num_threads) {
    if (const char* error = loader_.Validate()) {
      HWY_ABORT("\nInvalid loader args: %s", error);
    }
    if (const char* error = inference_args_.Validate()) {
      HWY_ABORT("\nInvalid inference args: %s", error);
    }
    // For many-core, pinning workers to cores helps.
    if (app_.num_threads > 10) {
      gcpp::PinWorkersToCores(pool_);
    }
    model_ = AllocateGemma(loader_, pool_);
    kv_cache_ = KVCache::Create(loader_.ModelType());
    gen_.seed(42);
  }
 std::pair<std::string, int> GemmaEnv::QueryModel(const std::string& input) {
  std::string prompt_string = input;
  if (loader_.ModelTrainingType() == ModelTraining::GEMMA_IT) {
    // For instruction-tuned models: add control tokens.
    prompt_string = "<start_of_turn>user\n" + input +
                    "<end_of_turn>\n<start_of_turn>model\n";
  }
  std::vector<int> prompt;
  HWY_ASSERT(model_->Tokenizer().Encode(input, &prompt));
  // For both pre-trained and instruction-tuned models: prepend "<bos>" token
  // if needed.
  prompt.insert(prompt.begin(), gcpp::BOS_ID);
  std::string res;
  size_t total_tokens = 0;
  auto accept_token = [](int) { return true; };
  std::mt19937 gen;
  gen.seed(42);
  const double time_start = hwy::platform::Now();
  auto stream_token = [&res, &total_tokens, &time_start, this](
                          int token, float) {
    ++total_tokens;
    std::string token_text;
    HWY_ASSERT(model_->Tokenizer().Decode(std::vector<int>{token},
                                          &token_text));
    res += token_text;
    if (app_.verbosity >= 1 && total_tokens % 100 == 0) {
      LogSpeedStats(time_start, total_tokens);
    }
    return true;
  };
  if (app_.verbosity >= 2) {
    std::cout << inference_args_.max_tokens << " "
              << inference_args_.max_generated_tokens << " "
              << inference_args_.temperature;
  }
  gcpp::TimingInfo timing_info;
  gcpp::RuntimeConfig runtime_config = {
      .max_tokens = inference_args_.max_tokens,
      .max_generated_tokens = inference_args_.max_generated_tokens,
      .temperature = inference_args_.temperature,
      .verbosity = app_.verbosity,
      .gen = &gen,
      .stream_token = stream_token,
      .accept_token = accept_token,
  };
  model_->Generate(runtime_config, prompt, /*start_pos=*/0, kv_cache_,
                  timing_info, /*layers_output=*/nullptr);
  if (app_.verbosity >= 1) {
    LogSpeedStats(time_start, total_tokens);
  }
  return {res, total_tokens};
 }
 void GemmaEnv::LogSpeedStats(double time_start, size_t total_tokens) const {
  const double time_end = hwy::platform::Now();
  const double time_elapsed = time_end - time_start;
  const double tok_sec = total_tokens / time_elapsed;
  std::cout << total_tokens << " tokens in " << time_elapsed << " seconds"
            << " [" << tok_sec << " tokens / sec" << "]\n";
 }
 }  // namespace gcpp
--- a/gemma/benchmark_helper.h
+++ b/gemma/benchmark_helper.h
@ -0,0 +1,69 @@
 // Copyright 2024 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_BENCHMARK_HELPER_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_BENCHMARK_HELPER_H_
 #include <memory>
 #include <random>
 #include <string>
 #include <utility>
 #include "gemma/gemma.h"
 #include "util/app.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 namespace gcpp {
 // Convenience class to load a model and run inference.
 class GemmaEnv {
 public:
  GemmaEnv(int argc, char** argv);
  // Sets the maximum number of output tokens to generate.
  void set_max_generated_tokens(int max_tokens) {
    inference_args_.max_generated_tokens = max_tokens;
  }
  // Runs inference on the given input and returns the top-1 result string and
  // the number of tokens that were generated.
  std::pair<std::string, int> QueryModel(const std::string& input);
 private:
  // Logs the inference speed in tokens/sec.
  void LogSpeedStats(double time_start, size_t total_tokens) const;
  // Arguments to the model loader: file locations, etc.
  LoaderArgs loader_;
  // Arguments to the inference function: max tokens, etc.
  InferenceArgs inference_args_;
  // Controls overall behavior of the app.
  AppArgs app_;
  // Thread pool for running inference.
  hwy::ThreadPool pool_;
  // Random number generator.
  std::mt19937 gen_;
  // The model to run inference on.
  std::unique_ptr<Gemma> model_;
  // The KV cache to use for inference.
  KVCache kv_cache_;
 };
 }  // namespace gcpp
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_BENCHMARK_HELPER_H_
--- a/gemma/benchmarks.cc
+++ b/gemma/benchmarks.cc
@ -0,0 +1,120 @@
 // Copyright 2024 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <fstream>
 #include <iostream>
 #include <ostream>
 #include <random>
 #include <sstream>
 #include <string>
 // Placeholder for internal header, do not modify.
 #include "benchmark/benchmark.h"
 #include "gemma/benchmark_helper.h"
 void run_gemma_prompt(const std::string& prompt_string,
                      gcpp::GemmaEnv& env,
                      benchmark::State& state) {
  std::mt19937 gen;
  if (prompt_string.empty()) return;
  int token_counter = 0;
  for (auto s : state) {
    auto [response, n] = env.QueryModel(prompt_string);
    std::cout << "response: " << response << "\n";
    std::cout << "n: " << n << "\n";
    token_counter += n;
  }
  state.SetItemsProcessed(token_counter);
 }
 // Awkward global because benchmarks don't support additional state, so it is
 // either this or cast to int64_t.
 gcpp::GemmaEnv* global_env = nullptr;
 static void BM_short_prompt(benchmark::State& state) {
  run_gemma_prompt("What is the capital of Spain?", *global_env,
                   state);
 }
 static void BM_factuality_prompt(benchmark::State& state) {
  run_gemma_prompt("How does an inkjet printer work?",
                   *global_env, state);
 }
 static void BM_creative_prompt(benchmark::State& state) {
  run_gemma_prompt(
      "Tell me a story about a magical bunny and their TRS-80.",
      *global_env, state);
 }
 static void BM_coding_prompt(benchmark::State& state) {
  run_gemma_prompt(
      "Write a python program to generate a fibonacci sequence.",
      *global_env, state);
 }
 static void BM_long_coding_prompt(benchmark::State& state) {
  std::ifstream t("benchmarks.cc", std::ios_base::in);
  std::stringstream buffer;
  buffer << t.rdbuf();
  std::string prompt_string = buffer.str();
  t.close();
  run_gemma_prompt("Make improvements to the following code:\n " +
                   prompt_string, *global_env, state);
 }
 int main(int argc, char** argv) {
  {
    // Placeholder for internal init, do not modify.
  }
  gcpp::GemmaEnv env(argc, argv);
  env.set_max_generated_tokens(128);
  global_env = &env;
  BENCHMARK(BM_short_prompt)
      ->Iterations(3)
      ->Unit(benchmark::kMillisecond)
      ->UseRealTime();
  env.set_max_generated_tokens(256);
  BENCHMARK(BM_factuality_prompt)
      ->Iterations(3)
      ->Unit(benchmark::kMillisecond)
      ->UseRealTime();
  BENCHMARK(BM_creative_prompt)
      ->Iterations(3)
      ->Unit(benchmark::kMillisecond)
      ->UseRealTime();
  BENCHMARK(BM_coding_prompt)
      ->Iterations(3)
      ->Unit(benchmark::kMillisecond)
      ->UseRealTime();
  env.set_max_generated_tokens(1024);
  BENCHMARK(BM_long_coding_prompt)
      ->Iterations(3)
      ->Unit(benchmark::kMillisecond)
      ->UseRealTime();
  ::benchmark ::RunSpecifiedBenchmarks();
  ::benchmark ::Shutdown();
  return 0;
 }