remove --log fixing https://github.com/google/gemma.cpp/issues/59, improve command line args help, add copybara #include sort guards in more source files, add README sections on running faster and related projects

2024-02-28 15:18:40 -05:00 · 2024-02-28 15:18:40 -05:00 · 0ea7b993de
parent 272f17ddb3
commit 0ea7b993de
4 changed files with 93 additions and 42 deletions
--- a/README.md
+++ b/README.md
@ -92,7 +92,7 @@ weights enable faster inference. In general, we recommend starting with the
 | `7b-pt`     | 7 billion parameter pre-trained model, bfloat16 |
 | `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |

-> [!NOTE] 
+> [!NOTE]
 > **Important**: We strongly recommend starting off with the `2b-it-sfp` model to
 > get up and running.

@ -116,7 +116,7 @@ The build system uses [CMake](https://cmake.org/). To build the gemma inference
 runtime, create a build directory and generate the build files using `cmake`
 from the top-level project directory. Note if you previous ran `cmake` and are
 re-running with a different setting, be sure to clean out the `build/` directory
-with `rm -rf build/*` (warning this will delete any other files in the `build/` 
+with `rm -rf build/*` (warning this will delete any other files in the `build/`
 directory.

 For the 8-bit switched floating point weights (sfp), run cmake with no options:
@ -242,6 +242,21 @@ We're working on a python script to convert a standard model format to `.sbs`,
 and hope have it available in the next week or so. Follow [this
 issue](https://github.com/google/gemma.cpp/issues/11) for updates.

+**What are some easy ways to make the model run faster?**
+
+1. Make sure you are using the 8-bit switched floating point `-sfp` models.
+2. If you're on a laptop, make sure power mode is set to maximize performance
+and saving mode is **off**. For most laptops, the power saving modes get
+activated automatically if the computer is not plugged in.
+3. Close other unused cpu-intensive applications.
+4. On macs, anecdotally we observe a "warm-up" ramp-up in speed as performance
+cores get engaged.
+5. Experiment with the `--num_threads` argument value. Depending on the device,
+larger numbers don't always mean better performance.
+
+We're also working on algorithmic and optimization approaches for faster
+inference, stay tuned.
+
 ## Usage

 `gemma` has different usage modes, controlled by the verbosity flag.
@ -415,6 +430,17 @@ make -j [number of parallel threads to use] libgemma
 If this is successful, you should now have a `libgemma` library file in the
 `build/` directory. On Unix platforms, the filename is `libgemma.a`.

+## Independent Projects Using gemma.cpp
+
+Some independent projects using gemma.cpp:
+
+- [gemma-cpp-python - Python bindings](https://github.com/namtranase/gemma-cpp-python)
+- [lua-cgemma - Lua bindings](https://github.com/ufownl/lua-cgemma)
+- [Godot engine demo project](https://github.com/Rliop913/Gemma-godot-demo-project)
+
+If you would like to have your project included, feel free to get in touch or
+submit a PR with a `README.md` edit.
+
 ## Acknowledgements and Contacts

 gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com)
--- a/gemma.h
+++ b/gemma.h
@ -122,21 +122,22 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
  template <class Visitor>
  void ForEach(const Visitor& visitor) {
    visitor(tokenizer, "tokenizer", Path(),
-            "Path name of tokenizer model file. (required)");
+            "Path name of tokenizer model file.\n    Required argument.");
    visitor(
        cache, "compressed_weights", Path(),
        "Path name of compressed weights file, regenerated from `--weights` "
        "file if "
-        "the compressed weights file does not exist. (required)");
+        "the compressed weights file does not exist.\n    Required argument.");
    visitor(model_type, "model", std::string(),
-            "Model type - can be 2b-it (2B parameters, instruction-tuned), "
-            "2b-pt (2B parameters, pretrained), 7b-it (7B parameters, "
-            "instruction-tuned), or 7b-pt (7B parameters, pretrained). "
-            "(required)");
+            "Model type\n    2b-it (2B parameters, instruction-tuned)\n    "
+            "2b-pt (2B parameters, pretrained)\n    7b-it (7B parameters "
+            "instruction-tuned)\n    7b-pt (7B parameters, pretrained)\n"
+            "    Required argument.");
    visitor(model, "weights", Path(),
            "Path name of model weights (.sbs) file. Only required if "
            "compressed_weights file is not present and needs to be "
-            "regenerated. Otherwise, not needed");
+            "regenerated. This parameter is only required for compressing"
+            "new model weight exports, otherwise it is not needed.");
  }
 };

@ -192,7 +193,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
            "Make top-k sampling deterministic", 2);
    visitor(multiturn, "multiturn", false,
            "Multiturn mode (if 0, this clears the KV cache after every "
-            "interaction without quitting)\n    Default = 0 (conversation "
+            "interaction without quitting)\n    Default : 0 (conversation "
            "resets every turn)");
  }
 };
--- a/run.cc
+++ b/run.cc
@ -24,12 +24,16 @@

 // copybara:import_next_line:gemma_cpp
 #include "compression/compress.h"
+// copybara:end
 // copybara:import_next_line:gemma_cpp
-#include "gemma.h"    // Gemma
+#include "gemma.h"  // Gemma
+// copybara:end
 // copybara:import_next_line:gemma_cpp
 #include "util/app.h"
+// copybara:end
 // copybara:import_next_line:gemma_cpp
 #include "util/args.h"  // HasHelp
+// copybara:end
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
@ -39,20 +43,13 @@

 namespace gcpp {

-void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
-              gcpp::AppArgs& app) {
-  fprintf(stderr,
-          "\ngemma.cpp\n---------\n\nTo run gemma.cpp, you need to "
-          "specify 3 required model loading arguments: --tokenizer, "
-          "--compressed_weights, "
-          "and --model.\n\nModel Loading Arguments\n\n");
-  loader.Help();
-  fprintf(stderr, "\nInference Arguments\n\n");
-  inference.Help();
-  fprintf(stderr, "\nApplication Arguments\n\n");
-  app.Help();
-  fprintf(stderr, "\n\n");
-}
+static constexpr std::string_view kAsciiArtBanner =
+    "  __ _  ___ _ __ ___  _ __ ___   __ _   ___ _ __  _ __\n"
+    " / _` |/ _ \\ '_ ` _ \\| '_ ` _ \\ / _` | / __| '_ \\| '_ \\\n"
+    "| (_| |  __/ | | | | | | | | | | (_| || (__| |_) | |_) |\n"
+    " \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__/| .__/\n"
+    "  __/ |                                    | |   | |\n"
+    " |___/                                     |_|   |_|";

 void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
  loader.Print(app.verbosity);
@ -69,7 +66,8 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
              << std::thread::hardware_concurrency() << std::endl
              << "Instruction set               : "
              << hwy::TargetName(hwy::DispatchedTarget()) << " ("
-              << hwy::VectorBytes() * 8 << " bits)" << "\n"
+              << hwy::VectorBytes() * 8 << " bits)"
+              << "\n"
              << "Weight Type                   : "
              << gcpp::TypeName(gcpp::WeightT()) << "\n"
              << "EmbedderInput Type            : "
@ -77,11 +75,31 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
  }
 }

+void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
+              gcpp::AppArgs& app) {
+  std::cerr
+      << kAsciiArtBanner
+      << "\n\ngemma.cpp : a lightweight, standalone C++ inference engine\n"
+         "==========================================================\n\n"
+         "To run gemma.cpp, you need to "
+         "specify 3 required model loading arguments:\n    --tokenizer\n    "
+         "--compressed_weights\n"
+         "    --model.\n";
+  std::cerr << "\n*Example Usage*\n\n./gemma --tokenizer tokenizer.spm "
+               "--compressed_weights 2b-it-sfp.sbs --model 2b-it\n";
+  std::cerr << "\n*Model Loading Arguments*\n\n";
+  loader.Help();
+  std::cerr << "\n*Inference Arguments*\n\n";
+  inference.Help();
+  std::cerr << "\n*Application Arguments*\n\n";
+  app.Help();
+  std::cerr << "\n";
+}
+
 void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
               hwy::ThreadPool& inner_pool, const InferenceArgs& args,
               int verbosity, const gcpp::AcceptFunc& accept_token,
-               std::string &eot_line
-) {
+               std::string& eot_line) {
  PROFILER_ZONE("Gen.misc");
  int abs_pos = 0;      // absolute token index over all turns
  int current_pos = 0;  // token index within the current turn
@ -234,8 +252,12 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {

    const std::string instructions =
        "*Usage*\n"
-        "  Enter an instruction and press enter (%C reset conversation, "
-        "%Q quits).\n\n"
+        "  Enter an instruction and press enter (%C resets conversation, "
+        "%Q quits).\n" +
+        (inference.multiturn == 0
+             ? std::string("  Since multiturn is set to 0, conversation will "
+                           "automatically reset every turn.\n\n")
+             : "\n") +
        "*Examples*\n"
        "  - Write an email to grandma thanking her for the cookies.\n"
        "  - What are some historical attractions to visit around "
@ -244,13 +266,14 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
        "  - Write a standup comedy bit about GPU programming.\n";

    std::cout << "\033[2J\033[1;1H"  // clear screen
-              << banner_ascii_art << "\n\n";
+              << kAsciiArtBanner << "\n\n";
    ShowConfig(loader, inference, app);
    std::cout << "\n" << instructions << "\n";
  }

-  ReplGemma(model, pool, inner_pool, inference, app.verbosity,
-            /*accept_token=*/[](int) { return true; }, app.eot_line);
+  ReplGemma(
+      model, pool, inner_pool, inference, app.verbosity,
+      /*accept_token=*/[](int) { return true; }, app.eot_line);
 }

 }  // namespace gcpp
--- a/util/app.h
+++ b/util/app.h
@ -31,6 +31,7 @@

 // copybara:import_next_line:gemma_cpp
 #include "util/args.h"
+// copybara:end
 #include "hwy/base.h"  // HWY_ASSERT

 namespace gcpp {
@ -77,7 +78,6 @@ class AppArgs : public ArgsBase<AppArgs> {

  template <class Visitor>
  void ForEach(const Visitor& visitor) {
-    visitor(log, "log", Path{"/tmp/log.txt"}, "Logging file", 2);
    visitor(verbosity, "verbosity", 1,
            "Show verbose developer information\n   0 = only print generation "
            "output\n   1 = standard user-facing terminal ui\n   2 = show "
@ -85,15 +85,16 @@ class AppArgs : public ArgsBase<AppArgs> {
            2);
    visitor(num_threads, "num_threads",
            kDefaultNumThreads,  // see ChooseNumThreads
-            "Number of threads to use. Default value is set based on an "
-            "estimate of "
-            "how many concurrent threads are supported.",
-            2);
-    visitor(eot_line, "eot_line", std::string(""),
-            "End of turn line. "
-            "When you specify this, the prompt will be all lines "
-            "before the line where only the given string appears.",
+            "Number of threads to use.\n    Default = Estimate of the "
+            "number of suupported concurrent threads.",
            2);
+    visitor(
+        eot_line, "eot_line", std::string(""),
+        "End of turn line. "
+        "When you specify this, the prompt will be all lines "
+        "before the line where only the given string appears.\n    Default = "
+        "When a newline is encountered, that signals the end of the turn.",
+        2);
  }
 };