diff --git a/README.md b/README.md index 8db6862..331d96f 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ weights enable faster inference. In general, we recommend starting with the | `7b-pt` | 7 billion parameter pre-trained model, bfloat16 | | `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point | -> [!NOTE] +> [!NOTE] > **Important**: We strongly recommend starting off with the `2b-it-sfp` model to > get up and running. @@ -116,7 +116,7 @@ The build system uses [CMake](https://cmake.org/). To build the gemma inference runtime, create a build directory and generate the build files using `cmake` from the top-level project directory. Note if you previous ran `cmake` and are re-running with a different setting, be sure to clean out the `build/` directory -with `rm -rf build/*` (warning this will delete any other files in the `build/` +with `rm -rf build/*` (warning this will delete any other files in the `build/` directory. For the 8-bit switched floating point weights (sfp), run cmake with no options: @@ -242,6 +242,21 @@ We're working on a python script to convert a standard model format to `.sbs`, and hope have it available in the next week or so. Follow [this issue](https://github.com/google/gemma.cpp/issues/11) for updates. +**What are some easy ways to make the model run faster?** + +1. Make sure you are using the 8-bit switched floating point `-sfp` models. +2. If you're on a laptop, make sure power mode is set to maximize performance +and saving mode is **off**. For most laptops, the power saving modes get +activated automatically if the computer is not plugged in. +3. Close other unused cpu-intensive applications. +4. On macs, anecdotally we observe a "warm-up" ramp-up in speed as performance +cores get engaged. +5. Experiment with the `--num_threads` argument value. Depending on the device, +larger numbers don't always mean better performance. + +We're also working on algorithmic and optimization approaches for faster +inference, stay tuned. + ## Usage `gemma` has different usage modes, controlled by the verbosity flag. @@ -415,6 +430,17 @@ make -j [number of parallel threads to use] libgemma If this is successful, you should now have a `libgemma` library file in the `build/` directory. On Unix platforms, the filename is `libgemma.a`. +## Independent Projects Using gemma.cpp + +Some independent projects using gemma.cpp: + +- [gemma-cpp-python - Python bindings](https://github.com/namtranase/gemma-cpp-python) +- [lua-cgemma - Lua bindings](https://github.com/ufownl/lua-cgemma) +- [Godot engine demo project](https://github.com/Rliop913/Gemma-godot-demo-project) + +If you would like to have your project included, feel free to get in touch or +submit a PR with a `README.md` edit. + ## Acknowledgements and Contacts gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com) diff --git a/gemma.h b/gemma.h index 1ff98c1..7195bc9 100644 --- a/gemma.h +++ b/gemma.h @@ -122,21 +122,22 @@ struct LoaderArgs : public ArgsBase { template void ForEach(const Visitor& visitor) { visitor(tokenizer, "tokenizer", Path(), - "Path name of tokenizer model file. (required)"); + "Path name of tokenizer model file.\n Required argument."); visitor( cache, "compressed_weights", Path(), "Path name of compressed weights file, regenerated from `--weights` " "file if " - "the compressed weights file does not exist. (required)"); + "the compressed weights file does not exist.\n Required argument."); visitor(model_type, "model", std::string(), - "Model type - can be 2b-it (2B parameters, instruction-tuned), " - "2b-pt (2B parameters, pretrained), 7b-it (7B parameters, " - "instruction-tuned), or 7b-pt (7B parameters, pretrained). " - "(required)"); + "Model type\n 2b-it (2B parameters, instruction-tuned)\n " + "2b-pt (2B parameters, pretrained)\n 7b-it (7B parameters " + "instruction-tuned)\n 7b-pt (7B parameters, pretrained)\n" + " Required argument."); visitor(model, "weights", Path(), "Path name of model weights (.sbs) file. Only required if " "compressed_weights file is not present and needs to be " - "regenerated. Otherwise, not needed"); + "regenerated. This parameter is only required for compressing" + "new model weight exports, otherwise it is not needed."); } }; @@ -192,7 +193,7 @@ struct InferenceArgs : public ArgsBase { "Make top-k sampling deterministic", 2); visitor(multiturn, "multiturn", false, "Multiturn mode (if 0, this clears the KV cache after every " - "interaction without quitting)\n Default = 0 (conversation " + "interaction without quitting)\n Default : 0 (conversation " "resets every turn)"); } }; diff --git a/run.cc b/run.cc index 2d9a15e..507979d 100644 --- a/run.cc +++ b/run.cc @@ -24,12 +24,16 @@ // copybara:import_next_line:gemma_cpp #include "compression/compress.h" +// copybara:end // copybara:import_next_line:gemma_cpp -#include "gemma.h" // Gemma +#include "gemma.h" // Gemma +// copybara:end // copybara:import_next_line:gemma_cpp #include "util/app.h" +// copybara:end // copybara:import_next_line:gemma_cpp #include "util/args.h" // HasHelp +// copybara:end #include "hwy/base.h" #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/highway.h" @@ -39,20 +43,13 @@ namespace gcpp { -void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference, - gcpp::AppArgs& app) { - fprintf(stderr, - "\ngemma.cpp\n---------\n\nTo run gemma.cpp, you need to " - "specify 3 required model loading arguments: --tokenizer, " - "--compressed_weights, " - "and --model.\n\nModel Loading Arguments\n\n"); - loader.Help(); - fprintf(stderr, "\nInference Arguments\n\n"); - inference.Help(); - fprintf(stderr, "\nApplication Arguments\n\n"); - app.Help(); - fprintf(stderr, "\n\n"); -} +static constexpr std::string_view kAsciiArtBanner = + " __ _ ___ _ __ ___ _ __ ___ __ _ ___ _ __ _ __\n" + " / _` |/ _ \\ '_ ` _ \\| '_ ` _ \\ / _` | / __| '_ \\| '_ \\\n" + "| (_| | __/ | | | | | | | | | | (_| || (__| |_) | |_) |\n" + " \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__/| .__/\n" + " __/ | | | | |\n" + " |___/ |_| |_|"; void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { loader.Print(app.verbosity); @@ -69,7 +66,8 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { << std::thread::hardware_concurrency() << std::endl << "Instruction set : " << hwy::TargetName(hwy::DispatchedTarget()) << " (" - << hwy::VectorBytes() * 8 << " bits)" << "\n" + << hwy::VectorBytes() * 8 << " bits)" + << "\n" << "Weight Type : " << gcpp::TypeName(gcpp::WeightT()) << "\n" << "EmbedderInput Type : " @@ -77,11 +75,31 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { } } +void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference, + gcpp::AppArgs& app) { + std::cerr + << kAsciiArtBanner + << "\n\ngemma.cpp : a lightweight, standalone C++ inference engine\n" + "==========================================================\n\n" + "To run gemma.cpp, you need to " + "specify 3 required model loading arguments:\n --tokenizer\n " + "--compressed_weights\n" + " --model.\n"; + std::cerr << "\n*Example Usage*\n\n./gemma --tokenizer tokenizer.spm " + "--compressed_weights 2b-it-sfp.sbs --model 2b-it\n"; + std::cerr << "\n*Model Loading Arguments*\n\n"; + loader.Help(); + std::cerr << "\n*Inference Arguments*\n\n"; + inference.Help(); + std::cerr << "\n*Application Arguments*\n\n"; + app.Help(); + std::cerr << "\n"; +} + void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool, const InferenceArgs& args, int verbosity, const gcpp::AcceptFunc& accept_token, - std::string &eot_line -) { + std::string& eot_line) { PROFILER_ZONE("Gen.misc"); int abs_pos = 0; // absolute token index over all turns int current_pos = 0; // token index within the current turn @@ -234,8 +252,12 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { const std::string instructions = "*Usage*\n" - " Enter an instruction and press enter (%C reset conversation, " - "%Q quits).\n\n" + " Enter an instruction and press enter (%C resets conversation, " + "%Q quits).\n" + + (inference.multiturn == 0 + ? std::string(" Since multiturn is set to 0, conversation will " + "automatically reset every turn.\n\n") + : "\n") + "*Examples*\n" " - Write an email to grandma thanking her for the cookies.\n" " - What are some historical attractions to visit around " @@ -244,13 +266,14 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { " - Write a standup comedy bit about GPU programming.\n"; std::cout << "\033[2J\033[1;1H" // clear screen - << banner_ascii_art << "\n\n"; + << kAsciiArtBanner << "\n\n"; ShowConfig(loader, inference, app); std::cout << "\n" << instructions << "\n"; } - ReplGemma(model, pool, inner_pool, inference, app.verbosity, - /*accept_token=*/[](int) { return true; }, app.eot_line); + ReplGemma( + model, pool, inner_pool, inference, app.verbosity, + /*accept_token=*/[](int) { return true; }, app.eot_line); } } // namespace gcpp diff --git a/util/app.h b/util/app.h index f66a6cd..7f926a5 100644 --- a/util/app.h +++ b/util/app.h @@ -31,6 +31,7 @@ // copybara:import_next_line:gemma_cpp #include "util/args.h" +// copybara:end #include "hwy/base.h" // HWY_ASSERT namespace gcpp { @@ -77,7 +78,6 @@ class AppArgs : public ArgsBase { template void ForEach(const Visitor& visitor) { - visitor(log, "log", Path{"/tmp/log.txt"}, "Logging file", 2); visitor(verbosity, "verbosity", 1, "Show verbose developer information\n 0 = only print generation " "output\n 1 = standard user-facing terminal ui\n 2 = show " @@ -85,15 +85,16 @@ class AppArgs : public ArgsBase { 2); visitor(num_threads, "num_threads", kDefaultNumThreads, // see ChooseNumThreads - "Number of threads to use. Default value is set based on an " - "estimate of " - "how many concurrent threads are supported.", - 2); - visitor(eot_line, "eot_line", std::string(""), - "End of turn line. " - "When you specify this, the prompt will be all lines " - "before the line where only the given string appears.", + "Number of threads to use.\n Default = Estimate of the " + "number of suupported concurrent threads.", 2); + visitor( + eot_line, "eot_line", std::string(""), + "End of turn line. " + "When you specify this, the prompt will be all lines " + "before the line where only the given string appears.\n Default = " + "When a newline is encountered, that signals the end of the turn.", + 2); } };