Merge pull request #66 from google:dev-cleanup

PiperOrigin-RevId: 611207602
This commit is contained in:
Copybara-Service 2024-02-28 13:22:46 -08:00
commit c805fbe780
4 changed files with 93 additions and 42 deletions

View File

@ -92,7 +92,7 @@ weights enable faster inference. In general, we recommend starting with the
| `7b-pt` | 7 billion parameter pre-trained model, bfloat16 | | `7b-pt` | 7 billion parameter pre-trained model, bfloat16 |
| `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point | | `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
> [!NOTE] > [!NOTE]
> **Important**: We strongly recommend starting off with the `2b-it-sfp` model to > **Important**: We strongly recommend starting off with the `2b-it-sfp` model to
> get up and running. > get up and running.
@ -116,7 +116,7 @@ The build system uses [CMake](https://cmake.org/). To build the gemma inference
runtime, create a build directory and generate the build files using `cmake` runtime, create a build directory and generate the build files using `cmake`
from the top-level project directory. Note if you previous ran `cmake` and are from the top-level project directory. Note if you previous ran `cmake` and are
re-running with a different setting, be sure to clean out the `build/` directory re-running with a different setting, be sure to clean out the `build/` directory
with `rm -rf build/*` (warning this will delete any other files in the `build/` with `rm -rf build/*` (warning this will delete any other files in the `build/`
directory. directory.
For the 8-bit switched floating point weights (sfp), run cmake with no options: For the 8-bit switched floating point weights (sfp), run cmake with no options:
@ -242,6 +242,21 @@ We're working on a python script to convert a standard model format to `.sbs`,
and hope have it available in the next week or so. Follow [this and hope have it available in the next week or so. Follow [this
issue](https://github.com/google/gemma.cpp/issues/11) for updates. issue](https://github.com/google/gemma.cpp/issues/11) for updates.
**What are some easy ways to make the model run faster?**
1. Make sure you are using the 8-bit switched floating point `-sfp` models.
2. If you're on a laptop, make sure power mode is set to maximize performance
and saving mode is **off**. For most laptops, the power saving modes get
activated automatically if the computer is not plugged in.
3. Close other unused cpu-intensive applications.
4. On macs, anecdotally we observe a "warm-up" ramp-up in speed as performance
cores get engaged.
5. Experiment with the `--num_threads` argument value. Depending on the device,
larger numbers don't always mean better performance.
We're also working on algorithmic and optimization approaches for faster
inference, stay tuned.
## Usage ## Usage
`gemma` has different usage modes, controlled by the verbosity flag. `gemma` has different usage modes, controlled by the verbosity flag.
@ -415,6 +430,17 @@ make -j [number of parallel threads to use] libgemma
If this is successful, you should now have a `libgemma` library file in the If this is successful, you should now have a `libgemma` library file in the
`build/` directory. On Unix platforms, the filename is `libgemma.a`. `build/` directory. On Unix platforms, the filename is `libgemma.a`.
## Independent Projects Using gemma.cpp
Some independent projects using gemma.cpp:
- [gemma-cpp-python - Python bindings](https://github.com/namtranase/gemma-cpp-python)
- [lua-cgemma - Lua bindings](https://github.com/ufownl/lua-cgemma)
- [Godot engine demo project](https://github.com/Rliop913/Gemma-godot-demo-project)
If you would like to have your project included, feel free to get in touch or
submit a PR with a `README.md` edit.
## Acknowledgements and Contacts ## Acknowledgements and Contacts
gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com) gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com)

17
gemma.h
View File

@ -122,21 +122,22 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
template <class Visitor> template <class Visitor>
void ForEach(const Visitor& visitor) { void ForEach(const Visitor& visitor) {
visitor(tokenizer, "tokenizer", Path(), visitor(tokenizer, "tokenizer", Path(),
"Path name of tokenizer model file. (required)"); "Path name of tokenizer model file.\n Required argument.");
visitor( visitor(
cache, "compressed_weights", Path(), cache, "compressed_weights", Path(),
"Path name of compressed weights file, regenerated from `--weights` " "Path name of compressed weights file, regenerated from `--weights` "
"file if " "file if "
"the compressed weights file does not exist. (required)"); "the compressed weights file does not exist.\n Required argument.");
visitor(model_type, "model", std::string(), visitor(model_type, "model", std::string(),
"Model type - can be 2b-it (2B parameters, instruction-tuned), " "Model type\n 2b-it (2B parameters, instruction-tuned)\n "
"2b-pt (2B parameters, pretrained), 7b-it (7B parameters, " "2b-pt (2B parameters, pretrained)\n 7b-it (7B parameters "
"instruction-tuned), or 7b-pt (7B parameters, pretrained). " "instruction-tuned)\n 7b-pt (7B parameters, pretrained)\n"
"(required)"); " Required argument.");
visitor(model, "weights", Path(), visitor(model, "weights", Path(),
"Path name of model weights (.sbs) file. Only required if " "Path name of model weights (.sbs) file. Only required if "
"compressed_weights file is not present and needs to be " "compressed_weights file is not present and needs to be "
"regenerated. Otherwise, not needed"); "regenerated. This parameter is only required for compressing"
"new model weight exports, otherwise it is not needed.");
} }
}; };
@ -192,7 +193,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
"Make top-k sampling deterministic", 2); "Make top-k sampling deterministic", 2);
visitor(multiturn, "multiturn", false, visitor(multiturn, "multiturn", false,
"Multiturn mode (if 0, this clears the KV cache after every " "Multiturn mode (if 0, this clears the KV cache after every "
"interaction without quitting)\n Default = 0 (conversation " "interaction without quitting)\n Default : 0 (conversation "
"resets every turn)"); "resets every turn)");
} }
}; };

69
run.cc
View File

@ -24,12 +24,16 @@
// copybara:import_next_line:gemma_cpp // copybara:import_next_line:gemma_cpp
#include "compression/compress.h" #include "compression/compress.h"
// copybara:end
// copybara:import_next_line:gemma_cpp // copybara:import_next_line:gemma_cpp
#include "gemma.h" // Gemma #include "gemma.h" // Gemma
// copybara:end
// copybara:import_next_line:gemma_cpp // copybara:import_next_line:gemma_cpp
#include "util/app.h" #include "util/app.h"
// copybara:end
// copybara:import_next_line:gemma_cpp // copybara:import_next_line:gemma_cpp
#include "util/args.h" // HasHelp #include "util/args.h" // HasHelp
// copybara:end
#include "hwy/base.h" #include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/highway.h" #include "hwy/highway.h"
@ -39,20 +43,13 @@
namespace gcpp { namespace gcpp {
void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference, static constexpr std::string_view kAsciiArtBanner =
gcpp::AppArgs& app) { " __ _ ___ _ __ ___ _ __ ___ __ _ ___ _ __ _ __\n"
fprintf(stderr, " / _` |/ _ \\ '_ ` _ \\| '_ ` _ \\ / _` | / __| '_ \\| '_ \\\n"
"\ngemma.cpp\n---------\n\nTo run gemma.cpp, you need to " "| (_| | __/ | | | | | | | | | | (_| || (__| |_) | |_) |\n"
"specify 3 required model loading arguments: --tokenizer, " " \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__/| .__/\n"
"--compressed_weights, " " __/ | | | | |\n"
"and --model.\n\nModel Loading Arguments\n\n"); " |___/ |_| |_|";
loader.Help();
fprintf(stderr, "\nInference Arguments\n\n");
inference.Help();
fprintf(stderr, "\nApplication Arguments\n\n");
app.Help();
fprintf(stderr, "\n\n");
}
void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
loader.Print(app.verbosity); loader.Print(app.verbosity);
@ -69,7 +66,8 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
<< std::thread::hardware_concurrency() << std::endl << std::thread::hardware_concurrency() << std::endl
<< "Instruction set : " << "Instruction set : "
<< hwy::TargetName(hwy::DispatchedTarget()) << " (" << hwy::TargetName(hwy::DispatchedTarget()) << " ("
<< hwy::VectorBytes() * 8 << " bits)" << "\n" << hwy::VectorBytes() * 8 << " bits)"
<< "\n"
<< "Weight Type : " << "Weight Type : "
<< gcpp::TypeName(gcpp::WeightT()) << "\n" << gcpp::TypeName(gcpp::WeightT()) << "\n"
<< "EmbedderInput Type : " << "EmbedderInput Type : "
@ -77,11 +75,31 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
} }
} }
void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
gcpp::AppArgs& app) {
std::cerr
<< kAsciiArtBanner
<< "\n\ngemma.cpp : a lightweight, standalone C++ inference engine\n"
"==========================================================\n\n"
"To run gemma.cpp, you need to "
"specify 3 required model loading arguments:\n --tokenizer\n "
"--compressed_weights\n"
" --model.\n";
std::cerr << "\n*Example Usage*\n\n./gemma --tokenizer tokenizer.spm "
"--compressed_weights 2b-it-sfp.sbs --model 2b-it\n";
std::cerr << "\n*Model Loading Arguments*\n\n";
loader.Help();
std::cerr << "\n*Inference Arguments*\n\n";
inference.Help();
std::cerr << "\n*Application Arguments*\n\n";
app.Help();
std::cerr << "\n";
}
void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool, void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
hwy::ThreadPool& inner_pool, const InferenceArgs& args, hwy::ThreadPool& inner_pool, const InferenceArgs& args,
int verbosity, const gcpp::AcceptFunc& accept_token, int verbosity, const gcpp::AcceptFunc& accept_token,
std::string &eot_line std::string& eot_line) {
) {
PROFILER_ZONE("Gen.misc"); PROFILER_ZONE("Gen.misc");
int abs_pos = 0; // absolute token index over all turns int abs_pos = 0; // absolute token index over all turns
int current_pos = 0; // token index within the current turn int current_pos = 0; // token index within the current turn
@ -234,8 +252,12 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
const std::string instructions = const std::string instructions =
"*Usage*\n" "*Usage*\n"
" Enter an instruction and press enter (%C reset conversation, " " Enter an instruction and press enter (%C resets conversation, "
"%Q quits).\n\n" "%Q quits).\n" +
(inference.multiturn == 0
? std::string(" Since multiturn is set to 0, conversation will "
"automatically reset every turn.\n\n")
: "\n") +
"*Examples*\n" "*Examples*\n"
" - Write an email to grandma thanking her for the cookies.\n" " - Write an email to grandma thanking her for the cookies.\n"
" - What are some historical attractions to visit around " " - What are some historical attractions to visit around "
@ -244,13 +266,14 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
" - Write a standup comedy bit about GPU programming.\n"; " - Write a standup comedy bit about GPU programming.\n";
std::cout << "\033[2J\033[1;1H" // clear screen std::cout << "\033[2J\033[1;1H" // clear screen
<< banner_ascii_art << "\n\n"; << kAsciiArtBanner << "\n\n";
ShowConfig(loader, inference, app); ShowConfig(loader, inference, app);
std::cout << "\n" << instructions << "\n"; std::cout << "\n" << instructions << "\n";
} }
ReplGemma(model, pool, inner_pool, inference, app.verbosity, ReplGemma(
/*accept_token=*/[](int) { return true; }, app.eot_line); model, pool, inner_pool, inference, app.verbosity,
/*accept_token=*/[](int) { return true; }, app.eot_line);
} }
} // namespace gcpp } // namespace gcpp

View File

@ -31,6 +31,7 @@
// copybara:import_next_line:gemma_cpp // copybara:import_next_line:gemma_cpp
#include "util/args.h" #include "util/args.h"
// copybara:end
#include "hwy/base.h" // HWY_ASSERT #include "hwy/base.h" // HWY_ASSERT
namespace gcpp { namespace gcpp {
@ -77,7 +78,6 @@ class AppArgs : public ArgsBase<AppArgs> {
template <class Visitor> template <class Visitor>
void ForEach(const Visitor& visitor) { void ForEach(const Visitor& visitor) {
visitor(log, "log", Path{"/tmp/log.txt"}, "Logging file", 2);
visitor(verbosity, "verbosity", 1, visitor(verbosity, "verbosity", 1,
"Show verbose developer information\n 0 = only print generation " "Show verbose developer information\n 0 = only print generation "
"output\n 1 = standard user-facing terminal ui\n 2 = show " "output\n 1 = standard user-facing terminal ui\n 2 = show "
@ -85,15 +85,16 @@ class AppArgs : public ArgsBase<AppArgs> {
2); 2);
visitor(num_threads, "num_threads", visitor(num_threads, "num_threads",
kDefaultNumThreads, // see ChooseNumThreads kDefaultNumThreads, // see ChooseNumThreads
"Number of threads to use. Default value is set based on an " "Number of threads to use.\n Default = Estimate of the "
"estimate of " "number of suupported concurrent threads.",
"how many concurrent threads are supported.",
2);
visitor(eot_line, "eot_line", std::string(""),
"End of turn line. "
"When you specify this, the prompt will be all lines "
"before the line where only the given string appears.",
2); 2);
visitor(
eot_line, "eot_line", std::string(""),
"End of turn line. "
"When you specify this, the prompt will be all lines "
"before the line where only the given string appears.\n Default = "
"When a newline is encountered, that signals the end of the turn.",
2);
} }
}; };