mirror of https://github.com/google/gemma.cpp.git
remove --log fixing https://github.com/google/gemma.cpp/issues/59, improve command line args help, add copybara #include sort guards in more source files, add README sections on running faster and related projects
This commit is contained in:
parent
272f17ddb3
commit
0ea7b993de
30
README.md
30
README.md
|
|
@ -92,7 +92,7 @@ weights enable faster inference. In general, we recommend starting with the
|
|||
| `7b-pt` | 7 billion parameter pre-trained model, bfloat16 |
|
||||
| `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
|
||||
|
||||
> [!NOTE]
|
||||
> [!NOTE]
|
||||
> **Important**: We strongly recommend starting off with the `2b-it-sfp` model to
|
||||
> get up and running.
|
||||
|
||||
|
|
@ -116,7 +116,7 @@ The build system uses [CMake](https://cmake.org/). To build the gemma inference
|
|||
runtime, create a build directory and generate the build files using `cmake`
|
||||
from the top-level project directory. Note if you previous ran `cmake` and are
|
||||
re-running with a different setting, be sure to clean out the `build/` directory
|
||||
with `rm -rf build/*` (warning this will delete any other files in the `build/`
|
||||
with `rm -rf build/*` (warning this will delete any other files in the `build/`
|
||||
directory.
|
||||
|
||||
For the 8-bit switched floating point weights (sfp), run cmake with no options:
|
||||
|
|
@ -242,6 +242,21 @@ We're working on a python script to convert a standard model format to `.sbs`,
|
|||
and hope have it available in the next week or so. Follow [this
|
||||
issue](https://github.com/google/gemma.cpp/issues/11) for updates.
|
||||
|
||||
**What are some easy ways to make the model run faster?**
|
||||
|
||||
1. Make sure you are using the 8-bit switched floating point `-sfp` models.
|
||||
2. If you're on a laptop, make sure power mode is set to maximize performance
|
||||
and saving mode is **off**. For most laptops, the power saving modes get
|
||||
activated automatically if the computer is not plugged in.
|
||||
3. Close other unused cpu-intensive applications.
|
||||
4. On macs, anecdotally we observe a "warm-up" ramp-up in speed as performance
|
||||
cores get engaged.
|
||||
5. Experiment with the `--num_threads` argument value. Depending on the device,
|
||||
larger numbers don't always mean better performance.
|
||||
|
||||
We're also working on algorithmic and optimization approaches for faster
|
||||
inference, stay tuned.
|
||||
|
||||
## Usage
|
||||
|
||||
`gemma` has different usage modes, controlled by the verbosity flag.
|
||||
|
|
@ -415,6 +430,17 @@ make -j [number of parallel threads to use] libgemma
|
|||
If this is successful, you should now have a `libgemma` library file in the
|
||||
`build/` directory. On Unix platforms, the filename is `libgemma.a`.
|
||||
|
||||
## Independent Projects Using gemma.cpp
|
||||
|
||||
Some independent projects using gemma.cpp:
|
||||
|
||||
- [gemma-cpp-python - Python bindings](https://github.com/namtranase/gemma-cpp-python)
|
||||
- [lua-cgemma - Lua bindings](https://github.com/ufownl/lua-cgemma)
|
||||
- [Godot engine demo project](https://github.com/Rliop913/Gemma-godot-demo-project)
|
||||
|
||||
If you would like to have your project included, feel free to get in touch or
|
||||
submit a PR with a `README.md` edit.
|
||||
|
||||
## Acknowledgements and Contacts
|
||||
|
||||
gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com)
|
||||
|
|
|
|||
17
gemma.h
17
gemma.h
|
|
@ -122,21 +122,22 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
|
|||
template <class Visitor>
|
||||
void ForEach(const Visitor& visitor) {
|
||||
visitor(tokenizer, "tokenizer", Path(),
|
||||
"Path name of tokenizer model file. (required)");
|
||||
"Path name of tokenizer model file.\n Required argument.");
|
||||
visitor(
|
||||
cache, "compressed_weights", Path(),
|
||||
"Path name of compressed weights file, regenerated from `--weights` "
|
||||
"file if "
|
||||
"the compressed weights file does not exist. (required)");
|
||||
"the compressed weights file does not exist.\n Required argument.");
|
||||
visitor(model_type, "model", std::string(),
|
||||
"Model type - can be 2b-it (2B parameters, instruction-tuned), "
|
||||
"2b-pt (2B parameters, pretrained), 7b-it (7B parameters, "
|
||||
"instruction-tuned), or 7b-pt (7B parameters, pretrained). "
|
||||
"(required)");
|
||||
"Model type\n 2b-it (2B parameters, instruction-tuned)\n "
|
||||
"2b-pt (2B parameters, pretrained)\n 7b-it (7B parameters "
|
||||
"instruction-tuned)\n 7b-pt (7B parameters, pretrained)\n"
|
||||
" Required argument.");
|
||||
visitor(model, "weights", Path(),
|
||||
"Path name of model weights (.sbs) file. Only required if "
|
||||
"compressed_weights file is not present and needs to be "
|
||||
"regenerated. Otherwise, not needed");
|
||||
"regenerated. This parameter is only required for compressing"
|
||||
"new model weight exports, otherwise it is not needed.");
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -192,7 +193,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
|
|||
"Make top-k sampling deterministic", 2);
|
||||
visitor(multiturn, "multiturn", false,
|
||||
"Multiturn mode (if 0, this clears the KV cache after every "
|
||||
"interaction without quitting)\n Default = 0 (conversation "
|
||||
"interaction without quitting)\n Default : 0 (conversation "
|
||||
"resets every turn)");
|
||||
}
|
||||
};
|
||||
|
|
|
|||
69
run.cc
69
run.cc
|
|
@ -24,12 +24,16 @@
|
|||
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "compression/compress.h"
|
||||
// copybara:end
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "gemma.h" // Gemma
|
||||
#include "gemma.h" // Gemma
|
||||
// copybara:end
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/app.h"
|
||||
// copybara:end
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h" // HasHelp
|
||||
// copybara:end
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||
#include "hwy/highway.h"
|
||||
|
|
@ -39,20 +43,13 @@
|
|||
|
||||
namespace gcpp {
|
||||
|
||||
void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
|
||||
gcpp::AppArgs& app) {
|
||||
fprintf(stderr,
|
||||
"\ngemma.cpp\n---------\n\nTo run gemma.cpp, you need to "
|
||||
"specify 3 required model loading arguments: --tokenizer, "
|
||||
"--compressed_weights, "
|
||||
"and --model.\n\nModel Loading Arguments\n\n");
|
||||
loader.Help();
|
||||
fprintf(stderr, "\nInference Arguments\n\n");
|
||||
inference.Help();
|
||||
fprintf(stderr, "\nApplication Arguments\n\n");
|
||||
app.Help();
|
||||
fprintf(stderr, "\n\n");
|
||||
}
|
||||
static constexpr std::string_view kAsciiArtBanner =
|
||||
" __ _ ___ _ __ ___ _ __ ___ __ _ ___ _ __ _ __\n"
|
||||
" / _` |/ _ \\ '_ ` _ \\| '_ ` _ \\ / _` | / __| '_ \\| '_ \\\n"
|
||||
"| (_| | __/ | | | | | | | | | | (_| || (__| |_) | |_) |\n"
|
||||
" \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__/| .__/\n"
|
||||
" __/ | | | | |\n"
|
||||
" |___/ |_| |_|";
|
||||
|
||||
void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
||||
loader.Print(app.verbosity);
|
||||
|
|
@ -69,7 +66,8 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
|||
<< std::thread::hardware_concurrency() << std::endl
|
||||
<< "Instruction set : "
|
||||
<< hwy::TargetName(hwy::DispatchedTarget()) << " ("
|
||||
<< hwy::VectorBytes() * 8 << " bits)" << "\n"
|
||||
<< hwy::VectorBytes() * 8 << " bits)"
|
||||
<< "\n"
|
||||
<< "Weight Type : "
|
||||
<< gcpp::TypeName(gcpp::WeightT()) << "\n"
|
||||
<< "EmbedderInput Type : "
|
||||
|
|
@ -77,11 +75,31 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
|||
}
|
||||
}
|
||||
|
||||
void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
|
||||
gcpp::AppArgs& app) {
|
||||
std::cerr
|
||||
<< kAsciiArtBanner
|
||||
<< "\n\ngemma.cpp : a lightweight, standalone C++ inference engine\n"
|
||||
"==========================================================\n\n"
|
||||
"To run gemma.cpp, you need to "
|
||||
"specify 3 required model loading arguments:\n --tokenizer\n "
|
||||
"--compressed_weights\n"
|
||||
" --model.\n";
|
||||
std::cerr << "\n*Example Usage*\n\n./gemma --tokenizer tokenizer.spm "
|
||||
"--compressed_weights 2b-it-sfp.sbs --model 2b-it\n";
|
||||
std::cerr << "\n*Model Loading Arguments*\n\n";
|
||||
loader.Help();
|
||||
std::cerr << "\n*Inference Arguments*\n\n";
|
||||
inference.Help();
|
||||
std::cerr << "\n*Application Arguments*\n\n";
|
||||
app.Help();
|
||||
std::cerr << "\n";
|
||||
}
|
||||
|
||||
void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
|
||||
hwy::ThreadPool& inner_pool, const InferenceArgs& args,
|
||||
int verbosity, const gcpp::AcceptFunc& accept_token,
|
||||
std::string &eot_line
|
||||
) {
|
||||
std::string& eot_line) {
|
||||
PROFILER_ZONE("Gen.misc");
|
||||
int abs_pos = 0; // absolute token index over all turns
|
||||
int current_pos = 0; // token index within the current turn
|
||||
|
|
@ -234,8 +252,12 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
|||
|
||||
const std::string instructions =
|
||||
"*Usage*\n"
|
||||
" Enter an instruction and press enter (%C reset conversation, "
|
||||
"%Q quits).\n\n"
|
||||
" Enter an instruction and press enter (%C resets conversation, "
|
||||
"%Q quits).\n" +
|
||||
(inference.multiturn == 0
|
||||
? std::string(" Since multiturn is set to 0, conversation will "
|
||||
"automatically reset every turn.\n\n")
|
||||
: "\n") +
|
||||
"*Examples*\n"
|
||||
" - Write an email to grandma thanking her for the cookies.\n"
|
||||
" - What are some historical attractions to visit around "
|
||||
|
|
@ -244,13 +266,14 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
|||
" - Write a standup comedy bit about GPU programming.\n";
|
||||
|
||||
std::cout << "\033[2J\033[1;1H" // clear screen
|
||||
<< banner_ascii_art << "\n\n";
|
||||
<< kAsciiArtBanner << "\n\n";
|
||||
ShowConfig(loader, inference, app);
|
||||
std::cout << "\n" << instructions << "\n";
|
||||
}
|
||||
|
||||
ReplGemma(model, pool, inner_pool, inference, app.verbosity,
|
||||
/*accept_token=*/[](int) { return true; }, app.eot_line);
|
||||
ReplGemma(
|
||||
model, pool, inner_pool, inference, app.verbosity,
|
||||
/*accept_token=*/[](int) { return true; }, app.eot_line);
|
||||
}
|
||||
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
19
util/app.h
19
util/app.h
|
|
@ -31,6 +31,7 @@
|
|||
|
||||
// copybara:import_next_line:gemma_cpp
|
||||
#include "util/args.h"
|
||||
// copybara:end
|
||||
#include "hwy/base.h" // HWY_ASSERT
|
||||
|
||||
namespace gcpp {
|
||||
|
|
@ -77,7 +78,6 @@ class AppArgs : public ArgsBase<AppArgs> {
|
|||
|
||||
template <class Visitor>
|
||||
void ForEach(const Visitor& visitor) {
|
||||
visitor(log, "log", Path{"/tmp/log.txt"}, "Logging file", 2);
|
||||
visitor(verbosity, "verbosity", 1,
|
||||
"Show verbose developer information\n 0 = only print generation "
|
||||
"output\n 1 = standard user-facing terminal ui\n 2 = show "
|
||||
|
|
@ -85,15 +85,16 @@ class AppArgs : public ArgsBase<AppArgs> {
|
|||
2);
|
||||
visitor(num_threads, "num_threads",
|
||||
kDefaultNumThreads, // see ChooseNumThreads
|
||||
"Number of threads to use. Default value is set based on an "
|
||||
"estimate of "
|
||||
"how many concurrent threads are supported.",
|
||||
2);
|
||||
visitor(eot_line, "eot_line", std::string(""),
|
||||
"End of turn line. "
|
||||
"When you specify this, the prompt will be all lines "
|
||||
"before the line where only the given string appears.",
|
||||
"Number of threads to use.\n Default = Estimate of the "
|
||||
"number of suupported concurrent threads.",
|
||||
2);
|
||||
visitor(
|
||||
eot_line, "eot_line", std::string(""),
|
||||
"End of turn line. "
|
||||
"When you specify this, the prompt will be all lines "
|
||||
"before the line where only the given string appears.\n Default = "
|
||||
"When a newline is encountered, that signals the end of the turn.",
|
||||
2);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue