mirror of https://github.com/google/gemma.cpp.git
Merge pull request #66 from google:dev-cleanup
PiperOrigin-RevId: 611207602
This commit is contained in:
commit
c805fbe780
30
README.md
30
README.md
|
|
@ -92,7 +92,7 @@ weights enable faster inference. In general, we recommend starting with the
|
||||||
| `7b-pt` | 7 billion parameter pre-trained model, bfloat16 |
|
| `7b-pt` | 7 billion parameter pre-trained model, bfloat16 |
|
||||||
| `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
|
| `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point |
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> **Important**: We strongly recommend starting off with the `2b-it-sfp` model to
|
> **Important**: We strongly recommend starting off with the `2b-it-sfp` model to
|
||||||
> get up and running.
|
> get up and running.
|
||||||
|
|
||||||
|
|
@ -116,7 +116,7 @@ The build system uses [CMake](https://cmake.org/). To build the gemma inference
|
||||||
runtime, create a build directory and generate the build files using `cmake`
|
runtime, create a build directory and generate the build files using `cmake`
|
||||||
from the top-level project directory. Note if you previous ran `cmake` and are
|
from the top-level project directory. Note if you previous ran `cmake` and are
|
||||||
re-running with a different setting, be sure to clean out the `build/` directory
|
re-running with a different setting, be sure to clean out the `build/` directory
|
||||||
with `rm -rf build/*` (warning this will delete any other files in the `build/`
|
with `rm -rf build/*` (warning this will delete any other files in the `build/`
|
||||||
directory.
|
directory.
|
||||||
|
|
||||||
For the 8-bit switched floating point weights (sfp), run cmake with no options:
|
For the 8-bit switched floating point weights (sfp), run cmake with no options:
|
||||||
|
|
@ -242,6 +242,21 @@ We're working on a python script to convert a standard model format to `.sbs`,
|
||||||
and hope have it available in the next week or so. Follow [this
|
and hope have it available in the next week or so. Follow [this
|
||||||
issue](https://github.com/google/gemma.cpp/issues/11) for updates.
|
issue](https://github.com/google/gemma.cpp/issues/11) for updates.
|
||||||
|
|
||||||
|
**What are some easy ways to make the model run faster?**
|
||||||
|
|
||||||
|
1. Make sure you are using the 8-bit switched floating point `-sfp` models.
|
||||||
|
2. If you're on a laptop, make sure power mode is set to maximize performance
|
||||||
|
and saving mode is **off**. For most laptops, the power saving modes get
|
||||||
|
activated automatically if the computer is not plugged in.
|
||||||
|
3. Close other unused cpu-intensive applications.
|
||||||
|
4. On macs, anecdotally we observe a "warm-up" ramp-up in speed as performance
|
||||||
|
cores get engaged.
|
||||||
|
5. Experiment with the `--num_threads` argument value. Depending on the device,
|
||||||
|
larger numbers don't always mean better performance.
|
||||||
|
|
||||||
|
We're also working on algorithmic and optimization approaches for faster
|
||||||
|
inference, stay tuned.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
`gemma` has different usage modes, controlled by the verbosity flag.
|
`gemma` has different usage modes, controlled by the verbosity flag.
|
||||||
|
|
@ -415,6 +430,17 @@ make -j [number of parallel threads to use] libgemma
|
||||||
If this is successful, you should now have a `libgemma` library file in the
|
If this is successful, you should now have a `libgemma` library file in the
|
||||||
`build/` directory. On Unix platforms, the filename is `libgemma.a`.
|
`build/` directory. On Unix platforms, the filename is `libgemma.a`.
|
||||||
|
|
||||||
|
## Independent Projects Using gemma.cpp
|
||||||
|
|
||||||
|
Some independent projects using gemma.cpp:
|
||||||
|
|
||||||
|
- [gemma-cpp-python - Python bindings](https://github.com/namtranase/gemma-cpp-python)
|
||||||
|
- [lua-cgemma - Lua bindings](https://github.com/ufownl/lua-cgemma)
|
||||||
|
- [Godot engine demo project](https://github.com/Rliop913/Gemma-godot-demo-project)
|
||||||
|
|
||||||
|
If you would like to have your project included, feel free to get in touch or
|
||||||
|
submit a PR with a `README.md` edit.
|
||||||
|
|
||||||
## Acknowledgements and Contacts
|
## Acknowledgements and Contacts
|
||||||
|
|
||||||
gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com)
|
gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com)
|
||||||
|
|
|
||||||
17
gemma.h
17
gemma.h
|
|
@ -122,21 +122,22 @@ struct LoaderArgs : public ArgsBase<LoaderArgs> {
|
||||||
template <class Visitor>
|
template <class Visitor>
|
||||||
void ForEach(const Visitor& visitor) {
|
void ForEach(const Visitor& visitor) {
|
||||||
visitor(tokenizer, "tokenizer", Path(),
|
visitor(tokenizer, "tokenizer", Path(),
|
||||||
"Path name of tokenizer model file. (required)");
|
"Path name of tokenizer model file.\n Required argument.");
|
||||||
visitor(
|
visitor(
|
||||||
cache, "compressed_weights", Path(),
|
cache, "compressed_weights", Path(),
|
||||||
"Path name of compressed weights file, regenerated from `--weights` "
|
"Path name of compressed weights file, regenerated from `--weights` "
|
||||||
"file if "
|
"file if "
|
||||||
"the compressed weights file does not exist. (required)");
|
"the compressed weights file does not exist.\n Required argument.");
|
||||||
visitor(model_type, "model", std::string(),
|
visitor(model_type, "model", std::string(),
|
||||||
"Model type - can be 2b-it (2B parameters, instruction-tuned), "
|
"Model type\n 2b-it (2B parameters, instruction-tuned)\n "
|
||||||
"2b-pt (2B parameters, pretrained), 7b-it (7B parameters, "
|
"2b-pt (2B parameters, pretrained)\n 7b-it (7B parameters "
|
||||||
"instruction-tuned), or 7b-pt (7B parameters, pretrained). "
|
"instruction-tuned)\n 7b-pt (7B parameters, pretrained)\n"
|
||||||
"(required)");
|
" Required argument.");
|
||||||
visitor(model, "weights", Path(),
|
visitor(model, "weights", Path(),
|
||||||
"Path name of model weights (.sbs) file. Only required if "
|
"Path name of model weights (.sbs) file. Only required if "
|
||||||
"compressed_weights file is not present and needs to be "
|
"compressed_weights file is not present and needs to be "
|
||||||
"regenerated. Otherwise, not needed");
|
"regenerated. This parameter is only required for compressing"
|
||||||
|
"new model weight exports, otherwise it is not needed.");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -192,7 +193,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
|
||||||
"Make top-k sampling deterministic", 2);
|
"Make top-k sampling deterministic", 2);
|
||||||
visitor(multiturn, "multiturn", false,
|
visitor(multiturn, "multiturn", false,
|
||||||
"Multiturn mode (if 0, this clears the KV cache after every "
|
"Multiturn mode (if 0, this clears the KV cache after every "
|
||||||
"interaction without quitting)\n Default = 0 (conversation "
|
"interaction without quitting)\n Default : 0 (conversation "
|
||||||
"resets every turn)");
|
"resets every turn)");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
||||||
69
run.cc
69
run.cc
|
|
@ -24,12 +24,16 @@
|
||||||
|
|
||||||
// copybara:import_next_line:gemma_cpp
|
// copybara:import_next_line:gemma_cpp
|
||||||
#include "compression/compress.h"
|
#include "compression/compress.h"
|
||||||
|
// copybara:end
|
||||||
// copybara:import_next_line:gemma_cpp
|
// copybara:import_next_line:gemma_cpp
|
||||||
#include "gemma.h" // Gemma
|
#include "gemma.h" // Gemma
|
||||||
|
// copybara:end
|
||||||
// copybara:import_next_line:gemma_cpp
|
// copybara:import_next_line:gemma_cpp
|
||||||
#include "util/app.h"
|
#include "util/app.h"
|
||||||
|
// copybara:end
|
||||||
// copybara:import_next_line:gemma_cpp
|
// copybara:import_next_line:gemma_cpp
|
||||||
#include "util/args.h" // HasHelp
|
#include "util/args.h" // HasHelp
|
||||||
|
// copybara:end
|
||||||
#include "hwy/base.h"
|
#include "hwy/base.h"
|
||||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||||
#include "hwy/highway.h"
|
#include "hwy/highway.h"
|
||||||
|
|
@ -39,20 +43,13 @@
|
||||||
|
|
||||||
namespace gcpp {
|
namespace gcpp {
|
||||||
|
|
||||||
void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
|
static constexpr std::string_view kAsciiArtBanner =
|
||||||
gcpp::AppArgs& app) {
|
" __ _ ___ _ __ ___ _ __ ___ __ _ ___ _ __ _ __\n"
|
||||||
fprintf(stderr,
|
" / _` |/ _ \\ '_ ` _ \\| '_ ` _ \\ / _` | / __| '_ \\| '_ \\\n"
|
||||||
"\ngemma.cpp\n---------\n\nTo run gemma.cpp, you need to "
|
"| (_| | __/ | | | | | | | | | | (_| || (__| |_) | |_) |\n"
|
||||||
"specify 3 required model loading arguments: --tokenizer, "
|
" \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__/| .__/\n"
|
||||||
"--compressed_weights, "
|
" __/ | | | | |\n"
|
||||||
"and --model.\n\nModel Loading Arguments\n\n");
|
" |___/ |_| |_|";
|
||||||
loader.Help();
|
|
||||||
fprintf(stderr, "\nInference Arguments\n\n");
|
|
||||||
inference.Help();
|
|
||||||
fprintf(stderr, "\nApplication Arguments\n\n");
|
|
||||||
app.Help();
|
|
||||||
fprintf(stderr, "\n\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
||||||
loader.Print(app.verbosity);
|
loader.Print(app.verbosity);
|
||||||
|
|
@ -69,7 +66,8 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
||||||
<< std::thread::hardware_concurrency() << std::endl
|
<< std::thread::hardware_concurrency() << std::endl
|
||||||
<< "Instruction set : "
|
<< "Instruction set : "
|
||||||
<< hwy::TargetName(hwy::DispatchedTarget()) << " ("
|
<< hwy::TargetName(hwy::DispatchedTarget()) << " ("
|
||||||
<< hwy::VectorBytes() * 8 << " bits)" << "\n"
|
<< hwy::VectorBytes() * 8 << " bits)"
|
||||||
|
<< "\n"
|
||||||
<< "Weight Type : "
|
<< "Weight Type : "
|
||||||
<< gcpp::TypeName(gcpp::WeightT()) << "\n"
|
<< gcpp::TypeName(gcpp::WeightT()) << "\n"
|
||||||
<< "EmbedderInput Type : "
|
<< "EmbedderInput Type : "
|
||||||
|
|
@ -77,11 +75,31 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference,
|
||||||
|
gcpp::AppArgs& app) {
|
||||||
|
std::cerr
|
||||||
|
<< kAsciiArtBanner
|
||||||
|
<< "\n\ngemma.cpp : a lightweight, standalone C++ inference engine\n"
|
||||||
|
"==========================================================\n\n"
|
||||||
|
"To run gemma.cpp, you need to "
|
||||||
|
"specify 3 required model loading arguments:\n --tokenizer\n "
|
||||||
|
"--compressed_weights\n"
|
||||||
|
" --model.\n";
|
||||||
|
std::cerr << "\n*Example Usage*\n\n./gemma --tokenizer tokenizer.spm "
|
||||||
|
"--compressed_weights 2b-it-sfp.sbs --model 2b-it\n";
|
||||||
|
std::cerr << "\n*Model Loading Arguments*\n\n";
|
||||||
|
loader.Help();
|
||||||
|
std::cerr << "\n*Inference Arguments*\n\n";
|
||||||
|
inference.Help();
|
||||||
|
std::cerr << "\n*Application Arguments*\n\n";
|
||||||
|
app.Help();
|
||||||
|
std::cerr << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
|
void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool,
|
||||||
hwy::ThreadPool& inner_pool, const InferenceArgs& args,
|
hwy::ThreadPool& inner_pool, const InferenceArgs& args,
|
||||||
int verbosity, const gcpp::AcceptFunc& accept_token,
|
int verbosity, const gcpp::AcceptFunc& accept_token,
|
||||||
std::string &eot_line
|
std::string& eot_line) {
|
||||||
) {
|
|
||||||
PROFILER_ZONE("Gen.misc");
|
PROFILER_ZONE("Gen.misc");
|
||||||
int abs_pos = 0; // absolute token index over all turns
|
int abs_pos = 0; // absolute token index over all turns
|
||||||
int current_pos = 0; // token index within the current turn
|
int current_pos = 0; // token index within the current turn
|
||||||
|
|
@ -234,8 +252,12 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
||||||
|
|
||||||
const std::string instructions =
|
const std::string instructions =
|
||||||
"*Usage*\n"
|
"*Usage*\n"
|
||||||
" Enter an instruction and press enter (%C reset conversation, "
|
" Enter an instruction and press enter (%C resets conversation, "
|
||||||
"%Q quits).\n\n"
|
"%Q quits).\n" +
|
||||||
|
(inference.multiturn == 0
|
||||||
|
? std::string(" Since multiturn is set to 0, conversation will "
|
||||||
|
"automatically reset every turn.\n\n")
|
||||||
|
: "\n") +
|
||||||
"*Examples*\n"
|
"*Examples*\n"
|
||||||
" - Write an email to grandma thanking her for the cookies.\n"
|
" - Write an email to grandma thanking her for the cookies.\n"
|
||||||
" - What are some historical attractions to visit around "
|
" - What are some historical attractions to visit around "
|
||||||
|
|
@ -244,13 +266,14 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) {
|
||||||
" - Write a standup comedy bit about GPU programming.\n";
|
" - Write a standup comedy bit about GPU programming.\n";
|
||||||
|
|
||||||
std::cout << "\033[2J\033[1;1H" // clear screen
|
std::cout << "\033[2J\033[1;1H" // clear screen
|
||||||
<< banner_ascii_art << "\n\n";
|
<< kAsciiArtBanner << "\n\n";
|
||||||
ShowConfig(loader, inference, app);
|
ShowConfig(loader, inference, app);
|
||||||
std::cout << "\n" << instructions << "\n";
|
std::cout << "\n" << instructions << "\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
ReplGemma(model, pool, inner_pool, inference, app.verbosity,
|
ReplGemma(
|
||||||
/*accept_token=*/[](int) { return true; }, app.eot_line);
|
model, pool, inner_pool, inference, app.verbosity,
|
||||||
|
/*accept_token=*/[](int) { return true; }, app.eot_line);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace gcpp
|
} // namespace gcpp
|
||||||
|
|
|
||||||
19
util/app.h
19
util/app.h
|
|
@ -31,6 +31,7 @@
|
||||||
|
|
||||||
// copybara:import_next_line:gemma_cpp
|
// copybara:import_next_line:gemma_cpp
|
||||||
#include "util/args.h"
|
#include "util/args.h"
|
||||||
|
// copybara:end
|
||||||
#include "hwy/base.h" // HWY_ASSERT
|
#include "hwy/base.h" // HWY_ASSERT
|
||||||
|
|
||||||
namespace gcpp {
|
namespace gcpp {
|
||||||
|
|
@ -77,7 +78,6 @@ class AppArgs : public ArgsBase<AppArgs> {
|
||||||
|
|
||||||
template <class Visitor>
|
template <class Visitor>
|
||||||
void ForEach(const Visitor& visitor) {
|
void ForEach(const Visitor& visitor) {
|
||||||
visitor(log, "log", Path{"/tmp/log.txt"}, "Logging file", 2);
|
|
||||||
visitor(verbosity, "verbosity", 1,
|
visitor(verbosity, "verbosity", 1,
|
||||||
"Show verbose developer information\n 0 = only print generation "
|
"Show verbose developer information\n 0 = only print generation "
|
||||||
"output\n 1 = standard user-facing terminal ui\n 2 = show "
|
"output\n 1 = standard user-facing terminal ui\n 2 = show "
|
||||||
|
|
@ -85,15 +85,16 @@ class AppArgs : public ArgsBase<AppArgs> {
|
||||||
2);
|
2);
|
||||||
visitor(num_threads, "num_threads",
|
visitor(num_threads, "num_threads",
|
||||||
kDefaultNumThreads, // see ChooseNumThreads
|
kDefaultNumThreads, // see ChooseNumThreads
|
||||||
"Number of threads to use. Default value is set based on an "
|
"Number of threads to use.\n Default = Estimate of the "
|
||||||
"estimate of "
|
"number of suupported concurrent threads.",
|
||||||
"how many concurrent threads are supported.",
|
|
||||||
2);
|
|
||||||
visitor(eot_line, "eot_line", std::string(""),
|
|
||||||
"End of turn line. "
|
|
||||||
"When you specify this, the prompt will be all lines "
|
|
||||||
"before the line where only the given string appears.",
|
|
||||||
2);
|
2);
|
||||||
|
visitor(
|
||||||
|
eot_line, "eot_line", std::string(""),
|
||||||
|
"End of turn line. "
|
||||||
|
"When you specify this, the prompt will be all lines "
|
||||||
|
"before the line where only the given string appears.\n Default = "
|
||||||
|
"When a newline is encountered, that signals the end of the turn.",
|
||||||
|
2);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue