mirror of https://github.com/google/gemma.cpp.git
With new matmul, much larger batch sizes are advantageous, default to 256.
Can still override via command line argument. PiperOrigin-RevId: 730502653
This commit is contained in:
parent
9a2360d719
commit
b3b4b9f92f
|
|
@ -100,7 +100,7 @@ struct RuntimeConfig {
|
|||
|
||||
// These defaults are overridden by InferenceArgs::CopyTo(*this):
|
||||
// Max tokens per batch during prefill.
|
||||
size_t prefill_tbatch_size = 32;
|
||||
size_t prefill_tbatch_size = 256;
|
||||
// Max queries per batch (one token from each) during decode.
|
||||
size_t decode_qbatch_size = 16;
|
||||
|
||||
|
|
|
|||
|
|
@ -273,7 +273,7 @@ struct InferenceArgs : public ArgsBase<InferenceArgs> {
|
|||
visitor(max_generated_tokens, "max_generated_tokens", size_t{2048},
|
||||
"Maximum number of tokens to generate.");
|
||||
|
||||
visitor(prefill_tbatch_size, "prefill_tbatch", size_t{64},
|
||||
visitor(prefill_tbatch_size, "prefill_tbatch", size_t{256},
|
||||
"Prefill: max tokens per batch.");
|
||||
visitor(decode_qbatch_size, "decode_qbatch", size_t{16},
|
||||
"Decode: max queries per batch.");
|
||||
|
|
|
|||
Loading…
Reference in New Issue