Merge 3827a23255 into 58062860af
This commit is contained in:
commit
d356bee76e
|
|
@ -73,17 +73,23 @@ int main(int argc, char ** argv, char ** envp) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// validate batch size for embeddings
|
// validate batch size for embeddings and reranking
|
||||||
// embeddings require all tokens to be processed in a single ubatch
|
// non-causal attention (embeddings/reranking) requires n_batch == n_ubatch
|
||||||
// see https://github.com/ggml-org/llama.cpp/issues/12836
|
// see https://github.com/ggml-org/llama.cpp/issues/6263
|
||||||
if (params.embedding && params.n_batch > params.n_ubatch) {
|
if (params.embedding && params.n_batch != params.n_ubatch) {
|
||||||
LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params.n_batch, params.n_ubatch);
|
LOG_WRN("%s: embeddings/reranking mode requires n_batch == n_ubatch\n", __func__);
|
||||||
LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params.n_ubatch);
|
LOG_WRN("%s: setting both to min(%d, %d) = %d to avoid configuration issues\n",
|
||||||
params.n_batch = params.n_ubatch;
|
__func__, params.n_batch, params.n_ubatch,
|
||||||
|
std::min(params.n_batch, params.n_ubatch));
|
||||||
|
params.n_batch = params.n_ubatch = std::min(params.n_batch, params.n_ubatch);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_parallel < 0) {
|
// TODO: should we have a separate n_parallel parameter for the server?
|
||||||
LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__);
|
// https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
|
||||||
|
// TODO: this is a common configuration that is suitable for most local use cases
|
||||||
|
// however, overriding the parameters is a bit confusing - figure out something more intuitive
|
||||||
|
if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
|
||||||
|
LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
|
||||||
|
|
||||||
params.n_parallel = 4;
|
params.n_parallel = 4;
|
||||||
params.kv_unified = true;
|
params.kv_unified = true;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue