llama-model-loader: print warning when using overrides with mmap (#20978)
* llama-model-loader: use pinned memory for tensor overrides * change to warning
This commit is contained in:
parent
e2eb39e81c
commit
278521c33a
|
|
@ -1158,6 +1158,12 @@ struct ggml_tensor * llama_model_loader::create_tensor(
|
|||
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
||||
// when overriding to a CPU buffer, consider the extra buffer types
|
||||
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
|
||||
if (use_mmap) {
|
||||
static std::once_flag once;
|
||||
std::call_once(once, [] {
|
||||
LLAMA_LOG_WARN("llama_model_loader: tensor overrides to CPU are used with mmap enabled - consider using --no-mmap for better performance\n");
|
||||
});
|
||||
}
|
||||
} else {
|
||||
buft = overrides->buft;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue