From 046d5fd44e3505ab9c6d065ab65541fc2fdfd4f2 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 9 Jan 2026 05:34:56 +0800 Subject: [PATCH] llama: use host memory if device reports 0 memory (#18587) --- ggml/src/ggml-backend-impl.h | 2 +- ggml/src/ggml-opencl/ggml-opencl.cpp | 4 ++-- src/llama-model.cpp | 16 ++++++++++++---- src/llama.cpp | 14 +++++++++++++- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 6792ba986e..59190b7c46 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -144,7 +144,7 @@ extern "C" { // device description: short informative description of the device, could be the model name const char * (*get_description)(ggml_backend_dev_t dev); - // device memory in bytes + // device memory in bytes: 0 bytes to indicate no memory to report void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total); // device type diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 472e2df50a..e50ca8e0f2 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -4287,8 +4287,8 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_ } static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - *free = 1; - *total = 1; + *free = 0; + *total = 0; GGML_UNUSED(dev); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7ac59846bb..5de6493b9e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2452,6 +2452,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { pimpl->gpu_buft_list.emplace(dev, std::move(buft_list)); } + ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (cpu_dev == nullptr) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } + // calculate the split points bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; }); std::vector splits(n_devices()); @@ -2462,6 +2467,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { size_t total; size_t free; ggml_backend_dev_memory(dev, &free, &total); + + // devices can return 0 bytes for free and total memory if they do not + // have any to report. in this case, we will use the host memory as a fallback + // fixes: https://github.com/ggml-org/llama.cpp/issues/18577 + if (free == 0 && total == 0) { + ggml_backend_dev_memory(cpu_dev, &free, &total); + } splits[i] = free; } } else { @@ -2478,10 +2490,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { splits[i] /= split_sum; } - ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (cpu_dev == nullptr) { - throw std::runtime_error(format("%s: no CPU backend found", __func__)); - } const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0); const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { diff --git a/src/llama.cpp b/src/llama.cpp index 33f51a2389..f1096d960e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -111,8 +111,20 @@ static std::vector llama_get_device_memory_data( } } for (size_t i = 0; i < ret.size(); i++) { - size_t free, total; + size_t free; + size_t total; ggml_backend_dev_memory(model->devices[i], &free, &total); + + // devices can return 0 bytes for free and total memory if they do not + // have any to report. in this case, we will use the host memory as a fallback + // fixes: https://github.com/ggml-org/llama.cpp/issues/18577 + if (free == 0 && total == 0) { + ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (cpu_dev == nullptr) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } + ggml_backend_dev_memory(cpu_dev, &free, &total); + } ret[i].free = free; ret[i].total = total; }