llama: use host memory if device reports 0 memory (#18587)
This commit is contained in:
parent
480160d472
commit
046d5fd44e
|
|
@ -144,7 +144,7 @@ extern "C" {
|
||||||
// device description: short informative description of the device, could be the model name
|
// device description: short informative description of the device, could be the model name
|
||||||
const char * (*get_description)(ggml_backend_dev_t dev);
|
const char * (*get_description)(ggml_backend_dev_t dev);
|
||||||
|
|
||||||
// device memory in bytes
|
// device memory in bytes: 0 bytes to indicate no memory to report
|
||||||
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
||||||
|
|
||||||
// device type
|
// device type
|
||||||
|
|
|
||||||
|
|
@ -4287,8 +4287,8 @@ static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||||
*free = 1;
|
*free = 0;
|
||||||
*total = 1;
|
*total = 0;
|
||||||
|
|
||||||
GGML_UNUSED(dev);
|
GGML_UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2452,6 +2452,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
|
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (cpu_dev == nullptr) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
|
|
||||||
// calculate the split points
|
// calculate the split points
|
||||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
|
||||||
std::vector<float> splits(n_devices());
|
std::vector<float> splits(n_devices());
|
||||||
|
|
@ -2462,6 +2467,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_dev_memory(dev, &free, &total);
|
ggml_backend_dev_memory(dev, &free, &total);
|
||||||
|
|
||||||
|
// devices can return 0 bytes for free and total memory if they do not
|
||||||
|
// have any to report. in this case, we will use the host memory as a fallback
|
||||||
|
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||||
|
if (free == 0 && total == 0) {
|
||||||
|
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||||
|
}
|
||||||
splits[i] = free;
|
splits[i] = free;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -2478,10 +2490,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
splits[i] /= split_sum;
|
splits[i] /= split_sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
||||||
if (cpu_dev == nullptr) {
|
|
||||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
|
||||||
}
|
|
||||||
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
|
||||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
|
||||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||||
|
|
|
||||||
|
|
@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < ret.size(); i++) {
|
for (size_t i = 0; i < ret.size(); i++) {
|
||||||
size_t free, total;
|
size_t free;
|
||||||
|
size_t total;
|
||||||
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
||||||
|
|
||||||
|
// devices can return 0 bytes for free and total memory if they do not
|
||||||
|
// have any to report. in this case, we will use the host memory as a fallback
|
||||||
|
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||||
|
if (free == 0 && total == 0) {
|
||||||
|
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (cpu_dev == nullptr) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
|
ggml_backend_dev_memory(cpu_dev, &free, &total);
|
||||||
|
}
|
||||||
ret[i].free = free;
|
ret[i].free = free;
|
||||||
ret[i].total = total;
|
ret[i].total = total;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue