llama: fix leaked buffers for mmap + split files (#16765)
This commit is contained in:
parent
75cbdd3fce
commit
945501f5ea
|
|
@ -15,7 +15,6 @@
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cmath>
|
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
@ -438,7 +437,7 @@ struct llama_model::impl {
|
||||||
llama_mlocks mlock_mmaps;
|
llama_mlocks mlock_mmaps;
|
||||||
|
|
||||||
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
|
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
|
||||||
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
|
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
|
||||||
|
|
||||||
buft_list_t cpu_buft_list;
|
buft_list_t cpu_buft_list;
|
||||||
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
||||||
|
|
@ -6186,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
||||||
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
|
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
|
||||||
|
|
||||||
ggml_backend_buffer_t buf = nullptr;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
||||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||||
|
|
@ -6199,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
||||||
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
||||||
if (buf == nullptr) {
|
if (buf == nullptr) {
|
||||||
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
||||||
}
|
}
|
||||||
|
bufs.emplace_back(buf);
|
||||||
buf_map.emplace(idx, buf);
|
buf_map.emplace(idx, buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||||
if (buf == nullptr) {
|
if (buf == nullptr) {
|
||||||
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
||||||
}
|
}
|
||||||
|
|
@ -6217,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
||||||
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
||||||
}
|
}
|
||||||
|
bufs.emplace_back(buf);
|
||||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||||
buf_map.emplace(idx, buf);
|
buf_map.emplace(idx, buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
|
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
|
||||||
|
|
||||||
for (auto & buf : buf_map) {
|
for (auto & buf : buf_map) {
|
||||||
// indicate that this buffer contains weights
|
// indicate that this buffer contains weights
|
||||||
|
|
@ -6247,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// print memory requirements per buffer type
|
// print memory requirements per buffer type
|
||||||
for (auto & [_, buf] : pimpl->ctxs_bufs) {
|
for (auto & [_, bufs] : pimpl->ctxs_bufs) {
|
||||||
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
|
for (auto & buf: bufs) {
|
||||||
|
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
|
||||||
|
__func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// populate tensors_by_name
|
// populate tensors_by_name
|
||||||
|
|
@ -6300,9 +6304,11 @@ size_t llama_model::n_devices() const {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||||
for (const auto & [_, buf] : pimpl->ctxs_bufs) {
|
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
|
||||||
|
for (const auto & buf : bufs) {
|
||||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue