use external memory for ggml_vk_host_malloc, revert model loader changes
This commit is contained in:
parent
8240a9c3b9
commit
ccffc464f2
|
|
@ -229,6 +229,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|||
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ NULL,
|
||||
};
|
||||
static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size);
|
||||
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
class vk_memory_logger;
|
||||
|
|
@ -772,7 +773,7 @@ struct vk_device_struct {
|
|||
|
||||
std::vector<vk_pipeline_ref> all_pipelines;
|
||||
|
||||
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
||||
std::vector<std::tuple<void*, size_t, vk_buffer, void*>> pinned_memory;
|
||||
|
||||
vk::Fence fence;
|
||||
vk_buffer sync_staging;
|
||||
|
|
@ -2405,6 +2406,12 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
|
|||
nullptr,
|
||||
};
|
||||
|
||||
vk::ExternalMemoryBufferCreateInfo external_memory_bci;
|
||||
if (import_ptr) {
|
||||
external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT;
|
||||
buffer_create_info.setPNext(&external_memory_bci);
|
||||
}
|
||||
|
||||
buf->buffer = device->device.createBuffer(buffer_create_info);
|
||||
|
||||
vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer);
|
||||
|
|
@ -5837,9 +5844,26 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
|
|||
|
||||
static void * ggml_vk_host_malloc(vk_device& device, size_t size) {
|
||||
VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
|
||||
vk_buffer buf = ggml_vk_create_buffer(device, size,
|
||||
{vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
|
||||
|
||||
void *malloc_ptr {};
|
||||
vk_buffer buf {};
|
||||
if (device->external_memory_host) {
|
||||
// overallocate to be able to align base and size
|
||||
malloc_ptr = malloc(size + 2 * device->min_imported_host_pointer_alignment);
|
||||
if (!malloc_ptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uintptr_t uptr = reinterpret_cast<uintptr_t>(malloc_ptr);
|
||||
uptr = ROUNDUP_POW2(uptr, device->min_imported_host_pointer_alignment);
|
||||
void *ptr = reinterpret_cast<void *>(uptr);
|
||||
|
||||
buf = ggml_vk_buffer_from_host_ptr(device, ptr, ROUNDUP_POW2(size, device->min_imported_host_pointer_alignment));
|
||||
} else {
|
||||
buf = ggml_vk_create_buffer(device, size,
|
||||
{vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
|
||||
}
|
||||
|
||||
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
||||
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
||||
|
|
@ -5850,7 +5874,7 @@ static void * ggml_vk_host_malloc(vk_device& device, size_t size) {
|
|||
}
|
||||
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf));
|
||||
device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf, malloc_ptr));
|
||||
|
||||
return buf->ptr;
|
||||
}
|
||||
|
|
@ -5879,6 +5903,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) {
|
|||
}
|
||||
|
||||
ggml_vk_destroy_buffer(buf);
|
||||
free(std::get<3>(device->pinned_memory[index]));
|
||||
|
||||
device->pinned_memory.erase(device->pinned_memory.begin() + index);
|
||||
}
|
||||
|
|
@ -14622,22 +14647,17 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm
|
|||
VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")");
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
|
||||
ggml_backend_buffer_t ret {};
|
||||
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
|
||||
static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) {
|
||||
if (!device->external_memory_host) {
|
||||
return ret;
|
||||
return {};
|
||||
}
|
||||
|
||||
uintptr_t uptr = reinterpret_cast<uintptr_t>(ptr);
|
||||
if (uptr & (device->min_imported_host_pointer_alignment - 1)) {
|
||||
return ret;
|
||||
return {};
|
||||
}
|
||||
if (size & (device->min_imported_host_pointer_alignment - 1)) {
|
||||
return {};
|
||||
}
|
||||
|
||||
vk::MemoryHostPointerPropertiesEXT host_pointer_props;
|
||||
|
|
@ -14645,7 +14665,7 @@ static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_ba
|
|||
host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, ptr);
|
||||
} catch (vk::SystemError& e) {
|
||||
GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what());
|
||||
return ret;
|
||||
return {};
|
||||
}
|
||||
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
|
||||
|
||||
|
|
@ -14657,14 +14677,14 @@ static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_ba
|
|||
}
|
||||
|
||||
vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx];
|
||||
// check for visible+coherent+cache. Other flags (e.g. devicelocal) are allowed
|
||||
// check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed
|
||||
if ((memory_type.propertyFlags & property_flags) == property_flags) {
|
||||
property_flags = memory_type.propertyFlags;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (memory_type_idx == 32) {
|
||||
return ret;
|
||||
return {};
|
||||
}
|
||||
|
||||
vk_buffer buf {};
|
||||
|
|
@ -14672,16 +14692,27 @@ static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_ba
|
|||
buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr, memory_type_idx);
|
||||
} catch (vk::SystemError& e) {
|
||||
GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what());
|
||||
return ret;
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")");
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
auto device = ggml_vk_get_device(ctx->device);
|
||||
|
||||
vk_buffer buf = ggml_vk_buffer_from_host_ptr(device, ptr, size);
|
||||
|
||||
if (!buf) {
|
||||
return ret;
|
||||
return {};
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(device, std::move(buf), device->name);
|
||||
|
||||
ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size);
|
||||
ggml_backend_buffer_t ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -971,7 +971,6 @@ bool llama_model_loader::load_all_data(
|
|||
std::vector<ggml_backend_buffer_t> host_buffers;
|
||||
std::vector<ggml_backend_event_t> events;
|
||||
std::vector<void *> host_ptrs;
|
||||
std::vector<void *> host_base_ptrs;
|
||||
size_t buffer_idx = 0; // buffer to use for async loads
|
||||
ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t {
|
||||
if (use_mmap || check_tensors) {
|
||||
|
|
@ -1016,14 +1015,7 @@ bool llama_model_loader::load_all_data(
|
|||
|
||||
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
||||
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
||||
void *base_ptr = malloc(buffer_size + 0x1000);
|
||||
if (!base_ptr) {
|
||||
return nullptr;
|
||||
}
|
||||
uintptr_t uptr = reinterpret_cast<uintptr_t>(base_ptr);
|
||||
uptr = (uptr + 0x1000 - 1) & ~uintptr_t{0x1000 - 1};
|
||||
void *p = reinterpret_cast<void *>(uptr);
|
||||
auto *buf = ggml_backend_dev_buffer_from_host_ptr(dev, p, buffer_size, buffer_size);
|
||||
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
||||
|
||||
if (!buf) {
|
||||
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
|
||||
|
|
@ -1032,8 +1024,7 @@ bool llama_model_loader::load_all_data(
|
|||
}
|
||||
|
||||
host_buffers.emplace_back(buf);
|
||||
host_ptrs.emplace_back(p);
|
||||
host_base_ptrs.emplace_back(base_ptr);
|
||||
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
|
||||
|
||||
auto * event = ggml_backend_event_new(dev);
|
||||
if (!event) {
|
||||
|
|
@ -1191,9 +1182,6 @@ bool llama_model_loader::load_all_data(
|
|||
for (auto * buf : host_buffers) {
|
||||
ggml_backend_buffer_free(buf);
|
||||
}
|
||||
for (auto * ptr : host_base_ptrs) {
|
||||
free(ptr);
|
||||
}
|
||||
ggml_backend_free(upload_backend);
|
||||
|
||||
// check validation results
|
||||
|
|
|
|||
Loading…
Reference in New Issue