From ccffc464f27cda6b564e198982cdaf23c29093f5 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 30 Dec 2025 10:55:32 -0600 Subject: [PATCH] use external memory for ggml_vk_host_malloc, revert model loader changes --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 75 ++++++++++++++++++++-------- src/llama-model-loader.cpp | 16 +----- 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e65ef33b67..571fcab56d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -229,6 +229,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, /* .is_host = */ NULL, }; +static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size); #ifdef GGML_VULKAN_MEMORY_DEBUG class vk_memory_logger; @@ -772,7 +773,7 @@ struct vk_device_struct { std::vector all_pipelines; - std::vector> pinned_memory; + std::vector> pinned_memory; vk::Fence fence; vk_buffer sync_staging; @@ -2405,6 +2406,12 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std nullptr, }; + vk::ExternalMemoryBufferCreateInfo external_memory_bci; + if (import_ptr) { + external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; + buffer_create_info.setPNext(&external_memory_bci); + } + buf->buffer = device->device.createBuffer(buffer_create_info); vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer); @@ -5837,9 +5844,26 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context static void * ggml_vk_host_malloc(vk_device& device, size_t size) { VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")"); - vk_buffer buf = ggml_vk_create_buffer(device, size, - {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); + + void *malloc_ptr {}; + vk_buffer buf {}; + if (device->external_memory_host) { + // overallocate to be able to align base and size + malloc_ptr = malloc(size + 2 * device->min_imported_host_pointer_alignment); + if (!malloc_ptr) { + return nullptr; + } + + uintptr_t uptr = reinterpret_cast(malloc_ptr); + uptr = ROUNDUP_POW2(uptr, device->min_imported_host_pointer_alignment); + void *ptr = reinterpret_cast(uptr); + + buf = ggml_vk_buffer_from_host_ptr(device, ptr, ROUNDUP_POW2(size, device->min_imported_host_pointer_alignment)); + } else { + buf = ggml_vk_create_buffer(device, size, + {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); + } if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", @@ -5850,7 +5874,7 @@ static void * ggml_vk_host_malloc(vk_device& device, size_t size) { } std::lock_guard guard(device->mutex); - device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); + device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf, malloc_ptr)); return buf->ptr; } @@ -5879,6 +5903,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) { } ggml_vk_destroy_buffer(buf); + free(std::get<3>(device->pinned_memory[index])); device->pinned_memory.erase(device->pinned_memory.begin() + index); } @@ -14622,22 +14647,17 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize"); } -static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); - GGML_UNUSED(max_tensor_size); - - ggml_backend_buffer_t ret {}; - - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - auto device = ggml_vk_get_device(ctx->device); - +static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) { if (!device->external_memory_host) { - return ret; + return {}; } uintptr_t uptr = reinterpret_cast(ptr); if (uptr & (device->min_imported_host_pointer_alignment - 1)) { - return ret; + return {}; + } + if (size & (device->min_imported_host_pointer_alignment - 1)) { + return {}; } vk::MemoryHostPointerPropertiesEXT host_pointer_props; @@ -14645,7 +14665,7 @@ static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_ba host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, ptr); } catch (vk::SystemError& e) { GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what()); - return ret; + return {}; } vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); @@ -14657,14 +14677,14 @@ static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_ba } vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx]; - // check for visible+coherent+cache. Other flags (e.g. devicelocal) are allowed + // check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed if ((memory_type.propertyFlags & property_flags) == property_flags) { property_flags = memory_type.propertyFlags; break; } } if (memory_type_idx == 32) { - return ret; + return {}; } vk_buffer buf {}; @@ -14672,16 +14692,27 @@ static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_ba buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr, memory_type_idx); } catch (vk::SystemError& e) { GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what()); - return ret; } + return buf; +} + +static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); + GGML_UNUSED(max_tensor_size); + + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + auto device = ggml_vk_get_device(ctx->device); + + vk_buffer buf = ggml_vk_buffer_from_host_ptr(device, ptr, size); + if (!buf) { - return ret; + return {}; } ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(device, std::move(buf), device->name); - ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size); + ggml_backend_buffer_t ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size); return ret; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 959c406299..5003b4fbf5 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -971,7 +971,6 @@ bool llama_model_loader::load_all_data( std::vector host_buffers; std::vector events; std::vector host_ptrs; - std::vector host_base_ptrs; size_t buffer_idx = 0; // buffer to use for async loads ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { if (use_mmap || check_tensors) { @@ -1016,14 +1015,7 @@ bool llama_model_loader::load_all_data( // If the backend is supported, create pinned memory buffers and events for synchronisation. for (size_t idx = 0; idx < n_buffers; ++idx) { - void *base_ptr = malloc(buffer_size + 0x1000); - if (!base_ptr) { - return nullptr; - } - uintptr_t uptr = reinterpret_cast(base_ptr); - uptr = (uptr + 0x1000 - 1) & ~uintptr_t{0x1000 - 1}; - void *p = reinterpret_cast(uptr); - auto *buf = ggml_backend_dev_buffer_from_host_ptr(dev, p, buffer_size, buffer_size); + auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); if (!buf) { LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, @@ -1032,8 +1024,7 @@ bool llama_model_loader::load_all_data( } host_buffers.emplace_back(buf); - host_ptrs.emplace_back(p); - host_base_ptrs.emplace_back(base_ptr); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); auto * event = ggml_backend_event_new(dev); if (!event) { @@ -1191,9 +1182,6 @@ bool llama_model_loader::load_all_data( for (auto * buf : host_buffers) { ggml_backend_buffer_free(buf); } - for (auto * ptr : host_base_ptrs) { - free(ptr); - } ggml_backend_free(upload_backend); // check validation results