From 43b7e1ed9ed9b42ff72feb396719be13608f2d59 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 29 Dec 2025 13:49:38 -0600 Subject: [PATCH 1/6] vulkan: support buffer_from_host_ptr --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 156 ++++++++++++++++++++++----- 1 file changed, 128 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 493ee9c9a4..ebf4fb7022 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -523,6 +523,8 @@ struct vk_device_struct { uint64_t max_memory_allocation_size; uint64_t max_buffer_size; uint64_t suballocation_block_size; + uint64_t min_imported_host_pointer_alignment; + bool external_memory_host {}; bool fp16; bool bf16; bool pipeline_robustness; @@ -2373,7 +2375,8 @@ static std::vector ggml_vk_find_memory_properties(const vk::PhysicalDe return indices; } -static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list & req_flags_list) { +static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list & req_flags_list, + void *import_ptr = nullptr, uint32_t import_memory_type = ~0u) { VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")"); if (size > device->max_buffer_size) { throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit"); @@ -2416,35 +2419,47 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std mem_flags_info.setPNext(&mem_priority_info); } - for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) { - const auto & req_flags = *it; - - const std::vector memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags); - - if (memory_type_indices.empty()) { - continue; + if (import_ptr) { + buf->memory_property_flags = mem_props.memoryTypes[import_memory_type].propertyFlags; + try { + vk::ImportMemoryHostPointerInfoEXT import_info; + import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; + import_info.pHostPointer = import_ptr; + import_info.setPNext(&mem_flags_info); + buf->device_memory = device->device.allocateMemory({ size, import_memory_type, &import_info }); + } catch (const vk::SystemError& e) { } - buf->memory_property_flags = req_flags; + } else { + for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) { + const auto & req_flags = *it; - bool done = false; + const std::vector memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags); - for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) { - try { - buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info }); - done = true; - break; - } catch (const vk::SystemError& e) { - // loop and retry - // during last attempt throw the exception - if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) { - device->device.destroyBuffer(buf->buffer); - throw e; + if (memory_type_indices.empty()) { + continue; + } + buf->memory_property_flags = req_flags; + + bool done = false; + + for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) { + try { + buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info }); + done = true; + break; + } catch (const vk::SystemError& e) { + // loop and retry + // during last attempt throw the exception + if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) { + device->device.destroyBuffer(buf->buffer); + throw e; + } } } - } - if (done) { - break; + if (done) { + break; + } } } @@ -2455,8 +2470,12 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std buf->ptr = nullptr; - if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { - buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); + if (import_ptr) { + buf->ptr = import_ptr; + } else { + if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { + buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); + } } device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); @@ -4397,6 +4416,8 @@ static vk_device ggml_vk_get_device(size_t idx) { } else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 && getenv("GGML_VK_ENABLE_MEMORY_PRIORITY")) { device->memory_priority = true; + } else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) { + device->external_memory_host = true; } } @@ -4411,6 +4432,7 @@ static vk_device ggml_vk_get_device(size_t idx) { vk::PhysicalDeviceVulkan12Properties vk12_props; vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR shader_integer_dot_product_props; + vk::PhysicalDeviceExternalMemoryHostPropertiesEXT external_memory_host_props; props2.pNext = &props3; props3.pNext = &subgroup_props; @@ -4450,6 +4472,11 @@ static vk_device ggml_vk_get_device(size_t idx) { last_struct = (VkBaseOutStructure *)&shader_integer_dot_product_props; } + if (device->external_memory_host) { + last_struct->pNext = (VkBaseOutStructure *)&external_memory_host_props; + last_struct = (VkBaseOutStructure *)&external_memory_host_props; + } + device->physical_device.getProperties2(&props2); device->properties = props2.properties; device->vendor_id = device->properties.vendorID; @@ -4536,6 +4563,8 @@ static vk_device ggml_vk_get_device(size_t idx) { device->integer_dot_product = device->integer_dot_product && shader_integer_dot_product_props.integerDotProduct4x8BitPackedSignedAccelerated; + device->min_imported_host_pointer_alignment = external_memory_host_props.minImportedHostPointerAlignment; + device->max_workgroup_size_log2 = uint32_t(log2f(float(device->properties.limits.maxComputeWorkGroupInvocations))); std::vector queue_family_props = device->physical_device.getQueueFamilyProperties(); @@ -4667,6 +4696,10 @@ static vk_device ggml_vk_get_device(size_t idx) { device_extensions.push_back("VK_KHR_pipeline_executable_properties"); } + if (device->external_memory_host) { + device_extensions.push_back("VK_EXT_external_memory_host"); + } + vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); device->pipeline_executable_properties_support = pipeline_executable_properties_support; @@ -14007,10 +14040,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml props->type = ggml_backend_vk_device_get_type(dev); props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); + + auto device = ggml_vk_get_device(ctx->device); + props->caps = { /* .async = */ true, /* .host_buffer = */ true, - /* .buffer_from_host_ptr = */ false, + /* .buffer_from_host_ptr = */ device->external_memory_host, /* .events = */ true, }; } @@ -14589,6 +14625,70 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize"); } +static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); + GGML_UNUSED(max_tensor_size); + + ggml_backend_buffer_t ret {}; + + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + auto device = ggml_vk_get_device(ctx->device); + + if (!device->external_memory_host) { + return ret; + } + + uintptr_t uptr = reinterpret_cast(ptr); + if (uptr & (device->min_imported_host_pointer_alignment - 1)) { + return ret; + } + + vk::MemoryHostPointerPropertiesEXT host_pointer_props; + try { + host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, ptr); + } catch (vk::SystemError& e) { + GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what()); + return ret; + } + vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); + + uint32_t memory_type_idx; + vk::MemoryPropertyFlags property_flags = vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached; + for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) { + if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) { + continue; + } + + vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx]; + // check for visible+coherent+cache. Other flags (e.g. devicelocal) are allowed + if ((memory_type.propertyFlags & property_flags) == property_flags) { + property_flags = memory_type.propertyFlags; + break; + } + } + if (memory_type_idx == 32) { + return ret; + } + + vk_buffer buf {}; + try { + buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr, memory_type_idx); + } catch (vk::SystemError& e) { + GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what()); + return ret; + } + + if (!buf) { + return ret; + } + + ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(device, std::move(buf), device->name); + + ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size); + + return ret; +} + static const struct ggml_backend_device_i ggml_backend_vk_device_i = { /* .get_name = */ ggml_backend_vk_device_get_name, /* .get_description = */ ggml_backend_vk_device_get_description, @@ -14598,7 +14698,7 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = { /* .init_backend = */ ggml_backend_vk_device_init, /* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type, /* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type, - /* .buffer_from_host_ptr = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_vk_device_buffer_from_host_ptr, /* .supports_op = */ ggml_backend_vk_device_supports_op, /* .supports_buft = */ ggml_backend_vk_device_supports_buft, /* .offload_op = */ ggml_backend_vk_device_offload_op, From a151dcebda4dceb3e4c17bd6d85662af3cd3913e Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 29 Dec 2025 13:50:40 -0600 Subject: [PATCH 2/6] hacky use of buffer_from_host_ptr for directio --- src/llama-model-loader.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 5003b4fbf5..959c406299 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -971,6 +971,7 @@ bool llama_model_loader::load_all_data( std::vector host_buffers; std::vector events; std::vector host_ptrs; + std::vector host_base_ptrs; size_t buffer_idx = 0; // buffer to use for async loads ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { if (use_mmap || check_tensors) { @@ -1015,7 +1016,14 @@ bool llama_model_loader::load_all_data( // If the backend is supported, create pinned memory buffers and events for synchronisation. for (size_t idx = 0; idx < n_buffers; ++idx) { - auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); + void *base_ptr = malloc(buffer_size + 0x1000); + if (!base_ptr) { + return nullptr; + } + uintptr_t uptr = reinterpret_cast(base_ptr); + uptr = (uptr + 0x1000 - 1) & ~uintptr_t{0x1000 - 1}; + void *p = reinterpret_cast(uptr); + auto *buf = ggml_backend_dev_buffer_from_host_ptr(dev, p, buffer_size, buffer_size); if (!buf) { LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, @@ -1024,7 +1032,8 @@ bool llama_model_loader::load_all_data( } host_buffers.emplace_back(buf); - host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); + host_ptrs.emplace_back(p); + host_base_ptrs.emplace_back(base_ptr); auto * event = ggml_backend_event_new(dev); if (!event) { @@ -1182,6 +1191,9 @@ bool llama_model_loader::load_all_data( for (auto * buf : host_buffers) { ggml_backend_buffer_free(buf); } + for (auto * ptr : host_base_ptrs) { + free(ptr); + } ggml_backend_free(upload_backend); // check validation results From 8240a9c3b9643e46dd4f30456ea9a23686d37bab Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 29 Dec 2025 22:07:54 -0600 Subject: [PATCH 3/6] disable buffer_from_host_ptr cap --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ebf4fb7022..e65ef33b67 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -14040,13 +14040,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml props->type = ggml_backend_vk_device_get_type(dev); props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str(); ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); - - auto device = ggml_vk_get_device(ctx->device); - props->caps = { /* .async = */ true, /* .host_buffer = */ true, - /* .buffer_from_host_ptr = */ device->external_memory_host, + /* .buffer_from_host_ptr = */ false, /* .events = */ true, }; } From ccffc464f27cda6b564e198982cdaf23c29093f5 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 30 Dec 2025 10:55:32 -0600 Subject: [PATCH 4/6] use external memory for ggml_vk_host_malloc, revert model loader changes --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 75 ++++++++++++++++++++-------- src/llama-model-loader.cpp | 16 +----- 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e65ef33b67..571fcab56d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -229,6 +229,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, /* .is_host = */ NULL, }; +static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size); #ifdef GGML_VULKAN_MEMORY_DEBUG class vk_memory_logger; @@ -772,7 +773,7 @@ struct vk_device_struct { std::vector all_pipelines; - std::vector> pinned_memory; + std::vector> pinned_memory; vk::Fence fence; vk_buffer sync_staging; @@ -2405,6 +2406,12 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std nullptr, }; + vk::ExternalMemoryBufferCreateInfo external_memory_bci; + if (import_ptr) { + external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; + buffer_create_info.setPNext(&external_memory_bci); + } + buf->buffer = device->device.createBuffer(buffer_create_info); vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer); @@ -5837,9 +5844,26 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context static void * ggml_vk_host_malloc(vk_device& device, size_t size) { VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")"); - vk_buffer buf = ggml_vk_create_buffer(device, size, - {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, - vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); + + void *malloc_ptr {}; + vk_buffer buf {}; + if (device->external_memory_host) { + // overallocate to be able to align base and size + malloc_ptr = malloc(size + 2 * device->min_imported_host_pointer_alignment); + if (!malloc_ptr) { + return nullptr; + } + + uintptr_t uptr = reinterpret_cast(malloc_ptr); + uptr = ROUNDUP_POW2(uptr, device->min_imported_host_pointer_alignment); + void *ptr = reinterpret_cast(uptr); + + buf = ggml_vk_buffer_from_host_ptr(device, ptr, ROUNDUP_POW2(size, device->min_imported_host_pointer_alignment)); + } else { + buf = ggml_vk_create_buffer(device, size, + {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); + } if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", @@ -5850,7 +5874,7 @@ static void * ggml_vk_host_malloc(vk_device& device, size_t size) { } std::lock_guard guard(device->mutex); - device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); + device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf, malloc_ptr)); return buf->ptr; } @@ -5879,6 +5903,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) { } ggml_vk_destroy_buffer(buf); + free(std::get<3>(device->pinned_memory[index])); device->pinned_memory.erase(device->pinned_memory.begin() + index); } @@ -14622,22 +14647,17 @@ static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggm VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize"); } -static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); - GGML_UNUSED(max_tensor_size); - - ggml_backend_buffer_t ret {}; - - ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - auto device = ggml_vk_get_device(ctx->device); - +static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) { if (!device->external_memory_host) { - return ret; + return {}; } uintptr_t uptr = reinterpret_cast(ptr); if (uptr & (device->min_imported_host_pointer_alignment - 1)) { - return ret; + return {}; + } + if (size & (device->min_imported_host_pointer_alignment - 1)) { + return {}; } vk::MemoryHostPointerPropertiesEXT host_pointer_props; @@ -14645,7 +14665,7 @@ static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_ba host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, ptr); } catch (vk::SystemError& e) { GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what()); - return ret; + return {}; } vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); @@ -14657,14 +14677,14 @@ static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_ba } vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx]; - // check for visible+coherent+cache. Other flags (e.g. devicelocal) are allowed + // check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed if ((memory_type.propertyFlags & property_flags) == property_flags) { property_flags = memory_type.propertyFlags; break; } } if (memory_type_idx == 32) { - return ret; + return {}; } vk_buffer buf {}; @@ -14672,16 +14692,27 @@ static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_ba buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr, memory_type_idx); } catch (vk::SystemError& e) { GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what()); - return ret; } + return buf; +} + +static ggml_backend_buffer_t ggml_backend_vk_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + VK_LOG_DEBUG("ggml_backend_vk_device_buffer_from_host_ptr(backend=" << dev << ", ptr=" << ptr << ", size=" << size << ")"); + GGML_UNUSED(max_tensor_size); + + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + auto device = ggml_vk_get_device(ctx->device); + + vk_buffer buf = ggml_vk_buffer_from_host_ptr(device, ptr, size); + if (!buf) { - return ret; + return {}; } ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(device, std::move(buf), device->name); - ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size); + ggml_backend_buffer_t ret = ggml_backend_buffer_init(ggml_backend_vk_device_get_buffer_type(dev), ggml_backend_vk_buffer_interface, bufctx, size); return ret; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 959c406299..5003b4fbf5 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -971,7 +971,6 @@ bool llama_model_loader::load_all_data( std::vector host_buffers; std::vector events; std::vector host_ptrs; - std::vector host_base_ptrs; size_t buffer_idx = 0; // buffer to use for async loads ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { if (use_mmap || check_tensors) { @@ -1016,14 +1015,7 @@ bool llama_model_loader::load_all_data( // If the backend is supported, create pinned memory buffers and events for synchronisation. for (size_t idx = 0; idx < n_buffers; ++idx) { - void *base_ptr = malloc(buffer_size + 0x1000); - if (!base_ptr) { - return nullptr; - } - uintptr_t uptr = reinterpret_cast(base_ptr); - uptr = (uptr + 0x1000 - 1) & ~uintptr_t{0x1000 - 1}; - void *p = reinterpret_cast(uptr); - auto *buf = ggml_backend_dev_buffer_from_host_ptr(dev, p, buffer_size, buffer_size); + auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); if (!buf) { LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, @@ -1032,8 +1024,7 @@ bool llama_model_loader::load_all_data( } host_buffers.emplace_back(buf); - host_ptrs.emplace_back(p); - host_base_ptrs.emplace_back(base_ptr); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); auto * event = ggml_backend_event_new(dev); if (!event) { @@ -1191,9 +1182,6 @@ bool llama_model_loader::load_all_data( for (auto * buf : host_buffers) { ggml_backend_buffer_free(buf); } - for (auto * ptr : host_base_ptrs) { - free(ptr); - } ggml_backend_free(upload_backend); // check validation results From 57a53944a02aab41fdc1f91503942a08e9197b06 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 30 Dec 2025 14:56:43 -0600 Subject: [PATCH 5/6] disable external_memory_host for MoltenVK --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 571fcab56d..2eaab16bf3 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4489,6 +4489,12 @@ static vk_device ggml_vk_get_device(size_t idx) { device->vendor_id = device->properties.vendorID; device->driver_id = driver_props.driverID; + if (device->driver_id == vk::DriverId::eMoltenvk) { + // Disable external_memory_host until https://github.com/KhronosGroup/MoltenVK/pull/2622 + // is available in the Vulkan SDK. + device->external_memory_host = false; + } + // Implementing the async backend interfaces seems broken on older Intel HW, // see https://github.com/ggml-org/llama.cpp/issues/17302. device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL || From 51682440b0c98f8cdd17adbc66739fe7024b493d Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 1 Jan 2026 15:19:48 -0600 Subject: [PATCH 6/6] take buffer memory types into account --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 68 ++++++++++++++++------------ 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 2eaab16bf3..755498ec1d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2377,7 +2377,7 @@ static std::vector ggml_vk_find_memory_properties(const vk::PhysicalDe } static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list & req_flags_list, - void *import_ptr = nullptr, uint32_t import_memory_type = ~0u) { + void *import_ptr = nullptr) { VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")"); if (size > device->max_buffer_size) { throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit"); @@ -2427,13 +2427,46 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std } if (import_ptr) { - buf->memory_property_flags = mem_props.memoryTypes[import_memory_type].propertyFlags; + vk::MemoryHostPointerPropertiesEXT host_pointer_props; + try { + host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, import_ptr); + } catch (vk::SystemError& e) { + GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what()); + device->device.destroyBuffer(buf->buffer); + return {}; + } + vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); + + uint32_t memory_type_idx; + vk::MemoryPropertyFlags property_flags = *req_flags_list.begin(); + for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) { + if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) { + continue; + } + if (!(mem_req.memoryTypeBits & (1u << memory_type_idx))) { + continue; + } + + vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx]; + // check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed + if ((memory_type.propertyFlags & property_flags) == property_flags) { + property_flags = memory_type.propertyFlags; + break; + } + } + if (memory_type_idx == 32) { + GGML_LOG_WARN("ggml_vulkan: Memory type for host allocation not found\n"); + device->device.destroyBuffer(buf->buffer); + return {}; + } + + buf->memory_property_flags = mem_props.memoryTypes[memory_type_idx].propertyFlags; try { vk::ImportMemoryHostPointerInfoEXT import_info; import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT; import_info.pHostPointer = import_ptr; import_info.setPNext(&mem_flags_info); - buf->device_memory = device->device.allocateMemory({ size, import_memory_type, &import_info }); + buf->device_memory = device->device.allocateMemory({ size, memory_type_idx, &import_info }); } catch (const vk::SystemError& e) { } } else { @@ -14666,36 +14699,11 @@ static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, si return {}; } - vk::MemoryHostPointerPropertiesEXT host_pointer_props; - try { - host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, ptr); - } catch (vk::SystemError& e) { - GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what()); - return {}; - } - vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); - - uint32_t memory_type_idx; - vk::MemoryPropertyFlags property_flags = vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached; - for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) { - if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) { - continue; - } - - vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx]; - // check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed - if ((memory_type.propertyFlags & property_flags) == property_flags) { - property_flags = memory_type.propertyFlags; - break; - } - } - if (memory_type_idx == 32) { - return {}; - } + const vk::MemoryPropertyFlags property_flags = vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached; vk_buffer buf {}; try { - buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr, memory_type_idx); + buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr); } catch (vk::SystemError& e) { GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what()); }