From ebcec195db5e1ba692298d8b56bbdc2b8f9ad747 Mon Sep 17 00:00:00 2001 From: Gong-Mi <550230171@qq.com> Date: Fri, 2 Jan 2026 08:07:08 +0800 Subject: [PATCH 1/2] vulkan: optimize UMA memory allocation and fix ARM specific tuning logic --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 541e4a50b7..22f2dd33bf 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -90,6 +90,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } #define VK_VENDOR_ID_AMD 0x1002 #define VK_VENDOR_ID_APPLE 0x106b +#define VK_VENDOR_ID_ARM 0x13B5 #define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_NVIDIA 0x10de @@ -2528,7 +2529,8 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { vk::MemoryPropertyFlagBits::eDeviceLocal}); } else if (device->uma) { // Fall back to host memory type - buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal, + buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, + vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); } else if (device->disable_host_visible_vidmem) { if (device->allow_sysmem_fallback) { @@ -2930,7 +2932,11 @@ static void ggml_vk_load_shaders(vk_device& device) { s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32, 32, 32, 32, 32, 1, 2, 1, 1, mul_mat_subgroup_size_16 }; // chip specific tuning - if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) { + if (device->vendor_id == VK_VENDOR_ID_ARM) { + m_warptile_mmq = m_warptile_mmq_int = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 }; + m_warptile = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 }; + m_warptile_id = m_warptile_mmqid = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 }; + } else if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) { m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; } @@ -4487,6 +4493,14 @@ static vk_device ggml_vk_get_device(size_t idx) { device->vendor_id = device->properties.vendorID; device->driver_id = driver_props.driverID; + if (device->vendor_id == VK_VENDOR_ID_ARM) { + // Forcing FP32 path as it currently provides better performance on Mali G720. + // This is a simplified approach while deeper ARM-specific vec2 SIMD optimizations are investigated. + fp16_storage = false; + fp16_compute = false; + bfloat16_support = false; + } + // Implementing the async backend interfaces seems broken on older Intel HW, // see https://github.com/ggml-org/llama.cpp/issues/17302. device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL || @@ -4521,6 +4535,9 @@ static vk_device ggml_vk_get_device(size_t idx) { if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) { device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE); + } else if (device->vendor_id == VK_VENDOR_ID_ARM) { + // Limit batching of allocations to 256MB on Mali GPUs to avoid fragmentation issues + device->suballocation_block_size = 256 * 1024 * 1024; } else { // Limit batching of allocations to 1GB by default to avoid fragmentation issues device->suballocation_block_size = 1024*1024*1024; From 56b8f27fabe16c506eef7d3a2882b921ad108954 Mon Sep 17 00:00:00 2001 From: Gong-Mi <550230171@qq.com> Date: Fri, 2 Jan 2026 12:04:32 +0800 Subject: [PATCH 2/2] vulkan: optimize ARM Mali tile sizes and re-enable FP16 - Tunes l_warptile to match m_warptile (64x64) for ARM GPUs, fixing low occupancy on medium-sized matrices. - Re-enables FP16/BF16 support for ARM as the tiling fix resolves the performance regression. - Adds comments clarifying the UMA memory allocation fallback strategy. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 22f2dd33bf..81d4e2eafc 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2529,6 +2529,9 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { vk::MemoryPropertyFlagBits::eDeviceLocal}); } else if (device->uma) { // Fall back to host memory type + // 1. Prefer Device Local AND Host Visible (Best for UMA) + // 2. Fallback to Device Local + // 3. Fallback to Host Visible buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); @@ -2542,6 +2545,9 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { } else { // use rebar if available, otherwise fallback to device only visible memory if (device->allow_sysmem_fallback) { + // 1. Prefer Device Local AND Host Visible (ReBAR) + // 2. Fallback to Device Local + // 3. Fallback to Host Visible buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); @@ -2936,6 +2942,10 @@ static void ggml_vk_load_shaders(vk_device& device) { m_warptile_mmq = m_warptile_mmq_int = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 }; m_warptile = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 }; m_warptile_id = m_warptile_mmqid = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 }; + + l_warptile_mmq = l_warptile_mmq_int = m_warptile_mmq; + l_warptile = m_warptile; + l_warptile_id = l_warptile_mmqid = m_warptile_id; } else if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) { m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 }; @@ -4494,11 +4504,8 @@ static vk_device ggml_vk_get_device(size_t idx) { device->driver_id = driver_props.driverID; if (device->vendor_id == VK_VENDOR_ID_ARM) { - // Forcing FP32 path as it currently provides better performance on Mali G720. - // This is a simplified approach while deeper ARM-specific vec2 SIMD optimizations are investigated. - fp16_storage = false; - fp16_compute = false; - bfloat16_support = false; + // Previously forced FP32 here due to poor FP16 performance on some ARM GPUs. + // With adjusted l_warptile (below), FP16 is now performant and preferred. } // Implementing the async backend interfaces seems broken on older Intel HW,