From ebcec195db5e1ba692298d8b56bbdc2b8f9ad747 Mon Sep 17 00:00:00 2001
From: Gong-Mi <550230171@qq.com>
Date: Fri, 2 Jan 2026 08:07:08 +0800
Subject: [PATCH] vulkan: optimize UMA memory allocation and fix ARM specific
 tuning logic

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 541e4a50b7..22f2dd33bf 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -90,6 +90,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
 
 #define VK_VENDOR_ID_AMD 0x1002
 #define VK_VENDOR_ID_APPLE 0x106b
+#define VK_VENDOR_ID_ARM 0x13B5
 #define VK_VENDOR_ID_INTEL 0x8086
 #define VK_VENDOR_ID_NVIDIA 0x10de
 
@@ -2528,7 +2529,8 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
                                                        vk::MemoryPropertyFlagBits::eDeviceLocal});
         } else if (device->uma) {
             // Fall back to host memory type
-            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
+            buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
+                                                       vk::MemoryPropertyFlagBits::eDeviceLocal,
                                                        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
         } else if (device->disable_host_visible_vidmem) {
             if (device->allow_sysmem_fallback) {
@@ -2930,7 +2932,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
         s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32, 32, 32, 32,       32, 1, 2, 1, 1, mul_mat_subgroup_size_16 };
 
         // chip specific tuning
-        if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
+        if (device->vendor_id == VK_VENDOR_ID_ARM) {
+            m_warptile_mmq = m_warptile_mmq_int = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 };
+            m_warptile = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 };
+            m_warptile_id = m_warptile_mmqid = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 };
+        } else if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
             m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
             m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
         }
@@ -4487,6 +4493,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
         device->vendor_id = device->properties.vendorID;
         device->driver_id = driver_props.driverID;
 
+        if (device->vendor_id == VK_VENDOR_ID_ARM) {
+            // Forcing FP32 path as it currently provides better performance on Mali G720.
+            // This is a simplified approach while deeper ARM-specific vec2 SIMD optimizations are investigated.
+            fp16_storage = false;
+            fp16_compute = false;
+            bfloat16_support = false;
+        }
+
         // Implementing the async backend interfaces seems broken on older Intel HW,
         // see https://github.com/ggml-org/llama.cpp/issues/17302.
         device->support_async = (device->vendor_id != VK_VENDOR_ID_INTEL ||
@@ -4521,6 +4535,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
 
         if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) {
             device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE);
+        } else if (device->vendor_id == VK_VENDOR_ID_ARM) {
+            // Limit batching of allocations to 256MB on Mali GPUs to avoid fragmentation issues
+            device->suballocation_block_size = 256 * 1024 * 1024;
         } else {
             // Limit batching of allocations to 1GB by default to avoid fragmentation issues
             device->suballocation_block_size = 1024*1024*1024;