vulkan: optimize ARM Mali tile sizes and re-enable FP16

- Tunes l_warptile to match m_warptile (64x64) for ARM GPUs, fixing low occupancy on medium-sized matrices.

- Re-enables FP16/BF16 support for ARM as the tiling fix resolves the performance regression.

- Adds comments clarifying the UMA memory allocation fallback strategy.
This commit is contained in:
Gong-Mi 2026-01-02 12:04:32 +08:00
parent ebcec195db
commit 56b8f27fab
1 changed files with 12 additions and 5 deletions

View File

@ -2529,6 +2529,9 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
vk::MemoryPropertyFlagBits::eDeviceLocal});
} else if (device->uma) {
// Fall back to host memory type
// 1. Prefer Device Local AND Host Visible (Best for UMA)
// 2. Fallback to Device Local
// 3. Fallback to Host Visible
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
@ -2542,6 +2545,9 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
} else {
// use rebar if available, otherwise fallback to device only visible memory
if (device->allow_sysmem_fallback) {
// 1. Prefer Device Local AND Host Visible (ReBAR)
// 2. Fallback to Device Local
// 3. Fallback to Host Visible
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
@ -2936,6 +2942,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
m_warptile_mmq = m_warptile_mmq_int = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 };
m_warptile = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 };
m_warptile_id = m_warptile_mmqid = { 64, 64, 64, 16, 16, 32, 2, 2, 2, 1, 16 };
l_warptile_mmq = l_warptile_mmq_int = m_warptile_mmq;
l_warptile = m_warptile;
l_warptile_id = l_warptile_mmqid = m_warptile_id;
} else if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
m_warptile_mmqid = m_warptile_mmqid_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
@ -4494,11 +4504,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->driver_id = driver_props.driverID;
if (device->vendor_id == VK_VENDOR_ID_ARM) {
// Forcing FP32 path as it currently provides better performance on Mali G720.
// This is a simplified approach while deeper ARM-specific vec2 SIMD optimizations are investigated.
fp16_storage = false;
fp16_compute = false;
bfloat16_support = false;
// Previously forced FP32 here due to poor FP16 performance on some ARM GPUs.
// With adjusted l_warptile (below), FP16 is now performant and preferred.
}
// Implementing the async backend interfaces seems broken on older Intel HW,