From 3ed9183ac9974c5589e2e6c450e375cf010cb88f Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 11 Feb 2026 00:41:14 +0100 Subject: [PATCH] use minimal subgroup size on Intel --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 861e17512e..74abeb4b53 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2783,6 +2783,26 @@ static constexpr uint32_t coopmat1_flash_attention_num_large_rows = 16; static constexpr uint32_t scalar_flash_attention_Bc = 64; static constexpr uint32_t scalar_flash_attention_workgroup_size = 128; +static bool fa_disable_subgroups(const vk_device& device, FaCodePath path) { + return device->vendor_id == VK_VENDOR_ID_INTEL && path == FA_SCALAR; +} + +static uint32_t fa_subgroup_size(const vk_device& device, FaCodePath path) { + if (fa_disable_subgroups(device, path)) { + return 0xFFFFFFFF; + } + + if (path == FA_VECTOR) { + if (device->vendor_id == VK_VENDOR_ID_AMD && device->subgroup_min_size <= 32 && device->subgroup_max_size >= 32) { + return 32; + } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) { + return device->subgroup_min_size; + } + } + + return device->subgroup_size; +} + static std::array fa_rows_cols(FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, FaRows rows, bool small_cache) { GGML_UNUSED(clamp); @@ -3223,17 +3243,18 @@ static void ggml_vk_load_shaders(vk_device& device) { break; } + const uint32_t subgroup_size = fa_subgroup_size(device, path); + // D_split can't be larger than a subgroup because we use subgroupShuffle to reduce it. // D_split can't be larger than the LSB of D divided by 4 due to vectorization in the shader. const uint32_t D_lsb = D ^ (D & (D-1)); - uint32_t D_split = std::min(std::min(device->subgroup_size, 8u), D_lsb / 4); + uint32_t D_split = std::min(std::min(subgroup_size, 8u), D_lsb / 4); // Nvidia prefers shared memory use to load large tiles of K/V. // Switch to loading from global memory when it would use too much shared memory. // AMD prefers loading K directly from global memory const uint32_t shmem_staging = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 && hsv < 256 ? 1 : 0; - const uint32_t subgroup_size = disable_subgroups ? 0xFFFFFFFF : device->subgroup_size; return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, subgroup_size, shmem_staging, flags}; };