diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 977aff62d8..17fa488498 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -39,6 +39,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); #include #include #include +#include #if defined(_MSC_VER) # define NOMINMAX 1 @@ -266,7 +267,8 @@ enum vk_device_architecture { AMD_RDNA1, AMD_RDNA2, AMD_RDNA3, - INTEL_XE2, + INTEL_PRE_XE2, + INTEL_XE2_ONWARD, NVIDIA_PRE_TURING, NVIDIA_TURING, }; @@ -340,12 +342,15 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& props2.pNext = &subgroup_size_control_props; device.getProperties2(&props2); - if (subgroup_size_control_props.minSubgroupSize == 16) { - // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8. - // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value. - // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html - // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html - return vk_device_architecture::INTEL_XE2; + // Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8. + // Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value. + // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html + // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html + switch (subgroup_size_control_props.minSubgroupSize) { + case 8: + return vk_device_architecture::INTEL_PRE_XE2; + case 16: + return vk_device_architecture::INTEL_XE2_ONWARD; } } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) { const std::vector ext_props = device.enumerateDeviceExtensionProperties(); @@ -605,7 +610,6 @@ struct vk_device_struct { bool support_async; bool async_use_transfer_queue; uint32_t subgroup_size; - uint32_t subgroup_size_log2; uint32_t shader_core_count; bool uma; bool prefer_host_memory; @@ -2130,6 +2134,10 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT + //if (pipeline->name == "matmul_id_subgroup_q4_k_f32_f16acc_aligned_l") { + // std::cout << "here" << std::endl; + //} + vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast(spv_data)); pipeline->shader_module = device->device.createShaderModule(shader_module_create_info); @@ -3103,14 +3111,30 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec return supported; } +// A specific pipeline's configuration +struct PipelineConfigParameter { + uint32_t subgroup_size; + // True if we require full subgroup for this pipeline, + // False if not required. Empty means don't care (use default) + std::optional require_full_subgroup; + // Calculate specialization constants used for a specific pipeline. + // If empty we use the default. + // Some kernels must calculate specialization constants + // based on subgroup size so we have an interface to override the default here. + std::function(const PipelineConfigParameter &, const std::vector &)> + calc_specialization_constants; +}; + +// Pipeline configuration for a target GPU. +// This may contain a group of piplines struct GpuPipelineConfig { // GPU architecture identifier. // Example: vk_device_architecture::AMD_GCN vk_device_architecture arch; - // Mapping of pipeline names to their specific subgroup sizes. - // Example: {"soft_max_f32", 64} - std::unordered_map pipelines; + // Mapping of pipeline names to their specific configuration parameters. + // Example: {"soft_max_f32", {64}} + std::unordered_map pipelines; // Default subgroup size for this GPU. // Defaults to 0 if not explicitly provided. @@ -3118,19 +3142,41 @@ struct GpuPipelineConfig { }; // Pipeline configuration for RDNA1 GPUs. -static const std::unordered_map rdna1_pipelines = { - {"soft_max", 64}, {"im2col", 64}, - {"argmax", 64}, {"mul_mat_vec", 64}, - {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32} +static const std::unordered_map rdna1_pipelines = { + {"soft_max", {64}}, + {"im2col", {64}}, + {"argmax", {64}}, + {"mul_mat_vec", {64}}, + {"mul_mat_vec_f16", {32}}, + {"mul_mat_vec_f32_f16", {32}}, }; // Pipeline configuration for RDNA2 GPUs. -static const std::unordered_map rdna2_pipelines = { - {"soft_max", 64}, {"im2col", 64}, +static const std::unordered_map rdna2_pipelines = { + {"soft_max", {64}}, + {"im2col", {64}}, }; static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32; + +static std::vector calc_specialization_constant_intel_xe2_onward(const PipelineConfigParameter& config, const std::vector& current) { + std::vector output = current; + // replacing subgroup_size_8 with current subgroup size for m_warptile_mmq + output[4] = config.subgroup_size; + output[10] = config.subgroup_size; + return output; +} + +static const std::unordered_map xe2_onward_pipelines = { + {"matmul_id_subgroup_q4_k_f32_f16acc_aligned_m", {16, {}, calc_specialization_constant_intel_xe2_onward}}, + {"matmul_id_subgroup_q6_k_f32_f16acc_aligned_m", {16, {}, calc_specialization_constant_intel_xe2_onward}}, +}; + +// Intel GPU can use subgroup 8, 16, or 32 depending on architeture. +// Pre-Xe2 is 8, 16, or 32. Xe2 onward is 16 or 32. 32 is the default if nothing is specified. +static constexpr uint32_t INTEL_DEFAULT_SUBGROUP_SIZE = 16; + // Define configurations for different GPUs. static std::vector gpu_pipeline_configs = { { @@ -3147,44 +3193,80 @@ static std::vector gpu_pipeline_configs = { }, RDNA_DEFAULT_SUBGROUP_SIZE }, + { + vk_device_architecture::INTEL_PRE_XE2, + { + }, + INTEL_DEFAULT_SUBGROUP_SIZE + }, + { + vk_device_architecture::INTEL_XE2_ONWARD, + { + xe2_onward_pipelines, + }, + INTEL_DEFAULT_SUBGROUP_SIZE + } }; -static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) { - for (const auto &config : gpu_pipeline_configs) { +static bool get_gpu_pipeline_config(GpuPipelineConfig* output, const vk_device_architecture& arch) { + for (const auto & config : gpu_pipeline_configs) { if (config.arch == arch) { - auto pipIt = config.pipelines.find(pipeline_name); - if (pipIt != config.pipelines.end()) { - return pipIt->second; - } - std::vector> sorted_pipelines(config.pipelines.begin(), config.pipelines.end()); - std::sort(sorted_pipelines.begin(), sorted_pipelines.end(), - [](const auto &a, const auto &b) { return a.first.size() > b.first.size(); }); - for (const auto &entry : sorted_pipelines) { - if (pipeline_name.find(entry.first) != std::string::npos) { - return entry.second; - } - } - return config.default_subgroup_size; + *output = config; + return true; } } - return 0; // If no matching configuration is found + return false; +} + +static bool get_pipeline_config_parameter(PipelineConfigParameter* output, const GpuPipelineConfig& config, const std::string &pipeline_name) { + auto pipIt = config.pipelines.find(pipeline_name); + if (pipIt != config.pipelines.end()) { + *output = pipIt->second; + return true; + } + std::vector> sorted_pipelines(config.pipelines.begin(), config.pipelines.end()); + std::sort(sorted_pipelines.begin(), sorted_pipelines.end(), + [](const auto &a, const auto &b) { return a.first.size() > b.first.size(); }); + for (const auto &entry : sorted_pipelines) { + if (pipeline_name.find(entry.first) != std::string::npos) { + *output = entry.second; + return true; + } + } + return false; +} + +// Get default subgroup size for given device +static uint32_t get_subgroup_size(const vk_device& device) { + // Use the GPU default subgroup size if we have a matching configuration. + // If not we use the device given default. + GpuPipelineConfig gpu_config = {}; + auto have_config = get_gpu_pipeline_config(&gpu_config, device->architecture); + if (have_config) { + return gpu_config.default_subgroup_size; + } + return device->subgroup_size; } static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); std::lock_guard guard(device->mutex); - // some shaders have a minimum subgroup size - const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u); - const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u); - const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u); - const uint32_t mul_mat_subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; + const uint32_t default_subgroup_size = get_subgroup_size(device); + const uint32_t subgroup_size_log2 = uint32_t(log2f(float(default_subgroup_size))); + + // some shaders have a minimum subgroup size + const uint32_t subgroup_size_8 = std::max(default_subgroup_size, 8u); + const uint32_t subgroup_size_16 = std::max(default_subgroup_size, 16u); + const uint32_t subgroup_size_32 = std::max(default_subgroup_size, 32u); + + const uint32_t mul_mat_subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : default_subgroup_size; const uint32_t mul_mat_subgroup_size_8 = std::max(mul_mat_subgroup_size, 8u); const uint32_t mul_mat_subgroup_size_16 = std::max(mul_mat_subgroup_size, 16u); const uint32_t mul_mat_subgroup_size_32 = std::max(mul_mat_subgroup_size, 32u); - const bool subgroup_min_size_16 = (!device->subgroup_size_control && device->subgroup_size >= 16) || + const bool subgroup_min_size_16 = (!device->subgroup_size_control && default_subgroup_size >= 16) || (device->subgroup_size_control && device->subgroup_max_size >= 16); // mulmat @@ -3229,9 +3311,9 @@ static void ggml_vk_load_shaders(vk_device& device) { s_mmq_wg_denoms_k = { 32, 64, 1 }; // spec constants and tile sizes for quant matmul_id - l_warptile_mmqid = { 256, 128, 128, 32, 1, device->subgroup_size }; - m_warptile_mmqid = { 256, 128, 64, 32, 0, device->subgroup_size }; - s_warptile_mmqid = { 256, 128, 64, 32, 0, device->subgroup_size }; + l_warptile_mmqid = { 256, 128, 128, 32, 1, default_subgroup_size }; + m_warptile_mmqid = { 256, 128, 64, 32, 0, default_subgroup_size }; + s_warptile_mmqid = { 256, 128, 64, 32, 0, default_subgroup_size }; l_mmqid_wg_denoms = { 128, 128, 1 }; m_mmqid_wg_denoms = { 128, 64, 1 }; s_mmqid_wg_denoms = { 128, 64, 1 }; @@ -3251,7 +3333,7 @@ static void ggml_vk_load_shaders(vk_device& device) { const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1; const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1; - const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32; + const uint32_t s_warptile_wm = default_subgroup_size < 32 ? default_subgroup_size : 32; l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; @@ -3296,7 +3378,7 @@ static void ggml_vk_load_shaders(vk_device& device) { l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 }; - } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) { + } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2_ONWARD) { // Xe2/Xe3 with coopmat enabled - warptile performance tuning l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; @@ -3358,9 +3440,38 @@ static void ggml_vk_load_shaders(vk_device& device) { auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& base_pipeline, const char *name, size_t spv_size, const void* spv_data, const char *entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { + //if (std::string(name) == "matmul_id_subgroup_q4_k_f32_f16acc_aligned_l") { + // std::cout << "here" << std::endl; + //} - if (!require_full_subgroups && required_subgroup_size == 0) { - required_subgroup_size = get_subgroup_size(name, device->architecture); + // Override subgroup size and specialization constant based on pipeline name + GpuPipelineConfig gpu_config = {}; + PipelineConfigParameter pipeline_param = {}; + bool param_found = false; + auto gpu_config_found = get_gpu_pipeline_config(&gpu_config, device->architecture); + if (gpu_config_found) { + param_found = get_pipeline_config_parameter(&pipeline_param, gpu_config, std::string(name)); + } + + std::vector target_specilization_constants = specialization_constants; + if (gpu_config_found && param_found) { + // We have a GPU configuration and a specific parameter for this pipeline. + // We overwrite all valid parameters assuming the setting creator knows what they are doing. + if (pipeline_param.subgroup_size) { + required_subgroup_size = pipeline_param.subgroup_size; + } + if (pipeline_param.require_full_subgroup.has_value()) { + require_full_subgroups = pipeline_param.require_full_subgroup.value(); + } + if (pipeline_param.calc_specialization_constants) { + target_specilization_constants = pipeline_param.calc_specialization_constants(pipeline_param, specialization_constants); + } + } else if (gpu_config_found && !param_found) { + // Only GPU config was given. Just update the default subgroup size + // if not specified by default + if (required_subgroup_size == 0) { + required_subgroup_size = gpu_config.default_subgroup_size; + } } vk_pipeline *ptr = &base_pipeline; @@ -3403,8 +3514,9 @@ static void ggml_vk_load_shaders(vk_device& device) { compile_count++; } + compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), spv_size, spv_data, entrypoint, - parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); + parameter_count, wg_denoms, target_specilization_constants, disable_robustness, require_full_subgroups, required_subgroup_size)); } }; @@ -4004,7 +4116,7 @@ static void ggml_vk_load_shaders(vk_device& device) { m_wg_denoms = { 64, 64, 1 }; s_wg_denoms = { 32, 32, 1 }; - if (device->vendor_id == VK_VENDOR_ID_INTEL && device->architecture == INTEL_XE2) { + if (device->vendor_id == VK_VENDOR_ID_INTEL && device->architecture == INTEL_XE2_ONWARD) { // Xe2/Xe3 - bf16 warptile performance tuning l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, 4, 4, 1, subgroup_size_8 }; } @@ -4038,7 +4150,7 @@ static void ggml_vk_load_shaders(vk_device& device) { // Ensure a subgroup size >= 16 is available const bool use_subgroups16 = use_subgroups && subgroup_min_size_16; - const uint32_t subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control && device->subgroup_min_size <= 16 && device->subgroup_max_size >= 16) ? 16 : device->subgroup_size; + const uint32_t subgroup_size = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control && device->subgroup_min_size <= 16 && device->subgroup_max_size >= 16) ? 16 : default_subgroup_size; const uint32_t subgroup_size16 = std::max(subgroup_size, 16u); const uint32_t force_subgroup_size = use_subgroups ? subgroup_size : 0; @@ -4111,7 +4223,7 @@ static void ggml_vk_load_shaders(vk_device& device) { #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { - const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; + const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : default_subgroup_size; const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_q8_1_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_q8_1_f32", arr_dmmv_q4_0_q8_1_f32_len[reduc], arr_dmmv_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int, i+1}, 1, true, use_subgroups, subgroup_size_int); @@ -4162,7 +4274,7 @@ static void ggml_vk_load_shaders(vk_device& device) { #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) if (device->integer_dot_product) { - const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : device->subgroup_size; + const uint32_t subgroup_size_int = (device->vendor_id == VK_VENDOR_ID_INTEL && device->subgroup_size_control) ? device->subgroup_min_size : default_subgroup_size; const uint32_t wg_size_subgroup_int = (w == DMMV_WG_SIZE_SUBGROUP) ? subgroup_size_int : (subgroup_size_int * 4); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_q8_1_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_q8_1_f32", arr_dmmv_id_q4_0_q8_1_f32_len[reduc], arr_dmmv_id_q4_0_q8_1_f32_data[reduc], "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq_int, 1, 1}, {wg_size_subgroup_int, 1*rm_stdq_int}, 1, true, use_subgroups, subgroup_size_int); @@ -4268,24 +4380,24 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, default_subgroup_size, 1}, {default_subgroup_size}, 1, true); for (auto &it : device->pipeline_fa_mask_opt) { auto BrBc = it.first; - ggml_vk_create_pipeline(device, it.second, "fa_mask_opt", fa_mask_opt_len, fa_mask_opt_data, "main", 2, sizeof(vk_op_flash_attn_mask_opt_push_constants), {1, 1, 1}, {128, 128 / device->subgroup_size, BrBc.first, BrBc.second}, 1, true, true, device->subgroup_size); + ggml_vk_create_pipeline(device, it.second, "fa_mask_opt", fa_mask_opt_len, fa_mask_opt_data, "main", 2, sizeof(vk_op_flash_attn_mask_opt_push_constants), {1, 1, 1}, {128, 128 / default_subgroup_size, BrBc.first, BrBc.second}, 1, true, true, default_subgroup_size); } if (device->subgroup_clustered && device->subgroup_require_full_support) { - ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true); + ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * default_subgroup_size / 8, 1, 1}, { default_subgroup_size }, 1, true, true); } else { - ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * default_subgroup_size / 8, 1, 1}, { default_subgroup_size }, 1); } for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) { if (device->subgroup_arithmetic && device->subgroup_require_full_support) { - ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true, true); + ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {default_subgroup_size, i + 1}, 1, true, true); } else { - ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true); + ggml_vk_create_pipeline2(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_p021_push_constants), {1, 1, 1}, {default_subgroup_size, i + 1}, 1, true); } } ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_nc_push_constants), {1, 1, 1}, {}, 1); @@ -4519,11 +4631,11 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { default_subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { default_subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { default_subgroup_size }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); @@ -4574,29 +4686,30 @@ static void ggml_vk_load_shaders(vk_device& device) { const uint32_t NCOLS_PADDED_LOG2 = i; if (i <= device->max_workgroup_size_log2) { uint32_t nary_shmem = 2 * sizeof(int) * BLOCK_SIZE + - sizeof(int) * device->subgroup_size + + sizeof(int) * default_subgroup_size + 2 * sizeof(int) + - 2 * (BLOCK_SIZE / device->subgroup_size) * sizeof(int); + 2 * (BLOCK_SIZE / default_subgroup_size) * sizeof(int); if (device->subgroup_arithmetic && device->subgroup_require_full_support && device->subgroup_shuffle && device->subgroup_ballot && - nary_shmem <= device->properties.limits.maxComputeSharedMemorySize) { - ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_nary_search_f32_len, topk_nary_search_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, device->subgroup_size, device->subgroup_size_log2}, 1, true, true, device->subgroup_size); + nary_shmem <= device->properties.limits.maxComputeSharedMemorySize && + BLOCK_SIZE >= default_subgroup_size) { // The n-ary top-k shader needs at least one full subgroup per workgroup. + ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_nary_search_"+std::to_string(i), topk_nary_search_f32_len, topk_nary_search_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, default_subgroup_size, subgroup_size_log2}, 1, true, true, default_subgroup_size); } else if (2 * sizeof(int) * BLOCK_SIZE <= device->properties.limits.maxComputeSharedMemorySize) { - ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_argsort_f32_len, topk_argsort_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true); + ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_argsort_"+std::to_string(i), topk_argsort_f32_len, topk_argsort_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, NCOLS_PADDED_LOG2}, 1, true); } } } - ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { default_subgroup_size }, 1); - ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { default_subgroup_size }, 1); const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4; - ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_cumsum_small_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, device->subgroup_size, 1 }, 1, true, true, device->subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass1_f32, "cumsum_multipass1_f32", cumsum_multipass1_f32_len, cumsum_multipass1_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass2_f32, "cumsum_multipass2_f32", cumsum_multipass2_f32_len, cumsum_multipass2_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, device->subgroup_size }, 1, true, true, device->subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, default_subgroup_size, cumsum_elem_per_thread }, 1, true, true, default_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_cumsum_small_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 128, default_subgroup_size, 1 }, 1, true, true, default_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass1_f32, "cumsum_multipass1_f32", cumsum_multipass1_f32_len, cumsum_multipass1_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, default_subgroup_size }, 1, true, true, default_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_cumsum_multipass2_f32, "cumsum_multipass2_f32", cumsum_multipass2_f32_len, cumsum_multipass2_f32_data, "main", 3, sizeof(vk_op_sum_rows_push_constants), {256, 1, 1}, { 256, default_subgroup_size }, 1, true, true, default_subgroup_size); - ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { default_subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_count_experts, "count_experts", count_experts_len, count_experts_data, "main", 2, sizeof(vk_op_count_experts_push_constants), {1, 1, 1}, {}, 1, true); @@ -4615,13 +4728,13 @@ static void ggml_vk_load_shaders(vk_device& device) { } #define IM2COL(bda) \ - ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32 ## bda ## _len, im2col_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32 ## bda ## _len, im2col_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { default_subgroup_size }, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32, "im2col_3d_f32", im2col_3d_f32 ## bda ## _len, im2col_3d_f32 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \ if (device->float_controls_rte_fp16) { \ - ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte ## bda ## _len, im2col_f32_f16_rte ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte ## bda ## _len, im2col_f32_f16_rte ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { default_subgroup_size }, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16_rte ## bda ## _len, im2col_3d_f32_f16_rte ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \ } else { \ - ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16 ## bda ## _len, im2col_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16 ## bda ## _len, im2col_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { default_subgroup_size }, 1, true); \ ggml_vk_create_pipeline(device, device->pipeline_im2col_3d_f32_f16, "im2col_3d_f32_f16", im2col_3d_f32_f16 ## bda ## _len, im2col_3d_f32_f16 ## bda ## _data, "main", 2, sizeof(vk_op_im2col_3d_push_constants), {512, 1, 1}, { 512 }, 1, true); \ } if (device->shader_int64 && device->buffer_device_address) { @@ -4636,9 +4749,9 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {default_subgroup_size}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {default_subgroup_size}, 1); { const uint32_t gdn_sizes[] = {32, 64, 128}; @@ -4659,10 +4772,10 @@ static void ggml_vk_load_shaders(vk_device& device) { // Use largest power-of-two that divides both S_V and subgroup_size so that // (1) S_V % lanes_per_column == 0 and (2) S_V % (subgroup_size / lanes_per_column) == 0. // This means we don't need extra bounds checking logic in the shader. - lanes_per_column = std::min(S_V, device->subgroup_size); + lanes_per_column = std::min(S_V, default_subgroup_size); } - const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < device->subgroup_size); + const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < default_subgroup_size); size_t gdn_len; const void * gdn_data; if (use_subgroup_reduce && need_clustered_shader) { @@ -4676,23 +4789,23 @@ static void ggml_vk_load_shaders(vk_device& device) { gdn_data = (const void *)gated_delta_net_f32_shmem_data; } - const uint32_t cols_per_wg = device->subgroup_size / lanes_per_column; + const uint32_t cols_per_wg = default_subgroup_size / lanes_per_column; const std::array wg_denoms = {1u, 1u, cols_per_wg}; for (uint32_t kda = 0; kda < 2; kda++) { ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda], gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants), - wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, device->subgroup_size); + wg_denoms, {S_V, kda, default_subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, default_subgroup_size); } } } if (device->subgroup_arithmetic && device->subgroup_require_full_support) { - ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size}, 1, true, true); - ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size}, 1, true, true); + ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, default_subgroup_size}, 1, true, true, default_subgroup_size); + ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, default_subgroup_size}, 1, true, true, default_subgroup_size); } else { - ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true); - ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true); + ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, default_subgroup_size, 16}, 1, true, true); + ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, default_subgroup_size, 16}, 1, true, true); } ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_f32, "ssm_conv_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 3, sizeof(vk_op_ssm_conv_push_constants), {32, 16, 1}, {32, 16}, 1); @@ -4738,7 +4851,7 @@ static void ggml_vk_load_shaders(vk_device& device) { allow_collectives_amd) { use_collectives = 1; conv2d_BS.CRS = std::min( - device->subgroup_size, + default_subgroup_size, conv2d_BS.CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used. } @@ -4747,7 +4860,7 @@ static void ggml_vk_load_shaders(vk_device& device) { if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) { conv2d_BS.CRS = 8; if (use_collectives) { - conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS); + conv2d_BS.CRS = std::min(default_subgroup_size, conv2d_BS.CRS); } } @@ -4797,7 +4910,7 @@ static void ggml_vk_load_shaders(vk_device& device) { for (uint32_t use_push = 0; use_push < 2; ++use_push) { for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { - ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][use_push], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 4, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][use_push], "topk_moe_f32_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 4, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {default_subgroup_size, 1u<suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size); device->subgroup_size = subgroup_props.subgroupSize; - device->subgroup_size_log2 = uint32_t(log2f(float(device->subgroup_size))); device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; if (sm_builtins) { device->shader_core_count = sm_props.shaderSMCount; @@ -5480,7 +5592,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->mul_mat_id_s[i] = true; break; case VK_VENDOR_ID_INTEL: - if (!device->coopmat_support || device->architecture != INTEL_XE2) { + if (!device->coopmat_support || device->architecture != INTEL_XE2_ONWARD) { device->mul_mat_l[i] = false; device->mul_mat_id_l[i] = false; } else { @@ -5709,7 +5821,12 @@ static void ggml_vk_print_gpu_info(size_t idx) { bool bf16 = false; #endif - uint32_t default_subgroup_size = get_subgroup_size("", device_architecture); + uint32_t default_subgroup_size = 0; + GpuPipelineConfig gpu_config = {}; + auto config_found = get_gpu_pipeline_config(&gpu_config, device_architecture); + if (config_found) { + default_subgroup_size = gpu_config.default_subgroup_size; + } const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize; const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; @@ -8989,7 +9106,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx uint32_t split_k = 1; // Intel Alchemist prefers more workgroups - const uint32_t shader_core_count_multiplier = (ctx->device->vendor_id == VK_VENDOR_ID_INTEL && ctx->device->architecture != INTEL_XE2) ? 2 : 1; + const uint32_t shader_core_count_multiplier = (ctx->device->vendor_id == VK_VENDOR_ID_INTEL && ctx->device->architecture != INTEL_XE2_ONWARD) ? 2 : 1; // Use a placeholder core count if one isn't available. split_k is a big help for perf. const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count * shader_core_count_multiplier : 16; @@ -10565,7 +10682,7 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, std::array elements; const uint32_t d_state = src0->ne[0]; - uint32_t num_subgroups = d_state / ctx->device->subgroup_size; + uint32_t num_subgroups = d_state / get_subgroup_size(ctx->device); const uint32_t num_workgroups_x = CEIL_DIV(n_head * head_dim, num_subgroups); const uint32_t num_workgroups_y = n_seq; elements = { num_workgroups_x, num_workgroups_y, 1 }; @@ -11446,8 +11563,6 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons uint32_t preferred_pipeline = std::max(num_topk_pipelines - 3, (uint32_t)log2f(float(k)) + 2); max_pipeline = std::min(preferred_pipeline, max_pipeline); uint32_t min_pipeline = (uint32_t)log2f(float(k)) + 1; - // require full subgroup - min_pipeline = std::max(min_pipeline, ctx->device->subgroup_size_log2); uint32_t pipeline_idx = (uint32_t)ceilf(log2f(float(num_elements))); pipeline_idx = std::min(pipeline_idx, max_pipeline); @@ -16060,9 +16175,8 @@ static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) { static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: - // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost, - // while some older hardware (ex. Arc A770) has performance regressions - return arch == vk_device_architecture::INTEL_XE2; + // Only allowing Xe2 and newer GPU at the moment since some older hardware (ex. Arc A770) have performance regressions + return arch == vk_device_architecture::INTEL_XE2_ONWARD; case VK_VENDOR_ID_AMD: if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { // Workaround for AMD proprietary driver reporting support on all GPUs