From 1dc62a795ce1a165636e50f32a0c23d64dca9fe0 Mon Sep 17 00:00:00 2001 From: jiachengjason Date: Wed, 12 Nov 2025 15:54:39 -0500 Subject: [PATCH] PR clean up, addressed comments --- ggml/src/ggml-cuda/common.cuh | 6 +----- ggml/src/ggml-cuda/mmq.cu | 4 ++-- ggml/src/ggml-cuda/mmq.cuh | 2 +- ggml/src/ggml-hip/CMakeLists.txt | 4 ---- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index dae8fb2cd9..556d5f1a80 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -232,7 +232,7 @@ static const char * cu_get_error_str(CUresult err) { #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA #define VOLTA_MMA_AVAILABLE #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA -#if defined(GGML_USE_HIP) && defined(RDNA4) && !defined(GGML_HIP_NO_MMQ_WMMA) +#if defined(GGML_USE_HIP) && defined(RDNA4) #define AMD_WMMA_AVAILABLE #endif // defined(GGML_USE_HIP) && defined(RDNA4) && !defined(GGML_HIP_NO_MMQ_WMMA) @@ -299,11 +299,7 @@ static bool volta_mma_available(const int cc) { } static bool amd_wmma_available(const int cc) { -#if !defined(GGML_HIP_NO_MMQ_WMMA) return GGML_CUDA_CC_IS_RDNA4(cc); -#else - return false; -#endif //!defined(GGML_HIP_NO_MMQ_WMMA) } // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 0e3ffcdb73..e145e2b361 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -290,11 +290,11 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } - if (amd_mfma_available(cc)||amd_wmma_available(cc)) { + if (amd_mfma_available(cc) || amd_wmma_available(cc)) { // As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT) // performs better but is currently suffering from a crash on this architecture. // TODO: Revisit when hipblaslt is fixed on CDNA3 - if (GGML_CUDA_CC_IS_CDNA3(cc)||GGML_CUDA_CC_IS_RDNA4(cc)) { + if (GGML_CUDA_CC_IS_CDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { return true; } if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) { diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 3fa27ca3bf..7affff4ef8 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -2262,7 +2262,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { -#if defined(AMD_MFMA_AVAILABLE) +#if defined(AMD_MFMA_AVAILABLE) typedef tile<16, 8, int> tile_A; typedef tile<16, 8, int> tile_B; typedef tile<16, 16, int> tile_C; diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index 192b468d71..c22379bce4 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -120,10 +120,6 @@ if (NOT GGML_HIP_MMQ_WMMA) add_compile_definitions(GGML_HIP_NO_MMQ_WMMA) endif() -if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0) - add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12) -endif() - if (GGML_HIP_EXPORT_METRICS) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps") endif()