PR clean up, addressed comments

This commit is contained in:
jiachengjason 2025-11-12 15:54:39 -05:00 committed by jiachengjason
parent 0bf9f09ae4
commit 1dc62a795c
4 changed files with 4 additions and 12 deletions

View File

@ -232,7 +232,7 @@ static const char * cu_get_error_str(CUresult err) {
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#define VOLTA_MMA_AVAILABLE
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if defined(GGML_USE_HIP) && defined(RDNA4) && !defined(GGML_HIP_NO_MMQ_WMMA)
#if defined(GGML_USE_HIP) && defined(RDNA4)
#define AMD_WMMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(RDNA4) && !defined(GGML_HIP_NO_MMQ_WMMA)
@ -299,11 +299,7 @@ static bool volta_mma_available(const int cc) {
}
static bool amd_wmma_available(const int cc) {
#if !defined(GGML_HIP_NO_MMQ_WMMA)
return GGML_CUDA_CC_IS_RDNA4(cc);
#else
return false;
#endif //!defined(GGML_HIP_NO_MMQ_WMMA)
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.

View File

@ -290,11 +290,11 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
if (amd_mfma_available(cc)||amd_wmma_available(cc)) {
if (amd_mfma_available(cc) || amd_wmma_available(cc)) {
// As of ROCM 7.0 rocblas/tensile performs very poorly on CDNA3 and hipblaslt (via ROCBLAS_USE_HIPBLASLT)
// performs better but is currently suffering from a crash on this architecture.
// TODO: Revisit when hipblaslt is fixed on CDNA3
if (GGML_CUDA_CC_IS_CDNA3(cc)||GGML_CUDA_CC_IS_RDNA4(cc)) {
if (GGML_CUDA_CC_IS_CDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
return true;
}
if (ne11 <= 128 || type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {

View File

@ -2262,7 +2262,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
#if defined(AMD_MFMA_AVAILABLE)
#if defined(AMD_MFMA_AVAILABLE)
typedef tile<16, 8, int> tile_A;
typedef tile<16, 8, int> tile_B;
typedef tile<16, 16, int> tile_C;

View File

@ -120,10 +120,6 @@ if (NOT GGML_HIP_MMQ_WMMA)
add_compile_definitions(GGML_HIP_NO_MMQ_WMMA)
endif()
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
endif()
if (GGML_HIP_EXPORT_METRICS)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps")
endif()