From 318cb5b80cad235a09f366468addc588cd33e578 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 13 Dec 2025 13:42:29 +0800 Subject: [PATCH 01/10] mma.cuh for rdna4 --- ggml/src/ggml-cuda/mma.cuh | 122 ++++++++++++++++++++++++++++++++++--- ggml/src/ggml-cuda/mmf.cuh | 32 +++++++--- 2 files changed, 134 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index dcfa40f4d5..16f28f6ab9 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -76,9 +76,11 @@ namespace ggml_cuda_mma { // For the A/C matrices this means I major == row major, J major == column major. // For the B matrix this means I major == column major, J major == row major. // MIRRORED == Each data value is held exactly once per thread subgroup. - DATA_LAYOUT_I_MAJOR = 0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell. - DATA_LAYOUT_I_MAJOR_MIRRORED = 10, - DATA_LAYOUT_J_MAJOR_MIRRORED = 20, + DATA_LAYOUT_I_MAJOR = 0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA. + DATA_LAYOUT_J_MAJOR = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3. + DATA_LAYOUT_I_MAJOR_MIRRORED = 20, + DATA_LAYOUT_J_MAJOR_MIRRORED = 30, + DATA_LAYOUT_I_MAJOR_DUAL = 40, // Matrix A&B for RDNA3. }; // Implemented mma combinations are: // - (I_MAJOR, I_MAJOR) -> I_MAJOR @@ -458,6 +460,46 @@ namespace ggml_cuda_mma { #endif // defined(AMD_WMMA_AVAILABLE) }; + template + struct tile { + static constexpr int I = I_; + static constexpr int J = J_; + static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR; + + static constexpr int ne = I * J / 32; + T x[ne] = {0}; + + static constexpr __device__ bool supported() { + if (I == 16 && J == 16) return true; + return false; + } + + static __device__ __forceinline__ int get_i(const int l) { + if constexpr (I == 16 && J == 16) { +#if defined(RDNA4) + return 8 * (threadIdx.x / 16) + l; +#elif defined(RDNA3) + return 2 * l + (threadIdx.x / 16); +#else + NO_DEVICE_CODE; + return -1; +#endif // defined(RDNA4) + } else { + NO_DEVICE_CODE; + return -1; + } + } + + static __device__ __forceinline__ int get_j(const int l) { + if constexpr (I == 16 && J == 16) { + return threadIdx.x % 16; + } else { + NO_DEVICE_CODE; + return -1; + } + } + }; + template struct tile { static constexpr int I = I_; @@ -524,6 +566,63 @@ namespace ggml_cuda_mma { } }; + template + struct tile { + static constexpr int I = I_; + static constexpr int J = J_; + static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL; + + static constexpr int ne = I * J / 32 * 2; + + half2 x[ne] = {{0.0f, 0.0f}}; + + static constexpr __device__ bool supported() { + if (I == 16 && J == 8) return true; + return false; + } + + static __device__ __forceinline__ int get_i(const int l) { + if constexpr (I == 16 && J == 8) { + return threadIdx.x % 16; + } else { + NO_DEVICE_CODE; + return -1; + } + } + + static __device__ __forceinline__ int get_j(const int l) { + if constexpr (I == 16 && J == 8) { + return l; + } else { + NO_DEVICE_CODE; + return -1; + } + } + }; + + template + struct tile { + static constexpr int I = I_; + static constexpr int J = J_; + static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL; + + static constexpr int ne = I * J / 32 * 2; + + nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; + + static constexpr __device__ bool supported() { + return tile::supported(); + } + + static __device__ __forceinline__ int get_i(const int l) { + return tile::get_i(l); + } + + static __device__ __forceinline__ int get_j(const int l) { + return tile::get_j(l); + } + }; + #if defined(TURING_MMA_AVAILABLE) template static __device__ __forceinline__ tile get_half2(const tile & tile_float) { @@ -660,9 +759,9 @@ namespace ggml_cuda_mma { #endif // TURING_MMA_AVAILABLE } - template + template static __device__ __forceinline__ void load_ldmatrix( - tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) { + tile<16, 8, T, dl> & t, const T * __restrict__ xs0, const int stride) { #if defined(TURING_MMA_AVAILABLE) int * xi = (int * ) t.x; const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2); @@ -832,8 +931,9 @@ namespace ggml_cuda_mma { #endif // TURING_MMA_AVAILABLE } + template static __device__ __forceinline__ void mma( - tile<16, 8, float> & D, const tile<16, 8, float> & A, const tile<8, 8, float> & B) { + tile<16, 8, float, DLayout> & D, const tile<16, 8, float, ABLayout> & A, const tile<8, 8, float, ABLayout> & B) { #ifdef AMPERE_MMA_AVAILABLE const int * Axi = (const int *) A.x; const int * Bxi = (const int *) B.x; @@ -886,9 +986,10 @@ namespace ggml_cuda_mma { NO_DEVICE_CODE; #endif // AMPERE_MMA_AVAILABLE } - + + template static __device__ __forceinline__ void mma( - tile<16, 16, float> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) { + tile<16, 16, float, DLayout> & D, const tile<16, 8, half2, ABLayout> & A, const tile<16, 8, half2, ABLayout> & B) { #ifdef TURING_MMA_AVAILABLE const int * Axi = (const int *) A.x; const int * Bxi = (const int *) B.x; @@ -939,9 +1040,10 @@ namespace ggml_cuda_mma { NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } - + + template static __device__ __forceinline__ void mma( - tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) { + tile<16, 16, float, DLayout> & D, const tile<16, 8, nv_bfloat162, ABLayout> & A, const tile<16, 8, nv_bfloat162, ABLayout> & B) { #if defined(AMD_WMMA_AVAILABLE) #if defined(RDNA4) using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16; diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index e1c695c5c0..e1b9c6a6b7 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -32,11 +32,17 @@ static __global__ void mul_mat_f( #if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE) #if defined(AMD_WMMA_AVAILABLE) // Special case for tf32, just dummy mma layout as wmma doesn't support it. - constexpr int tile_B_I = std::is_same_v ? 8 : 16; - constexpr int tile_C_J = std::is_same_v ? 8 : 16; - typedef tile<16, 8, T> tile_A; - typedef tile tile_B; - typedef tile<16, tile_C_J, float> tile_C; + constexpr bool is_tf32 = std::is_same_v; + constexpr int tile_B_I = is_tf32 ? 8 : 16; + constexpr int tile_C_J = is_tf32 ? 8 : 16; +#if defined(RDNA3) + constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : DATA_LAYOUT_I_MAJOR_DUAL; +#else + constexpr data_layout ab_layout = DATA_LAYOUT_I_MAJOR; +#endif // #if defined(RDNA3) + typedef tile<16, 8, T, ab_layout> tile_A; + typedef tile tile_B; + typedef tile<16, tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C; #else #ifdef VOLTA_MMA_AVAILABLE if constexpr (!std::is_same_v) {NO_DEVICE_CODE;} else { @@ -272,11 +278,17 @@ static __global__ void mul_mat_f_ids( #if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE) #if defined(AMD_WMMA_AVAILABLE) // Special case for tf32, just dummy mma layout as wmma doesn't support it. - constexpr int tile_B_I = std::is_same_v ? 8 : 16; - constexpr int tile_C_J = std::is_same_v ? 8 : 16; - typedef tile<16, 8, T> tile_A; - typedef tile tile_B; - typedef tile<16, tile_C_J, float> tile_C; + constexpr bool is_tf32 = std::is_same_v; + constexpr int tile_B_I = is_tf32 ? 8 : 16; + constexpr int tile_C_J = is_tf32 ? 8 : 16; +#if defined(RDNA3) + constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : DATA_LAYOUT_I_MAJOR_DUAL; +#else + constexpr data_layout ab_layout = DATA_LAYOUT_I_MAJOR; +#endif // #if defined(RDNA3) + typedef tile<16, 8, T, ab_layout> tile_A; + typedef tile tile_B; + typedef tile<16, tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C; #else #ifdef VOLTA_MMA_AVAILABLE if constexpr (!std::is_same_v) {NO_DEVICE_CODE;} else { From 074b93146e410aac4de51c197a75814e62671154 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 13 Dec 2025 14:10:56 +0800 Subject: [PATCH 02/10] mma for rdna3 --- ggml/src/ggml-cuda/mma.cuh | 53 +++++++------------------------------- 1 file changed, 9 insertions(+), 44 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 16f28f6ab9..c4016a49eb 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -295,12 +295,7 @@ namespace ggml_cuda_mma { } } #elif defined(AMD_WMMA_AVAILABLE) -#if defined(RDNA3) - // RDNA3 has duplicated data as input. - static constexpr int ne = I * J / 32 * 2; -#else static constexpr int ne = I * J / 32; -#endif // defined(RDNA3) half2 x[ne] = {{0.0f, 0.0f}}; static constexpr __device__ bool supported() { @@ -319,14 +314,7 @@ namespace ggml_cuda_mma { static __device__ __forceinline__ int get_j(const int l) { if constexpr (I == 16 && J == 8) { -#if defined(RDNA4) return 4 * (threadIdx.x / 16) + l; -#elif defined(RDNA3) - return l; -#else - NO_DEVICE_CODE; - return -1; -#endif // defined(RDNA4) } else { NO_DEVICE_CODE; return -1; @@ -384,42 +372,19 @@ namespace ggml_cuda_mma { static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR; #if defined(AMD_WMMA_AVAILABLE) -#if defined(RDNA3) - // RDNA3 has duplicated data as input. - static constexpr int ne = I * J / 32 * 2; -#else static constexpr int ne = I * J / 32; -#endif // defined(RDNA3) nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; static constexpr __device__ bool supported() { - if (I == 16 && J == 8) return true; - return false; + return tile::supported(); } static __device__ __forceinline__ int get_i(const int l) { - if constexpr (I == 16 && J == 8) { - return threadIdx.x % 16; - } else { - NO_DEVICE_CODE; - return -1; - } + return tile::get_i(l); } static __device__ __forceinline__ int get_j(const int l) { - if constexpr (I == 16 && J == 8) { -#if defined(RDNA4) - return 4 * (threadIdx.x / 16) + l; -#elif defined(RDNA3) - return l; -#else - NO_DEVICE_CODE; - return -1; -#endif // defined(RDNA4) - } else { - NO_DEVICE_CODE; - return -1; - } + return tile::get_j(l); } #else static constexpr int ne = I * J / WARP_SIZE; @@ -931,9 +896,9 @@ namespace ggml_cuda_mma { #endif // TURING_MMA_AVAILABLE } - template + template static __device__ __forceinline__ void mma( - tile<16, 8, float, DLayout> & D, const tile<16, 8, float, ABLayout> & A, const tile<8, 8, float, ABLayout> & B) { + tile<16, 8, float, dl_d> & D, const tile<16, 8, float, dl_ab> & A, const tile<8, 8, float, dl_ab> & B) { #ifdef AMPERE_MMA_AVAILABLE const int * Axi = (const int *) A.x; const int * Bxi = (const int *) B.x; @@ -987,9 +952,9 @@ namespace ggml_cuda_mma { #endif // AMPERE_MMA_AVAILABLE } - template + template static __device__ __forceinline__ void mma( - tile<16, 16, float, DLayout> & D, const tile<16, 8, half2, ABLayout> & A, const tile<16, 8, half2, ABLayout> & B) { + tile<16, 16, float, dl_d> & D, const tile<16, 8, half2, dl_ab> & A, const tile<16, 8, half2, dl_ab> & B) { #ifdef TURING_MMA_AVAILABLE const int * Axi = (const int *) A.x; const int * Bxi = (const int *) B.x; @@ -1041,9 +1006,9 @@ namespace ggml_cuda_mma { #endif // TURING_MMA_AVAILABLE } - template + template static __device__ __forceinline__ void mma( - tile<16, 16, float, DLayout> & D, const tile<16, 8, nv_bfloat162, ABLayout> & A, const tile<16, 8, nv_bfloat162, ABLayout> & B) { + tile<16, 16, float, dl_d> & D, const tile<16, 8, nv_bfloat162, dl_ab> & A, const tile<16, 8, nv_bfloat162, dl_ab> & B) { #if defined(AMD_WMMA_AVAILABLE) #if defined(RDNA4) using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16; From 98846cb9ee7dcae8d28adfa4769689953308c18c Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 13 Dec 2025 16:35:51 +0800 Subject: [PATCH 03/10] mmq for rdna4 --- ggml/src/ggml-cuda/mma.cuh | 158 ++++++++++++------------------------- ggml/src/ggml-cuda/mmq.cuh | 34 ++++---- 2 files changed, 69 insertions(+), 123 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index c4016a49eb..8a53f19341 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -87,6 +87,12 @@ namespace ggml_cuda_mma { // - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR // - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR + constexpr bool is_i_major(const data_layout dl) { + return dl == DATA_LAYOUT_I_MAJOR || + dl == DATA_LAYOUT_I_MAJOR_MIRRORED || + dl == DATA_LAYOUT_I_MAJOR_DUAL; + } + template struct tile {}; @@ -173,28 +179,19 @@ namespace ggml_cuda_mma { } } #elif defined(AMD_WMMA_AVAILABLE) -#if defined(RDNA4) static constexpr int ne = I * J / 32; -#elif defined(RDNA3) - static constexpr int ne = (I == 16 && J == 16) ? I * J / 32 : I * J / 16; -#endif // defined(RDNA4) T x[ne] = {0}; static constexpr __device__ bool supported() { if (I == 16 && J == 16) return true; + if (I == 16 && J == 8) return true; + if (I == 16 && J == 4) return true; return false; } static __device__ __forceinline__ int get_i(const int l) { - if constexpr (I == 16 && J == 16) { -#if defined(RDNA4) - return 8 * (threadIdx.x / 16) + l; -#elif defined(RDNA3) - return 2 * l + (threadIdx.x / 16); -#else - NO_DEVICE_CODE; - return -1; -#endif // defined(RDNA4) + if constexpr (supported()) { + return threadIdx.x % 16; } else { NO_DEVICE_CODE; return -1; @@ -203,7 +200,17 @@ namespace ggml_cuda_mma { static __device__ __forceinline__ int get_j(const int l) { if constexpr (I == 16 && J == 16) { - return threadIdx.x % 16; + // matrix C +#if defined(RDNA3) + return 2 * l + (threadIdx.x / 16); +#else + return ne * (threadIdx.x / 16) + l; +#endif // defined(RDNA3) + } else if constexpr (I == 16 && J == 8) { + // mmq input for RDNA4 + return ne * (threadIdx.x / 16) + l; + } else if constexpr (I == 16 && J == 4) { + return ne * (threadIdx.x / 16) + l; } else { NO_DEVICE_CODE; return -1; @@ -440,28 +447,11 @@ namespace ggml_cuda_mma { } static __device__ __forceinline__ int get_i(const int l) { - if constexpr (I == 16 && J == 16) { -#if defined(RDNA4) - return 8 * (threadIdx.x / 16) + l; -#elif defined(RDNA3) - return 2 * l + (threadIdx.x / 16); -#else - NO_DEVICE_CODE; - return -1; -#endif // defined(RDNA4) - } else { - NO_DEVICE_CODE; - return -1; - } + return tile::get_j(l); } static __device__ __forceinline__ int get_j(const int l) { - if constexpr (I == 16 && J == 16) { - return threadIdx.x % 16; - } else { - NO_DEVICE_CODE; - return -1; - } + return tile::get_i(l); } }; @@ -531,23 +521,25 @@ namespace ggml_cuda_mma { } }; - template - struct tile { + template + struct tile { static constexpr int I = I_; static constexpr int J = J_; static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL; static constexpr int ne = I * J / 32 * 2; - half2 x[ne] = {{0.0f, 0.0f}}; + T x[ne] = {{0.0f, 0.0f}}; static constexpr __device__ bool supported() { - if (I == 16 && J == 8) return true; + if (I == 16 && J == 16) return true; + if (I == 16 && J == 8) return true; + if (I == 16 && J == 4) return true; return false; } static __device__ __forceinline__ int get_i(const int l) { - if constexpr (I == 16 && J == 8) { + if constexpr (supported()) { return threadIdx.x % 16; } else { NO_DEVICE_CODE; @@ -556,7 +548,7 @@ namespace ggml_cuda_mma { } static __device__ __forceinline__ int get_j(const int l) { - if constexpr (I == 16 && J == 8) { + if constexpr (supported()) { return l; } else { NO_DEVICE_CODE; @@ -565,29 +557,6 @@ namespace ggml_cuda_mma { } }; - template - struct tile { - static constexpr int I = I_; - static constexpr int J = J_; - static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL; - - static constexpr int ne = I * J / 32 * 2; - - nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; - - static constexpr __device__ bool supported() { - return tile::supported(); - } - - static __device__ __forceinline__ int get_i(const int l) { - return tile::get_i(l); - } - - static __device__ __forceinline__ int get_j(const int l) { - return tile::get_j(l); - } - }; - #if defined(TURING_MMA_AVAILABLE) template static __device__ __forceinline__ tile get_half2(const tile & tile_float) { @@ -638,50 +607,25 @@ namespace ggml_cuda_mma { xi[0] = xs[0]; } #elif defined(AMD_WMMA_AVAILABLE) - if constexpr (std::is_same_v || std::is_same_v) { -#if defined(RDNA4) - ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); -#elif defined(RDNA3) - ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); - ggml_cuda_memcpy_1(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2)); -#else - NO_DEVICE_CODE; -#endif // defined(RDNA4) - } else if constexpr (std::is_same_v) { - if constexpr (I == 16 && J == 4) { - int64_t * xi = (int64_t *) t.x; -#if defined(RDNA4) - const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I)); - xi[0] = xs[0]; -#elif defined(RDNA3) - static_assert(tile::ne >= 4, "fragment too small"); - const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride); - xi[0] = xs[0]; - xi[1] = xs[1]; -#endif // defined(RDNA4) - } else if constexpr (I == 16 && J == 8) { - int64_t * xi = (int64_t *) t.x; -#if defined(RDNA4) - const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I)); - xi[0] = xs[0]; - - const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2); - xi[1] = xs1[0]; -#elif defined(RDNA3) - static_assert(tile::ne >= 8, "fragment too small"); - const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride); - // contiguous four 64-bit chunks per lane for the wider RDNA3 fragment - xi[0] = xs[0]; - xi[1] = xs[1]; - const int64_t * xs1 = xs + 2; - xi[2] = xs1[0]; - xi[3] = xs1[1]; -#endif // defined(RDNA4) + // All wmma layout has continues data when i-major. + if constexpr (is_i_major(dl)) { + // the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes() + constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes(); + if constexpr (sizeof(t.x) > aligned_copy_bytes) { + static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size"); + constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes; +#pragma unroll + for (int i = 0; i < aligned_copy_count; ++i) { + ggml_cuda_memcpy_1(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i)); + } } else { - NO_DEVICE_CODE; + ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); } } else { - NO_DEVICE_CODE; +#pragma unroll + for (int l = 0; l < t.ne; ++l) { + t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)]; + } } #else #pragma unroll @@ -1034,8 +978,9 @@ namespace ggml_cuda_mma { #endif // AMPERE_MMA_AVAILABLE } + template static __device__ __forceinline__ void mma( - tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) { + tile<16, 16, int, dl_d> & D, const tile<16, 8, int, dl_ab> & A, const tile<16, 8, int, dl_ab> & B) { #if defined(AMD_MFMA_AVAILABLE) using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int; int32x4_t * acc = (int32x4_t *) D.x; @@ -1189,8 +1134,9 @@ namespace ggml_cuda_mma { #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA } -static __device__ __forceinline__ void mma( - tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) { + template + static __device__ __forceinline__ void mma( + tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) { #if defined(AMD_WMMA_AVAILABLE) using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int; int32x8_t * acc = (int32x8_t *) D.x; diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 1298f99fff..e748f24c3a 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -797,9 +797,9 @@ template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) - typedef tile<16, 8, int> tile_A; - typedef tile<16, 8, int> tile_B; - typedef tile<16, 16, int> tile_C; + typedef tile<16, 8, int, DATA_LAYOUT_I_MAJOR> tile_A; + typedef tile<16, 8, int, DATA_LAYOUT_I_MAJOR> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); constexpr int rows_per_warp = granularity; @@ -966,9 +966,9 @@ template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) - typedef tile<16, 8, int> tile_A; - typedef tile<16, 8, int> tile_B; - typedef tile<16, 16, int> tile_C; + typedef tile<16, 8, int, DATA_LAYOUT_I_MAJOR> tile_A; + typedef tile<16, 8, int, DATA_LAYOUT_I_MAJOR> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); constexpr int rows_per_warp = granularity; @@ -1179,9 +1179,9 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( } } #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles - typedef tile<16, 4, int> tile_A; - typedef tile<16, 4, int> tile_B; - typedef tile<16, 16, int> tile_C; + typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_A; + typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); constexpr int rows_per_warp = granularity; @@ -1502,9 +1502,9 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles - typedef tile<16, 4, int> tile_A; - typedef tile<16, 4, int> tile_B; - typedef tile<16, 16, int> tile_C; + typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_A; + typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); constexpr int rows_per_warp = granularity; @@ -1570,7 +1570,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } #elif defined(TURING_MMA_AVAILABLE) - typedef tile<16, 4, int> tile_A; + 16, 4, int> tile_A; typedef tile<16, 8, int> tile_A_8; typedef tile< 8, 4, int> tile_B; typedef tile<16, 8, int> tile_C; @@ -2316,9 +2316,9 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } } #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles - typedef tile<16, 4, int> tile_A; - typedef tile<16, 4, int> tile_B; - typedef tile<16, 16, int> tile_C; + typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_A; + typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); constexpr int rows_per_warp = granularity; @@ -3015,7 +3015,7 @@ static __device__ __forceinline__ void mmq_write_back_mma( #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) constexpr int tileC_IJ = mmq_get_granularity_device(0); - typedef tile tile_C; + typedef tile tile_C; constexpr int rows_per_warp = granularity; #else typedef tile<16, 8, int> tile_C; From 62e4954d3f35d9f8e86139595ac0d1f0c912e7e8 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 13 Dec 2025 16:52:25 +0800 Subject: [PATCH 04/10] mmq for rdna3 --- ggml/src/ggml-cuda/mma.cuh | 2 +- ggml/src/ggml-cuda/mmq.cuh | 46 +++++++++++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 8a53f19341..e56da4329b 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -529,7 +529,7 @@ namespace ggml_cuda_mma { static constexpr int ne = I * J / 32 * 2; - T x[ne] = {{0.0f, 0.0f}}; + T x[ne] = {0}; static constexpr __device__ bool supported() { if (I == 16 && J == 16) return true; diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index e748f24c3a..d1c75a22e1 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -797,8 +797,13 @@ template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) - typedef tile<16, 8, int, DATA_LAYOUT_I_MAJOR> tile_A; - typedef tile<16, 8, int, DATA_LAYOUT_I_MAJOR> tile_B; +#if defined(RDNA3) + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; +#else + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; +#endif // defined(RDNA3) + typedef tile<16, 8, int, input_layout> tile_A; + typedef tile<16, 8, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); @@ -966,8 +971,13 @@ template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) - typedef tile<16, 8, int, DATA_LAYOUT_I_MAJOR> tile_A; - typedef tile<16, 8, int, DATA_LAYOUT_I_MAJOR> tile_B; +#if defined(RDNA3) + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; +#else + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; +#endif // defined(RDNA3) + typedef tile<16, 8, int, input_layout> tile_A; + typedef tile<16, 8, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); @@ -1179,8 +1189,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( } } #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles - typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_A; - typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_B; +#if defined(RDNA3) + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; +#else + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; +#endif // defined(RDNA3) + typedef tile<16, 4, int, input_layout> tile_A; + typedef tile<16, 4, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); @@ -1501,9 +1516,13 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } } #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles - - typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_A; - typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_B; +#if defined(RDNA3) + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; +#else + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; +#endif // defined(RDNA3) + typedef tile<16, 4, int, input_layout> tile_A; + typedef tile<16, 4, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); @@ -2316,8 +2335,13 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } } #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles - typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_A; - typedef tile<16, 4, int, DATA_LAYOUT_I_MAJOR> tile_B; +#if defined(RDNA3) + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; +#else + constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; +#endif // defined(RDNA3) + typedef tile<16, 4, int, input_layout> tile_A; + typedef tile<16, 4, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; constexpr int granularity = mmq_get_granularity_device(mmq_x); From 8b26bc388aeac8125dc2009f84f6872e06a55176 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 13 Dec 2025 17:05:22 +0800 Subject: [PATCH 05/10] align i-major and j-major --- ggml/src/ggml-cuda/mma.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index e56da4329b..48f81e4955 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -442,8 +442,7 @@ namespace ggml_cuda_mma { T x[ne] = {0}; static constexpr __device__ bool supported() { - if (I == 16 && J == 16) return true; - return false; + return tile::supported(); } static __device__ __forceinline__ int get_i(const int l) { From afb0e3d5577a9d7b0b7b6ba1eed8e42b1fb1b2c5 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 13 Dec 2025 17:24:39 +0800 Subject: [PATCH 06/10] cdna --- ggml/src/ggml-cuda/mma.cuh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 48f81e4955..70bc60d320 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -123,9 +123,9 @@ namespace ggml_cuda_mma { } else if constexpr (I == 32 && J == 4) { return threadIdx.x % 32; } else if constexpr (I == 16 && J == 16) { - return 4 * (threadIdx.x / 16) + l; + return threadIdx.x % 16; } else if constexpr (I == 32 && J == 32) { - return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4); + return threadIdx.x % 32; } else { NO_DEVICE_CODE; return -1; @@ -140,9 +140,9 @@ namespace ggml_cuda_mma { } else if constexpr (I == 32 && J == 4) { return 2 * (threadIdx.x / 32) + l; } else if constexpr (I == 16 && J == 16) { - return threadIdx.x % 16; + return 4 * (threadIdx.x / 16) + l; } else if constexpr (I == 32 && J == 32) { - return threadIdx.x % 32; + return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4); } else { NO_DEVICE_CODE; return -1; @@ -601,9 +601,7 @@ namespace ggml_cuda_mma { t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)]; } } else { - int64_t * xi = (int64_t *) t.x; - const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I)); - xi[0] = xs[0]; + ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); } #elif defined(AMD_WMMA_AVAILABLE) // All wmma layout has continues data when i-major. From 6b8ed41f2be77c67e893b506d2693aa8c68c62d2 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 13 Dec 2025 17:52:50 +0800 Subject: [PATCH 07/10] fix cuda error --- ggml/src/ggml-cuda/mmq.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index d1c75a22e1..a91d95df15 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -1589,7 +1589,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } #elif defined(TURING_MMA_AVAILABLE) - 16, 4, int> tile_A; + typedef tile<16, 4, int> tile_A; typedef tile<16, 8, int> tile_A_8; typedef tile< 8, 4, int> tile_B; typedef tile<16, 8, int> tile_C; From 6acad9c7599b237232da78f4097f78f762d3366e Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 13 Dec 2025 20:26:39 +0800 Subject: [PATCH 08/10] add missing tile of mfma --- ggml/src/ggml-cuda/mma.cuh | 8 ++++++ ggml/src/ggml-cuda/mmf.cuh | 12 ++------ ggml/src/ggml-cuda/mmq.cuh | 57 +++++++++++++------------------------- 3 files changed, 30 insertions(+), 47 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 70bc60d320..74e58c322a 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -93,6 +93,14 @@ namespace ggml_cuda_mma { dl == DATA_LAYOUT_I_MAJOR_DUAL; } + constexpr data_layout get_input_data_layout() { +#if defined(RDNA3) + return DATA_LAYOUT_I_MAJOR_DUAL; +#else + return DATA_LAYOUT_I_MAJOR; +#endif // defined(RDNA3) + } + template struct tile {}; diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index e1b9c6a6b7..e36730948f 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -35,11 +35,7 @@ static __global__ void mul_mat_f( constexpr bool is_tf32 = std::is_same_v; constexpr int tile_B_I = is_tf32 ? 8 : 16; constexpr int tile_C_J = is_tf32 ? 8 : 16; -#if defined(RDNA3) - constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : DATA_LAYOUT_I_MAJOR_DUAL; -#else - constexpr data_layout ab_layout = DATA_LAYOUT_I_MAJOR; -#endif // #if defined(RDNA3) + constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout(); typedef tile<16, 8, T, ab_layout> tile_A; typedef tile tile_B; typedef tile<16, tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C; @@ -281,11 +277,7 @@ static __global__ void mul_mat_f_ids( constexpr bool is_tf32 = std::is_same_v; constexpr int tile_B_I = is_tf32 ? 8 : 16; constexpr int tile_C_J = is_tf32 ? 8 : 16; -#if defined(RDNA3) - constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : DATA_LAYOUT_I_MAJOR_DUAL; -#else - constexpr data_layout ab_layout = DATA_LAYOUT_I_MAJOR; -#endif // #if defined(RDNA3) + constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout(); typedef tile<16, 8, T, ab_layout> tile_A; typedef tile tile_B; typedef tile<16, tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C; diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index a91d95df15..fa8a72c9c1 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -797,11 +797,7 @@ template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) -#if defined(RDNA3) - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; -#else - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; -#endif // defined(RDNA3) + constexpr data_layout input_layout = get_input_data_layout(); typedef tile<16, 8, int, input_layout> tile_A; typedef tile<16, 8, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; @@ -971,11 +967,7 @@ template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) -#if defined(RDNA3) - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; -#else - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; -#endif // defined(RDNA3) + constexpr data_layout input_layout = get_input_data_layout(); typedef tile<16, 8, int, input_layout> tile_A; typedef tile<16, 8, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; @@ -1140,10 +1132,11 @@ template static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #if defined(AMD_MFMA_AVAILABLE) - typedef tile<16, 8, int> tile_A; - typedef tile<16, 8, int> tile_B; - typedef tile<16, 16, int> tile_C; - typedef tile<64, 2, int> tile_load; + constexpr data_layout input_layout = get_input_data_layout(); + typedef tile<16, 8, int, input_layout> tile_A; + typedef tile<16, 8, int, input_layout> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; + typedef tile<64, 2, int, input_layout> tile_load; constexpr int granularity = mmq_get_granularity_device(mmq_x); constexpr int rows_per_warp = granularity; @@ -1189,11 +1182,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma( } } #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles -#if defined(RDNA3) - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; -#else - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; -#endif // defined(RDNA3) + constexpr data_layout input_layout = get_input_data_layout(); typedef tile<16, 4, int, input_layout> tile_A; typedef tile<16, 4, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; @@ -1450,10 +1439,11 @@ template static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #if defined(AMD_MFMA_AVAILABLE) - typedef tile<16, 8, int> tile_A; - typedef tile<16, 8, int> tile_B; - typedef tile<16, 16, int> tile_C; - typedef tile<64, 2, int> tile_load; + constexpr data_layout input_layout = get_input_data_layout(); + typedef tile<16, 8, int, input_layout> tile_A; + typedef tile<16, 8, int, input_layout> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; + typedef tile<64, 2, int, input_layout> tile_load; constexpr int granularity = mmq_get_granularity_device(mmq_x); constexpr int rows_per_warp = granularity; @@ -1516,11 +1506,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma( } } #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles -#if defined(RDNA3) - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; -#else - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; -#endif // defined(RDNA3) + constexpr data_layout input_layout = get_input_data_layout(); typedef tile<16, 4, int, input_layout> tile_A; typedef tile<16, 4, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; @@ -2284,10 +2270,11 @@ template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { #if defined(AMD_MFMA_AVAILABLE) - typedef tile<16, 8, int> tile_A; - typedef tile<16, 8, int> tile_B; - typedef tile<16, 16, int> tile_C; - typedef tile<64, 2, int> tile_load; + constexpr data_layout input_layout = get_input_data_layout(); + typedef tile<16, 8, int, input_layout> tile_A; + typedef tile<16, 8, int, input_layout> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; + typedef tile<64, 2, int, input_layout> tile_load; constexpr int granularity = mmq_get_granularity_device(mmq_x); constexpr int rows_per_warp = granularity; @@ -2335,11 +2322,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( } } #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles -#if defined(RDNA3) - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL; -#else - constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR; -#endif // defined(RDNA3) + constexpr data_layout input_layout = get_input_data_layout(); typedef tile<16, 4, int, input_layout> tile_A; typedef tile<16, 4, int, input_layout> tile_B; typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; From cffa070b08b26893117e212f5a6b16c73ded1d5a Mon Sep 17 00:00:00 2001 From: zhang hui Date: Tue, 16 Dec 2025 10:01:30 +0800 Subject: [PATCH 09/10] fix j-major wrong ne on CDNA --- ggml/src/ggml-cuda/mma.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 74e58c322a..96d25c5801 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -446,7 +446,7 @@ namespace ggml_cuda_mma { static constexpr int J = J_; static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR; - static constexpr int ne = I * J / 32; + static constexpr int ne = tile::ne; T x[ne] = {0}; static constexpr __device__ bool supported() { From cad07fa4b57c43594173298252c659f8f8713590 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Tue, 16 Dec 2025 21:27:14 +0800 Subject: [PATCH 10/10] fix gramma and empty spaces --- ggml/src/ggml-cuda/mma.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 96d25c5801..794d90bdd1 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -533,9 +533,9 @@ namespace ggml_cuda_mma { static constexpr int I = I_; static constexpr int J = J_; static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL; - + static constexpr int ne = I * J / 32 * 2; - + T x[ne] = {0}; static constexpr __device__ bool supported() { @@ -612,7 +612,7 @@ namespace ggml_cuda_mma { ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); } #elif defined(AMD_WMMA_AVAILABLE) - // All wmma layout has continues data when i-major. + // All wmma layout has contiguous data when i-major. if constexpr (is_i_major(dl)) { // the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes() constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes(); @@ -900,7 +900,7 @@ namespace ggml_cuda_mma { NO_DEVICE_CODE; #endif // AMPERE_MMA_AVAILABLE } - + template static __device__ __forceinline__ void mma( tile<16, 16, float, dl_d> & D, const tile<16, 8, half2, dl_ab> & A, const tile<16, 8, half2, dl_ab> & B) { @@ -954,7 +954,7 @@ namespace ggml_cuda_mma { NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } - + template static __device__ __forceinline__ void mma( tile<16, 16, float, dl_d> & D, const tile<16, 8, nv_bfloat162, dl_ab> & A, const tile<16, 8, nv_bfloat162, dl_ab> & B) {