From 318cb5b80cad235a09f366468addc588cd33e578 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 13 Dec 2025 13:42:29 +0800
Subject: [PATCH 01/10] mma.cuh for rdna4

---
 ggml/src/ggml-cuda/mma.cuh | 122 ++++++++++++++++++++++++++++++++++---
 ggml/src/ggml-cuda/mmf.cuh |  32 +++++++---
 2 files changed, 134 insertions(+), 20 deletions(-)
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index dcfa40f4d5..16f28f6ab9 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -76,9 +76,11 @@ namespace ggml_cuda_mma {
         // For the A/C matrices this means I major == row major, J major == column major.
         // For the B matrix this means I major == column major, J major == row major.
         // MIRRORED == Each data value is held exactly once per thread subgroup.
-        DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell.
-        DATA_LAYOUT_I_MAJOR_MIRRORED  = 10,
-        DATA_LAYOUT_J_MAJOR_MIRRORED  = 20,
+        DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA.
+        DATA_LAYOUT_J_MAJOR           = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3.
+        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20,
+        DATA_LAYOUT_J_MAJOR_MIRRORED  = 30,
+        DATA_LAYOUT_I_MAJOR_DUAL      = 40, // Matrix A&B for RDNA3.
     };
     // Implemented mma combinations are:
     //   - (I_MAJOR, I_MAJOR)          -> I_MAJOR
@@ -458,6 +460,46 @@ namespace ggml_cuda_mma {
 #endif  // defined(AMD_WMMA_AVAILABLE)
     };
 
+    template <int I_, int J_, typename T>
+    struct tile<I_, J_, T, DATA_LAYOUT_J_MAJOR> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR;
+
+        static constexpr int ne = I * J / 32;
+        T x[ne] = {0};
+
+        static constexpr __device__ bool supported() {
+            if (I == 16 && J == 16) return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            if constexpr (I == 16 && J == 16) {
+#if defined(RDNA4)
+                return 8 * (threadIdx.x / 16) + l;
+#elif defined(RDNA3)
+                return 2 * l + (threadIdx.x / 16);
+#else
+                NO_DEVICE_CODE;
+                return -1;
+#endif // defined(RDNA4)
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (I == 16 && J == 16) {
+                return threadIdx.x % 16;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+    };
+
     template <int I_, int J_>
     struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
         static constexpr int         I  = I_;
@@ -524,6 +566,63 @@ namespace ggml_cuda_mma {
         }
     };
 
+    template <int I_, int J_>
+    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_DUAL> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL;
+        
+        static constexpr int         ne = I * J / 32 * 2;
+        
+        half2 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            if (I == 16 && J == 8) return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            if constexpr (I == 16 && J == 8) {
+                return threadIdx.x % 16;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (I == 16 && J == 8) {
+                return l;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+    };
+
+    template <int I_, int J_>
+    struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR_DUAL> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL;
+        
+        static constexpr int         ne = I * J / 32 * 2;
+        
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_DUAL>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_DUAL>::get_i(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_DUAL>::get_j(l);
+        }
+    };
+
 #if defined(TURING_MMA_AVAILABLE)
     template <int I, int J>
     static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
@@ -660,9 +759,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
     }
 
-    template <typename T>
+    template <typename T, data_layout dl>
     static __device__ __forceinline__ void load_ldmatrix(
-            tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
+            tile<16, 8, T, dl> & t, const T * __restrict__ xs0, const int stride) {
 #if defined(TURING_MMA_AVAILABLE)
         int * xi = (int * ) t.x;
         const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
@@ -832,8 +931,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
     }
 
+    template <data_layout ABLayout, data_layout DLayout>
     static __device__ __forceinline__ void mma(
-            tile<16, 8, float> & D, const tile<16, 8, float> & A, const tile<8, 8, float> & B) {
+            tile<16, 8, float, DLayout> & D, const tile<16, 8, float, ABLayout> & A, const tile<8, 8, float, ABLayout> & B) {
 #ifdef AMPERE_MMA_AVAILABLE
         const int * Axi = (const int *) A.x;
         const int * Bxi = (const int *) B.x;
@@ -886,9 +986,10 @@ namespace ggml_cuda_mma {
         NO_DEVICE_CODE;
 #endif // AMPERE_MMA_AVAILABLE
     }
-
+    
+    template <data_layout ABLayout, data_layout DLayout>
     static __device__ __forceinline__ void mma(
-            tile<16, 16, float> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
+            tile<16, 16, float, DLayout> & D, const tile<16, 8, half2, ABLayout> & A, const tile<16, 8, half2, ABLayout> & B) {
 #ifdef TURING_MMA_AVAILABLE
         const int * Axi = (const int *) A.x;
         const int * Bxi = (const int *) B.x;
@@ -939,9 +1040,10 @@ namespace ggml_cuda_mma {
         NO_DEVICE_CODE;
 #endif // TURING_MMA_AVAILABLE
     }
-
+    
+    template <data_layout ABLayout, data_layout DLayout>
     static __device__ __forceinline__ void mma(
-            tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
+            tile<16, 16, float, DLayout> & D, const tile<16, 8, nv_bfloat162, ABLayout> & A, const tile<16, 8, nv_bfloat162, ABLayout> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
 #if defined(RDNA4)
         using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
index e1c695c5c0..e1b9c6a6b7 100644
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -32,11 +32,17 @@ static __global__ void mul_mat_f(
 #if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
 #if defined(AMD_WMMA_AVAILABLE)
     // Special case for tf32, just dummy mma layout as wmma doesn't support it.
-    constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
-    constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
-    typedef tile<16,       8, T>     tile_A;
-    typedef tile<tile_B_I, 8, T>     tile_B;
-    typedef tile<16,       tile_C_J, float> tile_C;
+    constexpr bool is_tf32 = std::is_same_v<T, float>;
+    constexpr int tile_B_I = is_tf32 ? 8 : 16;
+    constexpr int tile_C_J = is_tf32 ? 8 : 16;
+#if defined(RDNA3)
+    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : DATA_LAYOUT_I_MAJOR_DUAL;
+#else
+    constexpr data_layout ab_layout = DATA_LAYOUT_I_MAJOR;
+#endif //  #if defined(RDNA3)
+    typedef tile<16,       8,        T,     ab_layout>           tile_A;
+    typedef tile<tile_B_I, 8,        T,     ab_layout>           tile_B;
+    typedef tile<16,       tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
 #else
 #ifdef VOLTA_MMA_AVAILABLE
     if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
@@ -272,11 +278,17 @@ static __global__ void mul_mat_f_ids(
 #if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
 #if defined(AMD_WMMA_AVAILABLE)
     // Special case for tf32, just dummy mma layout as wmma doesn't support it.
-    constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
-    constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
-    typedef tile<16,       8, T>     tile_A;
-    typedef tile<tile_B_I, 8, T>     tile_B;
-    typedef tile<16,       tile_C_J, float> tile_C;
+    constexpr bool is_tf32 = std::is_same_v<T, float>;
+    constexpr int tile_B_I = is_tf32 ? 8 : 16;
+    constexpr int tile_C_J = is_tf32 ? 8 : 16;
+#if defined(RDNA3)
+    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : DATA_LAYOUT_I_MAJOR_DUAL;
+#else
+    constexpr data_layout ab_layout = DATA_LAYOUT_I_MAJOR;
+#endif //  #if defined(RDNA3)
+    typedef tile<16,       8,        T,     ab_layout>           tile_A;
+    typedef tile<tile_B_I, 8,        T,     ab_layout>           tile_B;
+    typedef tile<16,       tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
 #else
 #ifdef VOLTA_MMA_AVAILABLE
     if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {

From 074b93146e410aac4de51c197a75814e62671154 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 13 Dec 2025 14:10:56 +0800
Subject: [PATCH 02/10] mma for rdna3

---
 ggml/src/ggml-cuda/mma.cuh | 53 +++++++-------------------------------
 1 file changed, 9 insertions(+), 44 deletions(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 16f28f6ab9..c4016a49eb 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -295,12 +295,7 @@ namespace ggml_cuda_mma {
             }
         }
 #elif defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-        // RDNA3 has duplicated data as input.
-        static constexpr int ne = I * J / 32 * 2;
-#else
         static constexpr int ne = I * J / 32;
-#endif // defined(RDNA3)
         half2 x[ne] = {{0.0f, 0.0f}};
 
         static constexpr __device__ bool supported() {
@@ -319,14 +314,7 @@ namespace ggml_cuda_mma {
 
         static __device__ __forceinline__ int get_j(const int l) {
             if constexpr (I == 16 && J == 8) {
-#if defined(RDNA4)
                 return 4 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return l;
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -384,42 +372,19 @@ namespace ggml_cuda_mma {
         static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
 
 #if defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-        // RDNA3 has duplicated data as input.
-        static constexpr int ne = I * J / 32 * 2;
-#else
         static constexpr int ne = I * J / 32;
-#endif // defined(RDNA3)
         nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
 
         static constexpr __device__ bool supported() {
-            if (I == 16 && J == 8) return true;
-            return false;
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::supported();
         }
 
         static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 16 && J == 8) {
-                return threadIdx.x % 16;
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_i(l);
         }
 
         static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (I == 16 && J == 8) {
-#if defined(RDNA4)
-                return 4 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return l;
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_j(l);
         }
 #else
         static constexpr int ne = I * J / WARP_SIZE;
@@ -931,9 +896,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
     }
 
-    template <data_layout ABLayout, data_layout DLayout>
+    template <data_layout dl_ab, data_layout dl_d>
     static __device__ __forceinline__ void mma(
-            tile<16, 8, float, DLayout> & D, const tile<16, 8, float, ABLayout> & A, const tile<8, 8, float, ABLayout> & B) {
+            tile<16, 8, float, dl_d> & D, const tile<16, 8, float, dl_ab> & A, const tile<8, 8, float, dl_ab> & B) {
 #ifdef AMPERE_MMA_AVAILABLE
         const int * Axi = (const int *) A.x;
         const int * Bxi = (const int *) B.x;
@@ -987,9 +952,9 @@ namespace ggml_cuda_mma {
 #endif // AMPERE_MMA_AVAILABLE
     }
     
-    template <data_layout ABLayout, data_layout DLayout>
+    template <data_layout dl_ab, data_layout dl_d>
     static __device__ __forceinline__ void mma(
-            tile<16, 16, float, DLayout> & D, const tile<16, 8, half2, ABLayout> & A, const tile<16, 8, half2, ABLayout> & B) {
+            tile<16, 16, float, dl_d> & D, const tile<16, 8, half2, dl_ab> & A, const tile<16, 8, half2, dl_ab> & B) {
 #ifdef TURING_MMA_AVAILABLE
         const int * Axi = (const int *) A.x;
         const int * Bxi = (const int *) B.x;
@@ -1041,9 +1006,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
     }
     
-    template <data_layout ABLayout, data_layout DLayout>
+    template <data_layout dl_ab, data_layout dl_d>
     static __device__ __forceinline__ void mma(
-            tile<16, 16, float, DLayout> & D, const tile<16, 8, nv_bfloat162, ABLayout> & A, const tile<16, 8, nv_bfloat162, ABLayout> & B) {
+            tile<16, 16, float, dl_d> & D, const tile<16, 8, nv_bfloat162, dl_ab> & A, const tile<16, 8, nv_bfloat162, dl_ab> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
 #if defined(RDNA4)
         using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;

From 98846cb9ee7dcae8d28adfa4769689953308c18c Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 13 Dec 2025 16:35:51 +0800
Subject: [PATCH 03/10] mmq for rdna4

---
 ggml/src/ggml-cuda/mma.cuh | 158 ++++++++++++-------------------------
 ggml/src/ggml-cuda/mmq.cuh |  34 ++++----
 2 files changed, 69 insertions(+), 123 deletions(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index c4016a49eb..8a53f19341 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -87,6 +87,12 @@ namespace ggml_cuda_mma {
     //   - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
     //   - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
 
+    constexpr bool is_i_major(const data_layout dl) {
+        return dl == DATA_LAYOUT_I_MAJOR ||
+               dl == DATA_LAYOUT_I_MAJOR_MIRRORED ||
+               dl == DATA_LAYOUT_I_MAJOR_DUAL;
+    }
+
     template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
     struct tile {};
 
@@ -173,28 +179,19 @@ namespace ggml_cuda_mma {
             }
         }
 #elif defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA4)
         static constexpr int ne = I * J / 32;
-#elif defined(RDNA3)
-        static constexpr int ne = (I == 16 && J == 16) ? I * J / 32 : I * J / 16;
-#endif // defined(RDNA4)
         T x[ne] = {0};
 
         static constexpr __device__ bool supported() {
             if (I == 16 && J == 16) return true;
+            if (I == 16 && J == 8) return true;
+            if (I == 16 && J == 4) return true;
             return false;
         }
 
         static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 16 && J == 16) {
-#if defined(RDNA4)
-                return 8 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return 2 * l + (threadIdx.x / 16);
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
+            if constexpr (supported()) {
+                return threadIdx.x % 16;
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -203,7 +200,17 @@ namespace ggml_cuda_mma {
 
         static __device__ __forceinline__ int get_j(const int l) {
             if constexpr (I == 16 && J == 16) {
-                return threadIdx.x % 16;
+                // matrix C
+#if defined(RDNA3)
+                return 2 * l + (threadIdx.x / 16);
+#else
+                return ne * (threadIdx.x / 16) + l;
+#endif // defined(RDNA3)
+            } else if constexpr (I == 16 && J == 8) {
+                // mmq input for RDNA4
+                return ne * (threadIdx.x / 16) + l;
+            } else if constexpr (I == 16 && J == 4) {
+                return ne * (threadIdx.x / 16) + l;
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -440,28 +447,11 @@ namespace ggml_cuda_mma {
         }
 
         static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 16 && J == 16) {
-#if defined(RDNA4)
-                return 8 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return 2 * l + (threadIdx.x / 16);
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
+            return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::get_j(l);
         }
 
         static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (I == 16 && J == 16) {
-                return threadIdx.x % 16;
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
+            return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::get_i(l);
         }
     };
 
@@ -531,23 +521,25 @@ namespace ggml_cuda_mma {
         }
     };
 
-    template <int I_, int J_>
-    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_DUAL> {
+    template <int I_, int J_, typename T>
+    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_DUAL> {
         static constexpr int         I  = I_;
         static constexpr int         J  = J_;
         static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL;
         
         static constexpr int         ne = I * J / 32 * 2;
         
-        half2 x[ne] = {{0.0f, 0.0f}};
+        T x[ne] = {{0.0f, 0.0f}};
 
         static constexpr __device__ bool supported() {
-            if (I == 16 && J == 8) return true;
+            if (I == 16 && J == 16) return true;
+            if (I == 16 && J == 8)  return true;
+            if (I == 16 && J == 4)  return true;
             return false;
         }
 
         static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 16 && J == 8) {
+            if constexpr (supported()) {
                 return threadIdx.x % 16;
             } else {
                 NO_DEVICE_CODE;
@@ -556,7 +548,7 @@ namespace ggml_cuda_mma {
         }
 
         static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (I == 16 && J == 8) {
+            if constexpr (supported()) {
                 return l;
             } else {
                 NO_DEVICE_CODE;
@@ -565,29 +557,6 @@ namespace ggml_cuda_mma {
         }
     };
 
-    template <int I_, int J_>
-    struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR_DUAL> {
-        static constexpr int         I  = I_;
-        static constexpr int         J  = J_;
-        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL;
-        
-        static constexpr int         ne = I * J / 32 * 2;
-        
-        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
-
-        static constexpr __device__ bool supported() {
-            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_DUAL>::supported();
-        }
-
-        static __device__ __forceinline__ int get_i(const int l) {
-            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_DUAL>::get_i(l);
-        }
-
-        static __device__ __forceinline__ int get_j(const int l) {
-            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_DUAL>::get_j(l);
-        }
-    };
-
 #if defined(TURING_MMA_AVAILABLE)
     template <int I, int J>
     static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
@@ -638,50 +607,25 @@ namespace ggml_cuda_mma {
             xi[0] = xs[0];
         }
 #elif defined(AMD_WMMA_AVAILABLE)
-        if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
-#if defined(RDNA4)
-                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-#elif defined(RDNA3)
-                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
-#else
-                NO_DEVICE_CODE;
-#endif // defined(RDNA4)
-        } else if constexpr (std::is_same_v<T, int>) {
-            if constexpr (I == 16 && J == 4) {
-                int64_t * xi = (int64_t *) t.x;
-#if defined(RDNA4)
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
-                xi[0] = xs[0];
-#elif defined(RDNA3)
-                static_assert(tile<I,J,T>::ne >= 4, "fragment too small");
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
-                xi[0] = xs[0];
-                xi[1] = xs[1];
-#endif // defined(RDNA4)
-            } else if constexpr (I == 16 && J == 8) {
-                int64_t * xi = (int64_t *) t.x;
-#if defined(RDNA4)
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
-                xi[0] = xs[0];
-
-                const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
-                xi[1] = xs1[0];
-#elif defined(RDNA3)
-                static_assert(tile<I,J,T>::ne >= 8, "fragment too small");
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
-                // contiguous four 64-bit chunks per lane for the wider RDNA3 fragment
-                xi[0] = xs[0];
-                xi[1] = xs[1];
-                const int64_t * xs1 = xs + 2;
-                xi[2] = xs1[0];
-                xi[3] = xs1[1];
-#endif // defined(RDNA4)
+        // All wmma layout has continues data when i-major.
+        if constexpr (is_i_major(dl)) {
+            // the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
+            constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
+            if constexpr (sizeof(t.x) > aligned_copy_bytes) {
+                static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size");
+                constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes;
+#pragma unroll
+                for (int i = 0; i < aligned_copy_count; ++i) {
+                    ggml_cuda_memcpy_1<aligned_copy_bytes>(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i));
+                }
             } else {
-                NO_DEVICE_CODE;
+                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
             }
         } else {
-            NO_DEVICE_CODE;
+#pragma unroll
+            for (int l = 0; l < t.ne; ++l) {
+                t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
+            }
         }
 #else
 #pragma unroll
@@ -1034,8 +978,9 @@ namespace ggml_cuda_mma {
 #endif // AMPERE_MMA_AVAILABLE
     }
 
+    template <data_layout dl_d, data_layout dl_ab>
     static __device__ __forceinline__ void mma(
-            tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
+            tile<16, 16, int, dl_d> & D, const tile<16, 8, int, dl_ab> & A, const tile<16, 8, int, dl_ab> & B) {
 #if defined(AMD_MFMA_AVAILABLE)
         using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
         int32x4_t * acc = (int32x4_t *) D.x;
@@ -1189,8 +1134,9 @@ namespace ggml_cuda_mma {
 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
     }
 
-static __device__ __forceinline__ void mma(
-            tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
+    template <data_layout dl_d, data_layout dl_ab>
+    static __device__ __forceinline__ void mma(
+            tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
         using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
         int32x8_t * acc = (int32x8_t *) D.x;
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 1298f99fff..e748f24c3a 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -797,9 +797,9 @@ template <int mmq_x, int mmq_y, mmq_q8_1_ds_layout ds_layout>
 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    typedef tile<16,  8, int, DATA_LAYOUT_I_MAJOR> tile_A;
+    typedef tile<16,  8, int, DATA_LAYOUT_I_MAJOR> tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -966,9 +966,9 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    typedef tile<16,  8, int, DATA_LAYOUT_I_MAJOR> tile_A;
+    typedef tile<16,  8, int, DATA_LAYOUT_I_MAJOR> tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -1179,9 +1179,9 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-    typedef tile<16,  4, int> tile_A;
-    typedef tile<16,  4, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_A;
+    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -1502,9 +1502,9 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
 
-    typedef tile<16,  4, int> tile_A;
-    typedef tile<16,  4, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_A;
+    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -1570,7 +1570,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
     }
 #elif defined(TURING_MMA_AVAILABLE)
 
-    typedef tile<16, 4, int> tile_A;
+    16, 4, int> tile_A;
     typedef tile<16, 8, int> tile_A_8;
     typedef tile< 8, 4, int> tile_B;
     typedef tile<16, 8, int> tile_C;
@@ -2316,9 +2316,9 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-    typedef tile<16,  4, int> tile_A;
-    typedef tile<16,  4, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_A;
+    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -3015,7 +3015,7 @@ static __device__ __forceinline__ void mmq_write_back_mma(
 
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
     constexpr int tileC_IJ = mmq_get_granularity_device(0);
-    typedef tile<tileC_IJ, tileC_IJ, int> tile_C;
+    typedef tile<tileC_IJ, tileC_IJ, int, DATA_LAYOUT_J_MAJOR> tile_C;
     constexpr int rows_per_warp = granularity;
 #else
     typedef tile<16, 8, int> tile_C;

From 62e4954d3f35d9f8e86139595ac0d1f0c912e7e8 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 13 Dec 2025 16:52:25 +0800
Subject: [PATCH 04/10] mmq for rdna3

---
 ggml/src/ggml-cuda/mma.cuh |  2 +-
 ggml/src/ggml-cuda/mmq.cuh | 46 +++++++++++++++++++++++++++++---------
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 8a53f19341..e56da4329b 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -529,7 +529,7 @@ namespace ggml_cuda_mma {
         
         static constexpr int         ne = I * J / 32 * 2;
         
-        T x[ne] = {{0.0f, 0.0f}};
+        T x[ne] = {0};
 
         static constexpr __device__ bool supported() {
             if (I == 16 && J == 16) return true;
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index e748f24c3a..d1c75a22e1 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -797,8 +797,13 @@ template <int mmq_x, int mmq_y, mmq_q8_1_ds_layout ds_layout>
 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-    typedef tile<16,  8, int, DATA_LAYOUT_I_MAJOR> tile_A;
-    typedef tile<16,  8, int, DATA_LAYOUT_I_MAJOR> tile_B;
+#if defined(RDNA3)
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
+#else
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
+#endif // defined(RDNA3)
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
@@ -966,8 +971,13 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-    typedef tile<16,  8, int, DATA_LAYOUT_I_MAJOR> tile_A;
-    typedef tile<16,  8, int, DATA_LAYOUT_I_MAJOR> tile_B;
+#if defined(RDNA3)
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
+#else
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
+#endif // defined(RDNA3)
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
@@ -1179,8 +1189,13 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_A;
-    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_B;
+#if defined(RDNA3)
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
+#else
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
+#endif // defined(RDNA3)
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
@@ -1501,9 +1516,13 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-
-    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_A;
-    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_B;
+#if defined(RDNA3)
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
+#else
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
+#endif // defined(RDNA3)
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
@@ -2316,8 +2335,13 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_A;
-    typedef tile<16,  4, int, DATA_LAYOUT_I_MAJOR> tile_B;
+#if defined(RDNA3)
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
+#else
+    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
+#endif // defined(RDNA3)
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);

From 8b26bc388aeac8125dc2009f84f6872e06a55176 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 13 Dec 2025 17:05:22 +0800
Subject: [PATCH 05/10] align i-major and j-major

---
 ggml/src/ggml-cuda/mma.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index e56da4329b..48f81e4955 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -442,8 +442,7 @@ namespace ggml_cuda_mma {
         T x[ne] = {0};
 
         static constexpr __device__ bool supported() {
-            if (I == 16 && J == 16) return true;
-            return false;
+            return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::supported();
         }
 
         static __device__ __forceinline__ int get_i(const int l) {

From afb0e3d5577a9d7b0b7b6ba1eed8e42b1fb1b2c5 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 13 Dec 2025 17:24:39 +0800
Subject: [PATCH 06/10] cdna

---
 ggml/src/ggml-cuda/mma.cuh | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 48f81e4955..70bc60d320 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -123,9 +123,9 @@ namespace ggml_cuda_mma {
             } else if constexpr (I == 32 && J == 4) {
                 return threadIdx.x % 32;
             } else if constexpr (I == 16 && J == 16) {
-                return 4 * (threadIdx.x / 16) + l;
+                return threadIdx.x % 16;
             } else if constexpr (I == 32 && J == 32) {
-                return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
+                return threadIdx.x % 32;
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -140,9 +140,9 @@ namespace ggml_cuda_mma {
             } else if constexpr (I == 32 && J == 4) {
                 return 2 * (threadIdx.x / 32) + l;
             } else if constexpr (I == 16 && J == 16) {
-                return threadIdx.x % 16;
+                return 4 * (threadIdx.x / 16) + l;
             } else if constexpr (I == 32 && J == 32) {
-                return threadIdx.x % 32;
+                return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -601,9 +601,7 @@ namespace ggml_cuda_mma {
                 t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
             }
         } else {
-            int64_t * xi = (int64_t *) t.x;
-            const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
-            xi[0] = xs[0];
+            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
         }
 #elif defined(AMD_WMMA_AVAILABLE)
         // All wmma layout has continues data when i-major.

From 6b8ed41f2be77c67e893b506d2693aa8c68c62d2 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 13 Dec 2025 17:52:50 +0800
Subject: [PATCH 07/10] fix cuda error

---
 ggml/src/ggml-cuda/mmq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index d1c75a22e1..a91d95df15 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -1589,7 +1589,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
     }
 #elif defined(TURING_MMA_AVAILABLE)
 
-    16, 4, int> tile_A;
+    typedef tile<16, 4, int> tile_A;
     typedef tile<16, 8, int> tile_A_8;
     typedef tile< 8, 4, int> tile_B;
     typedef tile<16, 8, int> tile_C;

From 6acad9c7599b237232da78f4097f78f762d3366e Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 13 Dec 2025 20:26:39 +0800
Subject: [PATCH 08/10] add missing tile of mfma

---
 ggml/src/ggml-cuda/mma.cuh |  8 ++++++
 ggml/src/ggml-cuda/mmf.cuh | 12 ++------
 ggml/src/ggml-cuda/mmq.cuh | 57 +++++++++++++-------------------------
 3 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 70bc60d320..74e58c322a 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -93,6 +93,14 @@ namespace ggml_cuda_mma {
                dl == DATA_LAYOUT_I_MAJOR_DUAL;
     }
 
+    constexpr data_layout get_input_data_layout() {
+#if defined(RDNA3)
+        return DATA_LAYOUT_I_MAJOR_DUAL;
+#else
+        return DATA_LAYOUT_I_MAJOR;
+#endif // defined(RDNA3)
+    }
+
     template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
     struct tile {};
 
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
index e1b9c6a6b7..e36730948f 100644
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -35,11 +35,7 @@ static __global__ void mul_mat_f(
     constexpr bool is_tf32 = std::is_same_v<T, float>;
     constexpr int tile_B_I = is_tf32 ? 8 : 16;
     constexpr int tile_C_J = is_tf32 ? 8 : 16;
-#if defined(RDNA3)
-    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : DATA_LAYOUT_I_MAJOR_DUAL;
-#else
-    constexpr data_layout ab_layout = DATA_LAYOUT_I_MAJOR;
-#endif //  #if defined(RDNA3)
+    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
     typedef tile<16,       8,        T,     ab_layout>           tile_A;
     typedef tile<tile_B_I, 8,        T,     ab_layout>           tile_B;
     typedef tile<16,       tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
@@ -281,11 +277,7 @@ static __global__ void mul_mat_f_ids(
     constexpr bool is_tf32 = std::is_same_v<T, float>;
     constexpr int tile_B_I = is_tf32 ? 8 : 16;
     constexpr int tile_C_J = is_tf32 ? 8 : 16;
-#if defined(RDNA3)
-    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : DATA_LAYOUT_I_MAJOR_DUAL;
-#else
-    constexpr data_layout ab_layout = DATA_LAYOUT_I_MAJOR;
-#endif //  #if defined(RDNA3)
+    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
     typedef tile<16,       8,        T,     ab_layout>           tile_A;
     typedef tile<tile_B_I, 8,        T,     ab_layout>           tile_B;
     typedef tile<16,       tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index a91d95df15..fa8a72c9c1 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -797,11 +797,7 @@ template <int mmq_x, int mmq_y, mmq_q8_1_ds_layout ds_layout>
 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
-#else
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
-#endif // defined(RDNA3)
+    constexpr data_layout input_layout = get_input_data_layout();
     typedef tile<16,  8, int, input_layout>        tile_A;
     typedef tile<16,  8, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
@@ -971,11 +967,7 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
-#else
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
-#endif // defined(RDNA3)
+    constexpr data_layout input_layout = get_input_data_layout();
     typedef tile<16,  8, int, input_layout>        tile_A;
     typedef tile<16,  8, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
@@ -1140,10 +1132,11 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
-    typedef tile<64,  2, int> tile_load;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+    typedef tile<64,  2, int, input_layout>        tile_load;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -1189,11 +1182,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-#if defined(RDNA3)
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
-#else
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
-#endif // defined(RDNA3)
+    constexpr data_layout input_layout = get_input_data_layout();
     typedef tile<16,  4, int, input_layout>        tile_A;
     typedef tile<16,  4, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
@@ -1450,10 +1439,11 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
-    typedef tile<64,  2, int> tile_load;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+    typedef tile<64,  2, int, input_layout>        tile_load;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -1516,11 +1506,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-#if defined(RDNA3)
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
-#else
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
-#endif // defined(RDNA3)
+    constexpr data_layout input_layout = get_input_data_layout();
     typedef tile<16,  4, int, input_layout>        tile_A;
     typedef tile<16,  4, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
@@ -2284,10 +2270,11 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
-    typedef tile<64,  2, int> tile_load;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+    typedef tile<64,  2, int, input_layout>        tile_load;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -2335,11 +2322,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-#if defined(RDNA3)
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR_DUAL;
-#else
-    constexpr data_layout input_layout = DATA_LAYOUT_I_MAJOR;
-#endif // defined(RDNA3)
+    constexpr data_layout input_layout = get_input_data_layout();
     typedef tile<16,  4, int, input_layout>        tile_A;
     typedef tile<16,  4, int, input_layout>        tile_B;
     typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;

From cffa070b08b26893117e212f5a6b16c73ded1d5a Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Tue, 16 Dec 2025 10:01:30 +0800
Subject: [PATCH 09/10] fix j-major wrong ne on CDNA

---
 ggml/src/ggml-cuda/mma.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 74e58c322a..96d25c5801 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -446,7 +446,7 @@ namespace ggml_cuda_mma {
         static constexpr int         J  = J_;
         static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR;
 
-        static constexpr int ne = I * J / 32;
+        static constexpr int ne = tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::ne;
         T x[ne] = {0};
 
         static constexpr __device__ bool supported() {

From cad07fa4b57c43594173298252c659f8f8713590 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Tue, 16 Dec 2025 21:27:14 +0800
Subject: [PATCH 10/10] fix gramma and empty spaces

---
 ggml/src/ggml-cuda/mma.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 96d25c5801..794d90bdd1 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -533,9 +533,9 @@ namespace ggml_cuda_mma {
         static constexpr int         I  = I_;
         static constexpr int         J  = J_;
         static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL;
-        
+
         static constexpr int         ne = I * J / 32 * 2;
-        
+
         T x[ne] = {0};
 
         static constexpr __device__ bool supported() {
@@ -612,7 +612,7 @@ namespace ggml_cuda_mma {
             ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
         }
 #elif defined(AMD_WMMA_AVAILABLE)
-        // All wmma layout has continues data when i-major.
+        // All wmma layout has contiguous data when i-major.
         if constexpr (is_i_major(dl)) {
             // the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
             constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
@@ -900,7 +900,7 @@ namespace ggml_cuda_mma {
         NO_DEVICE_CODE;
 #endif // AMPERE_MMA_AVAILABLE
     }
-    
+
     template <data_layout dl_ab, data_layout dl_d>
     static __device__ __forceinline__ void mma(
             tile<16, 16, float, dl_d> & D, const tile<16, 8, half2, dl_ab> & A, const tile<16, 8, half2, dl_ab> & B) {
@@ -954,7 +954,7 @@ namespace ggml_cuda_mma {
         NO_DEVICE_CODE;
 #endif // TURING_MMA_AVAILABLE
     }
-    
+
     template <data_layout dl_ab, data_layout dl_d>
     static __device__ __forceinline__ void mma(
             tile<16, 16, float, dl_d> & D, const tile<16, 8, nv_bfloat162, dl_ab> & A, const tile<16, 8, nv_bfloat162, dl_ab> & B) {