diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
index 5c51a22256..be2ad1c6b6 100644
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
             return false;
         }
     } else {
-        if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) {
+        if (src1_ncols > 16) {
             return false;
         }
     }
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
index c2a0a2e42f..18de0dc04a 100644
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -8,6 +8,19 @@ using namespace ggml_cuda_mma;
 
 #define MMF_ROWS_PER_BLOCK 32
 
+// TODO: submit a bug to rocm compiler, remove this when the bug is fixed.
+// force rocm compiler to use more register and unroll code for mul_mat_f.
+#if defined(RDNA4)
+#define MMF_REGISTER_UNROLL_FOR_RDNA \
+do {                                 \
+    if (blockIdx.z == -1) {          \
+        NO_DEVICE_CODE;              \
+    }                                \
+} while(0)
+#else
+#define MMF_REGISTER_UNROLL_FOR_RDNA
+#endif // defined(RDNA4)
+
 struct mmf_ids_data {
     const int32_t * ids_src_compact = nullptr;
     const int32_t * ids_dst_compact = nullptr;
@@ -153,6 +166,7 @@ static __global__ void mul_mat_f(
 #pragma unroll
             for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) {
                 load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded);
+                MMF_REGISTER_UNROLL_FOR_RDNA;
             }
         }
 
@@ -191,6 +205,7 @@ static __global__ void mul_mat_f(
             for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) {
                 tile_B B;
                 load_ldmatrix(B, tile_xy + k0, tile_k_padded);
+                MMF_REGISTER_UNROLL_FOR_RDNA;
 #pragma unroll
                 for (int itA = 0; itA < ntA; ++itA) {
                     mma(C[itA][itB], A[itA][k0/tile_B::J], B);