HIP: add CDNA4 (gfx950) architecture support for MI350X/MI355X (#21570)

Add AMD Instinct MI350X/MI355X (gfx950, CDNA4) support:

- vendors/hip.h: Add CDNA4 preprocessor define for __gfx950__
- common.cuh: Add GGML_CUDA_CC_CDNA4 and GGML_CUDA_CC_IS_CDNA4 macros
- mma.cuh: Route CDNA4 to compatible MFMA instructions:
  * f32 matmul: mfma_f32_16x16x4f32 (xf32 variant unavailable on gfx950)
  * bf16 matmul: mfma_f32_16x16x16bf16_1k (same as CDNA3)
  * int8 matmul: mfma_i32_16x16x32_i8/32x32x16 (same as CDNA3)
- mmq.cuh: Include CDNA4 in stream-k kernel dispatch

CDNA4 is largely compatible with CDNA3 except:
- No xf32 MFMA (mfma_f32_16x16x8_xf32) — routes to f32 path
- Different FP8 format (e4m3fn vs e4m3_fnuz) — not changed here

Tested on AMD Instinct MI355X (gfx950), ROCm 7.0.1:
- Build: compiles cleanly with -DAMDGPU_TARGETS=gfx950
- llama-bench (Qwen2.5-1.5B Q4_K_M, single GPU):
  * f16+FA: 40,013 tok/s prefill, 254 tok/s decode
  * q8_0+FA: functional
- Flash attention: works correctly
- MMQ: works correctly with stream-k dispatch

Co-authored-by: Andy Luo <andyluo7@users.noreply.github.com>
This commit is contained in:
andyluo7 2026-04-09 22:13:32 +03:00 committed by GitHub
parent d6f3030047
commit d132f22fc9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 19 additions and 12 deletions

View File

@ -67,6 +67,7 @@
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x90a) // MI210 (gfx90a), minimum acc register renaming
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
#define GGML_CUDA_CC_CDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x950) // MI350X/MI355X
// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
@ -87,7 +88,8 @@
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2)
#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3)
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_CDNA4)
#define GGML_CUDA_CC_IS_CDNA4(cc) (cc >= GGML_CUDA_CC_CDNA4 && cc < GGML_CUDA_CC_RDNA1)
// Moore Threads
#define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons

View File

@ -1025,7 +1025,8 @@ namespace ggml_cuda_mma {
const floatx2_t& a_frag = reinterpret_cast<const floatx2_t&>(A.x[0]);
const floatx2_t& b_frag = reinterpret_cast<const floatx2_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a_frag, b_frag, acc_frag, 0, 0, 0);
#elif defined(CDNA2) || defined(CDNA1)
#elif defined(CDNA4) || defined(CDNA2) || defined(CDNA1)
// CDNA4 (gfx950) does not support xf32 MFMA, use f32 path like CDNA2/CDNA1
#pragma unroll
for (int i = 0; i < 2; ++i) {
acc_frag = __builtin_amdgcn_mfma_f32_16x16x4f32(A.x[i], B.x[i], acc_frag, 0, 0, 0);
@ -1187,7 +1188,7 @@ namespace ggml_cuda_mma {
#elif defined(AMD_MFMA_AVAILABLE)
using floatx4_t = __attribute__((ext_vector_type(4))) float;
floatx4_t& acc_frag = reinterpret_cast<floatx4_t&>(D.x[0]);
#if defined(CDNA3) || defined(CDNA2)
#if defined(CDNA4) || defined(CDNA3) || defined(CDNA2)
using bf16x4_t = __attribute__((ext_vector_type(4))) __bf16;
const bf16x4_t& a_frag = reinterpret_cast<const bf16x4_t&>(A.x[0]);
const bf16x4_t& b_frag = reinterpret_cast<const bf16x4_t&>(B.x[0]);
@ -1216,12 +1217,12 @@ namespace ggml_cuda_mma {
#if defined(AMD_MFMA_AVAILABLE)
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
int32x4_t * acc = (int32x4_t *) D.x;
#if defined(CDNA3)
#if defined(CDNA4) || defined(CDNA3)
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
((int64_t *) B.x)[0],
acc[0],
0, 0, 0);
#elif defined(CDNA2) || defined(CDNA)
#elif defined(CDNA2) || defined(CDNA1)
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
B.x[0],
acc[0],
@ -1230,7 +1231,7 @@ namespace ggml_cuda_mma {
B.x[1],
acc[0],
0, 0, 0);
#endif // defined(CDNA3)
#endif // defined(CDNA4) || defined(CDNA3)
#elif defined(AMD_WMMA_AVAILABLE)
@ -1295,12 +1296,12 @@ namespace ggml_cuda_mma {
#if defined(AMD_MFMA_AVAILABLE)
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
int32x16_t * acc = (int32x16_t *) D.x;
#if defined(CDNA3)
#if defined(CDNA4) || defined(CDNA3)
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
((int64_t *) B.x)[0],
acc[0],
0, 0, 0);
#elif defined(CDNA2) || defined(CDNA)
#elif defined(CDNA2) || defined(CDNA1)
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
B.x[0],
acc[0],
@ -1309,7 +1310,7 @@ namespace ggml_cuda_mma {
B.x[1],
acc[0],
0, 0, 0);
#endif // defined(CDNA3)
#endif // defined(CDNA4) || defined(CDNA3)
#else
GGML_UNUSED_VARS(D, A, B);

View File

@ -3645,7 +3645,7 @@ static __global__ void mul_mat_q(
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
return;
}
#endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
constexpr int ITER_K = get_iter_k(type);

View File

@ -189,6 +189,10 @@
#define GCN
#endif // defined(GCN5) || defined(GCN4)
#if defined(__gfx950__)
#define CDNA4
#endif // defined(__gfx950__)
#if defined(__gfx942__)
#define CDNA3
#endif // defined(__gfx942__)
@ -201,9 +205,9 @@
#define CDNA1
#endif // defined(__gfx908__)
#if defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#if defined(CDNA4) || defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#define CDNA // For the entire family
#endif // defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#endif // defined(CDNA4) || defined(CDNA3) || defined(CDNA2) || defined(CDNA1)
#if defined(__GFX12__)
#define RDNA4