git rebase on top of master: fixing the correctness of the mat mul operations, updating layout mappings for RDNA4
This commit is contained in:
parent
65a4691512
commit
48afe04ca3
|
|
@ -73,34 +73,7 @@ namespace ggml_cuda_mma {
|
|||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(RDNA4)
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 16) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
return 8 * (threadIdx.x / 16) + l;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
return threadIdx.x % 16;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#else
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
static constexpr int ne = I * J / 64;
|
||||
T x[ne] = {0};
|
||||
|
||||
|
|
@ -146,7 +119,6 @@ namespace ggml_cuda_mma {
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
#endif // defined(RDNA4)
|
||||
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
|
@ -177,6 +149,34 @@ namespace ggml_cuda_mma {
|
|||
return -1;
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(RDNA4)
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
static constexpr __device__ bool supported() {
|
||||
if (I == 16 && J == 16) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
return 8 * (threadIdx.x / 16) + l;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 16 && J == 16) {
|
||||
return threadIdx.x % 16;
|
||||
} else {
|
||||
NO_DEVICE_CODE;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
|
@ -425,7 +425,7 @@ namespace ggml_cuda_mma {
|
|||
|
||||
template <int I, int J, typename T>
|
||||
static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
|
|
@ -784,21 +784,21 @@ namespace ggml_cuda_mma {
|
|||
#if defined(RDNA4)
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
false,
|
||||
true,
|
||||
a_vec[0],
|
||||
false,
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
false
|
||||
true
|
||||
);
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
false,
|
||||
true,
|
||||
a_vec[1],
|
||||
false,
|
||||
true,
|
||||
b_vec[1],
|
||||
acc[0],
|
||||
false
|
||||
true
|
||||
);
|
||||
#endif // defined(RDNA4)
|
||||
|
||||
|
|
@ -873,4 +873,31 @@ namespace ggml_cuda_mma {
|
|||
mma(D16[1], A16[1], B);
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
|
||||
#if defined(AMD_WMMA_AVAILABLE)
|
||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||
|
||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||
int32x8_t * acc = (int32x8_t *) D.x;
|
||||
|
||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||
true,
|
||||
a_vec[0],
|
||||
true,
|
||||
b_vec[0],
|
||||
acc[0],
|
||||
false
|
||||
);
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -265,7 +265,7 @@ static int mmq_get_nwarps_host(const int /*cc*/, const int warp_size) {
|
|||
#endif // (GGML_USE_HIP)
|
||||
|
||||
static constexpr __device__ int mmq_get_nwarps_device() {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
return 8;
|
||||
#else
|
||||
return 256/ggml_cuda_get_physical_warp_size();
|
||||
|
|
@ -1129,7 +1129,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
|
|||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
|
|
@ -1170,6 +1170,54 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
|||
tile_C C;
|
||||
mma(C, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4] * dB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
typedef tile<16, 4, int> tile_A;
|
||||
typedef tile<16, 4, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const float * y_df = (const float *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
mma(C, A[n], B);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
|
|
@ -1386,7 +1434,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
|||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
|
|
@ -1438,6 +1486,72 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
|||
tile_C Cd;
|
||||
mma(Cd, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
const float2 dm = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q2_K + k0/4]);
|
||||
float tmp = Cd.x[l]*dm.x;
|
||||
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
||||
tmp -= Cm.x[l]*dm.y;
|
||||
}
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += tmp*dB;
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] -= dm.y*sB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
typedef tile<16, 4, int> tile_A;
|
||||
typedef tile<16, 4, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const half2 * x_dm = (const half2 *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const half2 * y_ds = (const half2 *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q2_K + k0, MMQ_MMA_TILE_X_K_Q2_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = (k01 < MMQ_TILE_NE_K/2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K]).x : __half22float2(y_ds[j*MMQ_TILE_Y_K]).y;
|
||||
const float sB = (k01 >= MMQ_TILE_NE_K * 3/4) ? 0
|
||||
: (((k01/4)%2) ? __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).y
|
||||
: __half22float2(y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]).x);
|
||||
|
||||
tile_C Cm;
|
||||
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
||||
tile_A A1;
|
||||
A1.x[0] = 0x01010101;
|
||||
A1.x[1] = 0x01010101;
|
||||
mma(Cm, A1, B);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C Cd;
|
||||
mma(Cd, A[n], B);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
|
|
@ -1662,7 +1776,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
|||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
|
||||
#if !(defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if !(defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE))
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) {
|
||||
int i = (i0 + threadIdx.y*warp_size + threadIdx.x) % mmq_y;
|
||||
|
|
@ -1921,7 +2035,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
|||
constexpr int rows_per_warp = warp_size / 2;
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
// Need if on AMD instead of % because warp_size == 64
|
||||
// This causes double work and throughput loss (MI300X)
|
||||
// H100 loses about 100 t/s with 'if' condition over '%'
|
||||
|
|
@ -2148,7 +2262,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
|
|||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<16, 8, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
|
|
@ -2190,6 +2304,56 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
|||
tile_C C;
|
||||
mma(C, A[n], B[0]);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16);
|
||||
sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l] * sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K] * dB;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
typedef tile<16, 4, int> tile_A;
|
||||
typedef tile<16, 4, int> tile_B;
|
||||
typedef tile<16, 16, int> tile_C;
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = granularity;
|
||||
constexpr int ntx = rows_per_warp/tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
|
||||
const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K;
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const float * y_df = (const float *) y;
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
tile_A A[ntx];
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
load_generic(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
|
||||
tile_B B;
|
||||
load_generic(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
|
||||
|
||||
const int j = j0 + tile_C::get_j(0);
|
||||
const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
mma(C, A[n], B);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
const int i = i0 + n*tile_C::I + tile_C::get_i(l);
|
||||
|
|
|
|||
Loading…
Reference in New Issue