diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 67af1d8ccc..c562960619 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -15,6 +15,7 @@ if (CUDAToolkit_FOUND) # 80 == Ampere, asynchronous data loading, faster tensor core instructions # 86 == RTX 3000, needs CUDA v11.1 # 89 == RTX 4000, needs CUDA v11.8 + # 100 == Blackwell, needs CUDA v12.8, native FP4 tensor cores # # XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run # XX-real == compile CUDA code as device code for this specific architecture @@ -34,6 +35,10 @@ if (CUDAToolkit_FOUND) if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real) endif() + + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES 100-real) + endif() endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index c4529f5d94..85c587928e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -50,6 +50,7 @@ #define GGML_CUDA_CC_TURING 750 #define GGML_CUDA_CC_AMPERE 800 #define GGML_CUDA_CC_ADA_LOVELACE 890 +#define GGML_CUDA_CC_BLACKWELL 1000 #define GGML_CUDA_CC_OFFSET_AMD 0x1000000 #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000 #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS) @@ -243,6 +244,10 @@ static const char * cu_get_error_str(CUresult err) { #define AMPERE_MMA_AVAILABLE #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL +# define BLACKWELL_MMA_AVAILABLE +#endif + #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #define CP_ASYNC_AVAILABLE #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE @@ -313,6 +318,10 @@ static bool cp_async_available(const int cc) { return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE; } +static bool blackwell_mma_available(const int cc) { + return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_BLACKWELL; +} + static constexpr __device__ int ggml_cuda_get_physical_warp_size() { #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) return 64; @@ -698,6 +707,41 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { #endif // CUDART_VERSION >= 12050 } +__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) { + // Handle exact zero early + if (x == 0.0f) { + return 0; + } + + const float sign = x < 0.0f ? -1.0f : 1.0f; + float ax = fabsf(x) * e; + + // Positive LUT + static constexpr float pos_lut[8] = { 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f }; + + // Saturate to max representable magnitude + if (ax > pos_lut[7]) { + ax = pos_lut[7]; + } + + int best_i = 0; + float best_err = fabsf(ax - pos_lut[0]); + for (int i = 1; i < 8; ++i) { + float err = fabsf(ax - pos_lut[i]); + if (err < best_err) { + best_err = err; + best_i = i; + } + } + + // Positive codes: 0..7, negative: 8..15 (sign bit = MSB) + if (sign > 0.0f) { + return static_cast(best_i); // 0..7 + } else { + return static_cast(best_i | 0x8); // 8..15 + } +} + // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. // Precompute mp (m' in the paper) and L such that division // can be computed using a multiply (high 32b of 64b result) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 0b13293da9..20edc82109 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -812,6 +812,25 @@ namespace ggml_cuda_mma { #endif // AMPERE_MMA_AVAILABLE } + static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D, + const tile<16, 8, int> & A, + const tile<8, 8, int> & B, + uint32_t a_scale, + uint32_t b_scale) { +#ifdef BLACKWELL_MMA_AVAILABLE + const int * Axi = (const int *) A.x; + const int * Bxi = (const int *) B.x; + float * Dxi = (float *) D.x; + + asm volatile( + "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, " + "%10, {0, 0}, %11, {0, 0};" + : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3]) + : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale)); +#endif // BLACKWELL_MMA_AVAILABLE + } + static __device__ __forceinline__ void mma( tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) { #ifdef TURING_MMA_AVAILABLE diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index f7a2cbca90..0844e475c2 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -123,12 +123,23 @@ void ggml_cuda_mul_mat_q( const int64_t s11 = src1->nb[1] / ts_src1; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s13 = src1->nb[3] / ts_src1; - quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, - ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); + if (blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4) { + quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, + ne11, ne12, ne13, stream); + + } else { + quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, + ne11, ne12, ne13, stream); + } CUDA_CHECK(cudaGetLastError()); } - const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + // Stride depends on quantization format + const int64_t s12 = (blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4) ? + ne11 * ne10_padded * sizeof(block_fp4_mmq) / + (4 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 128 values + : + ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int)); const int64_t s13 = ne12*s12; const mmq_args args = { @@ -175,12 +186,20 @@ void ggml_cuda_mul_mat_q( const int64_t s11 = src1->nb[1] / ts_src1; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s13 = src1->nb[2] / ts_src1; - quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, - ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + + if (blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4) { + quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13, + ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + } else { + quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13, + ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); + } CUDA_CHECK(cudaGetLastError()); } - const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int)); + const int64_t s12 = (blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4) ? + ne11 * ne10_padded * sizeof(block_fp4_mmq) / (4 * QK_MXFP4 * sizeof(int)) : + ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int)); const int64_t s13 = ne12*s12; // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid. diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 1298f99fff..0ce95c7006 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -44,6 +44,12 @@ struct block_q8_1_mmq { }; int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each }; + +struct block_fp4_mmq { + uint32_t d4[2]; // 1 8 bit (e8m0) scale per 32 values, packed LSB as d0-d1 in d4[0] and d4[1] + int8_t qs[2 * 32]; // 128 values to 4 bit each (4 blocks) +}; + static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size"); static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1), "Unexpected block_q8_1_mmq size"); @@ -191,6 +197,7 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml } #define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4) +#define MMQ_MMA_TILE_X_K_FP4 (MMQ_TILE_NE_K + MMQ_TILE_NE_K / QI8_0) #define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4) #define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4) #define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) @@ -201,6 +208,7 @@ static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q2_K % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding."); +static_assert(MMQ_MMA_TILE_X_K_FP4 % 8 == 4, "Wrong padding."); static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { switch (type) { @@ -209,7 +217,12 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1; case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0; +#ifdef BLACKWELL_MMA_AVAILABLE + case GGML_TYPE_MXFP4: + return MMQ_MMA_TILE_X_K_FP4; +#else case GGML_TYPE_MXFP4: return MMQ_MMA_TILE_X_K_Q8_1; +#endif case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K; case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K; case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1; @@ -229,6 +242,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { // block_q8_1_mmq has (128 8-bit ints == 32 32-bit ints + 4 32-bit scales) #define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI8_1) +#define MMQ_TILE_Y_FP4_K MMQ_TILE_Y_K / 2 static int mmq_get_granularity_host(const int mmq_x, const int cc) { if (amd_mfma_available(cc) || amd_wmma_available(cc)) { @@ -761,6 +775,68 @@ template static __device__ __forceinline__ void loa } } +template +static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restrict__ x, + int * __restrict__ x_tile, + const int kbx0, + const int i_max, + const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + +#if defined(BLACKWELL_MMA_AVAILABLE) + int * x_qs = (int *) x_tile; + uint32_t * x_sc = (uint32_t *) (x_qs + MMQ_TILE_NE_K); // Same offset as original: 2*MMQ_TILE_NE_K + + constexpr int nrows = 1; + const int txi = threadIdx.x; // txi + const int kbx = txi; + + // TODO: only 8 threads of a warp at the moment for simplicity, use more threads + if (txi >= 8) { + return; + } +# pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nrows * nwarps) { + int i = i0 + threadIdx.y; + + if (need_check) { + i = min(i, i_max); + } + + const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i * stride + kbx; + + // Load packed FP4 data directly (no LUT dequantization) + const int aux_q4_0 = get_int_b1(bxi->qs, 0); + const int aux_q4_1 = get_int_b1(bxi->qs, 1); + const int aux_q4_2 = get_int_b1(bxi->qs, 2); + const int aux_q4_3 = get_int_b1(bxi->qs, 3); + + const auto compress = [](const int x) -> int { + uint16_t a = (x >> 24) & 0xF; + uint16_t b = (x >> 16) & 0xF; + uint16_t c = (x >> 8) & 0xF; + uint16_t d = x & 0xF; + + return (a << 12) | (b << 8) | (c << 4) | d; + }; + + const int k0 = kbx * 4; // each block takes 4 bytes + + x_qs[i * MMQ_MMA_TILE_X_K_FP4 + k0 + 0] = compress(aux_q4_1) << 16 | compress(aux_q4_0); + x_qs[i * MMQ_MMA_TILE_X_K_FP4 + k0 + 1] = compress(aux_q4_3) << 16 | compress(aux_q4_2); + x_qs[i * MMQ_MMA_TILE_X_K_FP4 + k0 + 2] = compress(aux_q4_1 >> 4) << 16 | compress(aux_q4_0 >> 4); + x_qs[i * MMQ_MMA_TILE_X_K_FP4 + k0 + 3] = compress(aux_q4_3 >> 4) << 16 | compress(aux_q4_2 >> 4); + + if (txi % 2 == 0) { + uint32_t e = bxi->e; + bxi++; + e |= (bxi->e << 8); + x_sc[i * MMQ_MMA_TILE_X_K_FP4 + txi / 2] = e; + } + } +#endif +} + template static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { @@ -930,6 +1006,76 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) } +template +static __device__ __forceinline__ void vec_dot_mxfp4_mxfp4_mma(const int * __restrict__ x, + const int * __restrict__ y, + float * __restrict__ sum, + const int k00) { + typedef tile<16, 8, int> tile_A; + typedef tile<8, 8, int> tile_B; + typedef tile<16, 8, float> tile_C; // Output is float for native scaled MMA + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = 2 * granularity; + constexpr int ntx = rows_per_warp / tile_C::I; // Number of x minitiles per warp. + + y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_FP4_K); + + // Match layout from load_tiles_mxfp4_fp4 + const int * x_qs = (const int *) x; + const uint32_t * x_sc = (const uint32_t *) (x_qs + MMQ_TILE_NE_K); // E8M0 scales at same offset as load + const int * y_qs = (const int *) y + 2; + const uint32_t * y_sc = (const uint32_t *) y; // E8M0 scales for Y + + tile_A A[ntx][MMQ_TILE_NE_K / (2 * QI8_0)]; // 2 x 4 A tiles. Per warp there will be 1 scale pe rtile + uint32_t scaleA[ntx][MMQ_TILE_NE_K / (2 * QI8_0)]; // per tile you would only have 1 scale per thread + + // Block scale + // Each thread has to point to a 4 byte scale value + // https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { +#pragma unroll + for (int k01 = 0; k01 < MMQ_TILE_NE_K / 2; k01 += QI8_0) { + const int k0 = k00 / 2 + k01; + + load_ldmatrix(A[n][k01 / QI8_0], x_qs + (i0 + n * tile_A::I) * MMQ_MMA_TILE_X_K_FP4 + k0, + MMQ_MMA_TILE_X_K_FP4); + + // based on block-scaling document, 2 threads in each quad need to supply to the scale value + const int tidx = threadIdx.x / 4 + (threadIdx.x % 2) * 8; + scaleA[n][k01 / QI8_0] = *(x_sc + (i0 + n * tile_A::I + tidx) * MMQ_MMA_TILE_X_K_FP4 + k0 / QI8_0); + } + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) { +#pragma unroll + for (int k01 = 0; k01 < MMQ_TILE_NE_K / 2; k01 += QI8_0) { + tile_B B; + uint32_t scaleB; // 2xN scales + + load_generic(B, y_qs + j0 * MMQ_TILE_Y_FP4_K + k01, MMQ_TILE_Y_FP4_K); + + scaleB = y_sc[(j0 + threadIdx.x / 4) * MMQ_TILE_Y_FP4_K + k01 / QI8_0]; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + tile_C C; + + mma_block_scaled(C, A[n][k01 / QI8_0], B, scaleA[n][k01 / QI8_0], scaleB); +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l]; + } + } + } + } +} + template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { @@ -3102,8 +3248,13 @@ struct mmq_type_traits { template struct mmq_type_traits { static constexpr int vdr = VDR_MXFP4_Q8_1_MMQ; +#ifdef BLACKWELL_MMA_AVAILABLE + static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4_fp4; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_mxfp4_mxfp4_mma; +#else static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4; static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; +#endif static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; @@ -3240,13 +3391,24 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f}; + constexpr size_t sz = type == GGML_TYPE_MXFP4 ? sizeof(block_fp4_mmq) : sizeof(block_q8_1_mmq); + constexpr size_t y_stride = type == GGML_TYPE_MXFP4 ? MMQ_TILE_Y_FP4_K : MMQ_TILE_Y_K; + + constexpr int y_block_stride = + type == GGML_TYPE_MXFP4 ? (sz / sizeof(int)) // 18 ints per block_fp4_mmq (covers 128 values = 4 qk-blocks) + : + (qk * sz / (4 * QK8_1 * sizeof(int))); // original formula for Q8_1 + for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); - { - const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 0*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = + type == GGML_TYPE_MXFP4 ? + y + ncols_y * ((kb0 / 4) * y_block_stride) // kb0/4 for MXFP4 since 4 qk-blocks per block_fp4_mmq + : + y + ncols_y * (kb0 * y_block_stride); // original for Q8_1 #pragma unroll - for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) { + for (int l0 = 0; l0 < mmq_x * y_stride; l0 += nwarps * warp_size) { int l = l0 + threadIdx.y*warp_size + threadIdx.x; tile_y[l] = by0[l]; @@ -3260,9 +3422,14 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( __syncthreads(); { - const int * by0 = y + ncols_y*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + 1*sizeof(block_q8_1_mmq)/sizeof(int)); + const int * by0 = + type == GGML_TYPE_MXFP4 ? + y + ncols_y * ((kb0 / 4) * y_block_stride + y_block_stride) // advance by one block_fp4_mmq + : + y + ncols_y * (kb0 * y_block_stride + + (int) (sz / sizeof(int))); // original for Q8_1 (advance by one block) #pragma unroll - for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*warp_size) { + for (int l0 = 0; l0 < mmq_x * y_stride; l0 += nwarps * warp_size) { int l = l0 + threadIdx.y*warp_size + threadIdx.x; tile_y[l] = by0[l]; @@ -3456,7 +3623,8 @@ static __global__ void mul_mat_q( __syncthreads(); } - offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + constexpr size_t sz = type == GGML_TYPE_MXFP4 ? sizeof(block_fp4_mmq) : sizeof(block_q8_1_mmq); + offset_y += (col_low + jt * mmq_x) * (sz / sizeof(int)); offset_dst += it*mmq_y; const int tile_x_max_i = nrows_x - it*mmq_y - 1; @@ -3523,7 +3691,8 @@ static __global__ void mul_mat_q( __syncthreads(); } - offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int)); + constexpr size_t sz = type == GGML_TYPE_MXFP4 ? sizeof(block_fp4_mmq) : sizeof(block_q8_1_mmq); + offset_y += (col_low + jt * mmq_x) * (sz / sizeof(int)); offset_dst += it*mmq_y; const int tile_x_max_i = nrows_x - it*mmq_y - 1; @@ -3704,7 +3873,7 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type); const size_t nbs_ids = mmq_x*sizeof(int); const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); - const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq); + const size_t nbs_y = mmq_x * (sizeof(block_q8_1_mmq)); return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int)); } diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 5117f9ffc0..0fe034fa5e 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -47,6 +47,153 @@ static __global__ void quantize_q8_1( y[ib].ds = make_half2(d, sum); } +static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x, + const int32_t * __restrict__ ids, + void * __restrict__ vy, + const int64_t ne00, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t ne0, + const int ne1, + const int ne2) { + constexpr int vals_per_scale = 32; + constexpr int vals_per_warp = 2 * vals_per_scale; // Each warp processes 2 blocks of 32 + + // Each warp processes 2 adjacent blocks of 32 values (64 values total) + const int64_t warp_start_offset = blockIdx.y * vals_per_warp; + const int64_t i0_block0 = warp_start_offset + threadIdx.x; // First block: 0-31 + const int64_t i0_block1 = warp_start_offset + vals_per_scale + threadIdx.x; // Second block: 32-63 + + if (i0_block0 >= ne0) { + return; + } + + const int64_t i1 = blockIdx.x; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; + + const int64_t i01 = ids ? ids[i1] : i1; + const int64_t i02 = i2; + const int64_t i03 = i3; + + block_fp4_mmq * y = (block_fp4_mmq *) vy; + + const int64_t block_fp4_mmq_size = 4 * QK_MXFP4; // 128 values + + const int64_t ib0 = + blockIdx.z * ((int64_t) gridDim.x * gridDim.y * vals_per_warp / block_fp4_mmq_size); // first block of channel + const int64_t ib = ib0 + (warp_start_offset / block_fp4_mmq_size) * ne1 + blockIdx.x; // block index in channel + const int64_t pair_idx_in_block = + (warp_start_offset % block_fp4_mmq_size) / vals_per_warp; // 0-1: which pair of blocks within block_fp4_mmq + + uint8_t e_packed[2]; + + // Process first block (0-31) + { + const int64_t global_src_pos = i03 * s03 + i02 * s02 + i01 * s01 + i0_block0; + const float xi = i0_block0 < ne00 ? x[global_src_pos] : 0.0f; + + float amax = fabsf(xi); + + // Reduce max across all 32 threads in the warp +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE)); + } + + uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax / 6.0f)) + 127) : 0; + + float val = ggml_cuda_e8m0_to_fp32(e); + float inv_s = (amax == 0.0f) ? 0.0f : 1.0f / val; + + // Quantize: each thread processes 1 value + uint8_t q_val = ggml_cuda_float_to_fp4_e2m1(xi, inv_s); + + if (e == 0) { + e = 127; + } + + // Pack 4 values into char2: threads 0,1,2,3 -> first char2, etc. + const int lane_id = threadIdx.x % 4; + const int group_id = threadIdx.x / 4; + + // Use shuffle to gather values from 4 consecutive threads + uint8_t q0 = __shfl_sync(0xFFFFFFFF, q_val, (group_id * 4) + 0, WARP_SIZE); + uint8_t q1 = __shfl_sync(0xFFFFFFFF, q_val, (group_id * 4) + 1, WARP_SIZE); + uint8_t q2 = __shfl_sync(0xFFFFFFFF, q_val, (group_id * 4) + 2, WARP_SIZE); + uint8_t q3 = __shfl_sync(0xFFFFFFFF, q_val, (group_id * 4) + 3, WARP_SIZE); + + char2 q; + if (lane_id == 0) { + q.x = (q1 << 4) | q0; + q.y = (q3 << 4) | q2; + + // Write to output: first block in pair uses positions based on pair_idx_in_block + // Each pair has 2 blocks of 32 = 64 values = 16 char2 elements + char2 * yqs2 = (char2 *) y[ib].qs; + yqs2[pair_idx_in_block * 16 + group_id] = q; + } + + if (threadIdx.x == 0) { + e_packed[0] = e; + } + } + + // Process second block (32-63) + { + const int64_t global_src_pos = i03 * s03 + i02 * s02 + i01 * s01 + i0_block1; + const float xi = i0_block1 < ne00 ? x[global_src_pos] : 0.0f; + + float amax = fabsf(xi); + +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + amax = fmaxf(amax, __shfl_xor_sync(0xFFFFFFFF, amax, mask, WARP_SIZE)); + } + + uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax / 6.0f)) + 127) : 0; + + float val = ggml_cuda_e8m0_to_fp32(e); + float inv_s = (amax == 0.0f) ? 0.0f : 1.0f / val; + + if (e == 0) { + e = 127; + } + + uint8_t q_val = ggml_cuda_float_to_fp4_e2m1(xi, inv_s); + + const int lane_id = threadIdx.x % 4; + const int group_id = threadIdx.x / 4; + + // Use shuffle to gather values from 4 consecutive threads + uint8_t q0 = __shfl_sync(0xFFFFFFFF, q_val, (group_id * 4) + 0, WARP_SIZE); + uint8_t q1 = __shfl_sync(0xFFFFFFFF, q_val, (group_id * 4) + 1, WARP_SIZE); + uint8_t q2 = __shfl_sync(0xFFFFFFFF, q_val, (group_id * 4) + 2, WARP_SIZE); + uint8_t q3 = __shfl_sync(0xFFFFFFFF, q_val, (group_id * 4) + 3, WARP_SIZE); + + char2 q; + if (lane_id == 0) { + q.x = (q1 << 4) | q0; + q.y = (q3 << 4) | q2; + + // Write to output: second block in pair uses positions 8-15 within the pair + char2 * yqs2 = (char2 *) y[ib].qs; + yqs2[pair_idx_in_block * 16 + 8 + group_id] = q; + } + + if (threadIdx.x == 0) { + e_packed[1] = e; + } + } + + // Write packed exponents: d4[0-1] each stores 2 scales (for 2 blocks of 32) + // pair_idx_in_block tells us which d4 entry to use (0-1) + if (threadIdx.x == 0) { + y[ib].d4[pair_idx_in_block] = (e_packed[1] << 8) | e_packed[0]; + } +} + template static __global__ void quantize_mmq_q8_1( const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy, @@ -190,3 +337,26 @@ void quantize_mmq_q8_1_cuda( break; } } + +void quantize_mmq_mxfp4_cuda(const float * x, + const int32_t * ids, + void * vy, + [[maybe_unused]] const ggml_type type_src0, + const int64_t ne00, + const int64_t s01, + const int64_t s02, + const int64_t s03, + const int64_t ne0, + const int64_t ne1, + const int64_t ne2, + const int64_t ne3, + cudaStream_t stream) { + GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0); // Each warp processes 64 values + + // ne1 tends to assume the highest values, therefore use it as the "x" dimension of the CUDA grid: + constexpr int vals_per_warp = 2 * QK_MXFP4; // 64 + const int64_t block_num_y = (ne0 + vals_per_warp - 1) / vals_per_warp; + const dim3 num_blocks(ne1, block_num_y, ne2 * ne3); + const dim3 block_size(32, 1, 1); // Warp size + quantize_mmq_mxfp4<<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); +} diff --git a/ggml/src/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh index 725ab52443..6a91df6357 100644 --- a/ggml/src/ggml-cuda/quantize.cuh +++ b/ggml/src/ggml-cuda/quantize.cuh @@ -25,3 +25,17 @@ void quantize_mmq_q8_1_cuda( const float * x, const int32_t * ids, void * vy, ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); + +void quantize_mmq_mxfp4_cuda(const float * x, + const int32_t * ids, + void * vy, + ggml_type type_src0, + int64_t ne00, + int64_t s01, + int64_t s02, + int64_t s03, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + cudaStream_t stream);