diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 96d25c5801..794d90bdd1 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -533,9 +533,9 @@ namespace ggml_cuda_mma { static constexpr int I = I_; static constexpr int J = J_; static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL; - + static constexpr int ne = I * J / 32 * 2; - + T x[ne] = {0}; static constexpr __device__ bool supported() { @@ -612,7 +612,7 @@ namespace ggml_cuda_mma { ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); } #elif defined(AMD_WMMA_AVAILABLE) - // All wmma layout has continues data when i-major. + // All wmma layout has contiguous data when i-major. if constexpr (is_i_major(dl)) { // the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes() constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes(); @@ -900,7 +900,7 @@ namespace ggml_cuda_mma { NO_DEVICE_CODE; #endif // AMPERE_MMA_AVAILABLE } - + template static __device__ __forceinline__ void mma( tile<16, 16, float, dl_d> & D, const tile<16, 8, half2, dl_ab> & A, const tile<16, 8, half2, dl_ab> & B) { @@ -954,7 +954,7 @@ namespace ggml_cuda_mma { NO_DEVICE_CODE; #endif // TURING_MMA_AVAILABLE } - + template static __device__ __forceinline__ void mma( tile<16, 16, float, dl_d> & D, const tile<16, 8, nv_bfloat162, dl_ab> & A, const tile<16, 8, nv_bfloat162, dl_ab> & B) {