diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index e25a19b656..37558f7478 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -785,11 +785,11 @@ namespace ggml_cuda_mma { acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12( true, - a_vec[0], - true, - b_vec[0], - acc[0], - true + a_vec[0], + true, + b_vec[0], + acc[0], + true ); acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12( @@ -897,7 +897,7 @@ static __device__ __forceinline__ void mma( GGML_UNUSED(A); GGML_UNUSED(B); NO_DEVICE_CODE; -#endif +#endif } } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 7affff4ef8..99760d56c7 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -1888,7 +1888,7 @@ template static __device__ __forceinline__ void loa #else int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/2) % mmq_y; { -#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) +#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) if (need_check) { i = min(i, i_max); }