From afb0e3d5577a9d7b0b7b6ba1eed8e42b1fb1b2c5 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 13 Dec 2025 17:24:39 +0800 Subject: [PATCH] cdna --- ggml/src/ggml-cuda/mma.cuh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 48f81e4955..70bc60d320 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -123,9 +123,9 @@ namespace ggml_cuda_mma { } else if constexpr (I == 32 && J == 4) { return threadIdx.x % 32; } else if constexpr (I == 16 && J == 16) { - return 4 * (threadIdx.x / 16) + l; + return threadIdx.x % 16; } else if constexpr (I == 32 && J == 32) { - return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4); + return threadIdx.x % 32; } else { NO_DEVICE_CODE; return -1; @@ -140,9 +140,9 @@ namespace ggml_cuda_mma { } else if constexpr (I == 32 && J == 4) { return 2 * (threadIdx.x / 32) + l; } else if constexpr (I == 16 && J == 16) { - return threadIdx.x % 16; + return 4 * (threadIdx.x / 16) + l; } else if constexpr (I == 32 && J == 32) { - return threadIdx.x % 32; + return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4); } else { NO_DEVICE_CODE; return -1; @@ -601,9 +601,7 @@ namespace ggml_cuda_mma { t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)]; } } else { - int64_t * xi = (int64_t *) t.x; - const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I)); - xi[0] = xs[0]; + ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); } #elif defined(AMD_WMMA_AVAILABLE) // All wmma layout has continues data when i-major.