clean up after rebase: remove duplicated change, revert cmake files
This commit is contained in:
parent
fa56838893
commit
9075f546ee
|
|
@ -210,7 +210,6 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
|||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
||||
option(GGML_HIP_MMQ_WMMA "ggml: enable WMMA MMA for RDNA4 in MMQ" ON)
|
||||
option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
|
||||
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
||||
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
||||
|
|
|
|||
|
|
@ -232,9 +232,6 @@ static const char * cu_get_error_str(CUresult err) {
|
|||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#define VOLTA_MMA_AVAILABLE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||
#if defined(GGML_USE_HIP) && defined(RDNA4)
|
||||
#define AMD_WMMA_AVAILABLE
|
||||
#endif // defined(GGML_USE_HIP) && defined(RDNA4) && !defined(GGML_HIP_NO_MMQ_WMMA)
|
||||
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define TURING_MMA_AVAILABLE
|
||||
|
|
@ -298,10 +295,6 @@ static bool volta_mma_available(const int cc) {
|
|||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
|
||||
}
|
||||
|
||||
static bool amd_wmma_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_RDNA4(cc);
|
||||
}
|
||||
|
||||
static bool turing_mma_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -437,7 +437,20 @@ namespace ggml_cuda_mma {
|
|||
xi[0] = xs[0];
|
||||
}
|
||||
#elif defined(AMD_WMMA_AVAILABLE)
|
||||
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
|
||||
if constexpr (I == 16 && J == 4) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
}else if constexpr (I == 16 && J == 8) {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
|
||||
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
|
||||
xi[1] = xs1[0];
|
||||
}else{
|
||||
NO_DEVICE_CODE;
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue