tune mmq/rocblas switching for RDNA4

This commit is contained in:
jiachengjason 2026-01-13 11:09:50 -05:00
parent e4832e3ae4
commit b2d628dc51
1 changed files with 16 additions and 2 deletions

View File

@ -356,8 +356,22 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
}
}
// For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
if (GGML_CUDA_CC_IS_RDNA4(cc)){
if (n_experts > 64 || ne11 <= 128) {
return true;
}
if (type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || type == GGML_TYPE_Q5_0 || type == GGML_TYPE_Q5_1) {
return true;
}
if (ne11 <= 256 && (type == GGML_TYPE_Q4_K || type == GGML_TYPE_Q5_K)) {
return true;
}
return false;
}
return true;
}