mmq.cu: tune mmq/rocblas switching for RDNA (#18537)
* Patch perf regression for mmq kernels in ROCm recover performance regression for https://github.com/ggml-org/llama.cpp/issues/17917 * add n_experts branch like the cdna path * mmq.cu: tune mmq/wmma switching for RDNA * mmq.cu: move amd wmma mmq/wmma switching behind IS_RDNA3 * Update ggml/src/ggml-cuda/mmq.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Jiacheng (Jason) Chen <76919340+jiachengjason@users.noreply.github.com> Co-authored-by: jiachengjason <jasonchen.jiacheng@gmail.com> Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
parent
3d26a09dc7
commit
968929528c
|
|
@ -333,6 +333,28 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
|||
}
|
||||
|
||||
if (amd_wmma_available(cc)) {
|
||||
// RDNA 4 is consistently worse on rocblas
|
||||
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
// High expert counts almost always better on MMQ
|
||||
// due to a large amount of graph splits
|
||||
// https://github.com/ggml-org/llama.cpp/pull/18202
|
||||
if (n_experts >= 64) {
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
// These quants are really bad on MMQ
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
// These quants are usually worse but not always
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
return ne11 <= 128;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue