CUDA: fix kernel selection logic for tile FA (#19686)
* CUDA: fix kernel selection logic for tile FA * add comment
This commit is contained in:
parent
c5897995a7
commit
c78e682245
|
|
@ -1186,8 +1186,10 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
|
||||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||||
|
|
||||||
|
// On NVIDIA (Pascal and older) the GQA optimizations seem to be detrimental in some cases.
|
||||||
|
// However, for DKQ == 576, DV == 512 only the kernel variant with GQA optimizations is implemented.
|
||||||
const bool nvidia = GGML_CUDA_CC_IS_NVIDIA(ggml_cuda_info().devices[ggml_cuda_get_device()].cc);
|
const bool nvidia = GGML_CUDA_CC_IS_NVIDIA(ggml_cuda_info().devices[ggml_cuda_get_device()].cc);
|
||||||
const int gqa_limit = nvidia && gqa_ratio <= 4 ? 16 : INT_MAX;
|
const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
|
||||||
const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||||
|
|
||||||
if constexpr (DV == 512) {
|
if constexpr (DV == 512) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue