Fix top-k comp & behavior for non-CUB path

Some changes were made in 5ea3be265b
which were incomplete. In the case of non-CUB, bitonic sort and its
limitations of ncols < 1024 have to apply, similar to argsort.cu
This commit is contained in:
Oliver Simons 2025-11-27 16:40:41 +01:00
parent e9d070980b
commit f9889cf1c7
2 changed files with 9 additions and 8 deletions

View File

@ -4238,7 +4238,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SUM:
return ggml_is_contiguous_rows(op->src[0]);
case GGML_OP_TOP_K:
return true;
case GGML_OP_ARGSORT:
#ifndef GGML_CUDA_USE_CUB
return op->src[0]->ne[0] <= 1024;

View File

@ -50,7 +50,7 @@ static void top_k_cub(ggml_cuda_pool & pool,
DeviceTopK::MaxPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst, ncols, k, env);
}
#else
#elif defined(GGML_CUDA_USE_CUB) // CUB_TOP_K_AVAILABLE
static int next_power_of_2(int x) {
int n = 1;
@ -84,7 +84,7 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
for (int i = 0; i < nrows; i++) {
top_k_cub(pool, src0_d + i * ncols, dst_d + i * k, ncols, k, stream);
}
#else
#elif defined(GGML_CUDA_USE_CUB) // CUB_TOP_K_AVAILABLE
// Fall back to argsort + copy
const int ncols_pad = next_power_of_2(ncols);
const size_t shared_mem = ncols_pad * sizeof(int);
@ -94,15 +94,17 @@ void ggml_cuda_op_top_k(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
int * tmp_dst = temp_dst_alloc.get();
if (shared_mem > max_shared_mem || ncols > 1024) {
#ifdef GGML_CUDA_USE_CUB
argsort_f32_i32_cuda_cub(pool, src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
#else
argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
#endif
} else {
argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
}
CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
cudaMemcpyDeviceToDevice, stream));
#endif // CUB_TOP_K_AVAILABLE
#else // GGML_CUDA_USE_CUB
ggml_cuda_pool_alloc<int> temp_dst_alloc(pool, ncols * nrows);
int * tmp_dst = temp_dst_alloc.get();
argsort_f32_i32_cuda_bitonic(src0_d, tmp_dst, ncols, nrows, GGML_SORT_ORDER_DESC, stream);
CUDA_CHECK(cudaMemcpy2DAsync(dst_d, k * sizeof(int), tmp_dst, ncols * sizeof(int), k * sizeof(int), nrows,
cudaMemcpyDeviceToDevice, stream));
#endif
}