(Bugfix, ggml-cuda) Pool alloc count fix + small size computation type adjustment (#18559)

* CUDA: Fixed obj byte size instead of obj count being passed to pool alloc (fattn-common, dst_tmp_meta)

* CUDA: Explicitly casted some of the int alloc counts before multiplication in argsort

---------

Co-authored-by: pl752 <maximpl752@gmail.com>
This commit is contained in:
pl752 2026-01-03 15:13:40 +05:00 committed by GitHub
parent bcfc8c3cec
commit 9dba9f5352
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 3 additions and 3 deletions

View File

@ -29,8 +29,8 @@ static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
const int nrows,
ggml_sort_order order,
cudaStream_t stream) {
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ((size_t) ncols) * nrows);
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ((size_t) ncols) * nrows);
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
int * temp_indices = temp_indices_alloc.get();

View File

@ -918,7 +918,7 @@ void launch_fattn(
blocks_num.y = 1;
blocks_num.z = 1;
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
dst_tmp_meta.alloc(((size_t) blocks_num.x) * ncols * (2 + DV/2));
} else {
const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.