From 83a0313a146926e54da330446c4feeab6b3d9ec1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Wed, 25 Mar 2026 11:35:57 +0100 Subject: [PATCH] model : GGML_OP_SCATTER AND GGML_OP_FILL now work with f16 data, so we can get rid of ggml_cast() calls in sparse attention implementation --- src/llama-graph.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 29d804638c..21a4158c79 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2171,17 +2171,14 @@ ggml_tensor * llm_graph_context::build_attn( const auto & kq_mask = inp->get_kq_mask(); - ggml_tensor * kq_mask_f32 = ggml_cast(ctx0, kq_mask, GGML_TYPE_F32); - // prepare new kq mask - starts filled with -INFINITY - ggml_tensor * kq_mask_all = ggml_fill(ctx0, kq_mask_f32, -INFINITY); + ggml_tensor * kq_mask_all = ggml_fill(ctx0, kq_mask, -INFINITY); // modify it by unmasking tokens that are in top_k indices ggml_tensor * kq_mask_top_k = ggml_scatter(ctx0, kq_mask_all, top_k, 0); // combine with the original kq mask - kq_mask_top_k = ggml_add(ctx0, kq_mask_top_k, kq_mask_f32); - kq_mask_top_k = ggml_cast(ctx0, kq_mask_top_k, kq_mask->type); + kq_mask_top_k = ggml_add(ctx0, kq_mask_top_k, kq_mask); ggml_tensor * q = q_cur; ggml_tensor * k = mctx_cur->get_k(ctx0, il);