From 9c660ddafe560113083b351c50c23b81661b2f70 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Fri, 13 Feb 2026 14:59:59 +0530
Subject: [PATCH] move another memset out of the loop

---
 ggml/src/ggml-cpu/ops.cpp     | 2 +-
 ggml/src/ggml-cpu/simd-gemm.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index c44f3e5989..b7a70e06f1 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8485,6 +8485,7 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
         }
 
         memset(K_f32, 0, DK * KV_TILE_SZ * sizeof(float));
+        memset(V32,   0, KV_TILE_SZ * DV * sizeof(float));
 
         for (int64_t ic = 0; ic < nek1; ic += KV_TILE_SZ) {
             const int kv_tile = (int)std::min((int64_t)KV_TILE_SZ, nek1 - ic);
@@ -8578,7 +8579,6 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
 
             // V accumulation: VKQ32 += softmax(KQ) * V
             // Pack V tile to contiguous F32, zero-padded
-            memset(V32, 0, KV_TILE_SZ * DV * sizeof(float));
             for (int tk = 0; tk < kv_tile; tk++) {
                 const char * v_data = (const char *)v->data + (ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3;
                 if (kv_type == GGML_TYPE_F16) {
diff --git a/ggml/src/ggml-cpu/simd-gemm.h b/ggml/src/ggml-cpu/simd-gemm.h
index edbcc781f3..29defceb4e 100644
--- a/ggml/src/ggml-cpu/simd-gemm.h
+++ b/ggml/src/ggml-cpu/simd-gemm.h
@@ -5,6 +5,7 @@
 #include "ggml-cpu-impl.h"
 #include "vec.h"
 #include "common.h"
+#include "simd-mappings.h"
 
 
 // TODO: add support for sizeless vector types