move another memset out of the loop

This commit is contained in:
Aman Gupta 2026-02-13 14:59:59 +05:30
parent 8debab3104
commit 9c660ddafe
2 changed files with 2 additions and 1 deletions

View File

@ -8485,6 +8485,7 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
}
memset(K_f32, 0, DK * KV_TILE_SZ * sizeof(float));
memset(V32, 0, KV_TILE_SZ * DV * sizeof(float));
for (int64_t ic = 0; ic < nek1; ic += KV_TILE_SZ) {
const int kv_tile = (int)std::min((int64_t)KV_TILE_SZ, nek1 - ic);
@ -8578,7 +8579,6 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
// V accumulation: VKQ32 += softmax(KQ) * V
// Pack V tile to contiguous F32, zero-padded
memset(V32, 0, KV_TILE_SZ * DV * sizeof(float));
for (int tk = 0; tk < kv_tile; tk++) {
const char * v_data = (const char *)v->data + (ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3;
if (kv_type == GGML_TYPE_F16) {

View File

@ -5,6 +5,7 @@
#include "ggml-cpu-impl.h"
#include "vec.h"
#include "common.h"
#include "simd-mappings.h"
// TODO: add support for sizeless vector types