From 7d5ac45bda5f24086622653e99ebf095d66fc3b7 Mon Sep 17 00:00:00 2001 From: Alberto Cabrera Date: Thu, 5 Feb 2026 11:19:25 +0000 Subject: [PATCH] remaining comments from dev removed --- ggml/src/ggml-cpu/arch/arm/repack.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index 4801abb13c..3dc136df17 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -844,7 +844,6 @@ void ggml_gemv_q5_K_8x4_q8_K(int n, int16_t bsums_arr[8]; vst1q_s16(bsums_arr, bsums); - // Preload to maximize qh reuse uint8x16_t qh[col_groups][8]; for (int c = 0; c < col_groups; c++) { for (int i = 0; i < 8; i++) { @@ -879,8 +878,7 @@ void ggml_gemv_q5_K_8x4_q8_K(int n, uint8x16_t hbit_hi[8]; int8x16_t q5_lo[8]; int8x16_t q5_hi[8]; - // Already tried unrolling this loop, no perf difference - // Compiler seems to be able to unroll and schedule well enough + for (int i = 0; i < 8; i++) { q5_cols[i] = vld1q_u8(q5_ptr[b].qs + sb * QK_K + i * 32 + 16 * c); hbit_lo[i] = vandq_u8(qh[c][i], mone);