From 16132d38144f11e39fd08e7a0c607dc840bce707 Mon Sep 17 00:00:00 2001 From: David Friehs Date: Mon, 16 Feb 2026 00:10:12 +0100 Subject: [PATCH 1/4] avx2: calculate ksigns instead of using keven_signs_q2xs --- ggml/src/ggml-cpu/arch/x86/quants.c | 42 +++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index 74d699f633..cf3f2133ae 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -2369,6 +2369,35 @@ static const int8_t keven_signs_q2xs[1024] = { }; #endif +#if defined(__AVX2__) +static inline __m256i unpack_ksigns(const uint32_t packed) { + __m128i x = _mm_set1_epi32(packed); + + // shift each 32 bit value to their offset. bits 0-6 are ok, 7-31 are garbage + const __m128i shifts = _mm_setr_epi32(0, 7, 14, 21); + x = _mm_srlv_epi32(x, shifts); + + // plut has 0x80 at locations that have odd bitcount, 0x00 at even bitcount + const __m128i mask = _mm_set1_epi32(0x0F); + const __m128i plut = _mm_setr_epi32(0x00808000, 0x80000080, 0x80000080, 0x00808000); + + const __m128i p_l = _mm_shuffle_epi8(plut, _mm_and_si128(x, mask)); + const __m128i p_h = _mm_shuffle_epi8(plut, _mm_and_si128(_mm_srli_epi32(x, 4), mask)); + + // correct bit 7 via xor. bits 0-7 now ok, 8-31 still garbage + x = _mm_xor_si128(x, p_l); + x = _mm_xor_si128(x, p_h); + + // expand to __m256i, broadcast bytes 0, 4, 8, 12 + const __m256i shf = _mm256_setr_epi64x(0x0000000000000000LL, 0x0404040404040404LL, + 0x0808080808080808LL, 0x0C0C0C0C0C0C0C0CLL); + const __m256i y = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(x), shf); + + const __m256i sel = _mm256_set1_epi64x(0x8040201008040201LL); + return _mm256_cmpeq_epi8(_mm256_and_si256(y, sel), sel); +} +#endif + void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -2384,8 +2413,6 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const #if defined(__AVX2__) - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; - uint32_t aux32[4]; const uint8_t * aux8 = (const uint8_t *)aux32; @@ -2402,12 +2429,11 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); - const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], - signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); - const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127], - signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); - const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); - const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i s2_1 = unpack_ksigns(aux32[1]); + const __m256i s2_2 = unpack_ksigns(aux32[3]); + const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(q8_1, s2_1), s2_1); + const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(q8_2, s2_2), s2_2); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); const uint16_t ls1 = aux32[1] >> 28; From bd7b45e165bcd843a02dd1b183de78dbf5c4a760 Mon Sep 17 00:00:00 2001 From: David Friehs Date: Mon, 16 Feb 2026 01:44:54 +0100 Subject: [PATCH 2/4] avx2: drop 2 instructions, finally approach lut speed --- ggml/src/ggml-cpu/arch/x86/quants.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index cf3f2133ae..eb6b23d1e9 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -2377,16 +2377,18 @@ static inline __m256i unpack_ksigns(const uint32_t packed) { const __m128i shifts = _mm_setr_epi32(0, 7, 14, 21); x = _mm_srlv_epi32(x, shifts); - // plut has 0x80 at locations that have odd bitcount, 0x00 at even bitcount - const __m128i mask = _mm_set1_epi32(0x0F); - const __m128i plut = _mm_setr_epi32(0x00808000, 0x80000080, 0x80000080, 0x00808000); + // popc_odd has 0x80 at locations that have odd bitcount, 0x00 at even bitcount + const __m128i mask_nib = _mm_set1_epi32(0x0F); + const __m128i popc_odd = _mm_setr_epi32(0x00808000, 0x80000080, 0x80000080, 0x00808000); - const __m128i p_l = _mm_shuffle_epi8(plut, _mm_and_si128(x, mask)); - const __m128i p_h = _mm_shuffle_epi8(plut, _mm_and_si128(_mm_srli_epi32(x, 4), mask)); + // xor bit 4-7 into the lower bit 0-3. this does not change if the set bit count is odd + __m128i p = _mm_srli_epi32(x, 4); + p = _mm_xor_si128(p, x); + p = _mm_and_si128(p, mask_nib); + p = _mm_shuffle_epi8(popc_odd, p); // correct bit 7 via xor. bits 0-7 now ok, 8-31 still garbage - x = _mm_xor_si128(x, p_l); - x = _mm_xor_si128(x, p_h); + x = _mm_xor_si128(x, p); // expand to __m256i, broadcast bytes 0, 4, 8, 12 const __m256i shf = _mm256_setr_epi64x(0x0000000000000000LL, 0x0404040404040404LL, From 7fe317f66230b3dcfe783ead3a81b239c8947779 Mon Sep 17 00:00:00 2001 From: David Friehs Date: Mon, 16 Feb 2026 06:46:45 +0100 Subject: [PATCH 3/4] avx2: inline unpack_ksigns, fix up 8 sign bytes at once --- ggml/src/ggml-cpu/arch/x86/quants.c | 68 +++++++++++++++-------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index eb6b23d1e9..b1d7d0511c 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -2370,34 +2370,18 @@ static const int8_t keven_signs_q2xs[1024] = { #endif #if defined(__AVX2__) -static inline __m256i unpack_ksigns(const uint32_t packed) { - __m128i x = _mm_set1_epi32(packed); - - // shift each 32 bit value to their offset. bits 0-6 are ok, 7-31 are garbage - const __m128i shifts = _mm_setr_epi32(0, 7, 14, 21); - x = _mm_srlv_epi32(x, shifts); - - // popc_odd has 0x80 at locations that have odd bitcount, 0x00 at even bitcount - const __m128i mask_nib = _mm_set1_epi32(0x0F); - const __m128i popc_odd = _mm_setr_epi32(0x00808000, 0x80000080, 0x80000080, 0x00808000); - - // xor bit 4-7 into the lower bit 0-3. this does not change if the set bit count is odd - __m128i p = _mm_srli_epi32(x, 4); - p = _mm_xor_si128(p, x); - p = _mm_and_si128(p, mask_nib); - p = _mm_shuffle_epi8(popc_odd, p); - - // correct bit 7 via xor. bits 0-7 now ok, 8-31 still garbage - x = _mm_xor_si128(x, p); - - // expand to __m256i, broadcast bytes 0, 4, 8, 12 - const __m256i shf = _mm256_setr_epi64x(0x0000000000000000LL, 0x0404040404040404LL, - 0x0808080808080808LL, 0x0C0C0C0C0C0C0C0CLL); - const __m256i y = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(x), shf); - - const __m256i sel = _mm256_set1_epi64x(0x8040201008040201LL); - return _mm256_cmpeq_epi8(_mm256_and_si256(y, sel), sel); -} +// shifts to 7 bit signs in xxs quantizations +static const uint32_t ksigns_shift_xxs[8] = {0, 7, 14, 21, 0, 7, 14, 21}; +// for _mm256_shuffle_epi8, has 0x80 at indices that are encoded with odd bit counts +static const uint32_t ksigns_popc_odd[8] = { + 0x00808000, 0x80000080, 0x80000080, 0x00808000, + 0x00808000, 0x80000080, 0x80000080, 0x00808000, +}; +// for _mm256_shuffle_epi8, broadcasts bytes 0, 4, 8, 12 +static const uint64_t ksigns_bcast_xxs[4] = { + 0x0000000000000000ULL, 0x0404040404040404ULL, + 0x0808080808080808ULL, 0x0C0C0C0C0C0C0C0CULL, +}; #endif void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { @@ -2418,6 +2402,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const uint32_t aux32[4]; const uint8_t * aux8 = (const uint8_t *)aux32; + const __m256i ks_shift = _mm256_loadu_si256((const __m256i *)ksigns_shift_xxs); + const __m256i ks_bcast = _mm256_loadu_si256((const __m256i *)ksigns_bcast_xxs); + const __m256i popc_odd = _mm256_loadu_si256((const __m256i *)ksigns_popc_odd); + const __m256i mask_nib = _mm256_set1_epi32(0x0F); + const __m256i ks_bsel = _mm256_set1_epi64x(0x8040201008040201LL); + __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; @@ -2431,10 +2421,24 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); - const __m256i s2_1 = unpack_ksigns(aux32[1]); - const __m256i s2_2 = unpack_ksigns(aux32[3]); - const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(q8_1, s2_1), s2_1); - const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(q8_2, s2_2), s2_2); + + const __m256i s_raw = MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])); + // shift each value to their offset. bits 0-6 now ok, 7-31 are garbage + const __m256i s_7 = _mm256_srlv_epi32(s_raw, ks_shift); + // count the bits via xor+lut, correct bit 8 + const __m256i nib = _mm256_xor_si256(_mm256_srli_epi32(s_7, 4), s_7); + const __m256i popc = _mm256_shuffle_epi8(popc_odd, _mm256_and_si256(nib, mask_nib)); + const __m256i s_8 = _mm256_xor_si256(s_7, popc); + // extract into two __m256i, broadcast bytes 0, 4, 8, 12 + const __m256i s1_e = _mm256_permute2x128_si256(s_8, s_8, 0x00); + const __m256i s2_e = _mm256_permute2x128_si256(s_8, s_8, 0x11); + const __m256i s1_b = _mm256_shuffle_epi8(s1_e, ks_bcast); + const __m256i s2_b = _mm256_shuffle_epi8(s2_e, ks_bcast); + // set 0xFF in bytes that contain bit, then invert via xor+sub + const __m256i s1 = _mm256_cmpeq_epi8(_mm256_and_si256(s1_b, ks_bsel), ks_bsel); + const __m256i s2 = _mm256_cmpeq_epi8(_mm256_and_si256(s2_b, ks_bsel), ks_bsel); + const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(q8_1, s1), s1); + const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(q8_2, s2), s2); const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); From 4515987c95ff956452179982c8a4a2441aa5543a Mon Sep 17 00:00:00 2001 From: David Friehs Date: Mon, 16 Feb 2026 22:21:19 +0100 Subject: [PATCH 4/4] avx2: shift and mask in __m128i then pack to save on permutes --- ggml/src/ggml-cpu/arch/x86/quants.c | 54 ++++++++++++++++------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index b1d7d0511c..b54e2ae353 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -2371,16 +2371,17 @@ static const int8_t keven_signs_q2xs[1024] = { #if defined(__AVX2__) // shifts to 7 bit signs in xxs quantizations -static const uint32_t ksigns_shift_xxs[8] = {0, 7, 14, 21, 0, 7, 14, 21}; +static const uint32_t ksigns_shift_xxs[4] = {0, 7, 14, 21}; // for _mm256_shuffle_epi8, has 0x80 at indices that are encoded with odd bit counts -static const uint32_t ksigns_popc_odd[8] = { - 0x00808000, 0x80000080, 0x80000080, 0x00808000, - 0x00808000, 0x80000080, 0x80000080, 0x00808000, +static const uint32_t ksigns_popc_odd[4] = {0x00808000, 0x80000080, 0x80000080, 0x00808000,}; +// for _mm256_shuffle_epi8, broadcasts bytes 0, 2, 4, 6 / 8, 10, 12, 14 +static const uint64_t ksigns_bcast_1[4] = { + 0x0000000000000000ULL, 0x0202020202020202ULL, + 0x0404040404040404ULL, 0x0606060606060606ULL, }; -// for _mm256_shuffle_epi8, broadcasts bytes 0, 4, 8, 12 -static const uint64_t ksigns_bcast_xxs[4] = { - 0x0000000000000000ULL, 0x0404040404040404ULL, - 0x0808080808080808ULL, 0x0C0C0C0C0C0C0C0CULL, +static const uint64_t ksigns_bcast_2[4] = { + 0x0808080808080808ULL, 0x0A0A0A0A0A0A0A0AULL, + 0x0C0C0C0C0C0C0C0CULL, 0x0E0E0E0E0E0E0E0EULL, }; #endif @@ -2402,10 +2403,11 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const uint32_t aux32[4]; const uint8_t * aux8 = (const uint8_t *)aux32; - const __m256i ks_shift = _mm256_loadu_si256((const __m256i *)ksigns_shift_xxs); - const __m256i ks_bcast = _mm256_loadu_si256((const __m256i *)ksigns_bcast_xxs); - const __m256i popc_odd = _mm256_loadu_si256((const __m256i *)ksigns_popc_odd); - const __m256i mask_nib = _mm256_set1_epi32(0x0F); + const __m128i ks_shift = _mm_loadu_si128((const __m128i *)ksigns_shift_xxs); + const __m128i ks_mask = _mm_set1_epi32(0x7F); + const __m128i popc_odd = _mm_loadu_si128((const __m128i *)ksigns_popc_odd); + const __m256i ks_bc_1 = _mm256_loadu_si256((const __m256i *)ksigns_bcast_1); + const __m256i ks_bc_2 = _mm256_loadu_si256((const __m256i *)ksigns_bcast_2); const __m256i ks_bsel = _mm256_set1_epi64x(0x8040201008040201LL); __m256 accumf = _mm256_setzero_ps(); @@ -2422,18 +2424,22 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); - const __m256i s_raw = MM256_SET_M128I(_mm_set1_epi32(aux32[3]), _mm_set1_epi32(aux32[1])); - // shift each value to their offset. bits 0-6 now ok, 7-31 are garbage - const __m256i s_7 = _mm256_srlv_epi32(s_raw, ks_shift); - // count the bits via xor+lut, correct bit 8 - const __m256i nib = _mm256_xor_si256(_mm256_srli_epi32(s_7, 4), s_7); - const __m256i popc = _mm256_shuffle_epi8(popc_odd, _mm256_and_si256(nib, mask_nib)); - const __m256i s_8 = _mm256_xor_si256(s_7, popc); - // extract into two __m256i, broadcast bytes 0, 4, 8, 12 - const __m256i s1_e = _mm256_permute2x128_si256(s_8, s_8, 0x00); - const __m256i s2_e = _mm256_permute2x128_si256(s_8, s_8, 0x11); - const __m256i s1_b = _mm256_shuffle_epi8(s1_e, ks_bcast); - const __m256i s2_b = _mm256_shuffle_epi8(s2_e, ks_bcast); + __m128i s_l = _mm_set1_epi32(aux32[1]); + __m128i s_h = _mm_set1_epi32(aux32[3]); + // shift each value to their offset, then zero out garbage + s_l = _mm_srlv_epi32(s_l, ks_shift); + s_h = _mm_srlv_epi32(s_h, ks_shift); + s_l = _mm_and_si128(s_l, ks_mask); + s_h = _mm_and_si128(s_h, ks_mask); + // pack, count bits via xor+lut, correct bit 8 + __m128i signs_128 = _mm_packus_epi32(s_l, s_h); + const __m128i cnt4 = _mm_xor_si128(_mm_srli_epi16(signs_128, 4), signs_128); + const __m128i popc = _mm_shuffle_epi8(popc_odd, cnt4); + signs_128 = _mm_or_si128(signs_128, popc); + // expand to 256 bits, then broadcast to 8 bytes each + __m256i signs_256 = _mm256_broadcastsi128_si256(signs_128); + const __m256i s1_b = _mm256_shuffle_epi8(signs_256, ks_bc_1); + const __m256i s2_b = _mm256_shuffle_epi8(signs_256, ks_bc_2); // set 0xFF in bytes that contain bit, then invert via xor+sub const __m256i s1 = _mm256_cmpeq_epi8(_mm256_and_si256(s1_b, ks_bsel), ks_bsel); const __m256i s2 = _mm256_cmpeq_epi8(_mm256_and_si256(s2_b, ks_bsel), ks_bsel);