diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 7e6a375e89..0d93534524 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -1971,8 +1971,11 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Permute mask used for easier vector processing at later stages __m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0); + // Mask to extract 2 bits from packed bytes const __m256i m3b = _mm256_set1_epi8(3); + // Mask to extract nibbles from packed bytes const __m256i m4b = _mm256_set1_epi8(0xF); + // Vector with each byte value 32 - Used as an subtract offset for 6 bit quantized values const __m256i m32s = _mm256_set1_epi8(32); //Mask to get appropriate scales @@ -1998,7 +2001,6 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // Master FP accumulators __m256 acc_row = _mm256_setzero_ps(); - __m256 acc_min_rows = _mm256_setzero_ps(); for (int64_t b = 0; b < nb; b++) { @@ -2015,135 +2017,137 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo for(int sb = 0; sb < QK_K / 128; sb++) { // Load the high bits(bit 5, 6) of eight block_q6_K for eight sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7 - const __m256i rhs_raw_vec_qh_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + sb * 256)); - const __m256i rhs_raw_vec_qh_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 32 + sb * 256)); - const __m256i rhs_raw_vec_qh_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 64 + sb * 256)); - const __m256i rhs_raw_vec_qh_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 96 + sb * 256)); - const __m256i rhs_raw_vec_qh_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 128 + sb * 256)); - const __m256i rhs_raw_vec_qh_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 160 + sb * 256)); - const __m256i rhs_raw_vec_qh_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 192 + sb * 256)); - const __m256i rhs_raw_vec_qh_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 224 + sb * 256)); + const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + sb * 256)); + const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 32 + sb * 256)); + const __m256i rhs_raw_hbit_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 64 + sb * 256)); + const __m256i rhs_raw_hbit_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 96 + sb * 256)); + const __m256i rhs_raw_hbit_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 128 + sb * 256)); + const __m256i rhs_raw_hbit_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 160 + sb * 256)); + const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 192 + sb * 256)); + const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 224 + sb * 256)); // 2-bit -> 8-bit - // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_vec_qh_0123_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_0, m3b), 4); //B00(0-7) B01(0-7) B02(0-7) B03(0-7) - const __m256i rhs_vec_qh_0123_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 2), m3b), 4); //B20(0-7) B21(0-7) B22(0-7) B23(0-7) - const __m256i rhs_vec_qh_0123_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 4), m3b), 4); //B40(0-7) B41(0-7) B42(0-7) B43(0-7) - const __m256i rhs_vec_qh_0123_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 6), m3b), 4); //B60(0-7) B61(0-7) B62(0-7) B63(0-7) + // hbit Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_hbit_0123_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_0, m3b), 4); //B00(0-7) B01(0-7) B02(0-7) B03(0-7) + const __m256i rhs_hbit_0123_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_0, 2), m3b), 4); //B20(0-7) B21(0-7) B22(0-7) B23(0-7) + const __m256i rhs_hbit_0123_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_0, 4), m3b), 4); //B40(0-7) B41(0-7) B42(0-7) B43(0-7) + const __m256i rhs_hbit_0123_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_0, 6), m3b), 4); //B60(0-7) B61(0-7) B62(0-7) B63(0-7) - const __m256i rhs_vec_qh_4567_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_0, m3b), 4); //B04(0-7) B05(0-7) B06(0-7) B07(0-7) - const __m256i rhs_vec_qh_4567_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 2), m3b), 4); //B24(0-7) B25(0-7) B26(0-7) B27(0-7) - const __m256i rhs_vec_qh_4567_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 4), m3b), 4); //B44(0-7) B45(0-7) B46(0-7) B47(0-7) - const __m256i rhs_vec_qh_4567_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 6), m3b), 4); //B64(0-7) B65(0-7) B66(0-7) B67(0-7) + const __m256i rhs_hbit_4567_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_0, m3b), 4); //B04(0-7) B05(0-7) B06(0-7) B07(0-7) + const __m256i rhs_hbit_4567_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_0, 2), m3b), 4); //B24(0-7) B25(0-7) B26(0-7) B27(0-7) + const __m256i rhs_hbit_4567_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_0, 4), m3b), 4); //B44(0-7) B45(0-7) B46(0-7) B47(0-7) + const __m256i rhs_hbit_4567_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_0, 6), m3b), 4); //B64(0-7) B65(0-7) B66(0-7) B67(0-7) - const __m256i rhs_vec_qh_0123_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_1, m3b), 4); //B00(8-15) B01(8-15) B02(8-15) B03(8-15) - const __m256i rhs_vec_qh_0123_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 2), m3b), 4); //B20(8-15) B21(8-15) B22(8-15) B23(8-15) - const __m256i rhs_vec_qh_0123_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 4), m3b), 4); //B40(8-15) B41(8-15) B42(8-15) B43(8-15) - const __m256i rhs_vec_qh_0123_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 6), m3b), 4); //B60(8-15) B61(8-15) B62(8-15) B63(8-15) + const __m256i rhs_hbit_0123_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_1, m3b), 4); //B00(8-15) B01(8-15) B02(8-15) B03(8-15) + const __m256i rhs_hbit_0123_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_1, 2), m3b), 4); //B20(8-15) B21(8-15) B22(8-15) B23(8-15) + const __m256i rhs_hbit_0123_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_1, 4), m3b), 4); //B40(8-15) B41(8-15) B42(8-15) B43(8-15) + const __m256i rhs_hbit_0123_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_1, 6), m3b), 4); //B60(8-15) B61(8-15) B62(8-15) B63(8-15) - const __m256i rhs_vec_qh_4567_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_1, m3b), 4); //B04(8-15) B05(8-15) B06(8-15) B07(8-15) - const __m256i rhs_vec_qh_4567_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 2), m3b), 4); //B24(8-15) B25(8-15) B26(8-15) B27(8-15) - const __m256i rhs_vec_qh_4567_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 4), m3b), 4); //B44(8-15) B45(8-15) B46(8-15) B47(8-15) - const __m256i rhs_vec_qh_4567_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 6), m3b), 4); //B64(8-15) B65(8-15) B66(8-15) B67(8-15) + const __m256i rhs_hbit_4567_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_1, m3b), 4); //B04(8-15) B05(8-15) B06(8-15) B07(8-15) + const __m256i rhs_hbit_4567_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_1, 2), m3b), 4); //B24(8-15) B25(8-15) B26(8-15) B27(8-15) + const __m256i rhs_hbit_4567_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_1, 4), m3b), 4); //B44(8-15) B45(8-15) B46(8-15) B47(8-15) + const __m256i rhs_hbit_4567_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_1, 6), m3b), 4); //B64(8-15) B65(8-15) B66(8-15) B67(8-15) - // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_vec_qh_0123_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_2, m3b), 4); //B10(0-7) B11(0-7) B12(0-7) B13(0-7) - const __m256i rhs_vec_qh_0123_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 2), m3b), 4); //B30(0-7) B31(0-7) B32(0-7) B33(0-7) - const __m256i rhs_vec_qh_0123_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 4), m3b), 4); //B50(0-7) B51(0-7) B52(0-7) B53(0-7) - const __m256i rhs_vec_qh_0123_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 6), m3b), 4); //B70(0-7) B71(0-7) B72(0-7) B73(0-7) + // hbit Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + const __m256i rhs_hbit_0123_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_2, m3b), 4); //B10(0-7) B11(0-7) B12(0-7) B13(0-7) + const __m256i rhs_hbit_0123_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_2, 2), m3b), 4); //B30(0-7) B31(0-7) B32(0-7) B33(0-7) + const __m256i rhs_hbit_0123_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_2, 4), m3b), 4); //B50(0-7) B51(0-7) B52(0-7) B53(0-7) + const __m256i rhs_hbit_0123_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_2, 6), m3b), 4); //B70(0-7) B71(0-7) B72(0-7) B73(0-7) - const __m256i rhs_vec_qh_4567_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_2, m3b), 4); //B14(0-7) B15(0-7) B16(0-7) B17(0-7) - const __m256i rhs_vec_qh_4567_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 2), m3b), 4); //B34(0-7) B35(0-7) B36(0-7) B37(0-7) - const __m256i rhs_vec_qh_4567_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 4), m3b), 4); //B54(0-7) B55(0-7) B56(0-7) B57(0-7) - const __m256i rhs_vec_qh_4567_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 6), m3b), 4); //B74(0-7) B75(0-7) B76(0-7) B77(0-7) + const __m256i rhs_hbit_4567_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_2, m3b), 4); //B14(0-7) B15(0-7) B16(0-7) B17(0-7) + const __m256i rhs_hbit_4567_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_2, 2), m3b), 4); //B34(0-7) B35(0-7) B36(0-7) B37(0-7) + const __m256i rhs_hbit_4567_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_2, 4), m3b), 4); //B54(0-7) B55(0-7) B56(0-7) B57(0-7) + const __m256i rhs_hbit_4567_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_2, 6), m3b), 4); //B74(0-7) B75(0-7) B76(0-7) B77(0-7) - const __m256i rhs_vec_qh_0123_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_3, m3b), 4); //B10(8-15) B11(8-15) B12(8-15) B13(8-15) - const __m256i rhs_vec_qh_0123_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 2), m3b), 4); //B30(8-15) B31(8-15) B32(8-15) B33(8-15) - const __m256i rhs_vec_qh_0123_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 4), m3b), 4); //B50(8-15) B51(8-15) B52(8-15) B53(8-15) - const __m256i rhs_vec_qh_0123_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 6), m3b), 4); //B70(8-15) B71(8-15) B72(8-15) B73(8-15) + const __m256i rhs_hbit_0123_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_3, m3b), 4); //B10(8-15) B11(8-15) B12(8-15) B13(8-15) + const __m256i rhs_hbit_0123_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_3, 2), m3b), 4); //B30(8-15) B31(8-15) B32(8-15) B33(8-15) + const __m256i rhs_hbit_0123_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_3, 4), m3b), 4); //B50(8-15) B51(8-15) B52(8-15) B53(8-15) + const __m256i rhs_hbit_0123_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_3, 6), m3b), 4); //B70(8-15) B71(8-15) B72(8-15) B73(8-15) - const __m256i rhs_vec_qh_4567_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_3, m3b), 4); //B14(8-15) B15(8-15) B16(8-15) B17(8-15) - const __m256i rhs_vec_qh_4567_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 2), m3b), 4); //B34(8-15) B35(8-15) B36(8-15) B37(8-15) - const __m256i rhs_vec_qh_4567_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 4), m3b), 4); //B54(8-15) B55(8-15) B56(8-15) B57(8-15) - const __m256i rhs_vec_qh_4567_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 6), m3b), 4); //B74(8-15) B75(8-15) B76(8-15) B77(8-15) + const __m256i rhs_hbit_4567_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_3, m3b), 4); //B14(8-15) B15(8-15) B16(8-15) B17(8-15) + const __m256i rhs_hbit_4567_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_3, 2), m3b), 4); //B34(8-15) B35(8-15) B36(8-15) B37(8-15) + const __m256i rhs_hbit_4567_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_3, 4), m3b), 4); //B54(8-15) B55(8-15) B56(8-15) B57(8-15) + const __m256i rhs_hbit_4567_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_3, 6), m3b), 4); //B74(8-15) B75(8-15) B76(8-15) B77(8-15) - // Load the lower bits(bits 0 - 3) of eight block_q6_K for eight sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7 - const __m256i rhs_raw_vec_ql_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + sb * 512)); // 0 - 8, +64 - const __m256i rhs_raw_vec_ql_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 32 + sb * 512)); // 0 - 8 - const __m256i rhs_raw_vec_ql_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 64 + sb * 512)); // 8 - 15 - const __m256i rhs_raw_vec_ql_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 96 + sb * 512)); // 8 - 15 - const __m256i rhs_raw_vec_ql_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 128 + sb * 512)); // 16 - 23 - const __m256i rhs_raw_vec_ql_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 160 + sb * 512)); // 16 - 23 - const __m256i rhs_raw_vec_ql_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 192 + sb * 512)); // 24 - 31 - const __m256i rhs_raw_vec_ql_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 224 + sb * 512)); // 24 - 31 - const __m256i rhs_raw_vec_ql_0123_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_vec_ql_4567_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_vec_ql_0123_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_vec_ql_4567_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_vec_ql_0123_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_vec_ql_4567_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_vec_ql_0123_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_vec_ql_4567_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 480 + sb * 512)); + // Load the eight block_q6_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7 + // The lower and higher packed bits are loaded, unpacked and individual bytes representing 6 bits each are formed from the same + // They are blended/permuted for further mul mat operations within the pipeline + const __m256i rhs_raw_lbit_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 224 + sb * 512)); + const __m256i rhs_raw_lbit_0123_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_4567_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_0123_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_4567_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_0123_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_4567_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_0123_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_4567_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 480 + sb * 512)); - // 0 -7, 64 - 71 - const __m256i rhs_vec_0123_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_0, m4b), rhs_vec_qh_0123_00); - const __m256i rhs_vec_0123_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_0, 4), m4b), rhs_vec_qh_0123_40); + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 0 -7, 64 - 71 + const __m256i rhs_vec_0123_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_0, m4b), rhs_hbit_0123_00); + const __m256i rhs_vec_0123_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_0, 4), m4b), rhs_hbit_0123_40); - const __m256i rhs_vec_4567_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_0, m4b), rhs_vec_qh_4567_00); - const __m256i rhs_vec_4567_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_0, 4), m4b), rhs_vec_qh_4567_40); + const __m256i rhs_vec_4567_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_0, m4b), rhs_hbit_4567_00); + const __m256i rhs_vec_4567_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_0, 4), m4b), rhs_hbit_4567_40); - // 8 - 15, 72 - 79 - const __m256i rhs_vec_0123_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_1, m4b), rhs_vec_qh_0123_01); - const __m256i rhs_vec_0123_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_1, 4), m4b), rhs_vec_qh_0123_41); + // Index : 8 - 15, 72 - 79 + const __m256i rhs_vec_0123_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_1, m4b), rhs_hbit_0123_01); + const __m256i rhs_vec_0123_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_1, 4), m4b), rhs_hbit_0123_41); - const __m256i rhs_vec_4567_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_1, m4b), rhs_vec_qh_4567_01); - const __m256i rhs_vec_4567_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_1, 4), m4b), rhs_vec_qh_4567_41); + const __m256i rhs_vec_4567_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_1, m4b), rhs_hbit_4567_01); + const __m256i rhs_vec_4567_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_1, 4), m4b), rhs_hbit_4567_41); - // 16 - 23, 80 - 87 - const __m256i rhs_vec_0123_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_2, m4b), rhs_vec_qh_0123_10); - const __m256i rhs_vec_0123_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_2, 4), m4b), rhs_vec_qh_0123_50); + // Index : 16 - 23, 80 - 87 + const __m256i rhs_vec_0123_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_2, m4b), rhs_hbit_0123_10); + const __m256i rhs_vec_0123_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_2, 4), m4b), rhs_hbit_0123_50); - const __m256i rhs_vec_4567_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_2, m4b), rhs_vec_qh_4567_10); - const __m256i rhs_vec_4567_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_2, 4), m4b), rhs_vec_qh_4567_50); + const __m256i rhs_vec_4567_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_2, m4b), rhs_hbit_4567_10); + const __m256i rhs_vec_4567_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_2, 4), m4b), rhs_hbit_4567_50); - // 24 - 31, 88 - 95 - const __m256i rhs_vec_0123_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_3, m4b), rhs_vec_qh_0123_11); - const __m256i rhs_vec_0123_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_3, 4), m4b), rhs_vec_qh_0123_51); + // Index : 24 - 31, 88 - 95 + const __m256i rhs_vec_0123_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_3, m4b), rhs_hbit_0123_11); + const __m256i rhs_vec_0123_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_3, 4), m4b), rhs_hbit_0123_51); - const __m256i rhs_vec_4567_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_3, m4b), rhs_vec_qh_4567_11); - const __m256i rhs_vec_4567_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_3, 4), m4b), rhs_vec_qh_4567_51); + const __m256i rhs_vec_4567_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_3, m4b), rhs_hbit_4567_11); + const __m256i rhs_vec_4567_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_3, 4), m4b), rhs_hbit_4567_51); - // 32 - 39, 96 - 103 - const __m256i rhs_vec_0123_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_4, m4b), rhs_vec_qh_0123_20); - const __m256i rhs_vec_0123_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_4, 4), m4b), rhs_vec_qh_0123_60); + // Index : 32 - 39, 96 - 103 + const __m256i rhs_vec_0123_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_4, m4b), rhs_hbit_0123_20); + const __m256i rhs_vec_0123_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_4, 4), m4b), rhs_hbit_0123_60); - const __m256i rhs_vec_4567_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_4, m4b), rhs_vec_qh_4567_20); - const __m256i rhs_vec_4567_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_4, 4), m4b), rhs_vec_qh_4567_60); + const __m256i rhs_vec_4567_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_4, m4b), rhs_hbit_4567_20); + const __m256i rhs_vec_4567_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_4, 4), m4b), rhs_hbit_4567_60); - // 40 - 47, 104 - 111 - const __m256i rhs_vec_0123_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_5, m4b), rhs_vec_qh_0123_21); - const __m256i rhs_vec_0123_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_5, 4), m4b), rhs_vec_qh_0123_61); + // Index : 40 - 47, 104 - 111 + const __m256i rhs_vec_0123_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_5, m4b), rhs_hbit_0123_21); + const __m256i rhs_vec_0123_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_5, 4), m4b), rhs_hbit_0123_61); - const __m256i rhs_vec_4567_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_5, m4b), rhs_vec_qh_4567_21); - const __m256i rhs_vec_4567_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_5, 4), m4b), rhs_vec_qh_4567_61); + const __m256i rhs_vec_4567_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_5, m4b), rhs_hbit_4567_21); + const __m256i rhs_vec_4567_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_5, 4), m4b), rhs_hbit_4567_61); - // 48 - 55, 112 - 119 - const __m256i rhs_vec_0123_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_6, m4b), rhs_vec_qh_0123_30); - const __m256i rhs_vec_0123_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_6, 4), m4b), rhs_vec_qh_0123_70); + // Index : 48 - 55, 112 - 119 + const __m256i rhs_vec_0123_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_6, m4b), rhs_hbit_0123_30); + const __m256i rhs_vec_0123_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_6, 4), m4b), rhs_hbit_0123_70); - const __m256i rhs_vec_4567_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_6, m4b), rhs_vec_qh_4567_30); - const __m256i rhs_vec_4567_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_6, 4), m4b), rhs_vec_qh_4567_70); + const __m256i rhs_vec_4567_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_6, m4b), rhs_hbit_4567_30); + const __m256i rhs_vec_4567_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_6, 4), m4b), rhs_hbit_4567_70); - // 56 - 63, 120 - 127 - const __m256i rhs_vec_0123_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_7, m4b), rhs_vec_qh_0123_31); - const __m256i rhs_vec_0123_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_7, 4), m4b), rhs_vec_qh_0123_71); + // Index : 56 - 63, 120 - 127 + const __m256i rhs_vec_0123_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_7, m4b), rhs_hbit_0123_31); + const __m256i rhs_vec_0123_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_7, 4), m4b), rhs_hbit_0123_71); - const __m256i rhs_vec_4567_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_7, m4b), rhs_vec_qh_4567_31); - const __m256i rhs_vec_4567_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_7, 4), m4b), rhs_vec_qh_4567_71); + const __m256i rhs_vec_4567_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_7, m4b), rhs_hbit_4567_31); + const __m256i rhs_vec_4567_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_7, 4), m4b), rhs_hbit_4567_71); //Scales of corresponding sub blocks from different Q6_K structures are stored together - //s00 s01 s10 s11 s20 s21 s30 s31 s40 s41 s50 s51 s60 s61 s70 s71 //s02 s03 //s04 s05 //s06 s07 - + //s00 s01 s10 s11 s20 s21 s30 s31 s40 s41 s50 s51 s60 s61 s70 s71 const __m128i scales_01 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + sb * 64)); const __m128i scales_23 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 16 + sb * 64)); const __m128i scales_45 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 32 + sb * 64)); @@ -2201,6 +2205,7 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo lhs_vec_6 = _mm256_permute2f128_si256(lhs_vec_6, lhs_vec_6, 0); lhs_vec_7 = _mm256_permute2f128_si256(lhs_vec_7, lhs_vec_7, 0); + // Multiply Q8 quants with bytes valued 32 - Subtracted later as an adjustment for 6 bit quantization __m256i lhs_vec_s_0 = _mm256_maddubs_epi16(m32s, lhs_vec_0); __m256i lhs_vec_s_1 = _mm256_maddubs_epi16(m32s, lhs_vec_1); __m256i lhs_vec_s_2 = _mm256_maddubs_epi16(m32s, lhs_vec_2); @@ -6922,7 +6927,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103 // Comments indicate the indices of elements from individual super block in non interleaved fashion - // Index : 8 - 15, 72 - 79 const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15 const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47 const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79 @@ -8719,7 +8723,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 192 + sb * 256)); const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256)); - // Indices 0 through 7 (first block): const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240); const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240); @@ -8729,7 +8732,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240); - // Indices 4 through 7 (second block): const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240); const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240); @@ -8793,8 +8795,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values - // Comments indicate the indices of elements from individual super block in non interleaved fashion - // Index : 0 -7, 64 - 71 // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00); @@ -9533,7 +9533,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 192 + sb * 256)); const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256)); - // Indices 0 through 7 (first block): const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240); const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240); @@ -9543,7 +9542,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240); - // Indices 4 through 7 (second block): const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240); const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240); const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240); @@ -9605,8 +9603,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 - // Comments indicate the indices of elements from individual super block in non interleaved fashion - // Index : 0 -7, 64 - 71 // Comments indicate the indices of elements from individual super block in non interleaved fashion // Index : 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00);