Cleanup GEMV Code
This commit is contained in:
parent
4806d6a8fe
commit
5c851ca7bd
|
|
@ -1971,8 +1971,11 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
// Permute mask used for easier vector processing at later stages
|
||||
__m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
|
||||
|
||||
// Mask to extract 2 bits from packed bytes
|
||||
const __m256i m3b = _mm256_set1_epi8(3);
|
||||
// Mask to extract nibbles from packed bytes
|
||||
const __m256i m4b = _mm256_set1_epi8(0xF);
|
||||
// Vector with each byte value 32 - Used as an subtract offset for 6 bit quantized values
|
||||
const __m256i m32s = _mm256_set1_epi8(32);
|
||||
|
||||
//Mask to get appropriate scales
|
||||
|
|
@ -1998,7 +2001,6 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
|
||||
// Master FP accumulators
|
||||
__m256 acc_row = _mm256_setzero_ps();
|
||||
__m256 acc_min_rows = _mm256_setzero_ps();
|
||||
|
||||
for (int64_t b = 0; b < nb; b++) {
|
||||
|
||||
|
|
@ -2015,135 +2017,137 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
for(int sb = 0; sb < QK_K / 128; sb++) {
|
||||
|
||||
// Load the high bits(bit 5, 6) of eight block_q6_K for eight sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
|
||||
const __m256i rhs_raw_vec_qh_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + sb * 256));
|
||||
const __m256i rhs_raw_vec_qh_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 32 + sb * 256));
|
||||
const __m256i rhs_raw_vec_qh_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 64 + sb * 256));
|
||||
const __m256i rhs_raw_vec_qh_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 96 + sb * 256));
|
||||
const __m256i rhs_raw_vec_qh_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 128 + sb * 256));
|
||||
const __m256i rhs_raw_vec_qh_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 160 + sb * 256));
|
||||
const __m256i rhs_raw_vec_qh_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 192 + sb * 256));
|
||||
const __m256i rhs_raw_vec_qh_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 224 + sb * 256));
|
||||
const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + sb * 256));
|
||||
const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 32 + sb * 256));
|
||||
const __m256i rhs_raw_hbit_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 64 + sb * 256));
|
||||
const __m256i rhs_raw_hbit_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 96 + sb * 256));
|
||||
const __m256i rhs_raw_hbit_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 128 + sb * 256));
|
||||
const __m256i rhs_raw_hbit_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 160 + sb * 256));
|
||||
const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 192 + sb * 256));
|
||||
const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].qh + 224 + sb * 256));
|
||||
|
||||
// 2-bit -> 8-bit
|
||||
// Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop
|
||||
const __m256i rhs_vec_qh_0123_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_0, m3b), 4); //B00(0-7) B01(0-7) B02(0-7) B03(0-7)
|
||||
const __m256i rhs_vec_qh_0123_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 2), m3b), 4); //B20(0-7) B21(0-7) B22(0-7) B23(0-7)
|
||||
const __m256i rhs_vec_qh_0123_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 4), m3b), 4); //B40(0-7) B41(0-7) B42(0-7) B43(0-7)
|
||||
const __m256i rhs_vec_qh_0123_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_0, 6), m3b), 4); //B60(0-7) B61(0-7) B62(0-7) B63(0-7)
|
||||
// hbit Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop
|
||||
const __m256i rhs_hbit_0123_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_0, m3b), 4); //B00(0-7) B01(0-7) B02(0-7) B03(0-7)
|
||||
const __m256i rhs_hbit_0123_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_0, 2), m3b), 4); //B20(0-7) B21(0-7) B22(0-7) B23(0-7)
|
||||
const __m256i rhs_hbit_0123_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_0, 4), m3b), 4); //B40(0-7) B41(0-7) B42(0-7) B43(0-7)
|
||||
const __m256i rhs_hbit_0123_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_0, 6), m3b), 4); //B60(0-7) B61(0-7) B62(0-7) B63(0-7)
|
||||
|
||||
const __m256i rhs_vec_qh_4567_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_0, m3b), 4); //B04(0-7) B05(0-7) B06(0-7) B07(0-7)
|
||||
const __m256i rhs_vec_qh_4567_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 2), m3b), 4); //B24(0-7) B25(0-7) B26(0-7) B27(0-7)
|
||||
const __m256i rhs_vec_qh_4567_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 4), m3b), 4); //B44(0-7) B45(0-7) B46(0-7) B47(0-7)
|
||||
const __m256i rhs_vec_qh_4567_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_0, 6), m3b), 4); //B64(0-7) B65(0-7) B66(0-7) B67(0-7)
|
||||
const __m256i rhs_hbit_4567_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_0, m3b), 4); //B04(0-7) B05(0-7) B06(0-7) B07(0-7)
|
||||
const __m256i rhs_hbit_4567_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_0, 2), m3b), 4); //B24(0-7) B25(0-7) B26(0-7) B27(0-7)
|
||||
const __m256i rhs_hbit_4567_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_0, 4), m3b), 4); //B44(0-7) B45(0-7) B46(0-7) B47(0-7)
|
||||
const __m256i rhs_hbit_4567_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_0, 6), m3b), 4); //B64(0-7) B65(0-7) B66(0-7) B67(0-7)
|
||||
|
||||
const __m256i rhs_vec_qh_0123_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_1, m3b), 4); //B00(8-15) B01(8-15) B02(8-15) B03(8-15)
|
||||
const __m256i rhs_vec_qh_0123_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 2), m3b), 4); //B20(8-15) B21(8-15) B22(8-15) B23(8-15)
|
||||
const __m256i rhs_vec_qh_0123_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 4), m3b), 4); //B40(8-15) B41(8-15) B42(8-15) B43(8-15)
|
||||
const __m256i rhs_vec_qh_0123_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_1, 6), m3b), 4); //B60(8-15) B61(8-15) B62(8-15) B63(8-15)
|
||||
const __m256i rhs_hbit_0123_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_1, m3b), 4); //B00(8-15) B01(8-15) B02(8-15) B03(8-15)
|
||||
const __m256i rhs_hbit_0123_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_1, 2), m3b), 4); //B20(8-15) B21(8-15) B22(8-15) B23(8-15)
|
||||
const __m256i rhs_hbit_0123_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_1, 4), m3b), 4); //B40(8-15) B41(8-15) B42(8-15) B43(8-15)
|
||||
const __m256i rhs_hbit_0123_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_1, 6), m3b), 4); //B60(8-15) B61(8-15) B62(8-15) B63(8-15)
|
||||
|
||||
const __m256i rhs_vec_qh_4567_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_1, m3b), 4); //B04(8-15) B05(8-15) B06(8-15) B07(8-15)
|
||||
const __m256i rhs_vec_qh_4567_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 2), m3b), 4); //B24(8-15) B25(8-15) B26(8-15) B27(8-15)
|
||||
const __m256i rhs_vec_qh_4567_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 4), m3b), 4); //B44(8-15) B45(8-15) B46(8-15) B47(8-15)
|
||||
const __m256i rhs_vec_qh_4567_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_1, 6), m3b), 4); //B64(8-15) B65(8-15) B66(8-15) B67(8-15)
|
||||
const __m256i rhs_hbit_4567_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_1, m3b), 4); //B04(8-15) B05(8-15) B06(8-15) B07(8-15)
|
||||
const __m256i rhs_hbit_4567_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_1, 2), m3b), 4); //B24(8-15) B25(8-15) B26(8-15) B27(8-15)
|
||||
const __m256i rhs_hbit_4567_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_1, 4), m3b), 4); //B44(8-15) B45(8-15) B46(8-15) B47(8-15)
|
||||
const __m256i rhs_hbit_4567_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_1, 6), m3b), 4); //B64(8-15) B65(8-15) B66(8-15) B67(8-15)
|
||||
|
||||
// Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop
|
||||
const __m256i rhs_vec_qh_0123_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_2, m3b), 4); //B10(0-7) B11(0-7) B12(0-7) B13(0-7)
|
||||
const __m256i rhs_vec_qh_0123_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 2), m3b), 4); //B30(0-7) B31(0-7) B32(0-7) B33(0-7)
|
||||
const __m256i rhs_vec_qh_0123_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 4), m3b), 4); //B50(0-7) B51(0-7) B52(0-7) B53(0-7)
|
||||
const __m256i rhs_vec_qh_0123_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_2, 6), m3b), 4); //B70(0-7) B71(0-7) B72(0-7) B73(0-7)
|
||||
// hbit Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop
|
||||
const __m256i rhs_hbit_0123_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_2, m3b), 4); //B10(0-7) B11(0-7) B12(0-7) B13(0-7)
|
||||
const __m256i rhs_hbit_0123_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_2, 2), m3b), 4); //B30(0-7) B31(0-7) B32(0-7) B33(0-7)
|
||||
const __m256i rhs_hbit_0123_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_2, 4), m3b), 4); //B50(0-7) B51(0-7) B52(0-7) B53(0-7)
|
||||
const __m256i rhs_hbit_0123_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_2, 6), m3b), 4); //B70(0-7) B71(0-7) B72(0-7) B73(0-7)
|
||||
|
||||
const __m256i rhs_vec_qh_4567_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_2, m3b), 4); //B14(0-7) B15(0-7) B16(0-7) B17(0-7)
|
||||
const __m256i rhs_vec_qh_4567_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 2), m3b), 4); //B34(0-7) B35(0-7) B36(0-7) B37(0-7)
|
||||
const __m256i rhs_vec_qh_4567_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 4), m3b), 4); //B54(0-7) B55(0-7) B56(0-7) B57(0-7)
|
||||
const __m256i rhs_vec_qh_4567_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_2, 6), m3b), 4); //B74(0-7) B75(0-7) B76(0-7) B77(0-7)
|
||||
const __m256i rhs_hbit_4567_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_2, m3b), 4); //B14(0-7) B15(0-7) B16(0-7) B17(0-7)
|
||||
const __m256i rhs_hbit_4567_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_2, 2), m3b), 4); //B34(0-7) B35(0-7) B36(0-7) B37(0-7)
|
||||
const __m256i rhs_hbit_4567_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_2, 4), m3b), 4); //B54(0-7) B55(0-7) B56(0-7) B57(0-7)
|
||||
const __m256i rhs_hbit_4567_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_2, 6), m3b), 4); //B74(0-7) B75(0-7) B76(0-7) B77(0-7)
|
||||
|
||||
const __m256i rhs_vec_qh_0123_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_0123_3, m3b), 4); //B10(8-15) B11(8-15) B12(8-15) B13(8-15)
|
||||
const __m256i rhs_vec_qh_0123_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 2), m3b), 4); //B30(8-15) B31(8-15) B32(8-15) B33(8-15)
|
||||
const __m256i rhs_vec_qh_0123_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 4), m3b), 4); //B50(8-15) B51(8-15) B52(8-15) B53(8-15)
|
||||
const __m256i rhs_vec_qh_0123_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_0123_3, 6), m3b), 4); //B70(8-15) B71(8-15) B72(8-15) B73(8-15)
|
||||
const __m256i rhs_hbit_0123_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0123_3, m3b), 4); //B10(8-15) B11(8-15) B12(8-15) B13(8-15)
|
||||
const __m256i rhs_hbit_0123_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_3, 2), m3b), 4); //B30(8-15) B31(8-15) B32(8-15) B33(8-15)
|
||||
const __m256i rhs_hbit_0123_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_3, 4), m3b), 4); //B50(8-15) B51(8-15) B52(8-15) B53(8-15)
|
||||
const __m256i rhs_hbit_0123_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0123_3, 6), m3b), 4); //B70(8-15) B71(8-15) B72(8-15) B73(8-15)
|
||||
|
||||
const __m256i rhs_vec_qh_4567_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_vec_qh_4567_3, m3b), 4); //B14(8-15) B15(8-15) B16(8-15) B17(8-15)
|
||||
const __m256i rhs_vec_qh_4567_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 2), m3b), 4); //B34(8-15) B35(8-15) B36(8-15) B37(8-15)
|
||||
const __m256i rhs_vec_qh_4567_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 4), m3b), 4); //B54(8-15) B55(8-15) B56(8-15) B57(8-15)
|
||||
const __m256i rhs_vec_qh_4567_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_qh_4567_3, 6), m3b), 4); //B74(8-15) B75(8-15) B76(8-15) B77(8-15)
|
||||
const __m256i rhs_hbit_4567_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_4567_3, m3b), 4); //B14(8-15) B15(8-15) B16(8-15) B17(8-15)
|
||||
const __m256i rhs_hbit_4567_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_3, 2), m3b), 4); //B34(8-15) B35(8-15) B36(8-15) B37(8-15)
|
||||
const __m256i rhs_hbit_4567_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_3, 4), m3b), 4); //B54(8-15) B55(8-15) B56(8-15) B57(8-15)
|
||||
const __m256i rhs_hbit_4567_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_4567_3, 6), m3b), 4); //B74(8-15) B75(8-15) B76(8-15) B77(8-15)
|
||||
|
||||
// Load the lower bits(bits 0 - 3) of eight block_q6_K for eight sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
|
||||
const __m256i rhs_raw_vec_ql_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + sb * 512)); // 0 - 8, +64
|
||||
const __m256i rhs_raw_vec_ql_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 32 + sb * 512)); // 0 - 8
|
||||
const __m256i rhs_raw_vec_ql_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 64 + sb * 512)); // 8 - 15
|
||||
const __m256i rhs_raw_vec_ql_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 96 + sb * 512)); // 8 - 15
|
||||
const __m256i rhs_raw_vec_ql_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 128 + sb * 512)); // 16 - 23
|
||||
const __m256i rhs_raw_vec_ql_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 160 + sb * 512)); // 16 - 23
|
||||
const __m256i rhs_raw_vec_ql_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 192 + sb * 512)); // 24 - 31
|
||||
const __m256i rhs_raw_vec_ql_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 224 + sb * 512)); // 24 - 31
|
||||
const __m256i rhs_raw_vec_ql_0123_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 256 + sb * 512));
|
||||
const __m256i rhs_raw_vec_ql_4567_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 288 + sb * 512));
|
||||
const __m256i rhs_raw_vec_ql_0123_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 320 + sb * 512));
|
||||
const __m256i rhs_raw_vec_ql_4567_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 352 + sb * 512));
|
||||
const __m256i rhs_raw_vec_ql_0123_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 384 + sb * 512));
|
||||
const __m256i rhs_raw_vec_ql_4567_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 416 + sb * 512));
|
||||
const __m256i rhs_raw_vec_ql_0123_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 448 + sb * 512));
|
||||
const __m256i rhs_raw_vec_ql_4567_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 480 + sb * 512));
|
||||
// Load the eight block_q6_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7
|
||||
// The lower and higher packed bits are loaded, unpacked and individual bytes representing 6 bits each are formed from the same
|
||||
// They are blended/permuted for further mul mat operations within the pipeline
|
||||
const __m256i rhs_raw_lbit_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + sb * 512));
|
||||
const __m256i rhs_raw_lbit_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 32 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 64 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 96 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 128 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 160 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 192 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 224 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_0123_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 256 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_4567_4 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 288 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_0123_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 320 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_4567_5 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 352 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_0123_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 384 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_4567_6 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 416 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_0123_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 448 + sb * 512));
|
||||
const __m256i rhs_raw_lbit_4567_7 = _mm256_loadu_si256((const __m256i * )(b_ptr[b].ql + 480 + sb * 512));
|
||||
|
||||
// 0 -7, 64 - 71
|
||||
const __m256i rhs_vec_0123_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_0, m4b), rhs_vec_qh_0123_00);
|
||||
const __m256i rhs_vec_0123_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_0, 4), m4b), rhs_vec_qh_0123_40);
|
||||
// Comments indicate the indices of elements from individual super block in non interleaved fashion
|
||||
// Index : 0 -7, 64 - 71
|
||||
const __m256i rhs_vec_0123_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_0, m4b), rhs_hbit_0123_00);
|
||||
const __m256i rhs_vec_0123_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_0, 4), m4b), rhs_hbit_0123_40);
|
||||
|
||||
const __m256i rhs_vec_4567_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_0, m4b), rhs_vec_qh_4567_00);
|
||||
const __m256i rhs_vec_4567_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_0, 4), m4b), rhs_vec_qh_4567_40);
|
||||
const __m256i rhs_vec_4567_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_0, m4b), rhs_hbit_4567_00);
|
||||
const __m256i rhs_vec_4567_40 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_0, 4), m4b), rhs_hbit_4567_40);
|
||||
|
||||
// 8 - 15, 72 - 79
|
||||
const __m256i rhs_vec_0123_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_1, m4b), rhs_vec_qh_0123_01);
|
||||
const __m256i rhs_vec_0123_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_1, 4), m4b), rhs_vec_qh_0123_41);
|
||||
// Index : 8 - 15, 72 - 79
|
||||
const __m256i rhs_vec_0123_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_1, m4b), rhs_hbit_0123_01);
|
||||
const __m256i rhs_vec_0123_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_1, 4), m4b), rhs_hbit_0123_41);
|
||||
|
||||
const __m256i rhs_vec_4567_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_1, m4b), rhs_vec_qh_4567_01);
|
||||
const __m256i rhs_vec_4567_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_1, 4), m4b), rhs_vec_qh_4567_41);
|
||||
const __m256i rhs_vec_4567_01 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_1, m4b), rhs_hbit_4567_01);
|
||||
const __m256i rhs_vec_4567_41 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_1, 4), m4b), rhs_hbit_4567_41);
|
||||
|
||||
// 16 - 23, 80 - 87
|
||||
const __m256i rhs_vec_0123_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_2, m4b), rhs_vec_qh_0123_10);
|
||||
const __m256i rhs_vec_0123_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_2, 4), m4b), rhs_vec_qh_0123_50);
|
||||
// Index : 16 - 23, 80 - 87
|
||||
const __m256i rhs_vec_0123_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_2, m4b), rhs_hbit_0123_10);
|
||||
const __m256i rhs_vec_0123_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_2, 4), m4b), rhs_hbit_0123_50);
|
||||
|
||||
const __m256i rhs_vec_4567_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_2, m4b), rhs_vec_qh_4567_10);
|
||||
const __m256i rhs_vec_4567_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_2, 4), m4b), rhs_vec_qh_4567_50);
|
||||
const __m256i rhs_vec_4567_10 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_2, m4b), rhs_hbit_4567_10);
|
||||
const __m256i rhs_vec_4567_50 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_2, 4), m4b), rhs_hbit_4567_50);
|
||||
|
||||
// 24 - 31, 88 - 95
|
||||
const __m256i rhs_vec_0123_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_3, m4b), rhs_vec_qh_0123_11);
|
||||
const __m256i rhs_vec_0123_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_3, 4), m4b), rhs_vec_qh_0123_51);
|
||||
// Index : 24 - 31, 88 - 95
|
||||
const __m256i rhs_vec_0123_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_3, m4b), rhs_hbit_0123_11);
|
||||
const __m256i rhs_vec_0123_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_3, 4), m4b), rhs_hbit_0123_51);
|
||||
|
||||
const __m256i rhs_vec_4567_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_3, m4b), rhs_vec_qh_4567_11);
|
||||
const __m256i rhs_vec_4567_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_3, 4), m4b), rhs_vec_qh_4567_51);
|
||||
const __m256i rhs_vec_4567_11 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_3, m4b), rhs_hbit_4567_11);
|
||||
const __m256i rhs_vec_4567_51 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_3, 4), m4b), rhs_hbit_4567_51);
|
||||
|
||||
// 32 - 39, 96 - 103
|
||||
const __m256i rhs_vec_0123_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_4, m4b), rhs_vec_qh_0123_20);
|
||||
const __m256i rhs_vec_0123_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_4, 4), m4b), rhs_vec_qh_0123_60);
|
||||
// Index : 32 - 39, 96 - 103
|
||||
const __m256i rhs_vec_0123_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_4, m4b), rhs_hbit_0123_20);
|
||||
const __m256i rhs_vec_0123_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_4, 4), m4b), rhs_hbit_0123_60);
|
||||
|
||||
const __m256i rhs_vec_4567_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_4, m4b), rhs_vec_qh_4567_20);
|
||||
const __m256i rhs_vec_4567_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_4, 4), m4b), rhs_vec_qh_4567_60);
|
||||
const __m256i rhs_vec_4567_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_4, m4b), rhs_hbit_4567_20);
|
||||
const __m256i rhs_vec_4567_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_4, 4), m4b), rhs_hbit_4567_60);
|
||||
|
||||
// 40 - 47, 104 - 111
|
||||
const __m256i rhs_vec_0123_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_5, m4b), rhs_vec_qh_0123_21);
|
||||
const __m256i rhs_vec_0123_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_5, 4), m4b), rhs_vec_qh_0123_61);
|
||||
// Index : 40 - 47, 104 - 111
|
||||
const __m256i rhs_vec_0123_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_5, m4b), rhs_hbit_0123_21);
|
||||
const __m256i rhs_vec_0123_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_5, 4), m4b), rhs_hbit_0123_61);
|
||||
|
||||
const __m256i rhs_vec_4567_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_5, m4b), rhs_vec_qh_4567_21);
|
||||
const __m256i rhs_vec_4567_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_5, 4), m4b), rhs_vec_qh_4567_61);
|
||||
const __m256i rhs_vec_4567_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_5, m4b), rhs_hbit_4567_21);
|
||||
const __m256i rhs_vec_4567_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_5, 4), m4b), rhs_hbit_4567_61);
|
||||
|
||||
// 48 - 55, 112 - 119
|
||||
const __m256i rhs_vec_0123_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_6, m4b), rhs_vec_qh_0123_30);
|
||||
const __m256i rhs_vec_0123_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_6, 4), m4b), rhs_vec_qh_0123_70);
|
||||
// Index : 48 - 55, 112 - 119
|
||||
const __m256i rhs_vec_0123_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_6, m4b), rhs_hbit_0123_30);
|
||||
const __m256i rhs_vec_0123_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_6, 4), m4b), rhs_hbit_0123_70);
|
||||
|
||||
const __m256i rhs_vec_4567_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_6, m4b), rhs_vec_qh_4567_30);
|
||||
const __m256i rhs_vec_4567_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_6, 4), m4b), rhs_vec_qh_4567_70);
|
||||
const __m256i rhs_vec_4567_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_6, m4b), rhs_hbit_4567_30);
|
||||
const __m256i rhs_vec_4567_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_6, 4), m4b), rhs_hbit_4567_70);
|
||||
|
||||
// 56 - 63, 120 - 127
|
||||
const __m256i rhs_vec_0123_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_0123_7, m4b), rhs_vec_qh_0123_31);
|
||||
const __m256i rhs_vec_0123_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_0123_7, 4), m4b), rhs_vec_qh_0123_71);
|
||||
// Index : 56 - 63, 120 - 127
|
||||
const __m256i rhs_vec_0123_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_7, m4b), rhs_hbit_0123_31);
|
||||
const __m256i rhs_vec_0123_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_7, 4), m4b), rhs_hbit_0123_71);
|
||||
|
||||
const __m256i rhs_vec_4567_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_vec_ql_4567_7, m4b), rhs_vec_qh_4567_31);
|
||||
const __m256i rhs_vec_4567_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_ql_4567_7, 4), m4b), rhs_vec_qh_4567_71);
|
||||
const __m256i rhs_vec_4567_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_7, m4b), rhs_hbit_4567_31);
|
||||
const __m256i rhs_vec_4567_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_7, 4), m4b), rhs_hbit_4567_71);
|
||||
|
||||
//Scales of corresponding sub blocks from different Q6_K structures are stored together
|
||||
//s00 s01 s10 s11 s20 s21 s30 s31 s40 s41 s50 s51 s60 s61 s70 s71 //s02 s03 //s04 s05 //s06 s07
|
||||
|
||||
//s00 s01 s10 s11 s20 s21 s30 s31 s40 s41 s50 s51 s60 s61 s70 s71
|
||||
const __m128i scales_01 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + sb * 64));
|
||||
const __m128i scales_23 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 16 + sb * 64));
|
||||
const __m128i scales_45 = _mm_loadu_si128((const __m128i *)(b_ptr[b].scales + 32 + sb * 64));
|
||||
|
|
@ -2201,6 +2205,7 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
lhs_vec_6 = _mm256_permute2f128_si256(lhs_vec_6, lhs_vec_6, 0);
|
||||
lhs_vec_7 = _mm256_permute2f128_si256(lhs_vec_7, lhs_vec_7, 0);
|
||||
|
||||
// Multiply Q8 quants with bytes valued 32 - Subtracted later as an adjustment for 6 bit quantization
|
||||
__m256i lhs_vec_s_0 = _mm256_maddubs_epi16(m32s, lhs_vec_0);
|
||||
__m256i lhs_vec_s_1 = _mm256_maddubs_epi16(m32s, lhs_vec_1);
|
||||
__m256i lhs_vec_s_2 = _mm256_maddubs_epi16(m32s, lhs_vec_2);
|
||||
|
|
@ -6922,7 +6927,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103
|
||||
|
||||
// Comments indicate the indices of elements from individual super block in non interleaved fashion
|
||||
// Index : 8 - 15, 72 - 79
|
||||
const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15
|
||||
const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47
|
||||
const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79
|
||||
|
|
@ -8719,7 +8723,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 192 + sb * 256));
|
||||
const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256));
|
||||
|
||||
// Indices 0 through 7 (first block):
|
||||
const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240);
|
||||
const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240);
|
||||
const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240);
|
||||
|
|
@ -8729,7 +8732,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240);
|
||||
const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240);
|
||||
|
||||
// Indices 4 through 7 (second block):
|
||||
const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240);
|
||||
const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240);
|
||||
const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240);
|
||||
|
|
@ -8793,8 +8795,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
|
||||
// 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values
|
||||
|
||||
// Comments indicate the indices of elements from individual super block in non interleaved fashion
|
||||
// Index : 0 -7, 64 - 71
|
||||
// Comments indicate the indices of elements from individual super block in non interleaved fashion
|
||||
// Index : 0 -7, 64 - 71
|
||||
const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00);
|
||||
|
|
@ -9533,7 +9533,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m256i rhs_raw_hbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 192 + sb * 256));
|
||||
const __m256i rhs_raw_hbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qh + 224 + sb * 256));
|
||||
|
||||
// Indices 0 through 7 (first block):
|
||||
const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240);
|
||||
const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240);
|
||||
const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240);
|
||||
|
|
@ -9543,7 +9542,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240);
|
||||
const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240);
|
||||
|
||||
// Indices 4 through 7 (second block):
|
||||
const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240);
|
||||
const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240);
|
||||
const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240);
|
||||
|
|
@ -9605,8 +9603,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95
|
||||
const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127
|
||||
|
||||
// Comments indicate the indices of elements from individual super block in non interleaved fashion
|
||||
// Index : 0 -7, 64 - 71
|
||||
// Comments indicate the indices of elements from individual super block in non interleaved fashion
|
||||
// Index : 0 -7, 64 - 71
|
||||
const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0145_0, m4), rhs_hbit_0145_00);
|
||||
|
|
|
|||
Loading…
Reference in New Issue