diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index 68fecfdbb5..0f3b6e40b3 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -6708,33 +6708,39 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo int64_t b_nb = n / QK_K; int64_t y = 0; - // Mask to mask out nibbles from packed bytes // Permute mask used for easier vector processing at later stages __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4); int64_t xstart = 0; int anr = nr - nr % 16;; // Used to align nr with boundary of 16 - // Mask to mask out nibbles from packed bytes + // Mask to extract nibbles from packed bytes const __m256i m4 = _mm256_set1_epi8(0xF); + // Mask to extract 2 bit values from packed bytes const __m256i m2 = _mm256_set1_epi8(3); + // Vector with each byte value 32 - Used as an subtract offset for 6 bit quantized values const __m256i m32s = _mm256_set1_epi8(32); //Mask to get appropriate scales __m128i scalesmask1_sse = _mm_set_epi8(14,14,12,12,10,10,8,8,6,6,4,4,2,2,0,0); __m128i scalesmask2_sse = _mm_set_epi8(15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1); +#ifdef __AVX512F__ + int anc = nc - nc % 16; // Used to align nc with boundary of 16 + //Expanded mask to get appropriate scales __m256i scalesmask1 = _mm256_castsi128_si256(scalesmask1_sse); scalesmask1 = _mm256_permute2f128_si256(scalesmask1, scalesmask1, 0); __m256i scalesmask2 = _mm256_castsi128_si256(scalesmask2_sse); scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0); -#ifdef __AVX512F__ - int anc = nc - nc % 16; // Used to align nc with boundary of 16 + // Mask to extract nibbles from packed bytes const __m512i m4_expanded = _mm512_set1_epi8(0xF); + // Mask to extract 2 bit values from packed bytes const __m512i m2_expanded = _mm512_set1_epi8(3); + // Vector with each byte set to 32 - Used as an subtraction adjustment factor for 6 bit quantization const __m512i m32s_expanded = _mm512_set1_epi8(32); + //Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation for (; y < anr / 4; y += 4){ const block_q8_Kx4 * a_ptrs[4]; @@ -6743,7 +6749,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo for (int i = 0; i < 3; ++i) { a_ptrs[i + 1] = a_ptrs[i] + nb; } - // Take group of eight block_q6_kx8 structures at each pass of the loop and perform dot product operation + // Take group of two block_q6_kx8 structures at each pass of the loop and perform dot product operation for (int64_t x = 0; x < anc / 8; x += 2) { const block_q6_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb); @@ -6757,45 +6763,49 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // For super block for (int64_t b = 0; b < nb; b++) { - // Delta values - Load the sixteen scale values from two block_q2_kx8 structures + // Delta values - Load the sixteen scale values from two block_q6_kx8 structures const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d); for (int sb = 0; sb < QK_K / 128; sb++) { - const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + sb * 512)); - const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 32 + sb * 512)); - const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 64 + sb * 512)); - const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 96 + sb * 512)); - const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 128 + sb * 512)); - const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 160 + sb * 512)); - const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 192 + sb * 512)); - const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 224 + sb * 512)); - const __m256i rhs_raw_mat_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_mat_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_mat_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_mat_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_mat_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_mat_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_mat_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_mat_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 480 + sb * 512)); + // Load the sixteen block_q6_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7 + // The lower and higher packed bits are loaded, unpacked and individual bytes representing 6 bits each are formed from the same + // They are blended/permuted for further mul mat operations within the pipeline + const __m256i rhs_raw_lbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_0123_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_4567_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_0123_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_4567_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 224 + sb * 512)); - const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + sb * 512)); - const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 32 + sb * 512)); - const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 64 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 96 + sb * 512)); - const __m256i rhs_raw_mat_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 128 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 160 + sb * 512)); - const __m256i rhs_raw_mat_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 192 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 224 + sb * 512)); + const __m256i rhs_raw_lbit_0123_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_4567_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_0123_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_4567_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_0123_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_4567_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_0123_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_4567_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].ql + 480 + sb * 512)); - const __m256i rhs_raw_mat_89AB_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 256 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 288 + sb * 512)); - const __m256i rhs_raw_mat_89AB_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 320 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 352 + sb * 512)); - const __m256i rhs_raw_mat_89AB_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 384 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 416 + sb * 512)); - const __m256i rhs_raw_mat_89AB_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 448 + sb * 512)); - const __m256i rhs_raw_mat_CDEF_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 480 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 32 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 64 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 96 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 128 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_2 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 160 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 192 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 224 + sb * 512)); + + const __m256i rhs_raw_lbit_89AB_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 256 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_4 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 288 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 320 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_5 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 352 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 384 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_6 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 416 + sb * 512)); + const __m256i rhs_raw_lbit_89AB_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 448 + sb * 512)); + const __m256i rhs_raw_lbit_CDEF_7 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].ql + 480 + sb * 512)); const __m256i rhs_raw_hbit_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + sb * 256)); const __m256i rhs_raw_hbit_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qh + 32 + sb * 256)); @@ -6815,45 +6825,41 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_89AB_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 192 + sb * 256)); const __m256i rhs_raw_hbit_CDEF_3 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qh + 224 + sb * 256)); - // Indices 0 through 7 (first block): - const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); - const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240); - const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240); - const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240); + const __m256i rhs_raw_lbit_0145_0 = _mm256_blend_epi32(rhs_raw_lbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_0, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_0, requiredOrder), rhs_raw_lbit_4567_0, 240); + const __m256i rhs_raw_lbit_0145_1 = _mm256_blend_epi32(rhs_raw_lbit_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_1, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_1, requiredOrder), rhs_raw_lbit_4567_1, 240); + const __m256i rhs_raw_lbit_0145_2 = _mm256_blend_epi32(rhs_raw_lbit_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_2, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_2, requiredOrder), rhs_raw_lbit_4567_2, 240); + const __m256i rhs_raw_lbit_0145_3 = _mm256_blend_epi32(rhs_raw_lbit_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_3, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_3, requiredOrder), rhs_raw_lbit_4567_3, 240); - // Indices 4 through 7 (second block): - const __m256i rhs_raw_mat_0145_4 = _mm256_blend_epi32(rhs_raw_mat_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_4, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_4, requiredOrder), rhs_raw_mat_4567_4, 240); - const __m256i rhs_raw_mat_0145_5 = _mm256_blend_epi32(rhs_raw_mat_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_5, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_5, requiredOrder), rhs_raw_mat_4567_5, 240); - const __m256i rhs_raw_mat_0145_6 = _mm256_blend_epi32(rhs_raw_mat_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_6, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_6, requiredOrder), rhs_raw_mat_4567_6, 240); - const __m256i rhs_raw_mat_0145_7 = _mm256_blend_epi32(rhs_raw_mat_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_7, requiredOrder), 240); - const __m256i rhs_raw_mat_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_7, requiredOrder), rhs_raw_mat_4567_7, 240); + const __m256i rhs_raw_lbit_0145_4 = _mm256_blend_epi32(rhs_raw_lbit_0123_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_4, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_4, requiredOrder), rhs_raw_lbit_4567_4, 240); + const __m256i rhs_raw_lbit_0145_5 = _mm256_blend_epi32(rhs_raw_lbit_0123_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_5, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_5, requiredOrder), rhs_raw_lbit_4567_5, 240); + const __m256i rhs_raw_lbit_0145_6 = _mm256_blend_epi32(rhs_raw_lbit_0123_6, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_6, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_6, requiredOrder), rhs_raw_lbit_4567_6, 240); + const __m256i rhs_raw_lbit_0145_7 = _mm256_blend_epi32(rhs_raw_lbit_0123_7, _mm256_permutevar8x32_epi32(rhs_raw_lbit_4567_7, requiredOrder), 240); + const __m256i rhs_raw_lbit_2367_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_0123_7, requiredOrder), rhs_raw_lbit_4567_7, 240); - // Indices 8 through F (first block): - const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240); - const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240); - const __m256i rhs_raw_mat_89CD_2 = _mm256_blend_epi32(rhs_raw_mat_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_2, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_2, requiredOrder), rhs_raw_mat_CDEF_2, 240); - const __m256i rhs_raw_mat_89CD_3 = _mm256_blend_epi32(rhs_raw_mat_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_3, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_3, requiredOrder), rhs_raw_mat_CDEF_3, 240); + const __m256i rhs_raw_lbit_89CD_0 = _mm256_blend_epi32(rhs_raw_lbit_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_0, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_0, requiredOrder), rhs_raw_lbit_CDEF_0, 240); + const __m256i rhs_raw_lbit_89CD_1 = _mm256_blend_epi32(rhs_raw_lbit_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_1, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_1, requiredOrder), rhs_raw_lbit_CDEF_1, 240); + const __m256i rhs_raw_lbit_89CD_2 = _mm256_blend_epi32(rhs_raw_lbit_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_2, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_2, requiredOrder), rhs_raw_lbit_CDEF_2, 240); + const __m256i rhs_raw_lbit_89CD_3 = _mm256_blend_epi32(rhs_raw_lbit_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_3, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_3, requiredOrder), rhs_raw_lbit_CDEF_3, 240); - // Indices 8 through F (second block): - const __m256i rhs_raw_mat_89CD_4 = _mm256_blend_epi32(rhs_raw_mat_89AB_4, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_4, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_4, requiredOrder), rhs_raw_mat_CDEF_4, 240); - const __m256i rhs_raw_mat_89CD_5 = _mm256_blend_epi32(rhs_raw_mat_89AB_5, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_5, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_5, requiredOrder), rhs_raw_mat_CDEF_5, 240); - const __m256i rhs_raw_mat_89CD_6 = _mm256_blend_epi32(rhs_raw_mat_89AB_6, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_6, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_6, requiredOrder), rhs_raw_mat_CDEF_6, 240); - const __m256i rhs_raw_mat_89CD_7 = _mm256_blend_epi32(rhs_raw_mat_89AB_7, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_7, requiredOrder), 240); - const __m256i rhs_raw_mat_ABEF_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_7, requiredOrder), rhs_raw_mat_CDEF_7, 240); + const __m256i rhs_raw_lbit_89CD_4 = _mm256_blend_epi32(rhs_raw_lbit_89AB_4, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_4, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_4 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_4, requiredOrder), rhs_raw_lbit_CDEF_4, 240); + const __m256i rhs_raw_lbit_89CD_5 = _mm256_blend_epi32(rhs_raw_lbit_89AB_5, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_5, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_5 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_5, requiredOrder), rhs_raw_lbit_CDEF_5, 240); + const __m256i rhs_raw_lbit_89CD_6 = _mm256_blend_epi32(rhs_raw_lbit_89AB_6, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_6, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_6 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_6, requiredOrder), rhs_raw_lbit_CDEF_6, 240); + const __m256i rhs_raw_lbit_89CD_7 = _mm256_blend_epi32(rhs_raw_lbit_89AB_7, _mm256_permutevar8x32_epi32(rhs_raw_lbit_CDEF_7, requiredOrder), 240); + const __m256i rhs_raw_lbit_ABEF_7 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_lbit_89AB_7, requiredOrder), rhs_raw_lbit_CDEF_7, 240); const __m256i rhs_raw_hbit_0145_0 = _mm256_blend_epi32(rhs_raw_hbit_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_hbit_4567_0, requiredOrder), 240); const __m256i rhs_raw_hbit_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_0123_0, requiredOrder), rhs_raw_hbit_4567_0, 240); @@ -6873,25 +6879,25 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i rhs_raw_hbit_89CD_3 = _mm256_blend_epi32(rhs_raw_hbit_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_hbit_CDEF_3, requiredOrder), 240); const __m256i rhs_raw_hbit_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_hbit_89AB_3, requiredOrder), rhs_raw_hbit_CDEF_3, 240); - const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1); - const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1); - const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1); - const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1); + const __m512i rhs_raw_lbit_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_0), rhs_raw_lbit_89CD_0, 1); + const __m512i rhs_raw_lbit_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_0), rhs_raw_lbit_ABEF_0, 1); + const __m512i rhs_raw_lbit_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_1), rhs_raw_lbit_89CD_1, 1); + const __m512i rhs_raw_lbit_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_1), rhs_raw_lbit_ABEF_1, 1); - const __m512i rhs_raw_mat_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_2), rhs_raw_mat_89CD_2, 1); - const __m512i rhs_raw_mat_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_2), rhs_raw_mat_ABEF_2, 1); - const __m512i rhs_raw_mat_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_3), rhs_raw_mat_89CD_3, 1); - const __m512i rhs_raw_mat_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_3), rhs_raw_mat_ABEF_3, 1); + const __m512i rhs_raw_lbit_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_2), rhs_raw_lbit_89CD_2, 1); + const __m512i rhs_raw_lbit_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_2), rhs_raw_lbit_ABEF_2, 1); + const __m512i rhs_raw_lbit_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_3), rhs_raw_lbit_89CD_3, 1); + const __m512i rhs_raw_lbit_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_3), rhs_raw_lbit_ABEF_3, 1); - const __m512i rhs_raw_mat_014589CD_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_4), rhs_raw_mat_89CD_4, 1); - const __m512i rhs_raw_mat_2367ABEF_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_4), rhs_raw_mat_ABEF_4, 1); - const __m512i rhs_raw_mat_014589CD_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_5), rhs_raw_mat_89CD_5, 1); - const __m512i rhs_raw_mat_2367ABEF_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_5), rhs_raw_mat_ABEF_5, 1); + const __m512i rhs_raw_lbit_014589CD_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_4), rhs_raw_lbit_89CD_4, 1); + const __m512i rhs_raw_lbit_2367ABEF_4 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_4), rhs_raw_lbit_ABEF_4, 1); + const __m512i rhs_raw_lbit_014589CD_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_5), rhs_raw_lbit_89CD_5, 1); + const __m512i rhs_raw_lbit_2367ABEF_5 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_5), rhs_raw_lbit_ABEF_5, 1); - const __m512i rhs_raw_mat_014589CD_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_6), rhs_raw_mat_89CD_6, 1); - const __m512i rhs_raw_mat_2367ABEF_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_6), rhs_raw_mat_ABEF_6, 1); - const __m512i rhs_raw_mat_014589CD_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_7), rhs_raw_mat_89CD_7, 1); - const __m512i rhs_raw_mat_2367ABEF_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_7), rhs_raw_mat_ABEF_7, 1); + const __m512i rhs_raw_lbit_014589CD_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_6), rhs_raw_lbit_89CD_6, 1); + const __m512i rhs_raw_lbit_2367ABEF_6 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_6), rhs_raw_lbit_ABEF_6, 1); + const __m512i rhs_raw_lbit_014589CD_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_0145_7), rhs_raw_lbit_89CD_7, 1); + const __m512i rhs_raw_lbit_2367ABEF_7 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_lbit_2367_7), rhs_raw_lbit_ABEF_7, 1); const __m512i rhs_raw_hbit_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_0145_0), rhs_raw_hbit_89CD_0, 1); const __m512i rhs_raw_hbit_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_0), rhs_raw_hbit_ABEF_0, 1); @@ -6904,206 +6910,210 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m512i rhs_raw_hbit_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_hbit_2367_3), rhs_raw_hbit_ABEF_3, 1); // 2-bit -> 8-bit - // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) - const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) - const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) - const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + // hbit Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop + const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) - const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) - const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) - const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) - const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) - const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) - const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 8 - 15, 72 - 79 + const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //Index : 104 - 111 - const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) - const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) - const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) - const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //Index : 104 - 111 - // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) - const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) - const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) - const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + // hbit values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop + const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) - const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) - const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) - const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) - const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) - const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) - const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //Index : 120 - 127 - const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) - const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) - const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) - const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //Index : 120 - 127 - // 0 -7, 64 - 71 - const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); - const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); + // 4 bit values are unpacked/denibbled and bitwise or-ed with the hbit values to form the 6 bit quantized values - const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); - const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); + // Comments indicate the indices of elements from individual super block in non interleaved fashion + // Index : 0 -7, 64 - 71 + const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); + const __m512i rhs_mat_014589CD_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_0, 4), m4_expanded), rhs_hbit_014589CD_40); - // 8 - 15, 72 - 79 - const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); - const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); + const __m512i rhs_mat_2367ABEF_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_0, m4_expanded), rhs_hbit_2367ABEF_00); + const __m512i rhs_mat_2367ABEF_40 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_0, 4), m4_expanded), rhs_hbit_2367ABEF_40); - const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); - const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); + // Index : 8 - 15, 72 - 79 + const __m512i rhs_mat_014589CD_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_1, m4_expanded), rhs_hbit_014589CD_01); + const __m512i rhs_mat_014589CD_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_1, 4), m4_expanded), rhs_hbit_014589CD_41); - // 16 - 23, 80 - 87 - const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); - const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); + const __m512i rhs_mat_2367ABEF_01 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_1, m4_expanded), rhs_hbit_2367ABEF_01); + const __m512i rhs_mat_2367ABEF_41 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_1, 4), m4_expanded), rhs_hbit_2367ABEF_41); - const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); - const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); + // Index : 16 - 23, 80 - 87 + const __m512i rhs_mat_014589CD_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_2, m4_expanded), rhs_hbit_014589CD_10); + const __m512i rhs_mat_014589CD_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_2, 4), m4_expanded), rhs_hbit_014589CD_50); - // 24 - 31, 88 - 95 - const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); - const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); + const __m512i rhs_mat_2367ABEF_10 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_2, m4_expanded), rhs_hbit_2367ABEF_10); + const __m512i rhs_mat_2367ABEF_50 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_2, 4), m4_expanded), rhs_hbit_2367ABEF_50); - const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); - const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); + // Index : 24 - 31, 88 - 95 + const __m512i rhs_mat_014589CD_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_3, m4_expanded), rhs_hbit_014589CD_11); + const __m512i rhs_mat_014589CD_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_3, 4), m4_expanded), rhs_hbit_014589CD_51); - // 32 - 39, 96 - 103 - const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); - const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); + const __m512i rhs_mat_2367ABEF_11 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_3, m4_expanded), rhs_hbit_2367ABEF_11); + const __m512i rhs_mat_2367ABEF_51 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_3, 4), m4_expanded), rhs_hbit_2367ABEF_51); - const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); - const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); + // Index : 32 - 39, 96 - 103 + const __m512i rhs_mat_014589CD_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_4, m4_expanded), rhs_hbit_014589CD_20); + const __m512i rhs_mat_014589CD_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_4, 4), m4_expanded), rhs_hbit_014589CD_60); - // 40 - 47, 104 - 111 - const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); - const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); + const __m512i rhs_mat_2367ABEF_20 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_4, m4_expanded), rhs_hbit_2367ABEF_20); + const __m512i rhs_mat_2367ABEF_60 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_4, 4), m4_expanded), rhs_hbit_2367ABEF_60); - const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); - const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); + // Index : 40 - 47, 104 - 111 + const __m512i rhs_mat_014589CD_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_5, m4_expanded), rhs_hbit_014589CD_21); + const __m512i rhs_mat_014589CD_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_5, 4), m4_expanded), rhs_hbit_014589CD_61); - // 48 - 55, 112 - 119 - const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); - const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); + const __m512i rhs_mat_2367ABEF_21 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_5, m4_expanded), rhs_hbit_2367ABEF_21); + const __m512i rhs_mat_2367ABEF_61 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_5, 4), m4_expanded), rhs_hbit_2367ABEF_61); - const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); - const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); + // Index : 48 - 55, 112 - 119 + const __m512i rhs_mat_014589CD_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_6, m4_expanded), rhs_hbit_014589CD_30); + const __m512i rhs_mat_014589CD_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_6, 4), m4_expanded), rhs_hbit_014589CD_70); - // 56 - 63, 120 - 127 - const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); - const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + const __m512i rhs_mat_2367ABEF_30 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_6, m4_expanded), rhs_hbit_2367ABEF_30); + const __m512i rhs_mat_2367ABEF_70 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_6, 4), m4_expanded), rhs_hbit_2367ABEF_70); - const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); - const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); + // Index : 56 - 63, 120 - 127 + const __m512i rhs_mat_014589CD_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_014589CD_7, m4_expanded), rhs_hbit_014589CD_31); + const __m512i rhs_mat_014589CD_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_014589CD_7, 4), m4_expanded), rhs_hbit_014589CD_71); + + const __m512i rhs_mat_2367ABEF_31 = _mm512_or_si512(_mm512_and_si512(rhs_raw_lbit_2367ABEF_7, m4_expanded), rhs_hbit_2367ABEF_31); + const __m512i rhs_mat_2367ABEF_71 = _mm512_or_si512(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_lbit_2367ABEF_7, 4), m4_expanded), rhs_hbit_2367ABEF_71); // Shuffle pattern one - right side input - const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) - const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) + const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) B08(0-3) B09(0-3) B08(0-3) B09(0-3) B0C(0-3) B0D(0-3) B0C(0-3) B0D(0-3) + const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) B0A(0-3) B0B(0-3) B0A(0-3) B0B(0-3) B0E(0-3) B0F(0-3) B0E(0-3) B0F(0-3) - const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) - const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) + const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) B08(8-11) B09(8-11) B08(8-11) B09(8-11) B0C(8-11) B0D(8-11) B0C(8-11) B0D(8-11) + const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) B0A(8-11) B0B(8-11) B0A(8-11) B0B(8-11) B0E(8-11) B0F(8-11) B0E(8-11) B0F(8-11) - const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) - const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) + const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) B18(0-3) B19(0-3) B18(0-3) B19(0-3) B1C(0-3) B1D(0-3) B1C(0-3) B1D(0-3) + const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) B1A(0-3) B1B(0-3) B1A(0-3) B1B(0-3) B1E(0-3) B1F(0-3) B1E(0-3) B1F(0-3) - const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) - const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) + const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) B18(8-11) B19(8-11) B18(8-11) B19(8-11) B1C(8-11) B1D(8-11) B1C(8-11) B1D(8-11) + const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) B1A(8-11) B1B(8-11) B1A(8-11) B1B(8-11) B1E(8-11) B1F(8-11) B1E(8-11) B1F(8-11) - const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) - const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) + const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) B28(0-3) B29(0-3) B28(0-3) B29(0-3) B2C(0-3) B2D(0-3) B2C(0-3) B2D(0-3) + const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) B2A(0-3) B2B(0-3) B2A(0-3) B2B(0-3) B2E(0-3) B2F(0-3) B2E(0-3) B2F(0-3) - const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) - const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) + const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) B28(8-11) B29(8-11) B28(8-11) B29(8-11) B2C(8-11) B2D(8-11) B2C(8-11) B2D(8-11) + const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) B2A(8-11) B2B(8-11) B2A(8-11) B2B(8-11) B2E(8-11) B2F(8-11) B2E(8-11) B2F(8-11) - const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) - const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) + const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) B38(0-3) B39(0-3) B38(0-3) B39(0-3) B3C(0-3) B3D(0-3) B3C(0-3) B3D(0-3) + const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) B3A(0-3) B3B(0-3) B3A(0-3) B3B(0-3) B3E(0-3) B3F(0-3) B3E(0-3) B3F(0-3) - const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11 - const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) + const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11) B38(8-11) B39(8-11) B38(8-11) B39(8-11) B3C(8-11) B3D(8-11) B3C(8-11) B3D(8-11) + const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) B3A(8-11) B3B(8-11) B3A(8-11) B3B(8-11) B3E(8-11) B3F(8-11) B3E(8-11) B3F(8-11) - const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) - const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) + const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) B48(0-3) B49(0-3) B48(0-3) B49(0-3) B4C(0-3) B4D(0-3) B4C(0-3) B4D(0-3) + const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) B4A(0-3) B4B(0-3) B4A(0-3) B4B(0-3) B4E(0-3) B4F(0-3) B4E(0-3) B4F(0-3) - const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) - const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) + const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) B48(8-11) B49(8-11) B48(8-11) B49(8-11) B4C(8-11) B4D(8-11) B4C(8-11) B4D(8-11) + const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) B4A(8-11) B4B(8-11) B4A(8-11) B4B(8-11) B4E(8-11) B4F(8-11) B4E(8-11) B4F(8-11) - const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) - const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) + const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) B58(0-3) B59(0-3) B58(0-3) B59(0-3) B5C(0-3) B5D(0-3) B5C(0-3) B5D(0-3) + const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) B5A(0-3) B5B(0-3) B5A(0-3) B5B(0-3) B5E(0-3) B5F(0-3) B5E(0-3) B5F(0-3) - const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) - const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) + const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) B58(8-11) B59(8-11) B58(8-11) B59(8-11) B5C(8-11) B5D(8-11) B5C(8-11) B5D(8-11) + const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) B5A(8-11) B5B(8-11) B5A(8-11) B5B(8-11) B5E(8-11) B5F(8-11) B5E(8-11) B5F(8-11) - const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) - const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) + const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) B68(0-3) B69(0-3) B68(0-3) B69(0-3) B6C(0-3) B6D(0-3) B6C(0-3) B6D(0-3) + const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) B6A(0-3) B6B(0-3) B6A(0-3) B6B(0-3) B6E(0-3) B6F(0-3) B6E(0-3) B6F(0-3) - const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) - const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) + const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) B68(8-11) B69(8-11) B68(8-11) B69(8-11) B6C(8-11) B6D(8-11) B6C(8-11) B6D(8-11) + const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) B6A(8-11) B6B(8-11) B6A(8-11) B6B(8-11) B6E(8-11) B6F(8-11) B6E(8-11) B6F(8-11) - const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) - const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) + const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) B78(0-3) B79(0-3) B78(0-3) B79(0-3) B7C(0-3) B7D(0-3) B7C(0-3) B7D(0-3) + const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) B7A(0-3) B7B(0-3) B7A(0-3) B7B(0-3) B7E(0-3) B7F(0-3) B7E(0-3) B7F(0-3) - const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) - const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) + const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11) B78(8-11) B79(8-11) B78(8-11) B79(8-11) B7C(8-11) B7D(8-11) B7C(8-11) B7D(8-11) + const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) B7A(8-11) B7B(8-11) B7A(8-11) B7B(8-11) B7E(8-11) B7F(8-11) B7E(8-11) B7F(8-11) // Shuffle pattern two - right side input - const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) - const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) + const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) B08(4-7) B09(4-7) B08(4-7) B09(4-7) B0C(4-7) B0D(4-7) B0C(4-7) B0D(4-7) + const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) B0A(4-7) B0B(4-7) B0A(4-7) B0B(4-7) B0E(4-7) B0F(4-7) B0E(4-7) B0F(4-7) - const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) - const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) + const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) B08(12-15) B09(12-15) B08(12-15) B09(12-15) B0C(12-15) B0D(12-15) B0C(12-15) B0D(12-15) + const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) B0A(12-15) B0B(12-15) B0A(12-15) B0B(12-15) B0E(12-15) B0F(12-15) B0E(12-15) B0F(12-15) - const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) - const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) + const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) B18(4-7) B19(4-7) B18(4-7) B19(4-7) B1C(4-7) B1D(4-7) B1C(4-7) B1D(4-7) + const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) B1A(4-7) B1B(4-7) B1A(4-7) B1B(4-7) B1E(4-7) B1F(4-7) B1E(4-7) B1F(4-7) - const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) - const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) + const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) B18(12-15) B19(12-15) B18(12-15) B19(12-15) B1C(12-15) B1D(12-15) B1C(12-15) B1D(12-15) + const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) B1A(12-15) B1B(12-15) B1A(12-15) B1B(12-15) B1E(12-15) B1F(12-15) B1E(12-15) B1F(12-15) - const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) - const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) + const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) B28(4-7) B29(4-7) B28(4-7) B29(4-7) B2C(4-7) B2D(4-7) B2C(4-7) B2D(4-7) + const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) B2A(4-7) B2B(4-7) B2A(4-7) B2B(4-7) B2E(4-7) B2F(4-7) B2E(4-7) B2F(4-7) - const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) - const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) + const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) B28(12-15) B29(12-15) B28(12-15) B29(12-15) B2C(12-15) B2D(12-15) B2C(12-15) B2D(12-15) + const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) B2A(12-15) B2B(12-15) B2A(12-15) B2B(12-15) B2E(12-15) B2F(12-15) B2E(12-15) B2F(12-15) - const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) - const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) + const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) B38(4-7) B39(4-7) B38(4-7) B39(4-7) B3C(4-7) B3D(4-7) B3C(4-7) B3D(4-7) + const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) B3A(4-7) B3B(4-7) B3A(4-7) B3B(4-7) B3E(4-7) B3F(4-7) B3E(4-7) B3F(4-7) - const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) - const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) + const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) B38(12-15) B39(12-15) B38(12-15) B39(12-15) B3C(12-15) B3D(12-15) B3C(12-15) B3D(12-15) + const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) B3A(12-15) B3B(12-15) B3A(12-15) B3B(12-15) B3E(12-15) B3F(12-15) B3E(12-15) B3F(12-15) - const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) - const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) + const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) B48(4-7) B49(4-7) B48(4-7) B49(4-7) B4C(4-7) B4D(4-7) B4C(4-7) B4D(4-7) + const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) B4A(4-7) B4B(4-7) B4A(4-7) B4B(4-7) B4E(4-7) B4F(4-7) B4E(4-7) B4F(4-7) - const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) - const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) + const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) B48(12-15) B49(12-15) B48(12-15) B49(12-15) B4C(12-15) B4D(12-15) B4C(12-15) B4D(12-15) + const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) B4A(12-15) B4B(12-15) B4A(12-15) B4B(12-15) B4E(12-15) B4F(12-15) B4E(12-15) B4F(12-15) - const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) - const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) + const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) B58(4-7) B59(4-7) B58(4-7) B59(4-7) B5C(4-7) B5D(4-7) B5C(4-7) B5D(4-7) + const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) B5A(4-7) B5B(4-7) B5A(4-7) B5B(4-7) B5E(4-7) B5F(4-7) B5E(4-7) B5F(4-7) - const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) - const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) + const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) B58(12-15) B59(12-15) B58(12-15) B59(12-15) B5C(12-15) B5D(12-15) B5C(12-15) B5D(12-15) + const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) B5A(12-15) B5B(12-15) B5A(12-15) B5B(12-15) B5E(12-15) B5F(12-15) B5E(12-15) B5F(12-15) - const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) - const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) + const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) B68(4-7) B69(4-7) B68(4-7) B69(4-7) B6C(4-7) B6D(4-7) B6C(4-7) B6D(4-7) + const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) B6A(4-7) B6B(4-7) B6A(4-7) B6B(4-7) B6E(4-7) B6F(4-7) B6E(4-7) B6F(4-7) - const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) - const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) + const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) B68(12-15) B69(12-15) B68(12-15) B69(12-15) B6C(12-15) B6D(12-15) B6C(12-15) B6D(12-15) + const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) B6A(12-15) B6B(12-15) B6A(12-15) B6B(12-15) B6E(12-15) B6F(12-15) B6E(12-15) B6F(12-15) - const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) - const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) + const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) B78(4-7) B79(4-7) B78(4-7) B79(4-7) B7C(4-7) B7D(4-7) B7C(4-7) B7D(4-7) + const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) B7A(4-7) B7B(4-7) B7A(4-7) B7B(4-7) B7E(4-7) B7F(4-7) B7E(4-7) B7F(4-7) - const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) - const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) + const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) B78(12-15) B79(12-15) B78(12-15) B79(12-15) B7C(12-15) B7D(12-15) B7C(12-15) B7D(12-15) + const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) B7A(12-15) B7B(12-15) B7A(12-15) B7B(12-15) B7E(12-15) B7F(12-15) B7E(12-15) B7F(12-15) //Scales of corresponding sub blocks from different Q6_K structures are stored together //s00 s01 s10 s11 s20 s21 ...... s70 s71 - // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop const __m128i scales_01_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + sb * 64)); const __m128i scales_23_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 16 + sb * 64)); const __m128i scales_45_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 32 + sb * 64)); @@ -7114,7 +7124,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m128i scales_45_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 32 + sb * 64)); const __m128i scales_67_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 48 + sb * 64)); - // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop const __m256i scales_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_01_0), scales_01_1, 1); const __m256i scales_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_23_0), scales_23_1, 1); const __m256i scales_45 = _mm256_insertf128_si256(_mm256_castsi128_si256(scales_45_0), scales_45_1, 1); @@ -7207,7 +7216,6 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i lhs_mat_ymm_01_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 0); __m256i lhs_mat_ymm_23_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 17); - __m512i lhs_mat_01_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_00), lhs_mat_ymm_01_00, 1); __m512i lhs_mat_23_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_00), lhs_mat_ymm_23_00, 1); __m512i lhs_mat_01_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_01), lhs_mat_ymm_01_01, 1); @@ -7281,201 +7289,205 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m512i lhs_mat_s_01_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_01_71); __m512i lhs_mat_s_23_71 = _mm512_maddubs_epi16(m32s_expanded, lhs_mat_23_71); - // Shuffle pattern one - left side input - const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + // Shuffle pattern one – left-side input - const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - // Shuffle pattern two- left side input - const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) - const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + // Shuffle pattern two – left-side input - const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) - const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) - const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) - // Shuffle pattern one - left side input - const __m512i lhs_mat_s_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) - const __m512i lhs_mat_s_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) + const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) - const __m512i lhs_mat_s_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) - const __m512i lhs_mat_s_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) + const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) - const __m512i lhs_mat_s_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) - const __m512i lhs_mat_s_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) + // Shuffle pattern one – left-side input - const __m512i lhs_mat_s_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) - const __m512i lhs_mat_s_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) + const __m512i lhs_mat_s_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) + const __m512i lhs_mat_s_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) - const __m512i lhs_mat_s_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) - const __m512i lhs_mat_s_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) + const __m512i lhs_mat_s_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) + const __m512i lhs_mat_s_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) - const __m512i lhs_mat_s_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) - const __m512i lhs_mat_s_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) + const __m512i lhs_mat_s_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) + const __m512i lhs_mat_s_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) - const __m512i lhs_mat_s_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) - const __m512i lhs_mat_s_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) + const __m512i lhs_mat_s_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) + const __m512i lhs_mat_s_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) - const __m512i lhs_mat_s_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) - const __m512i lhs_mat_s_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) + const __m512i lhs_mat_s_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) + const __m512i lhs_mat_s_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) - const __m512i lhs_mat_s_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) - const __m512i lhs_mat_s_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) + const __m512i lhs_mat_s_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) + const __m512i lhs_mat_s_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) - const __m512i lhs_mat_s_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) - const __m512i lhs_mat_s_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) + const __m512i lhs_mat_s_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) + const __m512i lhs_mat_s_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) - const __m512i lhs_mat_s_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) - const __m512i lhs_mat_s_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) + const __m512i lhs_mat_s_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) + const __m512i lhs_mat_s_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) - const __m512i lhs_mat_s_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) - const __m512i lhs_mat_s_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) + const __m512i lhs_mat_s_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) + const __m512i lhs_mat_s_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) - const __m512i lhs_mat_s_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) - const __m512i lhs_mat_s_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) + const __m512i lhs_mat_s_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) + const __m512i lhs_mat_s_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) - const __m512i lhs_mat_s_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) - const __m512i lhs_mat_s_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) + const __m512i lhs_mat_s_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) + const __m512i lhs_mat_s_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) - const __m512i lhs_mat_s_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) - const __m512i lhs_mat_s_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) + const __m512i lhs_mat_s_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) + const __m512i lhs_mat_s_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) - const __m512i lhs_mat_s_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) - const __m512i lhs_mat_s_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) + const __m512i lhs_mat_s_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) + const __m512i lhs_mat_s_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) - // Shuffle pattern two- left side input - const __m512i lhs_mat_s_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) - const __m512i lhs_mat_s_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) + const __m512i lhs_mat_s_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) + const __m512i lhs_mat_s_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) - const __m512i lhs_mat_s_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) - const __m512i lhs_mat_s_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) + const __m512i lhs_mat_s_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) + const __m512i lhs_mat_s_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) - const __m512i lhs_mat_s_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) - const __m512i lhs_mat_s_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) + const __m512i lhs_mat_s_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) + const __m512i lhs_mat_s_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) - const __m512i lhs_mat_s_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) - const __m512i lhs_mat_s_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) + // Shuffle pattern two – left-side input - const __m512i lhs_mat_s_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) - const __m512i lhs_mat_s_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) + const __m512i lhs_mat_s_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) + const __m512i lhs_mat_s_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) - const __m512i lhs_mat_s_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) - const __m512i lhs_mat_s_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) + const __m512i lhs_mat_s_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) + const __m512i lhs_mat_s_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) - const __m512i lhs_mat_s_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) - const __m512i lhs_mat_s_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) + const __m512i lhs_mat_s_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) + const __m512i lhs_mat_s_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) - const __m512i lhs_mat_s_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) - const __m512i lhs_mat_s_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) + const __m512i lhs_mat_s_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) + const __m512i lhs_mat_s_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) - const __m512i lhs_mat_s_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) - const __m512i lhs_mat_s_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) + const __m512i lhs_mat_s_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) + const __m512i lhs_mat_s_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) - const __m512i lhs_mat_s_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) - const __m512i lhs_mat_s_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) + const __m512i lhs_mat_s_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) + const __m512i lhs_mat_s_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) - const __m512i lhs_mat_s_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) - const __m512i lhs_mat_s_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) + const __m512i lhs_mat_s_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) + const __m512i lhs_mat_s_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) - const __m512i lhs_mat_s_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) - const __m512i lhs_mat_s_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) + const __m512i lhs_mat_s_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) + const __m512i lhs_mat_s_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) - const __m512i lhs_mat_s_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) - const __m512i lhs_mat_s_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) + const __m512i lhs_mat_s_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) + const __m512i lhs_mat_s_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) - const __m512i lhs_mat_s_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) - const __m512i lhs_mat_s_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) + const __m512i lhs_mat_s_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) + const __m512i lhs_mat_s_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) - const __m512i lhs_mat_s_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) - const __m512i lhs_mat_s_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) + const __m512i lhs_mat_s_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) + const __m512i lhs_mat_s_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) - const __m512i lhs_mat_s_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) - const __m512i lhs_mat_s_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) + const __m512i lhs_mat_s_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) + const __m512i lhs_mat_s_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) + + const __m512i lhs_mat_s_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) + const __m512i lhs_mat_s_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) + + const __m512i lhs_mat_s_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) + const __m512i lhs_mat_s_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) + + const __m512i lhs_mat_s_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) + const __m512i lhs_mat_s_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) + + const __m512i lhs_mat_s_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) + const __m512i lhs_mat_s_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_s_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1), lhs_mat_s_01_00_sp1), _mm512_sub_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1), lhs_mat_s_01_01_sp1)); @@ -7856,46 +7868,46 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) - const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) - const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) - const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + const __m512i rhs_hbit_014589CD_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_0, m2_expanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_014589CD_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 2), m2_expanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_014589CD_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 4), m2_expanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_014589CD_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_0, 6), m2_expanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) - const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) - const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) - const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + const __m512i rhs_hbit_2367ABEF_00 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_0, m2_expanded), 4); //Index : 0 - 7 + const __m512i rhs_hbit_2367ABEF_20 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 2), m2_expanded), 4); //Index : 32 - 39 + const __m512i rhs_hbit_2367ABEF_40 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 4), m2_expanded), 4); //Index : 64 - 71 + const __m512i rhs_hbit_2367ABEF_60 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_0, 6), m2_expanded), 4); //Index : 96 - 103 - const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) - const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) - const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) - const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + const __m512i rhs_hbit_014589CD_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_1, m2_expanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_014589CD_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 2), m2_expanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_014589CD_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 4), m2_expanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_014589CD_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_1, 6), m2_expanded), 4); //Index : 104 - 111 - const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) - const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) - const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) - const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + const __m512i rhs_hbit_2367ABEF_01 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_1, m2_expanded), 4); //Index : 8 - 15 + const __m512i rhs_hbit_2367ABEF_21 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 2), m2_expanded), 4); //Index : 40 - 47 + const __m512i rhs_hbit_2367ABEF_41 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 4), m2_expanded), 4); //Index : 72 - 79 + const __m512i rhs_hbit_2367ABEF_61 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_1, 6), m2_expanded), 4); //Index : 104 - 111 // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) - const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) - const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) - const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + const __m512i rhs_hbit_014589CD_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_2, m2_expanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_014589CD_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 2), m2_expanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_014589CD_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 4), m2_expanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_014589CD_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_2, 6), m2_expanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) - const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) - const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) - const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + const __m512i rhs_hbit_2367ABEF_10 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_2, m2_expanded), 4); //Index : 16 - 23 + const __m512i rhs_hbit_2367ABEF_30 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 2), m2_expanded), 4); //Index : 48 - 55 + const __m512i rhs_hbit_2367ABEF_50 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 4), m2_expanded), 4); //Index : 80 - 87 + const __m512i rhs_hbit_2367ABEF_70 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_2, 6), m2_expanded), 4); //Index : 112 - 119 - const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) - const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) - const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) - const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + const __m512i rhs_hbit_014589CD_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_014589CD_3, m2_expanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_014589CD_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 2), m2_expanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_014589CD_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 4), m2_expanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_014589CD_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_014589CD_3, 6), m2_expanded), 4); //Index : 120 - 127 - const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) - const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) - const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) - const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + const __m512i rhs_hbit_2367ABEF_11 = _mm512_slli_epi16(_mm512_and_si512(rhs_raw_hbit_2367ABEF_3, m2_expanded), 4); //Index : 24 - 31 + const __m512i rhs_hbit_2367ABEF_31 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 2), m2_expanded), 4); //Index : 56 - 63 + const __m512i rhs_hbit_2367ABEF_51 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 4), m2_expanded), 4); //Index : 88 - 95 + const __m512i rhs_hbit_2367ABEF_71 = _mm512_slli_epi16(_mm512_and_si512(_mm512_srli_epi16(rhs_raw_hbit_2367ABEF_3, 6), m2_expanded), 4); //Index : 120 - 127 // 0 -7, 64 - 71 const __m512i rhs_mat_014589CD_00 = _mm512_or_si512(_mm512_and_si512(rhs_raw_mat_014589CD_0, m4_expanded), rhs_hbit_014589CD_00); @@ -8728,46 +8740,46 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) - const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) - const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) - const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //Index : 0 - 7 + const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //Index : 32 - 39 + const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //Index : 64 - 71 + const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //Index : 96 - 103 - const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) - const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) - const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) - const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //Index : 0 - 7 + const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //Index : 32 - 39 + const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //Index : 64 - 71 + const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //Index : 96 - 103 - const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) - const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) - const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) - const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //Index : 8 - 15 + const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //Index : 40 - 47 + const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //Index : 72 - 79 + const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //Index : 104 - 111 - const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) - const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) - const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) - const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //Index : 8 - 15 + const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //Index : 40 - 47 + const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //Index : 72 - 79 + const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //Index : 104 - 111 // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) - const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) - const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) - const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //Index : 16 - 23 + const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //Index : 48 - 55 + const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //Index : 80 - 87 + const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //Index : 112 - 119 - const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) - const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) - const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) - const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //Index : 16 - 23 + const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //Index : 48 - 55 + const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //Index : 80 - 87 + const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //Index : 112 - 119 - const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) - const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) - const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) - const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //Index : 24 - 31 + const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //Index : 56 - 63 + const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //Index : 88 - 95 + const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //Index : 120 - 127 - const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) - const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) - const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) - const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //Index : 24 - 31 + const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //Index : 56 - 63 + const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 + const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 // 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00); @@ -9535,46 +9547,46 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo // 2-bit -> 8-bit // Values of the 0th,2nd,4th,6th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) - const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) - const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) - const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) + const __m256i rhs_hbit_0145_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_0, m2), 4); //Index : 0 - 7 + const __m256i rhs_hbit_0145_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 2), m2), 4); //Index : 32 - 39 + const __m256i rhs_hbit_0145_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 4), m2), 4); //Index : 64 - 71 + const __m256i rhs_hbit_0145_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_0, 6), m2), 4); //Index : 96 - 103 - const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) - const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) - const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) - const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) + const __m256i rhs_hbit_2367_00 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_0, m2), 4); //Index : 0 - 7 + const __m256i rhs_hbit_2367_20 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 2), m2), 4); //Index : 32 - 39 + const __m256i rhs_hbit_2367_40 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 4), m2), 4); //Index : 64 - 71 + const __m256i rhs_hbit_2367_60 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_0, 6), m2), 4); //Index : 96 - 103 - const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) - const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) - const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) - const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) + const __m256i rhs_hbit_0145_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_1, m2), 4); //Index : 8 - 15 + const __m256i rhs_hbit_0145_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 2), m2), 4); //Index : 40 - 47 + const __m256i rhs_hbit_0145_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 4), m2), 4); //Index : 72 - 79 + const __m256i rhs_hbit_0145_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_1, 6), m2), 4); //Index : 104 - 111 - const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) - const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) - const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) - const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) + const __m256i rhs_hbit_2367_01 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_1, m2), 4); //Index : 8 - 15 + const __m256i rhs_hbit_2367_21 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 2), m2), 4); //Index : 40 - 47 + const __m256i rhs_hbit_2367_41 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 4), m2), 4); //Index : 72 - 79 + const __m256i rhs_hbit_2367_61 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_1, 6), m2), 4); //Index : 104 - 111 // Values of the 1st,3rd,5th,7th sub blocks of eight block_q6_K structures for the sb loop - const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) - const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) - const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) - const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) + const __m256i rhs_hbit_0145_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_2, m2), 4); //Index : 16 - 23 + const __m256i rhs_hbit_0145_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 2), m2), 4); //Index : 48 - 55 + const __m256i rhs_hbit_0145_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 4), m2), 4); //Index : 80 - 87 + const __m256i rhs_hbit_0145_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_2, 6), m2), 4); //Index : 112 - 119 - const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) - const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) - const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) - const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) + const __m256i rhs_hbit_2367_10 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_2, m2), 4); //Index : 16 - 23 + const __m256i rhs_hbit_2367_30 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 2), m2), 4); //Index : 48 - 55 + const __m256i rhs_hbit_2367_50 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 4), m2), 4); //Index : 80 - 87 + const __m256i rhs_hbit_2367_70 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_2, 6), m2), 4); //Index : 112 - 119 - const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) - const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) - const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) - const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) + const __m256i rhs_hbit_0145_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_0145_3, m2), 4); //Index : 24 - 31 + const __m256i rhs_hbit_0145_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 2), m2), 4); //Index : 56 - 63 + const __m256i rhs_hbit_0145_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 4), m2), 4); //Index : 88 - 95 + const __m256i rhs_hbit_0145_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_0145_3, 6), m2), 4); //Index : 120 - 127 - const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) - const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) - const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) - const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) + const __m256i rhs_hbit_2367_11 = _mm256_slli_epi16(_mm256_and_si256(rhs_raw_hbit_2367_3, m2), 4); //Index : 24 - 31 + const __m256i rhs_hbit_2367_31 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 2), m2), 4); //Index : 56 - 63 + const __m256i rhs_hbit_2367_51 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 4), m2), 4); //Index : 88 - 95 + const __m256i rhs_hbit_2367_71 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_hbit_2367_3, 6), m2), 4); //Index : 120 - 127 // 0 -7, 64 - 71 const __m256i rhs_mat_0145_00 = _mm256_or_si256(_mm256_and_si256(rhs_raw_mat_0145_0, m4), rhs_hbit_0145_00);