Merge 7d0ad88bfa into 3688c4f504
This commit is contained in:
commit
84591a8fa2
|
|
@ -42,6 +42,7 @@
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
|
||||||
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
||||||
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -54,6 +55,7 @@
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
|
||||||
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
||||||
# define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
# define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -75,6 +77,7 @@
|
||||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||||
|
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
|
||||||
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
||||||
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -83,6 +86,7 @@
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||||
|
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
|
||||||
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
||||||
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -106,6 +110,7 @@
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
|
||||||
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
||||||
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -118,6 +123,7 @@
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
|
||||||
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
||||||
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -142,6 +148,7 @@
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
|
||||||
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
||||||
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -154,6 +161,7 @@
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
|
||||||
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
||||||
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -185,6 +193,7 @@
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
|
||||||
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
||||||
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -196,6 +205,7 @@
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
|
||||||
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
||||||
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -226,6 +236,7 @@
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
|
||||||
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
||||||
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -238,6 +249,7 @@
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
|
||||||
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
||||||
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -270,6 +282,7 @@
|
||||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||||
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
|
||||||
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
|
||||||
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
|
@ -282,6 +295,7 @@
|
||||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||||
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
|
||||||
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
|
||||||
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
|
|
||||||
|
|
@ -785,6 +785,165 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
||||||
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemv_q5_K_8x4_q8_K(int n,
|
||||||
|
float * GGML_RESTRICT s,
|
||||||
|
size_t bs,
|
||||||
|
const void * GGML_RESTRICT vx,
|
||||||
|
const void * GGML_RESTRICT vy,
|
||||||
|
int nr,
|
||||||
|
int nc) {
|
||||||
|
constexpr int qk = QK_K;
|
||||||
|
const int nb = n / qk;
|
||||||
|
|
||||||
|
constexpr int ncols_interleaved = 8;
|
||||||
|
constexpr int blocklen = 4;
|
||||||
|
|
||||||
|
assert(n % qk == 0);
|
||||||
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
constexpr int col_groups = ncols_interleaved / 4; // 0123 and 4567
|
||||||
|
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||||
|
const uint8x16_t mone = vdupq_n_u8(1);
|
||||||
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
||||||
|
|
||||||
|
// 1x8 tile = 2 x 4
|
||||||
|
float32x4_t acc_f32[col_groups];
|
||||||
|
|
||||||
|
const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
|
||||||
|
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q5_Kx8 * GGML_RESTRICT q5_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
||||||
|
|
||||||
|
for (int i = 0; i < col_groups; i++) {
|
||||||
|
acc_f32[i] = vdupq_n_f32(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int b = 0; b < nb; b++) {
|
||||||
|
float32x4_t q5_d_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d)); // d0 d1 d2 d3
|
||||||
|
float32x4_t q5_d_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d + 4)); // d4 d5 d6 d7
|
||||||
|
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d);
|
||||||
|
float32x4_t sb_scale_0123 = vmulq_f32(q5_d_0, q8_d);
|
||||||
|
float32x4_t sb_scale_4567 = vmulq_f32(q5_d_1, q8_d);
|
||||||
|
float32x4_t q5_dmin_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin)); // dmin 0..3
|
||||||
|
float32x4_t q5_dmin_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin + 4)); // dmin 4..7
|
||||||
|
float32x4_t sb_min_0123 = vmulq_f32(q5_dmin_0, q8_d);
|
||||||
|
float32x4_t sb_min_4567 = vmulq_f32(q5_dmin_1, q8_d);
|
||||||
|
|
||||||
|
// interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
|
||||||
|
int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
|
||||||
|
int32x4_t acc_lo[col_groups];
|
||||||
|
int32x4_t acc_hi[col_groups];
|
||||||
|
|
||||||
|
// Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
|
||||||
|
const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
|
||||||
|
int16_t bsums_arr[8];
|
||||||
|
vst1q_s16(bsums_arr, bsums);
|
||||||
|
|
||||||
|
uint8x16_t qh[col_groups][8];
|
||||||
|
for (int c = 0; c < col_groups; c++) {
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
qh[c][i] = vld1q_u8(q5_ptr[b].qh + i * 32 + 16 * c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int sb = 0; sb < QK_K / 64; sb++) {
|
||||||
|
for (int i = 0; i < col_groups; i++) {
|
||||||
|
acc_lo[i] = vdupq_n_s32(0);
|
||||||
|
acc_hi[i] = vdupq_n_s32(0);
|
||||||
|
}
|
||||||
|
// Need scales for the low and high nibbles
|
||||||
|
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
|
||||||
|
int16x8_t q5sb_mins[2];
|
||||||
|
int16x8_t q5sb_scales[2];
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
int8_t aux_q5sb[8];
|
||||||
|
const int offset = sb * 24 + i * 12;
|
||||||
|
decode_q_Kx8_6bit_scales(&q5_ptr[b].scales[offset], &q5sb_mins[i], aux_q5sb);
|
||||||
|
q5sb_scales[i] = vmovl_s8(vld1_s8(aux_q5sb));
|
||||||
|
}
|
||||||
|
|
||||||
|
int8x16_t q8_qs[4];
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
q8_qs[i] = vld1q_s8(q8_ptr[b].qs + sb * 64 + i * 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int c = 0; c < col_groups; c++) {
|
||||||
|
uint8x16_t q5_cols[8];
|
||||||
|
uint8x16_t hbit_lo[8];
|
||||||
|
uint8x16_t hbit_hi[8];
|
||||||
|
int8x16_t q5_lo[8];
|
||||||
|
int8x16_t q5_hi[8];
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
q5_cols[i] = vld1q_u8(q5_ptr[b].qs + sb * QK_K + i * 32 + 16 * c);
|
||||||
|
hbit_lo[i] = vandq_u8(qh[c][i], mone);
|
||||||
|
hbit_hi[i] = vshlq_n_u8(vandq_u8(qh[c][i], mtwo), 3);
|
||||||
|
qh[c][i] = vshrq_n_u8(qh[c][i], 2);
|
||||||
|
q5_lo[i] = vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q5_cols[i], m4b), hbit_lo[i], 4));
|
||||||
|
q5_hi[i] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5_cols[i], 4), hbit_hi[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[0], q8_qs[0], 0);
|
||||||
|
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[1], q8_qs[0], 1);
|
||||||
|
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[2], q8_qs[0], 2);
|
||||||
|
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[3], q8_qs[0], 3);
|
||||||
|
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[4], q8_qs[1], 0);
|
||||||
|
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[5], q8_qs[1], 1);
|
||||||
|
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[6], q8_qs[1], 2);
|
||||||
|
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], q5_lo[7], q8_qs[1], 3);
|
||||||
|
|
||||||
|
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[0], q8_qs[2], 0);
|
||||||
|
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[1], q8_qs[2], 1);
|
||||||
|
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[2], q8_qs[2], 2);
|
||||||
|
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[3], q8_qs[2], 3);
|
||||||
|
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[4], q8_qs[3], 0);
|
||||||
|
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[5], q8_qs[3], 1);
|
||||||
|
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[6], q8_qs[3], 2);
|
||||||
|
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], q5_hi[7], q8_qs[3], 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scales
|
||||||
|
// row c0123 blk0 and blk1
|
||||||
|
const int16x4_t sc_0123_lo = vget_low_s16(q5sb_scales[0]);
|
||||||
|
const int16x4_t sc_0123_hi = vget_low_s16(q5sb_scales[1]);
|
||||||
|
const float32x4_t sumf_0123 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[0]),
|
||||||
|
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[0])));
|
||||||
|
acc_f32[0] = vfmaq_f32(acc_f32[0], sb_scale_0123, sumf_0123);
|
||||||
|
// row c4567 blk0 and blk1
|
||||||
|
const int16x4_t sc_4567_lo = vget_high_s16(q5sb_scales[0]);
|
||||||
|
const int16x4_t sc_4567_hi = vget_high_s16(q5sb_scales[1]);
|
||||||
|
const float32x4_t sumf_4567 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[1]),
|
||||||
|
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[1])));
|
||||||
|
acc_f32[1] = vfmaq_f32(acc_f32[1], sb_scale_4567, sumf_4567);
|
||||||
|
|
||||||
|
// Bias Correction
|
||||||
|
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
|
||||||
|
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
|
||||||
|
|
||||||
|
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q5sb_mins[0]));
|
||||||
|
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q5sb_mins[1]));
|
||||||
|
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q5sb_mins[0]));
|
||||||
|
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q5sb_mins[1]));
|
||||||
|
} // for sb
|
||||||
|
|
||||||
|
acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0123);
|
||||||
|
acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_4567);
|
||||||
|
} // for b
|
||||||
|
|
||||||
|
int base = x * ncols_interleaved;
|
||||||
|
vst1q_f32(s + base, acc_f32[0]);
|
||||||
|
vst1q_f32(s + base + 4, acc_f32[1]);
|
||||||
|
} // for x
|
||||||
|
return;
|
||||||
|
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
ggml_gemv_q5_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_gemv_q5_K_8x8_q8_K(int n,
|
void ggml_gemv_q5_K_8x8_q8_K(int n,
|
||||||
float * GGML_RESTRICT s,
|
float * GGML_RESTRICT s,
|
||||||
size_t bs,
|
size_t bs,
|
||||||
|
|
@ -3017,6 +3176,235 @@ void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||||
ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_gemm_q5_K_8x4_q8_K(int n,
|
||||||
|
float * GGML_RESTRICT s,
|
||||||
|
size_t bs,
|
||||||
|
const void * GGML_RESTRICT vx,
|
||||||
|
const void * GGML_RESTRICT vy,
|
||||||
|
int nr,
|
||||||
|
int nc) {
|
||||||
|
constexpr int qk = QK_K;
|
||||||
|
const int nb = n / qk;
|
||||||
|
|
||||||
|
constexpr int ncols_interleaved = 8;
|
||||||
|
constexpr int blocklen = 4;
|
||||||
|
|
||||||
|
assert(n % qk == 0);
|
||||||
|
assert(nr % 4 == 0);
|
||||||
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(nb);
|
||||||
|
UNUSED(ncols_interleaved);
|
||||||
|
UNUSED(blocklen);
|
||||||
|
|
||||||
|
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
constexpr int q8_k_blocklen = 4;
|
||||||
|
constexpr int acc_size = 2 * 4; // 2 row pairs, 4 col pairs
|
||||||
|
constexpr int col_groups = ncols_interleaved / 4;
|
||||||
|
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||||
|
const uint8x16_t mone = vdupq_n_u8(1);
|
||||||
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
||||||
|
|
||||||
|
// 8 accumulators: 2 row pairs, 4 col pairs
|
||||||
|
float32x4_t acc_f32[acc_size];
|
||||||
|
|
||||||
|
for (int y = 0; y < nr / q8_k_blocklen; y++) {
|
||||||
|
const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||||
|
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q5_Kx8 * GGML_RESTRICT q5_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
||||||
|
|
||||||
|
for (int i = 0; i < acc_size; i++) {
|
||||||
|
acc_f32[i] = vdupq_n_f32(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int b = 0; b < nb; b++) {
|
||||||
|
// d5 0 1 2 3, 4 5 6 7
|
||||||
|
float32x4_t q5_d_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d));
|
||||||
|
float32x4_t q5_d_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].d + 4));
|
||||||
|
// d8 0 1 2 3
|
||||||
|
float32x4_t q8_d_0123 = vld1q_f32(q8_ptr[b].d);
|
||||||
|
// mins
|
||||||
|
float32x4_t q5_dmin_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin));
|
||||||
|
float32x4_t q5_dmin_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q5_ptr[b].dmin + 4));
|
||||||
|
|
||||||
|
// Precomputation of scales and mins
|
||||||
|
float32x4_t sbd_scale_0123[q8_k_blocklen];
|
||||||
|
float32x4_t sbd_scale_4567[q8_k_blocklen];
|
||||||
|
float32x4_t sbd_min_0123[q8_k_blocklen];
|
||||||
|
float32x4_t sbd_min_4567[q8_k_blocklen];
|
||||||
|
|
||||||
|
sbd_scale_0123[0] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 0);
|
||||||
|
sbd_scale_4567[0] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 0);
|
||||||
|
sbd_min_0123[0] = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 0);
|
||||||
|
sbd_min_4567[0] = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 0);
|
||||||
|
|
||||||
|
sbd_scale_0123[1] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 1);
|
||||||
|
sbd_scale_4567[1] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 1);
|
||||||
|
sbd_min_0123[1] = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 1);
|
||||||
|
sbd_min_4567[1] = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 1);
|
||||||
|
|
||||||
|
sbd_scale_0123[2] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 2);
|
||||||
|
sbd_scale_4567[2] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 2);
|
||||||
|
sbd_min_0123[2] = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 2);
|
||||||
|
sbd_min_4567[2] = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 2);
|
||||||
|
|
||||||
|
sbd_scale_0123[3] = vmulq_laneq_f32(q5_d_0123, q8_d_0123, 3);
|
||||||
|
sbd_scale_4567[3] = vmulq_laneq_f32(q5_d_4567, q8_d_0123, 3);
|
||||||
|
sbd_min_0123[3] = vmulq_laneq_f32(q5_dmin_0123, q8_d_0123, 3);
|
||||||
|
sbd_min_4567[3] = vmulq_laneq_f32(q5_dmin_4567, q8_d_0123, 3);
|
||||||
|
|
||||||
|
// Precomputation of bsums, each vpaddq calcs all the bsums for each row
|
||||||
|
const int16x8_t bsums[q8_k_blocklen] = {
|
||||||
|
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
|
||||||
|
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
|
||||||
|
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
|
||||||
|
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
|
||||||
|
};
|
||||||
|
int16_t bsums_arr[QK_K / 64][8];
|
||||||
|
for (int q8_row = 0; q8_row < 4; q8_row++) {
|
||||||
|
vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// interleaved bias_acc: [0]->r0 0123, [1]->r1 0123, .., [4]->r0 4567, [5]->r1 4567 ..
|
||||||
|
int32x4_t bias_acc[acc_size];
|
||||||
|
for (int i = 0; i < acc_size; i++) {
|
||||||
|
bias_acc[i] = vdupq_n_s32(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8x16_t qh[col_groups][8];
|
||||||
|
for (int c = 0; c < col_groups; c++) {
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
qh[c][i] = vld1q_u8(q5_ptr[b].qh + i * 32 + 16 * c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int sb = 0; sb < QK_K / 64; sb++) {
|
||||||
|
// Int accumulators for qs vecdot (4 row * 2 col quartets)
|
||||||
|
int32x4_t acc_lo[acc_size];
|
||||||
|
int32x4_t acc_hi[acc_size];
|
||||||
|
for (int i = 0; i < acc_size; i++) {
|
||||||
|
acc_lo[i] = vdupq_n_s32(0);
|
||||||
|
acc_hi[i] = vdupq_n_s32(0);
|
||||||
|
}
|
||||||
|
// Need scales for the low and high nibbles
|
||||||
|
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
|
||||||
|
int16x8_t q5sb_scales[2];
|
||||||
|
int16x8_t q5sb_mins[2];
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
int8_t aux_q5sb[8];
|
||||||
|
const int offset = sb * 24 + i * 12;
|
||||||
|
decode_q_Kx8_6bit_scales(&q5_ptr[b].scales[offset], &q5sb_mins[i], aux_q5sb);
|
||||||
|
q5sb_scales[i] = vmovl_s8(vld1_s8(aux_q5sb));
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int reads_per_sb = 8; // 8 * 16 bytes each => 32 qs * 4 rows
|
||||||
|
for (int k = 0; k < reads_per_sb; k++) {
|
||||||
|
const int8x16_t q8_blk0 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k);
|
||||||
|
const int8x16_t q8_blk1 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k + 128);
|
||||||
|
|
||||||
|
// 0..3 & 32..35
|
||||||
|
const uint8x16_t q5_0123 = vld1q_u8(q5_ptr[b].qs + sb * QK_K + 32 * k);
|
||||||
|
const uint8x16_t q5_4567 = vld1q_u8(q5_ptr[b].qs + sb * QK_K + 32 * k + 16);
|
||||||
|
|
||||||
|
// NOTE: This is the only difference with q4_K
|
||||||
|
const uint8x16_t hbit_lo_0123 = vandq_u8(qh[0][k], mone);
|
||||||
|
const uint8x16_t hbit_hi_0123 = vshlq_n_u8(vandq_u8(qh[0][k], mtwo), 3);
|
||||||
|
qh[0][k] = vshrq_n_u8(qh[0][k], 2);
|
||||||
|
const uint8x16_t hbit_lo_4567 = vandq_u8(qh[1][k], mone);
|
||||||
|
const uint8x16_t hbit_hi_4567 = vshlq_n_u8(vandq_u8(qh[1][k], mtwo), 3);
|
||||||
|
qh[1][k] = vshrq_n_u8(qh[1][k], 2);
|
||||||
|
// From here, same as q4_K
|
||||||
|
|
||||||
|
const int8x16_t q5_0123_lo =
|
||||||
|
vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q5_0123, m4b), hbit_lo_0123, 4));
|
||||||
|
const int8x16_t q5_0123_hi =
|
||||||
|
vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5_0123, 4), hbit_hi_0123));
|
||||||
|
|
||||||
|
acc_lo[0] = vdotq_laneq_s32(acc_lo[0], q5_0123_lo, q8_blk0, 0); // 0..3 r0 c0123
|
||||||
|
acc_lo[1] = vdotq_laneq_s32(acc_lo[1], q5_0123_lo, q8_blk0, 1); // 0..3 r1 c0123
|
||||||
|
acc_lo[2] = vdotq_laneq_s32(acc_lo[2], q5_0123_lo, q8_blk0, 2); // 0..3 r2 c0123
|
||||||
|
acc_lo[3] = vdotq_laneq_s32(acc_lo[3], q5_0123_lo, q8_blk0, 3); // 0..3 r3 c0123
|
||||||
|
|
||||||
|
acc_hi[0] = vdotq_laneq_s32(acc_hi[0], q5_0123_hi, q8_blk1, 0); // 32..35 r0 c0123
|
||||||
|
acc_hi[1] = vdotq_laneq_s32(acc_hi[1], q5_0123_hi, q8_blk1, 1); // 32..35 r1 c0123
|
||||||
|
acc_hi[2] = vdotq_laneq_s32(acc_hi[2], q5_0123_hi, q8_blk1, 2); // 32..35 r2 c0123
|
||||||
|
acc_hi[3] = vdotq_laneq_s32(acc_hi[3], q5_0123_hi, q8_blk1, 3); // 32..35 r3 c0123
|
||||||
|
|
||||||
|
const int8x16_t q5_4567_lo =
|
||||||
|
vreinterpretq_s8_u8(vsliq_n_u8(vandq_u8(q5_4567, m4b), hbit_lo_4567, 4));
|
||||||
|
const int8x16_t q5_4567_hi =
|
||||||
|
vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5_4567, 4), hbit_hi_4567));
|
||||||
|
|
||||||
|
acc_lo[4] = vdotq_laneq_s32(acc_lo[4], q5_4567_lo, q8_blk0, 0); // 0..3 r0 c4567
|
||||||
|
acc_lo[5] = vdotq_laneq_s32(acc_lo[5], q5_4567_lo, q8_blk0, 1); // 0..3 r1 c4567
|
||||||
|
acc_lo[6] = vdotq_laneq_s32(acc_lo[6], q5_4567_lo, q8_blk0, 2); // 0..3 r2 c4567
|
||||||
|
acc_lo[7] = vdotq_laneq_s32(acc_lo[7], q5_4567_lo, q8_blk0, 3); // 0..3 r3 c4567
|
||||||
|
|
||||||
|
acc_hi[4] = vdotq_laneq_s32(acc_hi[4], q5_4567_hi, q8_blk1, 0); // 32..35 r0 c4567
|
||||||
|
acc_hi[5] = vdotq_laneq_s32(acc_hi[5], q5_4567_hi, q8_blk1, 1); // 32..35 r1 c4567
|
||||||
|
acc_hi[6] = vdotq_laneq_s32(acc_hi[6], q5_4567_hi, q8_blk1, 2); // 32..35 r2 c4567
|
||||||
|
acc_hi[7] = vdotq_laneq_s32(acc_hi[7], q5_4567_hi, q8_blk1, 3); // 32..35 r3 c4567
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scale and bias application
|
||||||
|
// acc is stored interleaved to match output layout
|
||||||
|
const int16x4_t sc_0123_lo = vget_low_s16(q5sb_scales[0]);
|
||||||
|
const int16x4_t sc_4567_lo = vget_high_s16(q5sb_scales[0]);
|
||||||
|
const int16x4_t sc_0123_hi = vget_low_s16(q5sb_scales[1]);
|
||||||
|
const int16x4_t sc_4567_hi = vget_high_s16(q5sb_scales[1]);
|
||||||
|
for (int row = 0; row < q8_k_blocklen; row++) {
|
||||||
|
// Bias correction
|
||||||
|
// row c0123 blk0 and blk1
|
||||||
|
const float32x4_t sumf_0123 =
|
||||||
|
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[row]),
|
||||||
|
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[row])));
|
||||||
|
acc_f32[2 * row] = vfmaq_f32(acc_f32[2 * row], sbd_scale_0123[row], sumf_0123);
|
||||||
|
|
||||||
|
// row c4567 blk0 and blk1
|
||||||
|
const float32x4_t sumf_4567 =
|
||||||
|
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[row + 4]),
|
||||||
|
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[row + 4])));
|
||||||
|
acc_f32[2 * row + 1] = vfmaq_f32(acc_f32[2 * row + 1], sbd_scale_4567[row], sumf_4567);
|
||||||
|
|
||||||
|
// Bias
|
||||||
|
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][row * 2]);
|
||||||
|
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][row * 2 + 1]);
|
||||||
|
|
||||||
|
// row c0123 blk0 and blk1
|
||||||
|
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_lo, vget_low_s16(q5sb_mins[0]));
|
||||||
|
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_hi, vget_low_s16(q5sb_mins[1]));
|
||||||
|
|
||||||
|
// row c4567 blk0 and blk1
|
||||||
|
bias_acc[2 * row + 1] =
|
||||||
|
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_lo, vget_high_s16(q5sb_mins[0]));
|
||||||
|
bias_acc[2 * row + 1] =
|
||||||
|
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_hi, vget_high_s16(q5sb_mins[1]));
|
||||||
|
}
|
||||||
|
} // for sb
|
||||||
|
|
||||||
|
for (int row = 0; row < q8_k_blocklen; row++) {
|
||||||
|
acc_f32[2 * row] = vmlsq_f32(acc_f32[2 * row], vcvtq_f32_s32(bias_acc[2 * row]), sbd_min_0123[row]);
|
||||||
|
acc_f32[2 * row + 1] =
|
||||||
|
vmlsq_f32(acc_f32[2 * row + 1], vcvtq_f32_s32(bias_acc[2 * row + 1]), sbd_min_4567[row]);
|
||||||
|
}
|
||||||
|
} // for b
|
||||||
|
|
||||||
|
for (int i = 0; i < q8_k_blocklen; i++) {
|
||||||
|
int row = y * q8_k_blocklen + i;
|
||||||
|
for (int j = 0; j < 2; j++) {
|
||||||
|
int col = x * ncols_interleaved + j * 4;
|
||||||
|
int offset = row * bs + col;
|
||||||
|
vst1q_f32(s + offset, acc_f32[2 * i + j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // for x
|
||||||
|
} // for y
|
||||||
|
return;
|
||||||
|
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
ggml_gemm_q5_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_gemm_q4_K_8x8_q8_K(int n,
|
void ggml_gemm_q4_K_8x8_q8_K(int n,
|
||||||
float * GGML_RESTRICT s,
|
float * GGML_RESTRICT s,
|
||||||
size_t bs,
|
size_t bs,
|
||||||
|
|
|
||||||
|
|
@ -256,6 +256,207 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTR
|
||||||
ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
|
ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Template implementation for Q5_K gemv with parametrized blocklen
|
||||||
|
template <int N, int M>
|
||||||
|
static void ggml_gemv_q5_K_NxM_q8_K_generic_impl(int n,
|
||||||
|
float * GGML_RESTRICT s,
|
||||||
|
size_t bs,
|
||||||
|
const void * GGML_RESTRICT vx,
|
||||||
|
const void * GGML_RESTRICT vy,
|
||||||
|
int nr,
|
||||||
|
int nc) {
|
||||||
|
constexpr int ncols_interleaved = N;
|
||||||
|
constexpr int blocklen = M;
|
||||||
|
const int qk = QK_K;
|
||||||
|
const int nb = n / qk;
|
||||||
|
static const uint32_t kmask1 = 0x3f3f3f3f;
|
||||||
|
static const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
static const uint32_t kmask3 = 0x03030303;
|
||||||
|
|
||||||
|
assert(n % qk == 0);
|
||||||
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
UNUSED(bs);
|
||||||
|
UNUSED(nr);
|
||||||
|
|
||||||
|
float sumf[8];
|
||||||
|
float sum_minf[8];
|
||||||
|
uint32_t utmp[32];
|
||||||
|
int sumi1;
|
||||||
|
int sumi2;
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
||||||
|
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumf[j] = 0.0;
|
||||||
|
sum_minf[j] = 0.0;
|
||||||
|
}
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int sb = 0; sb < 8; sb++) {
|
||||||
|
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
||||||
|
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
||||||
|
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
||||||
|
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
||||||
|
utmp[sb * 4 + 2] = uaux_0;
|
||||||
|
utmp[sb * 4 + 0] &= kmask1;
|
||||||
|
}
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * 32;
|
||||||
|
uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * 32 + 16;
|
||||||
|
|
||||||
|
const int qh_shift = (k / (32 / blocklen)) * 2;
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi1 = 0;
|
||||||
|
sumi2 = 0;
|
||||||
|
sumi = 0;
|
||||||
|
for (int i = 0; i < blocklen; ++i) {
|
||||||
|
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
||||||
|
|
||||||
|
const int qh_idx = (k * blocklen + i) % 32;
|
||||||
|
const int qh_chunk = qh_idx / blocklen;
|
||||||
|
const int qh_pos = qh_idx % blocklen;
|
||||||
|
const int b_qh_offset = qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
|
||||||
|
|
||||||
|
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
||||||
|
const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
||||||
|
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
||||||
|
|
||||||
|
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
||||||
|
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
||||||
|
|
||||||
|
const int q8_offset = (k / (32 / blocklen)) * 64 + (k % (32 / blocklen)) * blocklen + i;
|
||||||
|
|
||||||
|
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
||||||
|
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
|
||||||
|
sumi1 = sumi1 * scales_0[j];
|
||||||
|
sumi2 = sumi2 * scales_1[j];
|
||||||
|
sumi += sumi1 + sumi2;
|
||||||
|
}
|
||||||
|
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int sb = 0; sb < 8; sb++) {
|
||||||
|
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int N, int M>
|
||||||
|
static void ggml_gemm_q5_K_NxM_q8_K_generic_impl(int n,
|
||||||
|
float * GGML_RESTRICT s,
|
||||||
|
size_t bs,
|
||||||
|
const void * GGML_RESTRICT vx,
|
||||||
|
const void * GGML_RESTRICT vy,
|
||||||
|
int nr,
|
||||||
|
int nc) {
|
||||||
|
constexpr int ncols_interleaved = N;
|
||||||
|
constexpr int blocklen = M;
|
||||||
|
const int qk = QK_K;
|
||||||
|
const int nb = n / qk;
|
||||||
|
static const uint32_t kmask1 = 0x3f3f3f3f;
|
||||||
|
static const uint32_t kmask2 = 0x0f0f0f0f;
|
||||||
|
static const uint32_t kmask3 = 0x03030303;
|
||||||
|
|
||||||
|
assert(n % qk == 0);
|
||||||
|
assert(nr % 4 == 0);
|
||||||
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
|
float sumf[4][8];
|
||||||
|
float sum_minf[4][8];
|
||||||
|
uint32_t utmp[32];
|
||||||
|
int sumi1;
|
||||||
|
int sumi2;
|
||||||
|
int sumi;
|
||||||
|
|
||||||
|
for (int y = 0; y < nr / 4; y++) {
|
||||||
|
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||||
|
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||||
|
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumf[m][j] = 0.0;
|
||||||
|
sum_minf[m][j] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int l = 0; l < nb; l++) {
|
||||||
|
for (int sb = 0; sb < 8; sb++) {
|
||||||
|
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
||||||
|
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
||||||
|
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
||||||
|
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
||||||
|
utmp[sb * 4 + 2] = uaux_0;
|
||||||
|
utmp[sb * 4 + 0] &= kmask1;
|
||||||
|
}
|
||||||
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||||
|
uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * 32;
|
||||||
|
uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * 32 + 16;
|
||||||
|
|
||||||
|
const int qh_shift = (k / (32 / blocklen)) * 2;
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sumi1 = 0;
|
||||||
|
sumi2 = 0;
|
||||||
|
sumi = 0;
|
||||||
|
for (int i = 0; i < blocklen; ++i) {
|
||||||
|
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
||||||
|
|
||||||
|
const int qh_idx = (k * blocklen + i) % 32;
|
||||||
|
const int qh_chunk = qh_idx / blocklen;
|
||||||
|
const int qh_pos = qh_idx % blocklen;
|
||||||
|
const int b_qh_offset =
|
||||||
|
qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
|
||||||
|
|
||||||
|
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
||||||
|
const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
||||||
|
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
||||||
|
|
||||||
|
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
||||||
|
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
||||||
|
|
||||||
|
const int q8_offset = (k / (32 / blocklen)) * 256 +
|
||||||
|
(k % (32 / blocklen)) * 4 * blocklen + m * blocklen + i;
|
||||||
|
|
||||||
|
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
||||||
|
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
|
||||||
|
sumi1 = sumi1 * scales_0[j];
|
||||||
|
sumi2 = sumi2 * scales_1[j];
|
||||||
|
sumi += sumi1 + sumi2;
|
||||||
|
}
|
||||||
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int sb = 0; sb < 8; sb++) {
|
||||||
|
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
||||||
|
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int m = 0; m < 4; m++) {
|
||||||
|
for (int j = 0; j < ncols_interleaved; j++) {
|
||||||
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
|
|
@ -609,98 +810,12 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gemv_q5_K_8x8_q8_K_generic(int n,
|
void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
float * GGML_RESTRICT s,
|
ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 4>(n, s, bs, vx, vy, nr, nc);
|
||||||
size_t bs,
|
}
|
||||||
const void * GGML_RESTRICT vx,
|
|
||||||
const void * GGML_RESTRICT vy,
|
|
||||||
int nr,
|
|
||||||
int nc) {
|
|
||||||
const int qk = QK_K;
|
|
||||||
const int nb = n / qk;
|
|
||||||
const int ncols_interleaved = 8;
|
|
||||||
const int blocklen = 8;
|
|
||||||
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
||||||
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
||||||
static const uint32_t kmask3 = 0x03030303;
|
|
||||||
|
|
||||||
assert(n % qk == 0);
|
void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
assert(nc % ncols_interleaved == 0);
|
ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
||||||
|
|
||||||
UNUSED(bs);
|
|
||||||
UNUSED(nr);
|
|
||||||
|
|
||||||
float sumf[8];
|
|
||||||
float sum_minf[8];
|
|
||||||
uint32_t utmp[32];
|
|
||||||
int sumi1;
|
|
||||||
int sumi2;
|
|
||||||
int sumi;
|
|
||||||
|
|
||||||
const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
|
||||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
||||||
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
|
||||||
|
|
||||||
for (int j = 0; j < ncols_interleaved; j++) {
|
|
||||||
sumf[j] = 0.0;
|
|
||||||
sum_minf[j] = 0.0;
|
|
||||||
}
|
|
||||||
for (int l = 0; l < nb; l++) {
|
|
||||||
for (int sb = 0; sb < 8; sb++) {
|
|
||||||
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
||||||
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
||||||
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
||||||
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
||||||
utmp[sb * 4 + 2] = uaux_0;
|
|
||||||
utmp[sb * 4 + 0] &= kmask1;
|
|
||||||
}
|
|
||||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
||||||
uint8_t * scales_0 = (uint8_t *) utmp + (k / 4) * 32;
|
|
||||||
uint8_t * scales_1 = (uint8_t *) utmp + (k / 4) * 32 + 16;
|
|
||||||
|
|
||||||
const int qh_shift = (k / 4) * 2;
|
|
||||||
for (int j = 0; j < ncols_interleaved; j++) {
|
|
||||||
sumi1 = 0;
|
|
||||||
sumi2 = 0;
|
|
||||||
sumi = 0;
|
|
||||||
for (int i = 0; i < blocklen; ++i) {
|
|
||||||
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
||||||
|
|
||||||
const int qh_idx = (k * 8 + i) % 32;
|
|
||||||
const int qh_chunk = qh_idx / 8;
|
|
||||||
const int qh_pos = qh_idx % 8;
|
|
||||||
const int b_qh_offset = qh_chunk * 64 + j * 8 + qh_pos;
|
|
||||||
|
|
||||||
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
|
||||||
const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
|
||||||
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
|
||||||
|
|
||||||
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
|
||||||
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
|
||||||
|
|
||||||
const int q8_offset = (k >> 2) * 64 + (k % 4) * blocklen + i;
|
|
||||||
|
|
||||||
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
|
||||||
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
|
|
||||||
sumi1 = sumi1 * scales_0[j];
|
|
||||||
sumi2 = sumi2 * scales_1[j];
|
|
||||||
sumi += sumi1 + sumi2;
|
|
||||||
}
|
|
||||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int sb = 0; sb < 8; sb++) {
|
|
||||||
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
||||||
for (int j = 0; j < ncols_interleaved; j++) {
|
|
||||||
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
||||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int j = 0; j < ncols_interleaved; j++) {
|
|
||||||
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1382,107 +1497,12 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gemm_q5_K_8x8_q8_K_generic(int n,
|
void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
float * GGML_RESTRICT s,
|
ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 4>(n, s, bs, vx, vy, nr, nc);
|
||||||
size_t bs,
|
}
|
||||||
const void * GGML_RESTRICT vx,
|
|
||||||
const void * GGML_RESTRICT vy,
|
|
||||||
int nr,
|
|
||||||
int nc) {
|
|
||||||
const int qk = QK_K;
|
|
||||||
const int nb = n / qk;
|
|
||||||
const int ncols_interleaved = 8;
|
|
||||||
const int blocklen = 8;
|
|
||||||
|
|
||||||
constexpr uint32_t kmask1 = 0x3f3f3f3f;
|
void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
constexpr uint32_t kmask2 = 0x0f0f0f0f;
|
ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
||||||
constexpr uint32_t kmask3 = 0x03030303;
|
|
||||||
|
|
||||||
assert(n % qk == 0);
|
|
||||||
assert(nr % 4 == 0);
|
|
||||||
assert(nc % ncols_interleaved == 0);
|
|
||||||
|
|
||||||
float sumf[4][8];
|
|
||||||
float sum_minf[4][8];
|
|
||||||
uint32_t utmp[32];
|
|
||||||
int sumi1;
|
|
||||||
int sumi2;
|
|
||||||
int sumi;
|
|
||||||
|
|
||||||
for (int y = 0; y < nr / 4; y++) {
|
|
||||||
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
|
||||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
||||||
const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
|
||||||
for (int m = 0; m < 4; m++) {
|
|
||||||
for (int j = 0; j < ncols_interleaved; j++) {
|
|
||||||
sumf[m][j] = 0.0;
|
|
||||||
sum_minf[m][j] = 0.0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int l = 0; l < nb; l++) {
|
|
||||||
for (int sb = 0; sb < 8; sb++) {
|
|
||||||
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
|
||||||
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
|
||||||
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
|
||||||
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
|
||||||
utmp[sb * 4 + 2] = uaux_0;
|
|
||||||
utmp[sb * 4 + 0] &= kmask1;
|
|
||||||
}
|
|
||||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
||||||
uint8_t * scales_0 = (uint8_t *) utmp + (k / 4) * 32;
|
|
||||||
uint8_t * scales_1 = (uint8_t *) utmp + (k / 4) * 32 + 16;
|
|
||||||
|
|
||||||
const int qh_shift = (k / 4) * 2;
|
|
||||||
for (int m = 0; m < 4; m++) {
|
|
||||||
for (int j = 0; j < ncols_interleaved; j++) {
|
|
||||||
sumi1 = 0;
|
|
||||||
sumi2 = 0;
|
|
||||||
sumi = 0;
|
|
||||||
for (int i = 0; i < blocklen; ++i) {
|
|
||||||
const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
|
||||||
|
|
||||||
const int qh_idx = (k * 8 + i) % 32;
|
|
||||||
const int qh_chunk = qh_idx / 8;
|
|
||||||
const int qh_pos = qh_idx % 8;
|
|
||||||
const int b_qh_offset = qh_chunk * 64 + j * 8 + qh_pos;
|
|
||||||
|
|
||||||
const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
|
||||||
const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
|
||||||
const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
|
||||||
|
|
||||||
const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
|
||||||
const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
|
||||||
|
|
||||||
const int q8_offset = (k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i;
|
|
||||||
|
|
||||||
sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
|
||||||
sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
|
|
||||||
sumi1 = sumi1 * scales_0[j];
|
|
||||||
sumi2 = sumi2 * scales_1[j];
|
|
||||||
sumi += sumi1 + sumi2;
|
|
||||||
}
|
|
||||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int sb = 0; sb < 8; sb++) {
|
|
||||||
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
|
||||||
for (int m = 0; m < 4; m++) {
|
|
||||||
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
||||||
for (int j = 0; j < ncols_interleaved; j++) {
|
|
||||||
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
|
||||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int m = 0; m < 4; m++) {
|
|
||||||
for (int j = 0; j < ncols_interleaved; j++) {
|
|
||||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gemm_q6_K_8x8_q8_K_generic(int n,
|
void ggml_gemm_q6_K_8x8_q8_K_generic(int n,
|
||||||
|
|
@ -2013,18 +2033,16 @@ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_in
|
||||||
|
|
||||||
const int end = QK_K * 4 / blck_size_interleave;
|
const int end = QK_K * 4 / blck_size_interleave;
|
||||||
|
|
||||||
// Interleave Q5_K quants by taking 8 bytes at a time
|
// Interleave Q5_K quants by taking blck_size_interleave bytes at a time
|
||||||
for (int i = 0; i < end; ++i) {
|
for (int i = 0; i < end; ++i) {
|
||||||
int src_id = i % 8;
|
int src_id = i % 8;
|
||||||
int src_offset = (i / 8) * blck_size_interleave;
|
int src_offset = (i / 8) * blck_size_interleave;
|
||||||
int dst_offset = i * blck_size_interleave;
|
int dst_offset = i * blck_size_interleave;
|
||||||
|
|
||||||
uint64_t elems;
|
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
||||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
||||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Repeat for low bits 8 bytes at a time as well, since
|
// Repeat for high bits with the same chunk size, since
|
||||||
// the high bits are interleaved in Q5_K and the index is
|
// the high bits are interleaved in Q5_K and the index is
|
||||||
// qh_idx = (qs_idx % 32);
|
// qh_idx = (qs_idx % 32);
|
||||||
// qh_val = qh[qh_idx] >> (qs_idx / 32);
|
// qh_val = qh[qh_idx] >> (qs_idx / 32);
|
||||||
|
|
@ -2033,9 +2051,7 @@ static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_in
|
||||||
int src_offset = (i / 8) * blck_size_interleave;
|
int src_offset = (i / 8) * blck_size_interleave;
|
||||||
int dst_offset = i * blck_size_interleave;
|
int dst_offset = i * blck_size_interleave;
|
||||||
|
|
||||||
uint64_t elems;
|
memcpy(&out.qh[dst_offset], &in[src_id].qh[src_offset], blck_size_interleave);
|
||||||
memcpy(&elems, &in[src_id].qh[src_offset], sizeof(uint64_t));
|
|
||||||
memcpy(&out.qh[dst_offset], &elems, sizeof(uint64_t));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// The below logic is copied over from Q4_K
|
// The below logic is copied over from Q4_K
|
||||||
|
|
@ -2233,7 +2249,7 @@ static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
|
||||||
const void * GGML_RESTRICT data,
|
const void * GGML_RESTRICT data,
|
||||||
size_t data_size) {
|
size_t data_size) {
|
||||||
GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
|
GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
|
||||||
GGML_ASSERT(interleave_block == 8);
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||||
constexpr int nrows_interleaved = 8;
|
constexpr int nrows_interleaved = 8;
|
||||||
|
|
||||||
block_q5_Kx8 * dst = (block_q5_Kx8 *) t->data;
|
block_q5_Kx8 * dst = (block_q5_Kx8 *) t->data;
|
||||||
|
|
@ -2507,6 +2523,10 @@ template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * da
|
||||||
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> int repack<block_q5_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
|
return repack_q5_K_to_q5_K_8_bl(t, 4, data, data_size);
|
||||||
|
}
|
||||||
|
|
||||||
template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||||
return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
|
return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
|
||||||
}
|
}
|
||||||
|
|
@ -2571,6 +2591,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
||||||
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> void gemv<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
|
ggml_gemv_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
@ -2630,6 +2654,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
||||||
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <> void gemm<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
|
ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
|
}
|
||||||
|
|
||||||
template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||||
ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||||
}
|
}
|
||||||
|
|
@ -3040,6 +3068,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||||
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
||||||
|
|
||||||
// instance for Q5_K
|
// instance for Q5_K
|
||||||
|
static const ggml::cpu::repack::tensor_traits<block_q5_K, 4, 8, GGML_TYPE_Q8_K> q5_K_8x4_q8_K;
|
||||||
static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
|
static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
|
||||||
|
|
||||||
// instance for Q6_K
|
// instance for Q6_K
|
||||||
|
|
@ -3101,6 +3130,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||||
return &q5_K_8x8_q8_K;
|
return &q5_K_8x8_q8_K;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||||
|
if (cur->ne[1] % 8 == 0) {
|
||||||
|
return &q5_K_8x4_q8_K;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (cur->type == GGML_TYPE_Q6_K) {
|
} else if (cur->type == GGML_TYPE_Q6_K) {
|
||||||
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
if (cur->ne[1] % 8 == 0) {
|
if (cur->ne[1] % 8 == 0) {
|
||||||
|
|
|
||||||
|
|
@ -111,6 +111,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||||
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q5_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
@ -121,6 +122,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||||
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q5_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q5_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
@ -141,6 +143,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
@ -151,6 +154,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue