diff --git a/common/arg.cpp b/common/arg.cpp index e38caf428d..d1167d02a7 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -389,8 +389,6 @@ const std::vector kv_cache_types = { GGML_TYPE_Q5_1, GGML_TYPE_TBQ3_0, GGML_TYPE_TBQ4_0, - GGML_TYPE_TBQP3_0, - GGML_TYPE_TBQP4_0, }; static ggml_type kv_cache_type_from_str(const std::string & s) { diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 4780b546f2..ba3e8cc5ac 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -430,9 +430,7 @@ extern "C" { GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) GGML_TYPE_TBQ3_0 = 41, // TurboQuant 3-bit GGML_TYPE_TBQ4_0 = 42, // TurboQuant 4-bit - GGML_TYPE_TBQP3_0 = 43, // TurboQuant Q_prod 3-bit - GGML_TYPE_TBQP4_0 = 44, // TurboQuant Q_prod 4-bit - GGML_TYPE_COUNT = 45, + GGML_TYPE_COUNT = 43, }; // precision @@ -471,8 +469,6 @@ extern "C" { GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors GGML_FTYPE_MOSTLY_TBQ3_0 = 27, // except 1d tensors GGML_FTYPE_MOSTLY_TBQ4_0 = 28, // except 1d tensors - GGML_FTYPE_MOSTLY_TBQP3_0 = 29, // except 1d tensors - GGML_FTYPE_MOSTLY_TBQP4_0 = 30, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 5c3d5991e6..f03a1c3a62 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -282,24 +282,6 @@ typedef struct { } block_tbq4_0; static_assert(sizeof(block_tbq4_0) == sizeof(ggml_half) + QK_K / 2, "wrong tbq4_0 block size/padding"); -// 3.125 bpw -typedef struct { - uint8_t qs[QK_K / 4]; - uint8_t signs[QK_K / 8]; - ggml_half d; - ggml_half gamma; -} block_tbqp3_0; -static_assert(sizeof(block_tbqp3_0) == 2*sizeof(ggml_half) + QK_K / 4 + QK_K / 8, "wrong tbqp3_0 block size/padding"); - -// 4.125 bpw -typedef struct { - uint8_t qs[QK_K * 3 / 8]; - uint8_t signs[QK_K / 8]; - ggml_half d; - ggml_half gamma; -} block_tbqp4_0; -static_assert(sizeof(block_tbqp4_0) == 2*sizeof(ggml_half) + QK_K * 3 / 8 + QK_K / 8, "wrong tbqp4_0 block size/padding"); - // // Super-block quantization structures // diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 263584c925..724ac84f1d 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -20,8 +20,6 @@ #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K #define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K -#define ggml_vec_dot_tbqp3_0_q8_K_generic ggml_vec_dot_tbqp3_0_q8_K -#define ggml_vec_dot_tbqp4_0_q8_K_generic ggml_vec_dot_tbqp4_0_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K #define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K #define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K @@ -88,8 +86,6 @@ #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K #define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K -#define ggml_vec_dot_tbqp3_0_q8_K_generic ggml_vec_dot_tbqp3_0_q8_K -#define ggml_vec_dot_tbqp4_0_q8_K_generic ggml_vec_dot_tbqp4_0_q8_K // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 @@ -124,8 +120,6 @@ #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K #define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K -#define ggml_vec_dot_tbqp3_0_q8_K_generic ggml_vec_dot_tbqp3_0_q8_K -#define ggml_vec_dot_tbqp4_0_q8_K_generic ggml_vec_dot_tbqp4_0_q8_K #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 @@ -171,8 +165,6 @@ #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K #define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K -#define ggml_vec_dot_tbqp3_0_q8_K_generic ggml_vec_dot_tbqp3_0_q8_K -#define ggml_vec_dot_tbqp4_0_q8_K_generic ggml_vec_dot_tbqp4_0_q8_K #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 @@ -260,8 +252,6 @@ #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K #define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K -#define ggml_vec_dot_tbqp3_0_q8_K_generic ggml_vec_dot_tbqp3_0_q8_K -#define ggml_vec_dot_tbqp4_0_q8_K_generic ggml_vec_dot_tbqp4_0_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K #define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K #define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K @@ -314,8 +304,6 @@ #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K #define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K -#define ggml_vec_dot_tbqp3_0_q8_K_generic ggml_vec_dot_tbqp3_0_q8_K -#define ggml_vec_dot_tbqp4_0_q8_K_generic ggml_vec_dot_tbqp4_0_q8_K #define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K #define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K #define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index bc5b413e44..f1289d463c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -402,18 +402,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, - [GGML_TYPE_TBQP3_0] = { - .from_float = quantize_row_tbqp3_0, - .vec_dot = ggml_vec_dot_tbqp3_0_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, - }, - [GGML_TYPE_TBQP4_0] = { - .from_float = quantize_row_tbqp4_0, - .vec_dot = ggml_vec_dot_tbqp4_0_q8_K, - .vec_dot_type = GGML_TYPE_Q8_K, - .nrows = 1, - }, [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index bb376a2f88..6efdcb36b3 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -722,8 +722,6 @@ void ggml_compute_forward_add( case GGML_TYPE_TQ2_0: case GGML_TYPE_TBQ3_0: case GGML_TYPE_TBQ4_0: - case GGML_TYPE_TBQP3_0: - case GGML_TYPE_TBQP4_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -1176,8 +1174,6 @@ void ggml_compute_forward_add1( case GGML_TYPE_TQ2_0: case GGML_TYPE_TBQ3_0: case GGML_TYPE_TBQ4_0: - case GGML_TYPE_TBQP3_0: - case GGML_TYPE_TBQP4_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -1309,8 +1305,6 @@ void ggml_compute_forward_acc( case GGML_TYPE_TQ2_0: case GGML_TYPE_TBQ3_0: case GGML_TYPE_TBQ4_0: - case GGML_TYPE_TBQP3_0: - case GGML_TYPE_TBQP4_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -4401,8 +4395,6 @@ void ggml_compute_forward_out_prod( case GGML_TYPE_TQ2_0: case GGML_TYPE_TBQ3_0: case GGML_TYPE_TBQ4_0: - case GGML_TYPE_TBQP3_0: - case GGML_TYPE_TBQP4_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -4681,8 +4673,6 @@ void ggml_compute_forward_set( case GGML_TYPE_TQ2_0: case GGML_TYPE_TBQ3_0: case GGML_TYPE_TBQ4_0: - case GGML_TYPE_TBQP3_0: - case GGML_TYPE_TBQP4_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -4908,8 +4898,6 @@ void ggml_compute_forward_get_rows( case GGML_TYPE_TQ2_0: case GGML_TYPE_TBQ3_0: case GGML_TYPE_TBQ4_0: - case GGML_TYPE_TBQP3_0: - case GGML_TYPE_TBQP4_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -5637,8 +5625,6 @@ void ggml_compute_forward_clamp( case GGML_TYPE_TQ2_0: case GGML_TYPE_TBQ3_0: case GGML_TYPE_TBQ4_0: - case GGML_TYPE_TBQP3_0: - case GGML_TYPE_TBQP4_0: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index cc5c6cce3a..f5b0687122 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -120,18 +120,6 @@ void quantize_row_tbq4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, quantize_row_tbq4_0_ref(x, y, k); } -void quantize_row_tbqp3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(k % QK_K == 0); - block_tbqp3_0 * GGML_RESTRICT y = vy; - quantize_row_tbqp3_0_ref(x, y, k); -} - -void quantize_row_tbqp4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { - assert(k % QK_K == 0); - block_tbqp4_0 * GGML_RESTRICT y = vy; - quantize_row_tbqp4_0_ref(x, y, k); -} - //===================================== Q8_K ============================================== void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { @@ -556,57 +544,6 @@ void ggml_vec_dot_tbq4_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_tbqp3_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - float * tmp = tbq_vd_get_scratch(n); - dequantize_row_tbqp3_0((const block_tbqp3_0 *)vx, tmp, n); - - const block_q8_K * GGML_RESTRICT y = vy; - const int nb = n / QK_K; - - float sumf = 0.0f; - int64_t idx = 0; - for (int i = 0; i < nb; i++) { - const float d = y[i].d; - for (int j = 0; j < QK_K; j++) { - sumf += tmp[idx] * (d * y[i].qs[j]); - idx++; - } - } - - *s = sumf; -} - -void ggml_vec_dot_tbqp4_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - assert(nrc == 1); - UNUSED(nrc); - UNUSED(bx); - UNUSED(by); - UNUSED(bs); - - float * tmp = tbq_vd_get_scratch(n); - dequantize_row_tbqp4_0((const block_tbqp4_0 *)vx, tmp, n); - - const block_q8_K * GGML_RESTRICT y = vy; - const int nb = n / QK_K; - - float sumf = 0.0f; - int64_t idx = 0; - for (int i = 0; i < nb; i++) { - const float d = y[i].d; - for (int j = 0; j < QK_K; j++) { - sumf += tmp[idx] * (d * y[i].qs[j]); - idx++; - } - } - - *s = sumf; -} void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index 2c18a09127..c447fb4e4f 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -35,9 +35,6 @@ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, i void quantize_row_tbq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_tbq4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_tbqp3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_tbqp4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); - void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -63,9 +60,6 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo void ggml_vec_dot_tbq3_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_tbq4_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_tbqp3_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_tbqp4_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -92,9 +86,6 @@ void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_tbqp3_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_tbqp4_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); - void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 6eb06ed2cb..ddc6aae2e9 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5407,22 +5407,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_tbq4_0, data, nb); } break; - case GGML_TYPE_TBQP3_0: - { - const block_tbqp3_0 * q = (const block_tbqp3_0 *) data; - for (size_t i = 0; i < nb; ++i) { - if (!validate_fp16(q[i].d, i)) return false; - if (!validate_fp16(q[i].gamma, i)) return false; - } - } break; - case GGML_TYPE_TBQP4_0: - { - const block_tbqp4_0 * q = (const block_tbqp4_0 *) data; - for (size_t i = 0; i < nb; ++i) { - if (!validate_fp16(q[i].d, i)) return false; - if (!validate_fp16(q[i].gamma, i)) return false; - } - } break; case GGML_TYPE_I8: case GGML_TYPE_I16: diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 59591d64d7..6719168c52 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -37,9 +37,6 @@ GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 GGML_API void quantize_row_tbq3_0_ref(const float * GGML_RESTRICT x, block_tbq3_0 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_tbq4_0_ref(const float * GGML_RESTRICT x, block_tbq4_0 * GGML_RESTRICT y, int64_t k); -GGML_API void quantize_row_tbqp3_0_ref(const float * GGML_RESTRICT x, block_tbqp3_0 * GGML_RESTRICT y, int64_t k); -GGML_API void quantize_row_tbqp4_0_ref(const float * GGML_RESTRICT x, block_tbqp4_0 * GGML_RESTRICT y, int64_t k); - GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); @@ -70,9 +67,6 @@ GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_API void dequantize_row_tbq3_0(const block_tbq3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_tbq4_0(const block_tbq4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_tbqp3_0(const block_tbqp3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_tbqp4_0(const block_tbqp4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); - GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -100,9 +94,6 @@ GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_REST GGML_API size_t quantize_tbq3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_tbq4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -GGML_API size_t quantize_tbqp3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -GGML_API size_t quantize_tbqp4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); - GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml-turboq.c b/ggml/src/ggml-turboq.c index becc7b2a44..58d260a214 100644 --- a/ggml/src/ggml-turboq.c +++ b/ggml/src/ggml-turboq.c @@ -680,233 +680,3 @@ size_t quantize_tbq4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst } return nrows * row_size; } - -// --------------------------------------------------------------------------- -// TBQP3_0: TurboQuant Q_prod 3-bit (2-bit MSE + 1-bit QJL) -// -// Paper Algorithm 2 (TurboQuant_prod): -// 1. Quantize unit vector with (b-1)=2-bit MSE codebook -// 2. Dequantize MSE, inverse-rotate to get x̃_mse -// 3. Compute residual r = unit_vec - x̃_mse -// 4. Apply QJL: signs = sign(S · r) where S is d×d raw Gaussian -// 5. Store residual norm γ = ||r||₂ -// -// Dequantization: -// x̃ = norm · (x̃_mse + √(π/2)/d · γ · S^T · signs) -// --------------------------------------------------------------------------- - -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif - -void quantize_row_tbqp3_0_ref(const float * GGML_RESTRICT x, block_tbqp3_0 * GGML_RESTRICT y, int64_t k) { - assert(k % QK_K == 0); - const int64_t nb = k / QK_K; - float * unit = turboq_get_scratch(QK_K); - float * mse_rot = turboq_get_scratch2(QK_K); - float * tmp = turboq_get_scratch3(QK_K); - const uint64_t seed = turboq_seed_from_row(0); - const float scale_up = turboq_block_scale_up(); - const float scale_down = turboq_block_scale_down(); - uint8_t indices[QK_K]; - - for (int64_t b = 0; b < nb; b++) { - const float * xb = x + b * QK_K; - - float norm_sq = 0.0f; - for (int64_t i = 0; i < QK_K; ++i) { - norm_sq += xb[i] * xb[i]; - } - - float norm = sqrtf(norm_sq); - if (norm < 1e-10f) { - norm = 1e-10f; - } - - for (int64_t i = 0; i < QK_K; ++i) { - unit[i] = xb[i] / norm; - } - - turboq_rotate_qk_forward(mse_rot, unit, seed); - - for (int64_t i = 0; i < QK_K; ++i) { - indices[i] = quantize_scalar_2bit(mse_rot[i] * scale_up); - mse_rot[i] = turboq_codebook_2bit[indices[i]] * scale_down; - } - - turboq_rotate_qk_inverse(tmp, mse_rot, seed); - - float gamma_sq = 0.0f; - for (int64_t i = 0; i < QK_K; ++i) { - unit[i] -= tmp[i]; - gamma_sq += unit[i] * unit[i]; - } - - const float gamma = sqrtf(gamma_sq); - - turboq_project_qk(tmp, unit, seed); - - memset(y[b].qs, 0, sizeof(y[b].qs)); - memset(y[b].signs, 0, sizeof(y[b].signs)); - for (int64_t j = 0; j < QK_K; j++) { - y[b].qs[j / 4] |= (indices[j] << ((j % 4) * 2)); - if (tmp[j] >= 0.0f) { - y[b].signs[j / 8] |= (1 << (j % 8)); - } - } - y[b].d = GGML_FP32_TO_FP16(norm); - y[b].gamma = GGML_FP32_TO_FP16(gamma); - } -} - -void dequantize_row_tbqp3_0(const block_tbqp3_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { - assert(k % QK_K == 0); - const int64_t nb = k / QK_K; - const uint64_t seed = turboq_seed_from_row(0); - const float scale_dn = turboq_block_scale_down(); - const float qjl_scale = sqrtf((float) M_PI / 2.0f) / (float) QK_K; - float * mse_rot = turboq_get_scratch(QK_K); - float * signs_f = turboq_get_scratch2(QK_K); - float * mse_unit = turboq_get_scratch3(QK_K); - - for (int64_t b = 0; b < nb; ++b) { - const float norm = GGML_FP16_TO_FP32(x[b].d); - const float gamma = GGML_FP16_TO_FP32(x[b].gamma); - - for (int64_t j = 0; j < QK_K; ++j) { - const uint8_t idx = (x[b].qs[j / 4] >> ((j % 4) * 2)) & 0x3; - mse_rot[j] = turboq_codebook_2bit[idx] * scale_dn; - signs_f[j] = ((x[b].signs[j / 8] >> (j % 8)) & 1) ? 1.0f : -1.0f; - } - - turboq_rotate_qk_inverse(mse_unit, mse_rot, seed); - turboq_project_qk_inverse(mse_rot, signs_f, seed); - - const float qjl_f = qjl_scale * gamma; - for (int64_t j = 0; j < QK_K; ++j) { - y[b * QK_K + j] = norm * (mse_unit[j] + qjl_f * mse_rot[j]); - } - } -} - -size_t quantize_tbqp3_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - (void)imatrix; - assert(n_per_row % QK_K == 0); - const int64_t nb_per_row = n_per_row / QK_K; - const size_t row_size = nb_per_row * sizeof(block_tbqp3_0); - - for (int64_t row = 0; row < nrows; row++) { - const float * row_src = src + row * n_per_row; - block_tbqp3_0 * row_dst = (block_tbqp3_0 *)((char *)dst + row * row_size); - quantize_row_tbqp3_0_ref(row_src, row_dst, n_per_row); - } - return nrows * row_size; -} - -// --------------------------------------------------------------------------- -// TBQP4_0: TurboQuant Q_prod 4-bit (3-bit MSE + 1-bit QJL) -// --------------------------------------------------------------------------- - -void quantize_row_tbqp4_0_ref(const float * GGML_RESTRICT x, block_tbqp4_0 * GGML_RESTRICT y, int64_t k) { - assert(k % QK_K == 0); - const int64_t nb = k / QK_K; - float * unit = turboq_get_scratch(QK_K); - float * mse_rot = turboq_get_scratch2(QK_K); - float * tmp = turboq_get_scratch3(QK_K); - const uint64_t seed = turboq_seed_from_row(0); - const float scale_up = turboq_block_scale_up(); - const float scale_down = turboq_block_scale_down(); - uint8_t indices[QK_K]; - - for (int64_t b = 0; b < nb; ++b) { - const float * xb = x + b * QK_K; - - float norm_sq = 0.0f; - for (int64_t i = 0; i < QK_K; ++i) { - norm_sq += xb[i] * xb[i]; - } - - float norm = sqrtf(norm_sq); - if (norm < 1e-10f) { - norm = 1e-10f; - } - - for (int64_t i = 0; i < QK_K; ++i) { - unit[i] = xb[i] / norm; - } - - turboq_rotate_qk_forward(mse_rot, unit, seed); - - for (int64_t i = 0; i < QK_K; ++i) { - indices[i] = quantize_scalar_3bit(mse_rot[i] * scale_up); - mse_rot[i] = turboq_codebook_3bit[indices[i]] * scale_down; - } - - turboq_rotate_qk_inverse(tmp, mse_rot, seed); - - float gamma_sq = 0.0f; - for (int64_t i = 0; i < QK_K; ++i) { - unit[i] -= tmp[i]; - gamma_sq += unit[i] * unit[i]; - } - - const float gamma = sqrtf(gamma_sq); - - turboq_project_qk(tmp, unit, seed); - - memset(y[b].signs, 0, sizeof(y[b].signs)); - for (int64_t j = 0; j < QK_K; j++) { - if (tmp[j] >= 0.0f) { - y[b].signs[j / 8] |= (1 << (j % 8)); - } - } - pack_3bit(y[b].qs, indices, QK_K); - y[b].d = GGML_FP32_TO_FP16(norm); - y[b].gamma = GGML_FP32_TO_FP16(gamma); - } -} - -void dequantize_row_tbqp4_0(const block_tbqp4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { - assert(k % QK_K == 0); - const int64_t nb = k / QK_K; - const uint64_t seed = turboq_seed_from_row(0); - const float scale_dn = turboq_block_scale_down(); - const float qjl_scale = sqrtf((float) M_PI / 2.0f) / (float) QK_K; - float * mse_rot = turboq_get_scratch(QK_K); - float * signs_f = turboq_get_scratch2(QK_K); - float * mse_unit = turboq_get_scratch3(QK_K); - - uint8_t indices[QK_K]; - for (int64_t b = 0; b < nb; b++) { - const float norm = GGML_FP16_TO_FP32(x[b].d); - const float gamma = GGML_FP16_TO_FP32(x[b].gamma); - - unpack_3bit(indices, x[b].qs, QK_K); - for (int64_t j = 0; j < QK_K; j++) { - mse_rot[j] = turboq_codebook_3bit[indices[j]] * scale_dn; - signs_f[j] = ((x[b].signs[j / 8] >> (j % 8)) & 1) ? 1.0f : -1.0f; - } - - turboq_rotate_qk_inverse(mse_unit, mse_rot, seed); - turboq_project_qk_inverse(mse_rot, signs_f, seed); - - const float qjl_f = qjl_scale * gamma; - for (int64_t j = 0; j < QK_K; ++j) { - y[b * QK_K + j] = norm * (mse_unit[j] + qjl_f * mse_rot[j]); - } - } -} - -size_t quantize_tbqp4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - (void)imatrix; - assert(n_per_row % QK_K == 0); - const int64_t nb_per_row = n_per_row / QK_K; - const size_t row_size = nb_per_row * sizeof(block_tbqp4_0); - - for (int64_t row = 0; row < nrows; row++) { - const float * row_src = src + row * n_per_row; - block_tbqp4_0 * row_dst = (block_tbqp4_0 *)((char *)dst + row * row_size); - quantize_row_tbqp4_0_ref(row_src, row_dst, n_per_row); - } - return nrows * row_size; -} diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e743983ba9..6d895068c5 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -920,22 +920,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_tbq4_0, .from_float_ref = (ggml_from_float_t) quantize_row_tbq4_0_ref, }, - [GGML_TYPE_TBQP3_0] = { - .type_name = "tbqp3_0", - .blck_size = QK_K, - .type_size = sizeof(block_tbqp3_0), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_tbqp3_0, - .from_float_ref = (ggml_from_float_t) quantize_row_tbqp3_0_ref, - }, - [GGML_TYPE_TBQP4_0] = { - .type_name = "tbqp4_0", - .blck_size = QK_K, - .type_size = sizeof(block_tbqp4_0), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_tbqp4_0, - .from_float_ref = (ggml_from_float_t) quantize_row_tbqp4_0_ref, - }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -1421,10 +1405,8 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; case GGML_FTYPE_MOSTLY_MXFP4: wtype = GGML_TYPE_MXFP4; break; case GGML_FTYPE_MOSTLY_NVFP4: wtype = GGML_TYPE_NVFP4; break; - case GGML_FTYPE_MOSTLY_TBQ3_0: wtype = GGML_TYPE_TBQ3_0; break; - case GGML_FTYPE_MOSTLY_TBQ4_0: wtype = GGML_TYPE_TBQ4_0; break; - case GGML_FTYPE_MOSTLY_TBQP3_0: wtype = GGML_TYPE_TBQP3_0; break; - case GGML_FTYPE_MOSTLY_TBQP4_0: wtype = GGML_TYPE_TBQP4_0; break; + case GGML_FTYPE_MOSTLY_TBQ3_0: wtype = GGML_TYPE_TBQ3_0; break; + case GGML_FTYPE_MOSTLY_TBQ4_0: wtype = GGML_TYPE_TBQ4_0; break; case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break; case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break; case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break; @@ -7704,8 +7686,6 @@ size_t ggml_quantize_chunk( case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_TBQ3_0: result = quantize_tbq3_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_TBQ4_0: result = quantize_tbq4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_TBQP3_0: result = quantize_tbqp3_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_TBQP4_0: result = quantize_tbqp4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/include/llama.h b/include/llama.h index 661c2fbf89..1aec0cfecb 100644 --- a/include/llama.h +++ b/include/llama.h @@ -156,8 +156,6 @@ extern "C" { LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors LLAMA_FTYPE_MOSTLY_TBQ3_0 = 40, // except 1d tensors LLAMA_FTYPE_MOSTLY_TBQ4_0 = 41, // except 1d tensors - LLAMA_FTYPE_MOSTLY_TBQP3_0 = 42, // except 1d tensors - LLAMA_FTYPE_MOSTLY_TBQP4_0 = 43, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 1d54294ef1..4fd99f247f 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1790,10 +1790,8 @@ ggml_tensor * llm_graph_context::build_attn_mha( float kq_scale, int il) const { const bool v_trans = v->nb[1] > v->nb[2]; - const bool k_is_tbq = k->type == GGML_TYPE_TBQ3_0 || k->type == GGML_TYPE_TBQ4_0 || - k->type == GGML_TYPE_TBQP3_0 || k->type == GGML_TYPE_TBQP4_0; - const bool v_is_tbq = v->type == GGML_TYPE_TBQ3_0 || v->type == GGML_TYPE_TBQ4_0 || - v->type == GGML_TYPE_TBQP3_0 || v->type == GGML_TYPE_TBQP4_0; + const bool k_is_tbq = k->type == GGML_TYPE_TBQ3_0 || k->type == GGML_TYPE_TBQ4_0; + const bool v_is_tbq = v->type == GGML_TYPE_TBQ3_0 || v->type == GGML_TYPE_TBQ4_0; const bool use_flash_attn = cparams.flash_attn && kq_b == nullptr; const enum ggml_type tbq_attn_type = use_flash_attn ? GGML_TYPE_F16 : GGML_TYPE_F32; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index ce8207b1b0..fe6517d505 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1032,8 +1032,7 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k const uint32_t ns = sinfo.s1 - sinfo.s0 + 1; - if (k->type == GGML_TYPE_TBQ3_0 || k->type == GGML_TYPE_TBQ4_0 || - k->type == GGML_TYPE_TBQP3_0 || k->type == GGML_TYPE_TBQP4_0) { + if (k->type == GGML_TYPE_TBQ3_0 || k->type == GGML_TYPE_TBQ4_0) { return ggml_view_3d(ctx, k, n_embd_k_gqa, n_kv, ns, ggml_row_size(k->type, n_embd_k_gqa), @@ -1062,8 +1061,7 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k const uint32_t ns = sinfo.s1 - sinfo.s0 + 1; - if (v->type == GGML_TYPE_TBQ3_0 || v->type == GGML_TYPE_TBQ4_0 || - v->type == GGML_TYPE_TBQP3_0 || v->type == GGML_TYPE_TBQP4_0) { + if (v->type == GGML_TYPE_TBQ3_0 || v->type == GGML_TYPE_TBQ4_0) { return ggml_view_3d(ctx, v, n_embd_v_gqa, n_kv, ns, ggml_row_size(v->type, n_embd_v_gqa), diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a4f994e64c..f0fba770ec 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -386,9 +386,7 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: case GGML_TYPE_TBQ3_0: - case GGML_TYPE_TBQ4_0: - case GGML_TYPE_TBQP3_0: - case GGML_TYPE_TBQP4_0: return_type = GGML_TYPE_Q4_0; break; + case GGML_TYPE_TBQ4_0: return_type = GGML_TYPE_Q4_0; break; case GGML_TYPE_Q4_K: return_type = GGML_TYPE_Q5_0; break; case GGML_TYPE_Q5_K: return_type = GGML_TYPE_Q5_1; break; case GGML_TYPE_Q6_K: return_type = GGML_TYPE_Q8_0; break; @@ -491,9 +489,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type else if (ftype == LLAMA_FTYPE_MOSTLY_TBQ3_0 || ftype == LLAMA_FTYPE_MOSTLY_TBQ4_0) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_TBQP3_0 || ftype == LLAMA_FTYPE_MOSTLY_TBQP4_0) { - new_type = GGML_TYPE_Q4_K; - } } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { @@ -829,8 +824,6 @@ static ggml_type llama_ftype_get_default_type(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_TQ2_0: return GGML_TYPE_TQ2_0; case LLAMA_FTYPE_MOSTLY_TBQ3_0: return GGML_TYPE_TBQ3_0; case LLAMA_FTYPE_MOSTLY_TBQ4_0: return GGML_TYPE_TBQ4_0; - case LLAMA_FTYPE_MOSTLY_TBQP3_0: return GGML_TYPE_TBQP3_0; - case LLAMA_FTYPE_MOSTLY_TBQP4_0: return GGML_TYPE_TBQP4_0; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return GGML_TYPE_IQ2_XXS; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return GGML_TYPE_IQ2_XS; case LLAMA_FTYPE_MOSTLY_IQ2_S: return GGML_TYPE_IQ2_XS; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index ce3a62448e..f8ce1cf314 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7321,8 +7321,6 @@ static const ggml_type other_types[] = { static const ggml_type turboq_types[] = { GGML_TYPE_TBQ3_0, GGML_TYPE_TBQ4_0, - GGML_TYPE_TBQP3_0, - GGML_TYPE_TBQP4_0, }; #ifdef _MSC_VER diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index c94a6de4ad..cc50457bc1 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -25,8 +25,6 @@ constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TBQ4 = 0.0025f; -constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TBQP4 = 0.0060f; -constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TBQP3 = 0.0100f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_FP4 = 0.0030f; constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f; constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f; @@ -109,7 +107,7 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr } static bool test_turboq_vec_dot_dispatch() { - for (ggml_type type : { GGML_TYPE_TBQ3_0, GGML_TYPE_TBQ4_0, GGML_TYPE_TBQP3_0, GGML_TYPE_TBQP4_0 }) { + for (ggml_type type : { GGML_TYPE_TBQ3_0, GGML_TYPE_TBQ4_0 }) { const auto * qfns_cpu = ggml_get_type_traits_cpu(type); if (qfns_cpu->vec_dot == nullptr || qfns_cpu->vec_dot_type != GGML_TYPE_Q8_K) { return false; @@ -143,43 +141,6 @@ static bool test_tbq3_norm_scaling() { return fabsf(ggml_fp16_to_fp32(block.d) - 16.0f) < 1e-3f; } -template -static bool test_tbqp_residual_usage_impl( - void (*quantize_row_ref)(const float * GGML_RESTRICT, block_t * GGML_RESTRICT, int64_t), - void (*dequantize_row)(const block_t * GGML_RESTRICT, float * GGML_RESTRICT, int64_t)) { - std::vector x(QK_K); - std::vector y0(QK_K); - std::vector y1(QK_K); - - for (int i = 0; i < QK_K; ++i) { - x[i] = 0.1f + 2.0f*cosf((float) i); - } - - block_t block = {}; - quantize_row_ref(x.data(), &block, QK_K); - dequantize_row(&block, y0.data(), QK_K); - - block_t modified = block; - memset(modified.signs, 0, sizeof(modified.signs)); - modified.gamma = ggml_fp32_to_fp16(0.0f); - dequantize_row(&modified, y1.data(), QK_K); - - float diff = 0.0f; - for (int i = 0; i < QK_K; ++i) { - diff += fabsf(y0[i] - y1[i]); - } - - return diff > 1e-3f; -} - -static bool test_tbqp3_residual_usage() { - return test_tbqp_residual_usage_impl(quantize_row_tbqp3_0_ref, dequantize_row_tbqp3_0); -} - -static bool test_tbqp4_residual_usage() { - return test_tbqp_residual_usage_impl(quantize_row_tbqp4_0_ref, dequantize_row_tbqp4_0); -} - int main(int argc, char * argv[]) { bool verbose = false; const size_t test_size = 32 * 128; @@ -225,18 +186,6 @@ int main(int argc, char * argv[]) { printf("%5s norm scaling: %s\n", "tbq3", RESULT_STR[failed]); } - failed = !test_tbqp3_residual_usage(); - num_failed += failed; - if (failed || verbose) { - printf("%5s residual usage: %s\n", "tbqp3", RESULT_STR[failed]); - } - - failed = !test_tbqp4_residual_usage(); - num_failed += failed; - if (failed || verbose) { - printf("%5s residual usage: %s\n", "tbqp4", RESULT_STR[failed]); - } - for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; const auto * qfns = ggml_get_type_traits(type); @@ -264,8 +213,6 @@ int main(int argc, char * argv[]) { type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : type == GGML_TYPE_TBQ3_0 ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : type == GGML_TYPE_TBQ4_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TBQ4 : - type == GGML_TYPE_TBQP3_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TBQP3 : - type == GGML_TYPE_TBQP4_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TBQP4 : type == GGML_TYPE_NVFP4 ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 : MAX_QUANTIZATION_TOTAL_ERROR; failed = !(total_error < max_quantization_error); num_failed += failed; diff --git a/tools/cli/README.md b/tools/cli/README.md index fcaf6e3921..e336a909fc 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -52,8 +52,8 @@ | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | | `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | -| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | -| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | @@ -97,8 +97,8 @@ | `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)

(env: LLAMA_LOG_VERBOSITY) | | `--log-prefix` | Enable prefix in log messages
(env: LLAMA_LOG_PREFIX) | | `--log-timestamps` | Enable timestamps in log messages
(env: LLAMA_LOG_TIMESTAMPS) | -| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) | -| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) | +| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) | +| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) | ### Sampling params diff --git a/tools/completion/README.md b/tools/completion/README.md index 621f569170..9539fb4878 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -135,8 +135,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | | `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | -| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | -| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | @@ -180,8 +180,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)

(env: LLAMA_LOG_VERBOSITY) | | `--log-prefix` | Enable prefix in log messages
(env: LLAMA_LOG_PREFIX) | | `--log-timestamps` | Enable timestamps in log messages
(env: LLAMA_LOG_TIMESTAMPS) | -| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) | -| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) | +| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) | +| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) | ### Sampling params diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 3601662224..560d7061a9 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -489,12 +489,6 @@ static ggml_type ggml_type_from_name(const std::string & s) { if (s == "tbq4_0") { return GGML_TYPE_TBQ4_0; } - if (s == "tbqp3_0") { - return GGML_TYPE_TBQP3_0; - } - if (s == "tbqp4_0") { - return GGML_TYPE_TBQP4_0; - } return GGML_TYPE_COUNT; } diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 9c1c10fecb..b559af2996 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -47,8 +47,6 @@ static const std::vector QUANT_OPTIONS = { { "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", }, { "TBQ3_0", LLAMA_FTYPE_MOSTLY_TBQ3_0, " 3.06 bpw TurboQuant", }, { "TBQ4_0", LLAMA_FTYPE_MOSTLY_TBQ4_0, " 4.06 bpw TurboQuant", }, - { "TBQP3_0", LLAMA_FTYPE_MOSTLY_TBQP3_0, " 3.13 bpw TurboQuant prod", }, - { "TBQP4_0", LLAMA_FTYPE_MOSTLY_TBQP4_0, " 4.13 bpw TurboQuant prod", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, diff --git a/tools/server/README.md b/tools/server/README.md index c4b34103f0..f25df0f8cc 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -69,8 +69,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | | `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | -| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | -| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | @@ -113,8 +113,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)

(env: LLAMA_LOG_VERBOSITY) | | `--log-prefix` | Enable prefix in log messages
(env: LLAMA_LOG_PREFIX) | | `--log-timestamps` | Enable timestamps in log messages
(env: LLAMA_LOG_TIMESTAMPS) | -| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) | -| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0, tbqp3_0, tbqp4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) | +| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) | +| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1, tbq3_0, tbq4_0
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) | ### Sampling params