diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f759e2d588..3ab6335255 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -427,7 +427,8 @@ extern "C" { // GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_8_8 = 38, GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) - GGML_TYPE_COUNT = 40, + GGML_TYPE_Q2_0C = 40, + GGML_TYPE_COUNT = 41, }; // precision diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 93ab7ea446..deebacc951 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -255,6 +255,13 @@ typedef struct { } block_tq2_0; static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding"); +#define QKQ2_0C 512 +typedef struct { + ggml_half d; + uint8_t qs[QKQ2_0C / 4]; +} block_q2_0c; +static_assert(sizeof(block_q2_0c) == sizeof(ggml_half) + (QKQ2_0C / 4), "wrong q2_0c block size/padding"); + // // Super-block quantization structures // diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 7622d0bf49..f2350e725f 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -561,9 +561,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # Fetch KleidiAI sources: include(FetchContent) - set(KLEIDIAI_COMMIT_TAG "v1.16.0") + set(KLEIDIAI_COMMIT_TAG "v1.21.0") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") - set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321") + set(KLEIDIAI_ARCHIVE_MD5 "8b8525adc0eb9e2a16f765743a706ac8") if (POLICY CMP0135) cmake_policy(SET CMP0135 NEW) @@ -606,6 +606,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/ + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/) set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}") @@ -626,7 +627,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c - ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.c) + ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon.c) if (NOT DOTPROD_ENABLED MATCHES -1) list(APPEND GGML_KLEIDIAI_SOURCES @@ -656,6 +658,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot_asm.S + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa_asm.S ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S) set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2") endif() diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index b003fe13fd..2c5d395798 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -384,6 +384,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_Q2_0C] = { + .from_float = quantize_row_q2_0c, + .vec_dot = NULL, // TODO: We should have the fallback kernel when KleidiAI is not used + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.cpp b/ggml/src/ggml-cpu/kleidiai/kernels.cpp index d114f2d49b..659e205776 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2025-2026 Arm Limited and/or its affiliates // SPDX-License-Identifier: MIT // @@ -32,6 +32,12 @@ #include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h" #include "kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h" +#include "kai_lhs_quant_pack_qai8dxp_f32.h" +#include "kai_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot.h" +#include "kai_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa.h" +#include "kai_matmul_clamp_f32_qai8dxp_qsu2cxp_interface.h" +#include "kai_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon.h" + #include "kai_common.h" #include "simd-mappings.h" @@ -77,6 +83,15 @@ static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /* Fn(m, n, k, lhs, rhs, static_cast(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max); } +template +static inline void kernel_run_float_fn11_int2(size_t m, size_t n, size_t k, size_t /*bl*/, + const void* lhs, const void* rhs, void* dst, + size_t dst_stride_row, size_t dst_stride_col, + float clamp_min, float clamp_max, const int32_t* lut) { + + Fn(m, n, k, lhs, rhs, static_cast(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max, lut); +} + template static inline size_t lhs_ps_fn6(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) { return Fn(m, k, bl, mr, kr, sr); @@ -164,6 +179,18 @@ static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, si static_cast(params)); } +template +static inline void rhs_pack_scale_fn12_int2(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/, + size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale, + void* rhs_packed, size_t extra_bytes, const void* params, const int32_t* lut) { + Fn(num_groups, n, k, nr, kr, sr, + static_cast(rhs), + static_cast(bias), + static_cast(scale), + rhs_packed, extra_bytes, + static_cast(params), lut); +} + template static inline void rhs_pack_fn13(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/, size_t rhs_stride, const void* rhs, const void* bias, const void* scale, @@ -320,6 +347,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn3, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3, /* .run_kernel_ex = */ &kernel_run_fn11, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemm_lhs_info = */ { @@ -341,6 +369,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn3, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3, /* .run_kernel_ex = */ &kernel_run_fn11, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemv_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon, @@ -354,6 +383,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .packed_size_ex = */ &rhs_ps_fn5, /* .packed_stride_ex = */ &rhs_stride_fn4, /* .pack_func_ex = */ &rhs_pack_fn12, + /* .pack_func_lut_ex = */ nullptr, }, /* .required_cpu = */ CPU_FEATURE_SME, /* .lhs_type = */ GGML_TYPE_F32, @@ -374,6 +404,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn2, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2, /* .run_kernel_ex = */ &kernel_run_fn10, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemm_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme, @@ -394,6 +425,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_lhs_offset_ex = */ nullptr, /* .get_rhs_packed_offset_ex = */ nullptr, /* .run_kernel_ex = */ nullptr, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemv_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme, @@ -407,6 +439,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .packed_size_ex = */ &rhs_ps_fn2, /* .packed_stride_ex = */ &rhs_stride_fn1, /* .pack_func_ex = */ &rhs_pack_fn13, + /* .run_kernel_lut_ex = */ nullptr, }, /* .required_cpu = */ CPU_FEATURE_SME, /* .lhs_type = */ GGML_TYPE_F32, @@ -430,6 +463,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn3, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3, /* .run_kernel_ex = */ &kernel_run_fn11, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemm_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32, @@ -450,6 +484,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn3, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3, /* .run_kernel_ex = */ &kernel_run_fn11, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemv_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32, @@ -463,6 +498,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .packed_size_ex = */ &rhs_ps_fn5, /* .packed_stride_ex = */ &rhs_stride_fn4, /* .pack_func_ex = */ &rhs_pack_fn12, + /* .pack_func_lut_ex = */ nullptr, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD, /* .lhs_type = */ GGML_TYPE_F32, @@ -485,6 +521,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn3, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3, /* .run_kernel_ex = */ &kernel_run_fn11, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemm_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon, @@ -505,6 +542,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn3, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3, /* .run_kernel_ex = */ &kernel_run_fn11, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemv_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32, @@ -518,6 +556,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .packed_size_ex = */ &rhs_ps_fn5, /* .packed_stride_ex = */ &rhs_stride_fn4, /* .pack_func_ex = */ &rhs_pack_fn12, + /* .pack_func_lut_ex = */ nullptr, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM, /* .lhs_type = */ GGML_TYPE_F32, @@ -695,6 +734,68 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { { /* Sentinel */ } }; +static ggml_kleidiai_kernels gemm_gemv_kernels_q2_0c[] { +#if defined(__ARM_FEATURE_SME) + { + /* SME GEMM */ + { + /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa, + /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa, + /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa, + /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa, + /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa, + /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa, + /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa, + /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa, + /* .get_lhs_offset_ex = */ &kernel_offs_fn2, + /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2, + /* .run_kernel_ex = */ nullptr, + /* .run_kernel_lut_ex = */ &kernel_run_float_fn11_int2, + }, + /* .gemm_lhs_info = */ { + /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32, + /* .get_packed_offset_ex = */ &lhs_offs_fn5, + /* .packed_size_ex = */ &lhs_ps_fn5, + /* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl, + }, + /* SME GEMV */ + { + /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot, + /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot, + /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot, + /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot, + /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot, + /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot, + /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot, + /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot, + /* .get_lhs_offset_ex = */ &kernel_offs_fn2, + /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2, + /* .run_kernel_ex = */ nullptr, + /* .run_kernel_lut_ex = */ &kernel_run_float_fn11_int2, + }, + /* .gemv_lhs_info = */ { + /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32, + /* .get_packed_offset_ex = */ &lhs_offs_fn5, + /* .packed_size_ex = */ &lhs_ps_fn5, + /* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl, + }, + /* .rhs_info = */ { + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon, + /* .to_float = */ nullptr, + /* .packed_size_ex = */ &rhs_ps_fn5, + /* .packed_stride_ex = */ nullptr, + /* .pack_func_ex = */ nullptr, + /* .pack_func_lut_ex = */ &rhs_pack_scale_fn12_int2, + }, + /* .required_cpu = */ CPU_FEATURE_SME, + /* .lhs_type = */ GGML_TYPE_F32, + /* .rhs_type = */ GGML_TYPE_Q2_0C, + /* .op_type = */ GGML_TYPE_F32, + }, +#endif + { /* Sentinel */ } +}; + static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { #if defined(__ARM_FEATURE_SME) { @@ -711,6 +812,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn2, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2, /* .run_kernel_ex = */ &kernel_run_float_fn10, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemm_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32, @@ -731,6 +833,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn2, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2, /* .run_kernel_ex = */ &kernel_run_float_fn10, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemv_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32, @@ -744,6 +847,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .packed_size_ex = */ &rhs_ps_fn5, /* .packed_stride_ex = */ &rhs_stride_fn4, /* .pack_func_ex = */ &rhs_pack_scale_fn12, + /* .pack_func_lut_ex = */ nullptr, }, /* .required_cpu = */ CPU_FEATURE_SME, /* .lhs_type = */ GGML_TYPE_F32, @@ -766,6 +870,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn2, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2, /* .run_kernel_ex = */ &kernel_run_float_fn10, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemm_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32, @@ -786,6 +891,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn2, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2, /* .run_kernel_ex = */ &kernel_run_float_fn10, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemv_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32, @@ -799,6 +905,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .packed_size_ex = */ &rhs_ps_fn5, /* .packed_stride_ex = */ &rhs_stride_fn4, /* .pack_func_ex = */ &rhs_pack_scale_fn12, + /* .pack_func_lut_ex = */ nullptr, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM, /* .lhs_type = */ GGML_TYPE_F32, @@ -821,6 +928,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn2, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2, /* .run_kernel_ex = */ &kernel_run_float_fn10, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemm_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32, @@ -841,6 +949,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .get_lhs_offset_ex = */ &kernel_offs_fn2, /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2, /* .run_kernel_ex = */ &kernel_run_float_fn10, + /* .run_kernel_lut_ex = */ nullptr, }, /* .gemv_lhs_info = */ { /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32, @@ -854,6 +963,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = { /* .packed_size_ex = */ &rhs_ps_fn5, /* .packed_stride_ex = */ &rhs_stride_fn4, /* .pack_func_ex = */ &rhs_pack_scale_fn12, + /* .pack_func_lut_ex = */ nullptr, }, /* .required_cpu = */ CPU_FEATURE_DOTPROD, /* .lhs_type = */ GGML_TYPE_F32, @@ -890,9 +1000,21 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c } else { try_table(gemm_gemv_kernels); } + if (!kernel) { + for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q2_0c) - 1; ++i) { + if ((cpu_features & gemm_gemv_kernels_q2_0c[i].required_cpu) == gemm_gemv_kernels_q2_0c[i].required_cpu && + gemm_gemv_kernels_q2_0c[i].lhs_type == tensor->src[1]->type && + gemm_gemv_kernels_q2_0c[i].rhs_type == tensor->src[0]->type && + gemm_gemv_kernels_q2_0c[i].op_type == tensor->type) { + kernel = &gemm_gemv_kernels_q2_0c[i]; + break; + } + } + } #else GGML_UNUSED(gemm_gemv_kernels); GGML_UNUSED(gemm_gemv_kernels_q8); + GGML_UNUSED(gemm_gemv_kernels_q2_0c); GGML_UNUSED(cpu_features); #endif } @@ -936,3 +1058,20 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features) return kernels; } + +ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q2_0c(cpu_feature features) { + ggml_kleidiai_kernels * kernels = nullptr; + +#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8) + for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q2_0c) - 1; ++i) { + if ((features & gemm_gemv_kernels_q2_0c[i].required_cpu) == gemm_gemv_kernels_q2_0c[i].required_cpu) { + kernels = &gemm_gemv_kernels_q2_0c[i]; + break; + } + } +#else + GGML_UNUSED(features); +#endif + + return kernels; +} diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.h b/ggml/src/ggml-cpu/kleidiai/kernels.h index 129245400b..2073c0eba9 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.h +++ b/ggml/src/ggml-cpu/kleidiai/kernels.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2025-2026 Arm Limited and/or its affiliates // SPDX-License-Identifier: MIT // @@ -42,6 +42,12 @@ struct kernel_info { const void* lhs_packed, const void* rhs_packed, void* dst, size_t dst_stride_row, size_t dst_stride_col, float clamp_min, float clamp_max); + + void (*run_kernel_lut_ex)( + size_t m, size_t n, size_t k, size_t bl, + const void* lhs_packed, const void* rhs_packed, + void* dst, size_t dst_stride_row, size_t dst_stride_col, + float clamp_min, float clamp_max, const int32_t* lut); }; struct lhs_packing_info { @@ -68,6 +74,9 @@ struct rhs_packing_info { void (*pack_func_ex)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, size_t rhs_stride, const void * rhs, const void * bias, const void * scale, void * rhs_packed, size_t extra_bytes, const void * params); + + void (*pack_func_lut_ex)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, + size_t rhs_stride, const void * rhs, const void * bias, const void * scale, void * rhs_packed, size_t extra_bytes, const void * params, const int32_t* lut); }; struct ggml_kleidiai_kernels { @@ -88,3 +97,4 @@ struct ggml_kleidiai_kernels { ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor); ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features); ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features); +ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q2_0c(cpu_feature features); diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index ad23e73184..4927208cd4 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2025-2026 Arm Limited and/or its affiliates // SPDX-License-Identifier: MIT // #include @@ -43,7 +43,8 @@ struct ggml_kleidiai_context { cpu_feature features; ggml_kleidiai_kernels * kernels_q4; ggml_kleidiai_kernels * kernels_q8; -} static ctx = { CPU_FEATURE_NONE, NULL, NULL }; + ggml_kleidiai_kernels * kernels_q2c; +} static ctx = { CPU_FEATURE_NONE, NULL, NULL, NULL }; static const char* cpu_feature_to_string(cpu_feature f) { if (f == CPU_FEATURE_NONE) { @@ -84,8 +85,9 @@ static void init_kleidiai_context(void) { if (sme_enabled != 0) { ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE; } - ctx.kernels_q4 = ggml_kleidiai_select_kernels_q4_0(ctx.features); - ctx.kernels_q8 = ggml_kleidiai_select_kernels_q8_0(ctx.features); + ctx.kernels_q4 = ggml_kleidiai_select_kernels_q4_0(ctx.features); + ctx.kernels_q8 = ggml_kleidiai_select_kernels_q8_0(ctx.features); + ctx.kernels_q2c = ggml_kleidiai_select_kernels_q2_0c(ctx.features); #ifndef NDEBUG if (ctx.kernels_q4) { GGML_LOG_DEBUG("kleidiai: using q4 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q4->required_cpu)); @@ -93,6 +95,9 @@ static void init_kleidiai_context(void) { if (ctx.kernels_q8) { GGML_LOG_DEBUG("kleidiai: using q8 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q8->required_cpu)); } + if (ctx.kernels_q2c) { + GGML_LOG_DEBUG("kleidiai: using q2c kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q2c->required_cpu)); + } #endif } ggml_critical_section_end(); @@ -148,6 +153,9 @@ class tensor_traits : public ggml::cpu::tensor_traits { } else if (kernels->rhs_type == GGML_TYPE_Q8_0) { if (!lhs_info->packed_size_ex) return false; size = lhs_info->packed_size_ex(m, k, QK8_0, mr, kr, sr); + } else if (kernels->rhs_type == GGML_TYPE_Q2_0C) { + if (!lhs_info->packed_size_ex) return false; + size = lhs_info->packed_size_ex(m, k, QKQ2_0C, mr, kr, sr); } else if (kernels->rhs_type == GGML_TYPE_F16) { if (!lhs_info->packed_size_ex || !kernels->rhs_info.packed_size_ex) return false; const int64_t lhs_batch_size0 = op->src[1]->ne[2]; @@ -171,6 +179,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { return compute_forward_q8_0(params, dst); } else if (dst->src[0]->type == GGML_TYPE_F16) { return compute_forward_fp16(params, dst); + } else if (dst->src[0]->type == GGML_TYPE_Q2_0C && ctx.kernels_q2c != nullptr) { + return compute_forward_q2_0c(params, dst); } } else if (dst->op == GGML_OP_GET_ROWS) { if (dst->src[0]->type == GGML_TYPE_Q4_0 || dst->src[0]->type == GGML_TYPE_Q8_0) { @@ -504,6 +514,103 @@ class tensor_traits : public ggml::cpu::tensor_traits { return true; } + bool compute_forward_q2_0c(struct ggml_compute_params * params, struct ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q2_0C); + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst); + + // Look-up table used to unpack the int2 values + static const int32_t lut_i8_i2[4] = {-3, -1, 1, 3}; + + if (!kernels) { + return false; + } + + bool is_gemv = src1->ne[1] == 1; + kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm; + lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info; + + GGML_ASSERT(kernel); + if (!lhs_info->get_packed_offset_ex || !lhs_info->pack_func_ex || + !kernel->get_rhs_packed_offset_ex || !kernel->run_kernel_lut_ex || !kernel->get_dst_offset) { + return false; + } + + const int ith = params->ith; + const int nth_raw = params->nth; + const int nth = nth_raw > 0 ? nth_raw : 1; + + const size_t k = ne00; + const size_t m = ne11; + const size_t n = ne01; + + size_t mr = kernel->get_mr(); + size_t kr = kernel->get_kr(); + size_t sr = kernel->get_sr(); + + const uint8_t * lhs = static_cast(src1->data); + uint8_t * lhs_packed = (uint8_t*)params->wdata; + const uint8_t * rhs_packed = static_cast(src0->data); + + const size_t n_step = kernel->get_n_step(); + const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step); + const size_t n_start = ith * num_n_per_thread; + + size_t n_to_process = 0; + if (n_start < n) { + n_to_process = num_n_per_thread; + if ((n_start + n_to_process) > n) { + n_to_process = n - n_start; + } + } + + // Calculate number of columns to be processed per thread + const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth; + const size_t m_start = ith * num_m_per_thread; + size_t m_to_process = num_m_per_thread; + if ((m_start + m_to_process) > m) { + m_to_process = m - m_start; + } + + if (m_start < m) { + // Transform LHS + + const size_t src_stride = src1->nb[1]; + const float * src_ptr = reinterpret_cast(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1])); + const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(m_start, k, QKQ2_0C, mr, kr, sr); + void * lhs_packed_ptr = static_cast(lhs_packed + lhs_packed_offset); + + // Pack this thread's chunk with m_idx_start = 0 and per-thread output pointer + lhs_info->pack_func_ex(m_to_process, k, QKQ2_0C, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr); + } + + ggml_barrier(params->threadpool); + + if (n_to_process > 0) { + const size_t dst_stride = dst->nb[1]; + const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(0, k, 0, mr, kr, sr); + const size_t rhs_packed_offset = kernel->get_rhs_packed_offset_ex(n_start, k, 0); + const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride); + const void * rhs_ptr = static_cast(rhs_packed + rhs_packed_offset); + const void * lhs_ptr = static_cast(lhs_packed + lhs_packed_offset); + float * dst_ptr = reinterpret_cast(static_cast(dst->data) + dst_offset); + + if (n_to_process > 0) { + kernel->run_kernel_lut_ex(m, n_to_process, k, 0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, + sizeof(float), -FLT_MAX, FLT_MAX, &lut_i8_i2[0]); + } + } + + return true; + } + bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -565,6 +672,34 @@ class tensor_traits : public ggml::cpu::tensor_traits { return true; } + void split_values_scales_offsets_per_channel( + const block_q2_0c *data, + size_t n, + size_t k, + uint8_t *values_out, + float *scales_out) + { + const size_t blocks_per_row = k / QKQ2_0C; + const size_t bytes_per_block = QKQ2_0C / 4; + + for (size_t row = 0; row < n; ++row) { + for (size_t b = 0; b < blocks_per_row; ++b) { + size_t block_idx = row * blocks_per_row + b; + + const block_q2_0c *src_block = &data[block_idx]; + + // 1. Copy packed values (8 bytes per block) + memcpy(&values_out[block_idx * bytes_per_block], src_block->qs, bytes_per_block); + + // 2. Copy scale + // We copy only the first value because it is per-channel + if(b == 0) { + scales_out[row] = GGML_FP16_TO_FP32(src_block->d); + } + } + } + } + public: int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) { const size_t n = tensor->ne[1]; @@ -648,6 +783,68 @@ public: tensor->data, 0, ¶ms); GGML_UNUSED(data_size); return 0; + } else if (tensor->type == GGML_TYPE_Q2_0C) { + + if (!ctx.kernels_q2c) { + return -1; + } + + // Extract values and scales + // data is n (rows) x k (columns). and it is block_q2_0c + + // Look-up table used to unpack the int2 values + static const int32_t lut_i8_i2[4] = {-3, -1, 1, 3}; + + // split_values_scales_offsets(data, values, scales, offsets); + const size_t bytes_per_block = QKQ2_0C / 4; + const size_t blocks_per_row = k / QKQ2_0C; + const size_t total_blocks = n * blocks_per_row; + + const block_q2_0c *src = (const block_q2_0c *) data; + + // Allocate / reuse buffers as appropriate for your context: + // - values: 8 bytes per block + // - scales: 1 ggml_half per block + uint8_t *values_buf = (uint8_t *) malloc( total_blocks * bytes_per_block ); + + // Be careful!! For each n, we have a scale. Not for each block! + float *scales_buf = (float *) malloc( n * sizeof(float) ); + float *offsets_buf = (float *) malloc( n * sizeof(float) ); + + split_values_scales_offsets_per_channel( + src, + n, + k, + values_buf, + scales_buf + ); + + size_t nr = ctx.kernels_q2c->gemm.get_nr(); + size_t kr = ctx.kernels_q2c->gemm.get_kr(); + size_t sr = ctx.kernels_q2c->gemm.get_sr(); + + struct kai_rhs_pack_qs4cxs1s0_param params; + params.lhs_zero_point = 1; + params.rhs_zero_point = 2; + + ctx.kernels_q2c->rhs_info.pack_func_lut_ex( + 1, n, k, + nr, kr, sr, + 0, 0, + values_buf, + nullptr, + scales_buf, + tensor->data, + 0, ¶ms, + &lut_i8_i2[0]); + + + free(values_buf); + free(scales_buf); + free(offsets_buf); + + GGML_UNUSED(data_size); + return 0; } GGML_UNUSED(data_size); @@ -724,6 +921,18 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_ GGML_ASSERT(ctx.kernels_q8); kernels = ctx.kernels_q8; block_len = QK8_0; + } else if (tensor->type == GGML_TYPE_Q2_0C) { + GGML_ASSERT(ctx.kernels_q2c); + kernels = ctx.kernels_q2c; + block_len = QKQ2_0C; + const size_t nr = kernels->gemm.get_nr(); + const size_t kr = kernels->gemm.get_kr(); + const size_t sr = kernels->gemm.get_sr(); + const size_t packed = kernels->rhs_info.packed_size_ex(n, k, nr, kr, sr); + const size_t raw = ggml_nbytes(tensor); + + return packed > raw ? packed : raw; + } else { return 0; } @@ -739,6 +948,23 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_ namespace ggml::cpu::kleidiai { class extra_buffer_type : ggml::cpu::extra_buffer_type { bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { + if ((op->op == GGML_OP_MUL_MAT ) && + (op->src[0]->type == GGML_TYPE_Q2_0C) && + op->src[0]->buffer && + (ggml_n_dims(op->src[0]) == 2) && + op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) { + if (ctx.kernels_q2c == nullptr) { + return false; + } + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if ((op->src[1]->type == GGML_TYPE_F32) && + ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) { + return true; + } + } + if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) && (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q8_0) && op->src[0]->buffer && diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index ce15b18ce0..6d28e64afb 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -676,6 +676,7 @@ void ggml_compute_forward_add( case GGML_TYPE_Q6_K: case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: + case GGML_TYPE_Q2_0C: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -1125,6 +1126,7 @@ void ggml_compute_forward_add1( case GGML_TYPE_Q6_K: case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: + case GGML_TYPE_Q2_0C: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -1253,6 +1255,7 @@ void ggml_compute_forward_acc( case GGML_TYPE_Q6_K: case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: + case GGML_TYPE_Q2_0C: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -4276,6 +4279,7 @@ void ggml_compute_forward_out_prod( case GGML_TYPE_Q6_K: case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: + case GGML_TYPE_Q2_0C: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -4551,6 +4555,7 @@ void ggml_compute_forward_set( case GGML_TYPE_Q6_K: case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: + case GGML_TYPE_Q2_0C: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -4773,6 +4778,7 @@ void ggml_compute_forward_get_rows( case GGML_TYPE_Q6_K: case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: + case GGML_TYPE_Q2_0C: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: @@ -5497,6 +5503,7 @@ void ggml_compute_forward_clamp( case GGML_TYPE_Q6_K: case GGML_TYPE_TQ1_0: case GGML_TYPE_TQ2_0: + case GGML_TYPE_Q2_0C: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 365cb36d2d..091d1f698d 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -104,6 +104,12 @@ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, quantize_row_tq2_0_ref(x, y, k); } +void quantize_row_q2_0c(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { + assert(k % QKQ2_0C == 0); + block_q2_0c * GGML_RESTRICT y = vy; + quantize_row_q2_0c_ref(x, y, k); +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index d83eb1b144..5bc022f1d4 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -31,6 +31,8 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q2_0c(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index de5cbd75e8..0533471935 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -2198,6 +2198,121 @@ void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RE } } +static inline uint8_t map_int8_to_uint2_idx(int32_t v0) { + + switch(v0) { + case -3: + return 0; + case -1: + return 1; + case 1: + return 2; + case 3: + return 3; + default: + GGML_ASSERT(false); + } +} + +static inline int32_t map_uint2_idx_to_int8(uint8_t v0) { + + switch(v0) { + case 0: + return -3; + case 1: + return -1; + case 2: + return 1; + case 3: + return 3; + default: + GGML_ASSERT(false); + } +} + +void quantize_row_q2_0c_ref(const float * GGML_RESTRICT x, block_q2_0c * GGML_RESTRICT y, int64_t k) { + const int QK = QKQ2_0C; // block size + + assert(k % QK == 0); + + // ---- Find per-channel min/max ---- + float xmin = x[0]; + float xmax = x[0]; + + for (int j = 1; j < k; ++j) { + + const float v = x[j]; + + if (v < xmin) xmin = v; + if (v > xmax) xmax = v; + } + + float d = 0.0f; // scale + + // The four uint2 values [0, 1, 2, 3] map to the Int8 range: + // [-3, -1, +1, +3], yielding an evenly spaced, zero-centered distribution. + const float qmin = -3.0f; + const float qmax = 3.0f; + + if (xmax != xmin) { + d = (xmax - xmin) / (qmax - qmin); + } else { + d = 0.0f; + } + + // Number of blocks + const int64_t nb = k / QK; + + // All blocks share the same scale. + // This enables an optimized matmul implementation. + for (int64_t i = 0; i < nb; ++i) { + const float *xb = x + i*QK; + + y[i].d = GGML_FP32_TO_FP16(d); + + // ---- Quantize to uint2 ---- + if (d == 0.0f) { + for (int j = 0; j < QK; ++j) { + y[i].qs[j] = 0; + } + } else { + const float inv_d = 1.0f / d; + + for (int j = 0; j < QK; j+=4) { + float v0 = xb[j + 0]; + float v1 = xb[j + 1]; + float v2 = xb[j + 2]; + float v3 = xb[j + 3]; + + // q = round(v / d) + int qi0 = (int) lrintf(v0 * inv_d); + int qi1 = (int) lrintf(v1 * inv_d); + int qi2 = (int) lrintf(v2 * inv_d); + int qi3 = (int) lrintf(v3 * inv_d); + + // clamp to int8 range + if (qi0 < qmin) qi0 = qmin; + if (qi0 > qmax) qi0 = qmax; + if (qi1 < qmin) qi1 = qmin; + if (qi1 > qmax) qi1 = qmax; + if (qi2 < qmin) qi2 = qmin; + if (qi2 > qmax) qi2 = qmax; + if (qi3 < qmin) qi3 = qmin; + if (qi3 > qmax) qi3 = qmax; + + // TODO: What if we have -2 or +2? + const uint8_t v0_u8 = map_int8_to_uint2_idx(qi0); + const uint8_t v1_u8 = map_int8_to_uint2_idx(qi1); + const uint8_t v2_u8 = map_int8_to_uint2_idx(qi2); + const uint8_t v3_u8 = map_int8_to_uint2_idx(qi3); + + uint8_t rhs_v0 = (v0_u8 & 0x3) | ((v1_u8 << 2) & 0x0C) | ((v2_u8 << 4 & 0x30)) | ((v3_u8 << 6 & 0xC0)); + y[i].qs[j / 4] = rhs_v0; + } + } + } +} + size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { (void)quant_weights; // not used const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row); @@ -2212,6 +2327,18 @@ size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, return nrow * row_size; } +size_t quantize_q2_0c(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + // Number of bytes per row + const size_t row_size = ggml_row_size(GGML_TYPE_Q2_0C, n_per_row); + for(int64_t i = 0; i < nrow; ++i) { + uint8_t * row_dst_bytes = (uint8_t *) dst + (size_t) i * row_size; + block_q2_0c * row_dst = (block_q2_0c *) row_dst_bytes; + quantize_row_q2_0c_ref(src + i * n_per_row, row_dst, n_per_row); + } + return nrow * row_size; +} + void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2270,6 +2397,32 @@ void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_REST } } +void dequantize_row_q2_0c(const block_q2_0c * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QKQ2_0C == 0); + const int64_t nb = k / QKQ2_0C; + + for (int64_t i = 0; i < nb; ++i) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (size_t j = 0; j < QKQ2_0C; j += 4) { + const uint8_t rhs_byte = x[i].qs[j/4]; + const uint8_t u2_idx0 = ((uint8_t)(rhs_byte & 0x03)); + const uint8_t u2_idx1 = (((uint8_t)((rhs_byte >> 2) & 0x03))); + const uint8_t u2_idx2 = (((uint8_t)((rhs_byte >> 4) & 0x03))); + const uint8_t u2_idx3 = (((uint8_t)((rhs_byte >> 6) & 0x03))); + int32_t q0 = map_uint2_idx_to_int8(u2_idx0); + int32_t q1 = map_uint2_idx_to_int8(u2_idx1); + int32_t q2 = map_uint2_idx_to_int8(u2_idx2); + int32_t q3 = map_uint2_idx_to_int8(u2_idx3); + *y++ = (float) (q0) * d; + *y++ = (float) (q1) * d; + *y++ = (float) (q2) * d; + *y++ = (float) (q3) * d; + } + } +} + // ====================== "True" 2-bit (de)-quantization void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { @@ -5262,6 +5415,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb); } break; + case GGML_TYPE_Q2_0C: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q2_0c, data, nb); + } break; case GGML_TYPE_IQ1_S: { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 3b688f31c2..7de186a940 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -33,6 +33,8 @@ GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q2_0c_ref(const float * GGML_RESTRICT x, block_q2_0c * GGML_RESTRICT y, int64_t k); + GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); @@ -59,6 +61,8 @@ GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GG GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q2_0c(const block_q2_0c * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -83,6 +87,8 @@ GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RE GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q2_0c(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 500cb6b72f..561bc2132b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -896,6 +896,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .type_size = 0, .is_quantized = false, }, + [GGML_TYPE_Q2_0C] = { + .type_name = "q2_0c", + .blck_size = QKQ2_0C, + .type_size = sizeof(block_q2_0c), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_0c, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_0c_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -7572,6 +7580,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q2_0C: result = quantize_q2_0c(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/include/llama.h b/include/llama.h index bf4e28a8be..94bcd98f6b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -152,6 +152,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_0C = 39, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 776222cb6f..f825103614 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -509,6 +509,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break; case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break; + case LLAMA_FTYPE_MOSTLY_Q2_0C: default_type = GGML_TYPE_Q2_0C; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 0709e0bda0..1d4731322c 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -34,6 +34,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", }, { "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", }, + { "Q2_0C", LLAMA_FTYPE_MOSTLY_Q2_0C, " 2.06 bpw ternarization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },