Merge e514593221 into 3688c4f504
This commit is contained in:
commit
58fa2cb67f
|
|
@ -427,7 +427,8 @@ extern "C" {
|
||||||
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
||||||
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||||
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
||||||
GGML_TYPE_COUNT = 40,
|
GGML_TYPE_Q2_0C = 40,
|
||||||
|
GGML_TYPE_COUNT = 41,
|
||||||
};
|
};
|
||||||
|
|
||||||
// precision
|
// precision
|
||||||
|
|
|
||||||
|
|
@ -255,6 +255,13 @@ typedef struct {
|
||||||
} block_tq2_0;
|
} block_tq2_0;
|
||||||
static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
|
static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
|
||||||
|
|
||||||
|
#define QKQ2_0C 512
|
||||||
|
typedef struct {
|
||||||
|
ggml_half d;
|
||||||
|
uint8_t qs[QKQ2_0C / 4];
|
||||||
|
} block_q2_0c;
|
||||||
|
static_assert(sizeof(block_q2_0c) == sizeof(ggml_half) + (QKQ2_0C / 4), "wrong q2_0c block size/padding");
|
||||||
|
|
||||||
//
|
//
|
||||||
// Super-block quantization structures
|
// Super-block quantization structures
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -561,9 +561,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
|
|
||||||
# Fetch KleidiAI sources:
|
# Fetch KleidiAI sources:
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
set(KLEIDIAI_COMMIT_TAG "v1.16.0")
|
set(KLEIDIAI_COMMIT_TAG "v1.21.0")
|
||||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||||
set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321")
|
set(KLEIDIAI_ARCHIVE_MD5 "8b8525adc0eb9e2a16f765743a706ac8")
|
||||||
|
|
||||||
if (POLICY CMP0135)
|
if (POLICY CMP0135)
|
||||||
cmake_policy(SET CMP0135 NEW)
|
cmake_policy(SET CMP0135 NEW)
|
||||||
|
|
@ -606,6 +606,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
|
||||||
|
|
||||||
set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
|
set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
|
||||||
|
|
@ -626,7 +627,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.c)
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon.c)
|
||||||
|
|
||||||
if (NOT DOTPROD_ENABLED MATCHES -1)
|
if (NOT DOTPROD_ENABLED MATCHES -1)
|
||||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||||
|
|
@ -656,6 +658,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
||||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot_asm.S
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa.c
|
||||||
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsu2cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa_asm.S
|
||||||
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
|
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
|
||||||
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -384,6 +384,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
.nrows = 1,
|
.nrows = 1,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_Q2_0C] = {
|
||||||
|
.from_float = quantize_row_q2_0c,
|
||||||
|
.vec_dot = ggml_vec_dot_q2_0c_q8_K,
|
||||||
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||||
|
.nrows = 1,
|
||||||
|
},
|
||||||
[GGML_TYPE_I32] = {
|
[GGML_TYPE_I32] = {
|
||||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
|
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
// SPDX-FileCopyrightText: Copyright 2025-2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||||
// SPDX-License-Identifier: MIT
|
// SPDX-License-Identifier: MIT
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
@ -32,6 +32,12 @@
|
||||||
#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
|
#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
|
||||||
#include "kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h"
|
#include "kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h"
|
||||||
|
|
||||||
|
#include "kai_lhs_quant_pack_qai8dxp_f32.h"
|
||||||
|
#include "kai_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot.h"
|
||||||
|
#include "kai_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa.h"
|
||||||
|
#include "kai_matmul_clamp_f32_qai8dxp_qsu2cxp_interface.h"
|
||||||
|
#include "kai_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon.h"
|
||||||
|
|
||||||
#include "kai_common.h"
|
#include "kai_common.h"
|
||||||
|
|
||||||
#include "simd-mappings.h"
|
#include "simd-mappings.h"
|
||||||
|
|
@ -77,6 +83,15 @@ static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /*
|
||||||
Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
|
Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<void(*Fn)(size_t,size_t,size_t,const void*,const void*,float*,size_t,size_t,float,float, const int32_t*)>
|
||||||
|
static inline void kernel_run_float_fn11_int2(size_t m, size_t n, size_t k, size_t /*bl*/,
|
||||||
|
const void* lhs, const void* rhs, void* dst,
|
||||||
|
size_t dst_stride_row, size_t dst_stride_col,
|
||||||
|
float clamp_min, float clamp_max, const int32_t* lut) {
|
||||||
|
|
||||||
|
Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max, lut);
|
||||||
|
}
|
||||||
|
|
||||||
template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t)>
|
template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t)>
|
||||||
static inline size_t lhs_ps_fn6(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
|
static inline size_t lhs_ps_fn6(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
|
||||||
return Fn(m, k, bl, mr, kr, sr);
|
return Fn(m, k, bl, mr, kr, sr);
|
||||||
|
|
@ -164,6 +179,18 @@ static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, si
|
||||||
static_cast<const kai_rhs_pack_qsi8cx_params*>(params));
|
static_cast<const kai_rhs_pack_qsi8cx_params*>(params));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const uint8_t*,const float*,const float*,void*,size_t,const struct kai_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon_params*,const int32_t*)>
|
||||||
|
static inline void rhs_pack_scale_fn12_int2(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
|
||||||
|
size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
|
||||||
|
void* rhs_packed, size_t extra_bytes, const void* params, const int32_t* lut) {
|
||||||
|
Fn(num_groups, n, k, nr, kr, sr,
|
||||||
|
static_cast<const uint8_t*>(rhs),
|
||||||
|
static_cast<const float*>(bias),
|
||||||
|
static_cast<const float*>(scale),
|
||||||
|
rhs_packed, extra_bytes,
|
||||||
|
static_cast<const struct kai_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon_params*>(params), lut);
|
||||||
|
}
|
||||||
|
|
||||||
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,size_t,const void*,const void*,const void*,void*,size_t,const void*)>
|
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,size_t,const void*,const void*,const void*,void*,size_t,const void*)>
|
||||||
static inline void rhs_pack_fn13(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
|
static inline void rhs_pack_fn13(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
|
||||||
size_t rhs_stride, const void* rhs, const void* bias, const void* scale,
|
size_t rhs_stride, const void* rhs, const void* bias, const void* scale,
|
||||||
|
|
@ -320,6 +347,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
|
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
|
|
@ -341,6 +369,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
|
||||||
|
|
@ -354,6 +383,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon>,
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon>,
|
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon>,
|
||||||
|
/* .pack_func_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -374,6 +404,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn10<kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa>,
|
/* .run_kernel_ex = */ &kernel_run_fn10<kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
|
||||||
|
|
@ -394,6 +425,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ nullptr,
|
/* .get_lhs_offset_ex = */ nullptr,
|
||||||
/* .get_rhs_packed_offset_ex = */ nullptr,
|
/* .get_rhs_packed_offset_ex = */ nullptr,
|
||||||
/* .run_kernel_ex = */ nullptr,
|
/* .run_kernel_ex = */ nullptr,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
|
||||||
|
|
@ -407,6 +439,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn2<kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme>,
|
/* .packed_size_ex = */ &rhs_ps_fn2<kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn1<kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme>,
|
/* .packed_stride_ex = */ &rhs_stride_fn1<kai_get_rhs_packed_stride_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_fn13<kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme>,
|
/* .pack_func_ex = */ &rhs_pack_fn13<kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -430,6 +463,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||||
|
|
@ -450,6 +484,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||||
|
|
@ -463,6 +498,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
|
/* .pack_func_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -485,6 +521,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
||||||
|
|
@ -505,6 +542,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||||
|
|
@ -518,6 +556,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
|
/* .pack_func_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -541,6 +580,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
||||||
|
|
@ -561,6 +601,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||||
|
|
@ -574,6 +615,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
|
/* .pack_func_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_SVE | CPU_FEATURE_I8MM | CPU_FEATURE_DOTPROD,
|
/* .required_cpu = */ CPU_FEATURE_SVE | CPU_FEATURE_I8MM | CPU_FEATURE_DOTPROD,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -596,6 +638,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon,
|
||||||
|
|
@ -616,6 +659,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||||
|
|
@ -629,6 +673,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
|
/* .pack_func_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -651,6 +696,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||||
|
|
@ -671,6 +717,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn3<kai_get_lhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3<kai_get_rhs_packed_offset_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_fn11<kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||||
|
|
@ -684,6 +731,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
|
||||||
|
/* .pack_func_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -695,6 +743,68 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||||
{ /* Sentinel */ }
|
{ /* Sentinel */ }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static ggml_kleidiai_kernels gemm_gemv_kernels_q2_0c[] {
|
||||||
|
#if defined(__ARM_FEATURE_SME)
|
||||||
|
{
|
||||||
|
/* SME GEMM */
|
||||||
|
{
|
||||||
|
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||||
|
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||||
|
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||||
|
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||||
|
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||||
|
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||||
|
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||||
|
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa,
|
||||||
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa>,
|
||||||
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa>,
|
||||||
|
/* .run_kernel_ex = */ nullptr,
|
||||||
|
/* .run_kernel_lut_ex = */ &kernel_run_float_fn11_int2<kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsu2cxp4vlx4_1vlx4vl_sme2_mopa>,
|
||||||
|
},
|
||||||
|
/* .gemm_lhs_info = */ {
|
||||||
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||||
|
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
|
||||||
|
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
|
||||||
|
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
|
||||||
|
},
|
||||||
|
/* SME GEMV */
|
||||||
|
{
|
||||||
|
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot,
|
||||||
|
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot,
|
||||||
|
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot,
|
||||||
|
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot,
|
||||||
|
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot,
|
||||||
|
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot,
|
||||||
|
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot,
|
||||||
|
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot,
|
||||||
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot>,
|
||||||
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot>,
|
||||||
|
/* .run_kernel_ex = */ nullptr,
|
||||||
|
/* .run_kernel_lut_ex = */ &kernel_run_float_fn11_int2<kai_run_matmul_clamp_f32_qai8dxp1x4_qsu2cxp4vlx4_1x4vl_sme2_dot>,
|
||||||
|
},
|
||||||
|
/* .gemv_lhs_info = */ {
|
||||||
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||||
|
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
|
||||||
|
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
|
||||||
|
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
|
||||||
|
},
|
||||||
|
/* .rhs_info = */ {
|
||||||
|
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon,
|
||||||
|
/* .to_float = */ nullptr,
|
||||||
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon>,
|
||||||
|
/* .packed_stride_ex = */ nullptr,
|
||||||
|
/* .pack_func_ex = */ nullptr,
|
||||||
|
/* .pack_func_lut_ex = */ &rhs_pack_scale_fn12_int2<kai_run_rhs_pack_nxk_qsu2cxp4vlx4_qsu2cx_neon>,
|
||||||
|
},
|
||||||
|
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||||
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
/* .rhs_type = */ GGML_TYPE_Q2_0C,
|
||||||
|
/* .op_type = */ GGML_TYPE_F32,
|
||||||
|
},
|
||||||
|
#endif
|
||||||
|
{ /* Sentinel */ }
|
||||||
|
};
|
||||||
|
|
||||||
static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
#if defined(__ARM_FEATURE_SME)
|
#if defined(__ARM_FEATURE_SME)
|
||||||
{
|
{
|
||||||
|
|
@ -711,6 +821,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
|
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||||
|
|
@ -731,6 +842,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
|
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||||
|
|
@ -744,6 +856,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
|
/* .pack_func_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -766,6 +879,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
|
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||||
|
|
@ -786,6 +900,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||||
|
|
@ -799,6 +914,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
|
/* .pack_func_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -821,6 +937,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemm_lhs_info = */ {
|
/* .gemm_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||||
|
|
@ -841,6 +958,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
|
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
|
||||||
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
|
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
|
||||||
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
|
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
|
||||||
|
/* .run_kernel_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .gemv_lhs_info = */ {
|
/* .gemv_lhs_info = */ {
|
||||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
|
||||||
|
|
@ -854,6 +972,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
|
||||||
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
|
||||||
|
/* .pack_func_lut_ex = */ nullptr,
|
||||||
},
|
},
|
||||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||||
/* .lhs_type = */ GGML_TYPE_F32,
|
/* .lhs_type = */ GGML_TYPE_F32,
|
||||||
|
|
@ -890,9 +1009,21 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
|
||||||
} else {
|
} else {
|
||||||
try_table(gemm_gemv_kernels);
|
try_table(gemm_gemv_kernels);
|
||||||
}
|
}
|
||||||
|
if (!kernel) {
|
||||||
|
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q2_0c) - 1; ++i) {
|
||||||
|
if ((cpu_features & gemm_gemv_kernels_q2_0c[i].required_cpu) == gemm_gemv_kernels_q2_0c[i].required_cpu &&
|
||||||
|
gemm_gemv_kernels_q2_0c[i].lhs_type == tensor->src[1]->type &&
|
||||||
|
gemm_gemv_kernels_q2_0c[i].rhs_type == tensor->src[0]->type &&
|
||||||
|
gemm_gemv_kernels_q2_0c[i].op_type == tensor->type) {
|
||||||
|
kernel = &gemm_gemv_kernels_q2_0c[i];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED(gemm_gemv_kernels);
|
GGML_UNUSED(gemm_gemv_kernels);
|
||||||
GGML_UNUSED(gemm_gemv_kernels_q8);
|
GGML_UNUSED(gemm_gemv_kernels_q8);
|
||||||
|
GGML_UNUSED(gemm_gemv_kernels_q2_0c);
|
||||||
GGML_UNUSED(cpu_features);
|
GGML_UNUSED(cpu_features);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
@ -936,3 +1067,20 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features)
|
||||||
|
|
||||||
return kernels;
|
return kernels;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q2_0c(cpu_feature features) {
|
||||||
|
ggml_kleidiai_kernels * kernels = nullptr;
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q2_0c) - 1; ++i) {
|
||||||
|
if ((features & gemm_gemv_kernels_q2_0c[i].required_cpu) == gemm_gemv_kernels_q2_0c[i].required_cpu) {
|
||||||
|
kernels = &gemm_gemv_kernels_q2_0c[i];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_UNUSED(features);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return kernels;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
// SPDX-FileCopyrightText: Copyright 2025-2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||||
// SPDX-License-Identifier: MIT
|
// SPDX-License-Identifier: MIT
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
@ -42,6 +42,12 @@ struct kernel_info {
|
||||||
const void* lhs_packed, const void* rhs_packed,
|
const void* lhs_packed, const void* rhs_packed,
|
||||||
void* dst, size_t dst_stride_row, size_t dst_stride_col,
|
void* dst, size_t dst_stride_row, size_t dst_stride_col,
|
||||||
float clamp_min, float clamp_max);
|
float clamp_min, float clamp_max);
|
||||||
|
|
||||||
|
void (*run_kernel_lut_ex)(
|
||||||
|
size_t m, size_t n, size_t k, size_t bl,
|
||||||
|
const void* lhs_packed, const void* rhs_packed,
|
||||||
|
void* dst, size_t dst_stride_row, size_t dst_stride_col,
|
||||||
|
float clamp_min, float clamp_max, const int32_t* lut);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct lhs_packing_info {
|
struct lhs_packing_info {
|
||||||
|
|
@ -68,6 +74,9 @@ struct rhs_packing_info {
|
||||||
|
|
||||||
void (*pack_func_ex)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl,
|
void (*pack_func_ex)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl,
|
||||||
size_t rhs_stride, const void * rhs, const void * bias, const void * scale, void * rhs_packed, size_t extra_bytes, const void * params);
|
size_t rhs_stride, const void * rhs, const void * bias, const void * scale, void * rhs_packed, size_t extra_bytes, const void * params);
|
||||||
|
|
||||||
|
void (*pack_func_lut_ex)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl,
|
||||||
|
size_t rhs_stride, const void * rhs, const void * bias, const void * scale, void * rhs_packed, size_t extra_bytes, const void * params, const int32_t* lut);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_kleidiai_kernels {
|
struct ggml_kleidiai_kernels {
|
||||||
|
|
@ -88,3 +97,4 @@ struct ggml_kleidiai_kernels {
|
||||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor);
|
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor);
|
||||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features);
|
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features);
|
||||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features);
|
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features);
|
||||||
|
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q2_0c(cpu_feature features);
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
// SPDX-FileCopyrightText: Copyright 2025-2026 Arm Limited and/or its affiliates <open-source-office@arm.com>
|
||||||
// SPDX-License-Identifier: MIT
|
// SPDX-License-Identifier: MIT
|
||||||
//
|
//
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
@ -43,7 +43,8 @@ struct ggml_kleidiai_context {
|
||||||
cpu_feature features;
|
cpu_feature features;
|
||||||
ggml_kleidiai_kernels * kernels_q4;
|
ggml_kleidiai_kernels * kernels_q4;
|
||||||
ggml_kleidiai_kernels * kernels_q8;
|
ggml_kleidiai_kernels * kernels_q8;
|
||||||
} static ctx = { CPU_FEATURE_NONE, NULL, NULL };
|
ggml_kleidiai_kernels * kernels_q2c;
|
||||||
|
} static ctx = { CPU_FEATURE_NONE, NULL, NULL, NULL };
|
||||||
|
|
||||||
static const char* cpu_feature_to_string(cpu_feature f) {
|
static const char* cpu_feature_to_string(cpu_feature f) {
|
||||||
if (f == CPU_FEATURE_NONE) {
|
if (f == CPU_FEATURE_NONE) {
|
||||||
|
|
@ -84,8 +85,9 @@ static void init_kleidiai_context(void) {
|
||||||
if (sme_enabled != 0) {
|
if (sme_enabled != 0) {
|
||||||
ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
|
ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
|
||||||
}
|
}
|
||||||
ctx.kernels_q4 = ggml_kleidiai_select_kernels_q4_0(ctx.features);
|
ctx.kernels_q4 = ggml_kleidiai_select_kernels_q4_0(ctx.features);
|
||||||
ctx.kernels_q8 = ggml_kleidiai_select_kernels_q8_0(ctx.features);
|
ctx.kernels_q8 = ggml_kleidiai_select_kernels_q8_0(ctx.features);
|
||||||
|
ctx.kernels_q2c = ggml_kleidiai_select_kernels_q2_0c(ctx.features);
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
if (ctx.kernels_q4) {
|
if (ctx.kernels_q4) {
|
||||||
GGML_LOG_DEBUG("kleidiai: using q4 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q4->required_cpu));
|
GGML_LOG_DEBUG("kleidiai: using q4 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q4->required_cpu));
|
||||||
|
|
@ -93,6 +95,9 @@ static void init_kleidiai_context(void) {
|
||||||
if (ctx.kernels_q8) {
|
if (ctx.kernels_q8) {
|
||||||
GGML_LOG_DEBUG("kleidiai: using q8 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q8->required_cpu));
|
GGML_LOG_DEBUG("kleidiai: using q8 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q8->required_cpu));
|
||||||
}
|
}
|
||||||
|
if (ctx.kernels_q2c) {
|
||||||
|
GGML_LOG_DEBUG("kleidiai: using q2c kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q2c->required_cpu));
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
ggml_critical_section_end();
|
ggml_critical_section_end();
|
||||||
|
|
@ -148,6 +153,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||||
} else if (kernels->rhs_type == GGML_TYPE_Q8_0) {
|
} else if (kernels->rhs_type == GGML_TYPE_Q8_0) {
|
||||||
if (!lhs_info->packed_size_ex) return false;
|
if (!lhs_info->packed_size_ex) return false;
|
||||||
size = lhs_info->packed_size_ex(m, k, QK8_0, mr, kr, sr);
|
size = lhs_info->packed_size_ex(m, k, QK8_0, mr, kr, sr);
|
||||||
|
} else if (kernels->rhs_type == GGML_TYPE_Q2_0C) {
|
||||||
|
if (!lhs_info->packed_size_ex) return false;
|
||||||
|
size = lhs_info->packed_size_ex(m, k, QKQ2_0C, mr, kr, sr);
|
||||||
} else if (kernels->rhs_type == GGML_TYPE_F16) {
|
} else if (kernels->rhs_type == GGML_TYPE_F16) {
|
||||||
if (!lhs_info->packed_size_ex || !kernels->rhs_info.packed_size_ex) return false;
|
if (!lhs_info->packed_size_ex || !kernels->rhs_info.packed_size_ex) return false;
|
||||||
const int64_t lhs_batch_size0 = op->src[1]->ne[2];
|
const int64_t lhs_batch_size0 = op->src[1]->ne[2];
|
||||||
|
|
@ -171,6 +179,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||||
return compute_forward_q8_0(params, dst);
|
return compute_forward_q8_0(params, dst);
|
||||||
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||||
return compute_forward_fp16(params, dst);
|
return compute_forward_fp16(params, dst);
|
||||||
|
} else if (dst->src[0]->type == GGML_TYPE_Q2_0C && ctx.kernels_q2c != nullptr) {
|
||||||
|
return compute_forward_q2_0c(params, dst);
|
||||||
}
|
}
|
||||||
} else if (dst->op == GGML_OP_GET_ROWS) {
|
} else if (dst->op == GGML_OP_GET_ROWS) {
|
||||||
if (dst->src[0]->type == GGML_TYPE_Q4_0 || dst->src[0]->type == GGML_TYPE_Q8_0) {
|
if (dst->src[0]->type == GGML_TYPE_Q4_0 || dst->src[0]->type == GGML_TYPE_Q8_0) {
|
||||||
|
|
@ -504,6 +514,103 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool compute_forward_q2_0c(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q2_0C);
|
||||||
|
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
||||||
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
|
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
|
||||||
|
|
||||||
|
// Look-up table used to unpack the int2 values
|
||||||
|
static const int32_t lut_i8_i2[4] = {-3, -1, 1, 3};
|
||||||
|
|
||||||
|
if (!kernels) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_gemv = src1->ne[1] == 1;
|
||||||
|
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
|
||||||
|
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
|
||||||
|
|
||||||
|
GGML_ASSERT(kernel);
|
||||||
|
if (!lhs_info->get_packed_offset_ex || !lhs_info->pack_func_ex ||
|
||||||
|
!kernel->get_rhs_packed_offset_ex || !kernel->run_kernel_lut_ex || !kernel->get_dst_offset) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int ith = params->ith;
|
||||||
|
const int nth_raw = params->nth;
|
||||||
|
const int nth = nth_raw > 0 ? nth_raw : 1;
|
||||||
|
|
||||||
|
const size_t k = ne00;
|
||||||
|
const size_t m = ne11;
|
||||||
|
const size_t n = ne01;
|
||||||
|
|
||||||
|
size_t mr = kernel->get_mr();
|
||||||
|
size_t kr = kernel->get_kr();
|
||||||
|
size_t sr = kernel->get_sr();
|
||||||
|
|
||||||
|
const uint8_t * lhs = static_cast<const uint8_t *>(src1->data);
|
||||||
|
uint8_t * lhs_packed = (uint8_t*)params->wdata;
|
||||||
|
const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
|
||||||
|
|
||||||
|
const size_t n_step = kernel->get_n_step();
|
||||||
|
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
|
||||||
|
const size_t n_start = ith * num_n_per_thread;
|
||||||
|
|
||||||
|
size_t n_to_process = 0;
|
||||||
|
if (n_start < n) {
|
||||||
|
n_to_process = num_n_per_thread;
|
||||||
|
if ((n_start + n_to_process) > n) {
|
||||||
|
n_to_process = n - n_start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate number of columns to be processed per thread
|
||||||
|
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
|
||||||
|
const size_t m_start = ith * num_m_per_thread;
|
||||||
|
size_t m_to_process = num_m_per_thread;
|
||||||
|
if ((m_start + m_to_process) > m) {
|
||||||
|
m_to_process = m - m_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m_start < m) {
|
||||||
|
// Transform LHS
|
||||||
|
|
||||||
|
const size_t src_stride = src1->nb[1];
|
||||||
|
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
|
||||||
|
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(m_start, k, QKQ2_0C, mr, kr, sr);
|
||||||
|
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
|
||||||
|
|
||||||
|
// Pack this thread's chunk with m_idx_start = 0 and per-thread output pointer
|
||||||
|
lhs_info->pack_func_ex(m_to_process, k, QKQ2_0C, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_barrier(params->threadpool);
|
||||||
|
|
||||||
|
if (n_to_process > 0) {
|
||||||
|
const size_t dst_stride = dst->nb[1];
|
||||||
|
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(0, k, 0, mr, kr, sr);
|
||||||
|
const size_t rhs_packed_offset = kernel->get_rhs_packed_offset_ex(n_start, k, 0);
|
||||||
|
const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride);
|
||||||
|
const void * rhs_ptr = static_cast<const void *>(rhs_packed + rhs_packed_offset);
|
||||||
|
const void * lhs_ptr = static_cast<const void *>(lhs_packed + lhs_packed_offset);
|
||||||
|
float * dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
|
||||||
|
|
||||||
|
if (n_to_process > 0) {
|
||||||
|
kernel->run_kernel_lut_ex(m, n_to_process, k, 0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
|
||||||
|
sizeof(float), -FLT_MAX, FLT_MAX, &lut_i8_i2[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
const ggml_tensor * src1 = dst->src[1];
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
@ -565,6 +672,34 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void split_values_scales_offsets_per_channel(
|
||||||
|
const block_q2_0c *data,
|
||||||
|
size_t n,
|
||||||
|
size_t k,
|
||||||
|
uint8_t *values_out,
|
||||||
|
float *scales_out)
|
||||||
|
{
|
||||||
|
const size_t blocks_per_row = k / QKQ2_0C;
|
||||||
|
const size_t bytes_per_block = QKQ2_0C / 4;
|
||||||
|
|
||||||
|
for (size_t row = 0; row < n; ++row) {
|
||||||
|
for (size_t b = 0; b < blocks_per_row; ++b) {
|
||||||
|
size_t block_idx = row * blocks_per_row + b;
|
||||||
|
|
||||||
|
const block_q2_0c *src_block = &data[block_idx];
|
||||||
|
|
||||||
|
// 1. Copy packed values (8 bytes per block)
|
||||||
|
memcpy(&values_out[block_idx * bytes_per_block], src_block->qs, bytes_per_block);
|
||||||
|
|
||||||
|
// 2. Copy scale
|
||||||
|
// We copy only the first value because it is per-channel
|
||||||
|
if(b == 0) {
|
||||||
|
scales_out[row] = GGML_FP16_TO_FP32(src_block->d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
|
int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
|
||||||
const size_t n = tensor->ne[1];
|
const size_t n = tensor->ne[1];
|
||||||
|
|
@ -648,6 +783,68 @@ public:
|
||||||
tensor->data, 0, ¶ms);
|
tensor->data, 0, ¶ms);
|
||||||
GGML_UNUSED(data_size);
|
GGML_UNUSED(data_size);
|
||||||
return 0;
|
return 0;
|
||||||
|
} else if (tensor->type == GGML_TYPE_Q2_0C) {
|
||||||
|
|
||||||
|
if (!ctx.kernels_q2c) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract values and scales
|
||||||
|
// data is n (rows) x k (columns). and it is block_q2_0c
|
||||||
|
|
||||||
|
// Look-up table used to unpack the int2 values
|
||||||
|
static const int32_t lut_i8_i2[4] = {-3, -1, 1, 3};
|
||||||
|
|
||||||
|
// split_values_scales_offsets(data, values, scales, offsets);
|
||||||
|
const size_t bytes_per_block = QKQ2_0C / 4;
|
||||||
|
const size_t blocks_per_row = k / QKQ2_0C;
|
||||||
|
const size_t total_blocks = n * blocks_per_row;
|
||||||
|
|
||||||
|
const block_q2_0c *src = (const block_q2_0c *) data;
|
||||||
|
|
||||||
|
// Allocate / reuse buffers as appropriate for your context:
|
||||||
|
// - values: 8 bytes per block
|
||||||
|
// - scales: 1 ggml_half per block
|
||||||
|
uint8_t *values_buf = (uint8_t *) malloc( total_blocks * bytes_per_block );
|
||||||
|
|
||||||
|
// Be careful!! For each n, we have a scale. Not for each block!
|
||||||
|
float *scales_buf = (float *) malloc( n * sizeof(float) );
|
||||||
|
float *offsets_buf = (float *) malloc( n * sizeof(float) );
|
||||||
|
|
||||||
|
split_values_scales_offsets_per_channel(
|
||||||
|
src,
|
||||||
|
n,
|
||||||
|
k,
|
||||||
|
values_buf,
|
||||||
|
scales_buf
|
||||||
|
);
|
||||||
|
|
||||||
|
size_t nr = ctx.kernels_q2c->gemm.get_nr();
|
||||||
|
size_t kr = ctx.kernels_q2c->gemm.get_kr();
|
||||||
|
size_t sr = ctx.kernels_q2c->gemm.get_sr();
|
||||||
|
|
||||||
|
struct kai_rhs_pack_qs4cxs1s0_param params;
|
||||||
|
params.lhs_zero_point = 1;
|
||||||
|
params.rhs_zero_point = 2;
|
||||||
|
|
||||||
|
ctx.kernels_q2c->rhs_info.pack_func_lut_ex(
|
||||||
|
1, n, k,
|
||||||
|
nr, kr, sr,
|
||||||
|
0, 0,
|
||||||
|
values_buf,
|
||||||
|
nullptr,
|
||||||
|
scales_buf,
|
||||||
|
tensor->data,
|
||||||
|
0, ¶ms,
|
||||||
|
&lut_i8_i2[0]);
|
||||||
|
|
||||||
|
|
||||||
|
free(values_buf);
|
||||||
|
free(scales_buf);
|
||||||
|
free(offsets_buf);
|
||||||
|
|
||||||
|
GGML_UNUSED(data_size);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_UNUSED(data_size);
|
GGML_UNUSED(data_size);
|
||||||
|
|
@ -724,6 +921,18 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_
|
||||||
GGML_ASSERT(ctx.kernels_q8);
|
GGML_ASSERT(ctx.kernels_q8);
|
||||||
kernels = ctx.kernels_q8;
|
kernels = ctx.kernels_q8;
|
||||||
block_len = QK8_0;
|
block_len = QK8_0;
|
||||||
|
} else if (tensor->type == GGML_TYPE_Q2_0C) {
|
||||||
|
GGML_ASSERT(ctx.kernels_q2c);
|
||||||
|
kernels = ctx.kernels_q2c;
|
||||||
|
block_len = QKQ2_0C;
|
||||||
|
const size_t nr = kernels->gemm.get_nr();
|
||||||
|
const size_t kr = kernels->gemm.get_kr();
|
||||||
|
const size_t sr = kernels->gemm.get_sr();
|
||||||
|
const size_t packed = kernels->rhs_info.packed_size_ex(n, k, nr, kr, sr);
|
||||||
|
const size_t raw = ggml_nbytes(tensor);
|
||||||
|
|
||||||
|
return packed > raw ? packed : raw;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
@ -739,6 +948,23 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_
|
||||||
namespace ggml::cpu::kleidiai {
|
namespace ggml::cpu::kleidiai {
|
||||||
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||||
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
||||||
|
if ((op->op == GGML_OP_MUL_MAT ) &&
|
||||||
|
(op->src[0]->type == GGML_TYPE_Q2_0C) &&
|
||||||
|
op->src[0]->buffer &&
|
||||||
|
(ggml_n_dims(op->src[0]) == 2) &&
|
||||||
|
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||||
|
if (ctx.kernels_q2c == nullptr) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if ((op->src[1]->type == GGML_TYPE_F32) &&
|
||||||
|
ggml_ne(op->src[1], 2) == 1 && ggml_ne(op->src[1], 3) == 1) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) &&
|
if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) &&
|
||||||
(op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q8_0) &&
|
(op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q8_0) &&
|
||||||
op->src[0]->buffer &&
|
op->src[0]->buffer &&
|
||||||
|
|
|
||||||
|
|
@ -676,6 +676,7 @@ void ggml_compute_forward_add(
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_TQ1_0:
|
case GGML_TYPE_TQ1_0:
|
||||||
case GGML_TYPE_TQ2_0:
|
case GGML_TYPE_TQ2_0:
|
||||||
|
case GGML_TYPE_Q2_0C:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
|
@ -1125,6 +1126,7 @@ void ggml_compute_forward_add1(
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_TQ1_0:
|
case GGML_TYPE_TQ1_0:
|
||||||
case GGML_TYPE_TQ2_0:
|
case GGML_TYPE_TQ2_0:
|
||||||
|
case GGML_TYPE_Q2_0C:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
|
@ -1253,6 +1255,7 @@ void ggml_compute_forward_acc(
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_TQ1_0:
|
case GGML_TYPE_TQ1_0:
|
||||||
case GGML_TYPE_TQ2_0:
|
case GGML_TYPE_TQ2_0:
|
||||||
|
case GGML_TYPE_Q2_0C:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
|
@ -4276,6 +4279,7 @@ void ggml_compute_forward_out_prod(
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_TQ1_0:
|
case GGML_TYPE_TQ1_0:
|
||||||
case GGML_TYPE_TQ2_0:
|
case GGML_TYPE_TQ2_0:
|
||||||
|
case GGML_TYPE_Q2_0C:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
|
@ -4551,6 +4555,7 @@ void ggml_compute_forward_set(
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_TQ1_0:
|
case GGML_TYPE_TQ1_0:
|
||||||
case GGML_TYPE_TQ2_0:
|
case GGML_TYPE_TQ2_0:
|
||||||
|
case GGML_TYPE_Q2_0C:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
|
@ -4773,6 +4778,7 @@ void ggml_compute_forward_get_rows(
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_TQ1_0:
|
case GGML_TYPE_TQ1_0:
|
||||||
case GGML_TYPE_TQ2_0:
|
case GGML_TYPE_TQ2_0:
|
||||||
|
case GGML_TYPE_Q2_0C:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
|
@ -5497,6 +5503,7 @@ void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_TQ1_0:
|
case GGML_TYPE_TQ1_0:
|
||||||
case GGML_TYPE_TQ2_0:
|
case GGML_TYPE_TQ2_0:
|
||||||
|
case GGML_TYPE_Q2_0C:
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
|
|
||||||
|
|
@ -104,6 +104,12 @@ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy,
|
||||||
quantize_row_tq2_0_ref(x, y, k);
|
quantize_row_tq2_0_ref(x, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void quantize_row_q2_0c(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||||
|
assert(k % QKQ2_0C == 0);
|
||||||
|
block_q2_0c * GGML_RESTRICT y = vy;
|
||||||
|
quantize_row_q2_0c_ref(x, y, k);
|
||||||
|
}
|
||||||
|
|
||||||
//===================================== Q8_K ==============================================
|
//===================================== Q8_K ==============================================
|
||||||
|
|
||||||
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||||
|
|
@ -416,6 +422,68 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_vec_dot_q2_0c_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
|
assert(nrc == 1);
|
||||||
|
UNUSED(nrc);
|
||||||
|
UNUSED(bx);
|
||||||
|
UNUSED(by);
|
||||||
|
UNUSED(bs);
|
||||||
|
|
||||||
|
const block_q2_0c * GGML_RESTRICT x = vx;
|
||||||
|
const block_q8_K * GGML_RESTRICT y = vy;
|
||||||
|
|
||||||
|
GGML_ASSERT(n % QKQ2_0C == 0);
|
||||||
|
const int nb = n / QKQ2_0C;
|
||||||
|
|
||||||
|
float sumf = 0.0f;
|
||||||
|
|
||||||
|
static const int8_t q2_0c_vals[4] = { -3, -1, 1, 3 };
|
||||||
|
const int bytes_per_block = QKQ2_0C / 4;
|
||||||
|
const int bytes_per_half = QK_K / 4;
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const block_q2_0c * xb = x + i;
|
||||||
|
const block_q8_K * y0 = y + (i * 2 + 0);
|
||||||
|
const block_q8_K * y1 = y + (i * 2 + 1);
|
||||||
|
|
||||||
|
int32_t sum0 = 0;
|
||||||
|
int32_t sum1 = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < bytes_per_half; ++j) {
|
||||||
|
const uint8_t byte = xb->qs[j];
|
||||||
|
const int8_t q0 = q2_0c_vals[(byte >> 0) & 0x03];
|
||||||
|
const int8_t q1 = q2_0c_vals[(byte >> 2) & 0x03];
|
||||||
|
const int8_t q2 = q2_0c_vals[(byte >> 4) & 0x03];
|
||||||
|
const int8_t q3 = q2_0c_vals[(byte >> 6) & 0x03];
|
||||||
|
|
||||||
|
const int base = j * 4;
|
||||||
|
sum0 += q0 * y0->qs[base + 0];
|
||||||
|
sum0 += q1 * y0->qs[base + 1];
|
||||||
|
sum0 += q2 * y0->qs[base + 2];
|
||||||
|
sum0 += q3 * y0->qs[base + 3];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = bytes_per_half; j < bytes_per_block; ++j) {
|
||||||
|
const uint8_t byte = xb->qs[j];
|
||||||
|
const int8_t q0 = q2_0c_vals[(byte >> 0) & 0x03];
|
||||||
|
const int8_t q1 = q2_0c_vals[(byte >> 2) & 0x03];
|
||||||
|
const int8_t q2 = q2_0c_vals[(byte >> 4) & 0x03];
|
||||||
|
const int8_t q3 = q2_0c_vals[(byte >> 6) & 0x03];
|
||||||
|
|
||||||
|
const int base = (j - bytes_per_half) * 4;
|
||||||
|
sum1 += q0 * y1->qs[base + 0];
|
||||||
|
sum1 += q1 * y1->qs[base + 1];
|
||||||
|
sum1 += q2 * y1->qs[base + 2];
|
||||||
|
sum1 += q3 * y1->qs[base + 3];
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = GGML_CPU_FP16_TO_FP32(xb->d);
|
||||||
|
sumf += d * ((float) sum0 * y0->d + (float) sum1 * y1->d);
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = sumf;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(nrc == 1);
|
assert(nrc == 1);
|
||||||
UNUSED(nrc);
|
UNUSED(nrc);
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,8 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||||
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
void quantize_row_q2_0c(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
|
@ -51,6 +53,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||||
|
|
||||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
void ggml_vec_dot_q2_0c_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
|
||||||
|
|
@ -2198,6 +2198,126 @@ void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RE
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uint8_t map_int8_to_uint2_idx(int32_t v0) {
|
||||||
|
|
||||||
|
switch(v0) {
|
||||||
|
case -3:
|
||||||
|
return 0;
|
||||||
|
case -2:
|
||||||
|
return 1;
|
||||||
|
case -1:
|
||||||
|
return 1;
|
||||||
|
case 0:
|
||||||
|
return 1;
|
||||||
|
case 1:
|
||||||
|
return 2;
|
||||||
|
case 2:
|
||||||
|
return 2;
|
||||||
|
case 3:
|
||||||
|
return 3;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int32_t map_uint2_idx_to_int8(uint8_t v0) {
|
||||||
|
|
||||||
|
switch(v0) {
|
||||||
|
case 0:
|
||||||
|
return -3;
|
||||||
|
case 1:
|
||||||
|
return -1;
|
||||||
|
case 2:
|
||||||
|
return 1;
|
||||||
|
case 3:
|
||||||
|
return 3;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void quantize_row_q2_0c_ref(const float * GGML_RESTRICT x, block_q2_0c * GGML_RESTRICT y, int64_t k) {
|
||||||
|
const int QK = QKQ2_0C; // block size
|
||||||
|
|
||||||
|
assert(k % QK == 0);
|
||||||
|
|
||||||
|
// ---- Find per-channel min/max ----
|
||||||
|
float xmin = x[0];
|
||||||
|
float xmax = x[0];
|
||||||
|
|
||||||
|
for (int j = 1; j < k; ++j) {
|
||||||
|
|
||||||
|
const float v = x[j];
|
||||||
|
|
||||||
|
if (v < xmin) xmin = v;
|
||||||
|
if (v > xmax) xmax = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
float d = 0.0f; // scale
|
||||||
|
|
||||||
|
// The four uint2 values [0, 1, 2, 3] map to the Int8 range:
|
||||||
|
// [-3, -1, +1, +3], yielding an evenly spaced, zero-centered distribution.
|
||||||
|
const float qmin = -3.0f;
|
||||||
|
const float qmax = 3.0f;
|
||||||
|
|
||||||
|
if (xmax != xmin) {
|
||||||
|
d = (xmax - xmin) / (qmax - qmin);
|
||||||
|
} else {
|
||||||
|
d = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Number of blocks
|
||||||
|
const int64_t nb = k / QK;
|
||||||
|
|
||||||
|
// All blocks share the same scale.
|
||||||
|
// This enables an optimized matmul implementation.
|
||||||
|
for (int64_t i = 0; i < nb; ++i) {
|
||||||
|
const float *xb = x + i*QK;
|
||||||
|
|
||||||
|
y[i].d = GGML_FP32_TO_FP16(d);
|
||||||
|
|
||||||
|
// ---- Quantize to uint2 ----
|
||||||
|
if (d == 0.0f) {
|
||||||
|
for (int j = 0; j < QK; ++j) {
|
||||||
|
y[i].qs[j] = 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const float inv_d = 1.0f / d;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK; j+=4) {
|
||||||
|
float v0 = xb[j + 0];
|
||||||
|
float v1 = xb[j + 1];
|
||||||
|
float v2 = xb[j + 2];
|
||||||
|
float v3 = xb[j + 3];
|
||||||
|
|
||||||
|
// q = round(v / d)
|
||||||
|
int qi0 = (int) lrintf(v0 * inv_d);
|
||||||
|
int qi1 = (int) lrintf(v1 * inv_d);
|
||||||
|
int qi2 = (int) lrintf(v2 * inv_d);
|
||||||
|
int qi3 = (int) lrintf(v3 * inv_d);
|
||||||
|
|
||||||
|
// clamp to int8 range
|
||||||
|
if (qi0 < qmin) qi0 = qmin;
|
||||||
|
if (qi0 > qmax) qi0 = qmax;
|
||||||
|
if (qi1 < qmin) qi1 = qmin;
|
||||||
|
if (qi1 > qmax) qi1 = qmax;
|
||||||
|
if (qi2 < qmin) qi2 = qmin;
|
||||||
|
if (qi2 > qmax) qi2 = qmax;
|
||||||
|
if (qi3 < qmin) qi3 = qmin;
|
||||||
|
if (qi3 > qmax) qi3 = qmax;
|
||||||
|
|
||||||
|
const uint8_t v0_u8 = map_int8_to_uint2_idx(qi0);
|
||||||
|
const uint8_t v1_u8 = map_int8_to_uint2_idx(qi1);
|
||||||
|
const uint8_t v2_u8 = map_int8_to_uint2_idx(qi2);
|
||||||
|
const uint8_t v3_u8 = map_int8_to_uint2_idx(qi3);
|
||||||
|
|
||||||
|
uint8_t rhs_v0 = (v0_u8 & 0x3) | ((v1_u8 << 2) & 0x0C) | ((v2_u8 << 4 & 0x30)) | ((v3_u8 << 6 & 0xC0));
|
||||||
|
y[i].qs[j / 4] = rhs_v0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||||
(void)quant_weights; // not used
|
(void)quant_weights; // not used
|
||||||
const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
|
const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
|
||||||
|
|
@ -2212,6 +2332,18 @@ size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
|
||||||
return nrow * row_size;
|
return nrow * row_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t quantize_q2_0c(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||||
|
(void)quant_weights; // not used
|
||||||
|
// Number of bytes per row
|
||||||
|
const size_t row_size = ggml_row_size(GGML_TYPE_Q2_0C, n_per_row);
|
||||||
|
for(int64_t i = 0; i < nrow; ++i) {
|
||||||
|
uint8_t * row_dst_bytes = (uint8_t *) dst + (size_t) i * row_size;
|
||||||
|
block_q2_0c * row_dst = (block_q2_0c *) row_dst_bytes;
|
||||||
|
quantize_row_q2_0c_ref(src + i * n_per_row, row_dst, n_per_row);
|
||||||
|
}
|
||||||
|
return nrow * row_size;
|
||||||
|
}
|
||||||
|
|
||||||
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||||
assert(k % QK_K == 0);
|
assert(k % QK_K == 0);
|
||||||
const int64_t nb = k / QK_K;
|
const int64_t nb = k / QK_K;
|
||||||
|
|
@ -2270,6 +2402,32 @@ void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_REST
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void dequantize_row_q2_0c(const block_q2_0c * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||||
|
assert(k % QKQ2_0C == 0);
|
||||||
|
const int64_t nb = k / QKQ2_0C;
|
||||||
|
|
||||||
|
for (int64_t i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
|
||||||
|
for (size_t j = 0; j < QKQ2_0C; j += 4) {
|
||||||
|
const uint8_t rhs_byte = x[i].qs[j/4];
|
||||||
|
const uint8_t u2_idx0 = ((uint8_t)(rhs_byte & 0x03));
|
||||||
|
const uint8_t u2_idx1 = (((uint8_t)((rhs_byte >> 2) & 0x03)));
|
||||||
|
const uint8_t u2_idx2 = (((uint8_t)((rhs_byte >> 4) & 0x03)));
|
||||||
|
const uint8_t u2_idx3 = (((uint8_t)((rhs_byte >> 6) & 0x03)));
|
||||||
|
int32_t q0 = map_uint2_idx_to_int8(u2_idx0);
|
||||||
|
int32_t q1 = map_uint2_idx_to_int8(u2_idx1);
|
||||||
|
int32_t q2 = map_uint2_idx_to_int8(u2_idx2);
|
||||||
|
int32_t q3 = map_uint2_idx_to_int8(u2_idx3);
|
||||||
|
*y++ = (float) (q0) * d;
|
||||||
|
*y++ = (float) (q1) * d;
|
||||||
|
*y++ = (float) (q2) * d;
|
||||||
|
*y++ = (float) (q3) * d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ====================== "True" 2-bit (de)-quantization
|
// ====================== "True" 2-bit (de)-quantization
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
||||||
|
|
@ -5262,6 +5420,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||||
{
|
{
|
||||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb);
|
VALIDATE_ROW_DATA_D_F16_IMPL(block_tq2_0, data, nb);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_TYPE_Q2_0C:
|
||||||
|
{
|
||||||
|
VALIDATE_ROW_DATA_D_F16_IMPL(block_q2_0c, data, nb);
|
||||||
|
} break;
|
||||||
case GGML_TYPE_IQ1_S:
|
case GGML_TYPE_IQ1_S:
|
||||||
{
|
{
|
||||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
|
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,8 @@ GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K *
|
||||||
GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
||||||
GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
GGML_API void quantize_row_q2_0c_ref(const float * GGML_RESTRICT x, block_q2_0c * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||||
GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||||
GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
@ -59,6 +61,8 @@ GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GG
|
||||||
GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
|
GGML_API void dequantize_row_q2_0c(const block_q2_0c * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
@ -83,6 +87,8 @@ GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RE
|
||||||
GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
|
GGML_API size_t quantize_q2_0c(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
|
||||||
|
|
@ -896,6 +896,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||||
.type_size = 0,
|
.type_size = 0,
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_Q2_0C] = {
|
||||||
|
.type_name = "q2_0c",
|
||||||
|
.blck_size = QKQ2_0C,
|
||||||
|
.type_size = sizeof(block_q2_0c),
|
||||||
|
.is_quantized = true,
|
||||||
|
.to_float = (ggml_to_float_t) dequantize_row_q2_0c,
|
||||||
|
.from_float_ref = (ggml_from_float_t) quantize_row_q2_0c_ref,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
|
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
|
||||||
|
|
@ -7572,6 +7580,7 @@ size_t ggml_quantize_chunk(
|
||||||
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
|
case GGML_TYPE_Q2_0C: result = quantize_q2_0c(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||||
|
|
|
||||||
|
|
@ -152,6 +152,7 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q2_0C = 39, // except 1d tensors
|
||||||
|
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -509,6 +509,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
|
case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
|
case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q2_0C: default_type = GGML_TYPE_Q2_0C; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
|
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
|
||||||
|
|
|
||||||
|
|
@ -146,6 +146,7 @@ int main(int argc, char * argv[]) {
|
||||||
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
||||||
type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
||||||
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||||
|
type == GGML_TYPE_Q2_0C ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||||
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||||
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||||
|
|
@ -167,7 +168,7 @@ int main(int argc, char * argv[]) {
|
||||||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
||||||
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
||||||
: type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
|
: type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 || type == GGML_TYPE_Q2_0C
|
||||||
? MAX_DOT_PRODUCT_ERROR_TERNARY
|
? MAX_DOT_PRODUCT_ERROR_TERNARY
|
||||||
: MAX_DOT_PRODUCT_ERROR;
|
: MAX_DOT_PRODUCT_ERROR;
|
||||||
failed = !(vec_dot_error < max_allowed_error);
|
failed = !(vec_dot_error < max_allowed_error);
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
|
||||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||||
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
|
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
|
||||||
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
|
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
|
||||||
|
{ "Q2_0C", LLAMA_FTYPE_MOSTLY_Q2_0C, " 2.06 bpw ternarization", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||||
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue