diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index e3e067c916..9c3a6da5dc 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -111,13 +111,14 @@ extern "C" { // Internal types and functions exposed for tests and benchmarks typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, - const void * GGML_RESTRICT y, size_t by, int nrc); + const void * GGML_RESTRICT y, size_t by, int nrc, const void * levels); struct ggml_type_traits_cpu { ggml_from_float_t from_float; ggml_vec_dot_t vec_dot; enum ggml_type vec_dot_type; - int64_t nrows; // number of rows to process simultaneously + int64_t nrows; // number of rows to process simultaneously + size_t levels_row_stride; // bytes to add per row to get next row's quant_levels (0 = per-tensor) }; GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 11d3e8a816..a397a9ec43 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -429,7 +429,15 @@ extern "C" { GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) GGML_TYPE_Q1_0 = 41, - GGML_TYPE_COUNT = 42, + GGML_TYPE_Q3_PT = 42, // 3.875 bpw per-tensor Lloyd-Max, 16-elem affine sub-blocks + GGML_TYPE_Q3_KPT = 43, // Q3_K with learned per-tensor levels (3.4375 bpw) + GGML_TYPE_Q4_DPT = 44, // IQ4_NL with learned per-tensor int8 levels (4.125 bpw) + GGML_TYPE_Q2_DPT = 45, // 2-bit with learned per-tensor int8 levels (2.5 bpw) + GGML_TYPE_Q2_KPT = 46, // Q2_K with learned per-tensor float levels (2.625 bpw) + GGML_TYPE_IQ2_TQ = 47, // Trellis quantized with RNG codebook (2.0625 bpw) + GGML_TYPE_IQ3_TQ = 48, // 3-bit with per-tensor trained grid table (3.5625 bpw) + GGML_TYPE_IQ1_BN = 49, // 8D vector quantized with per-tensor trained codebook (1.5625 bpw) + GGML_TYPE_COUNT = 50, }; // precision @@ -457,6 +465,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_PT = 26, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors @@ -465,8 +474,11 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors - GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors - GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_KPT = 27, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_DPT = 28, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_KPT = 29, // except 1d tensors + GGML_FTYPE_MOSTLY_NVFP4 = 30, // except 1d tensors + GGML_FTYPE_MOSTLY_Q1_0 = 31, // except 1d tensors }; // available tensor operations: @@ -686,9 +698,8 @@ extern "C" { char name[GGML_MAX_NAME]; - void * extra; // extra things e.g. for ggml-cuda.cu - - char padding[8]; + void * extra; // extra things e.g. for ggml-cuda.cu + void * quant_levels; // per-tensor quantization levels (replaces char padding[8]; same size on 64-bit) }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -2723,7 +2734,7 @@ extern "C" { # define GGML_RESTRICT restrict # endif #endif - typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); struct ggml_type_traits { @@ -2734,6 +2745,7 @@ extern "C" { bool is_quantized; ggml_to_float_t to_float; ggml_from_float_t from_float_ref; + size_t levels_row_stride; // bytes to advance quant_levels per row (0 = per-tensor) }; GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 48fbe208d9..1daceffbf6 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -208,6 +208,13 @@ add_library(ggml-base ggml-quants.h gguf.cpp) +# Enable native SIMD for ggml-quants.c (needed for K-means training in quantization) +include(CheckCCompilerFlag) +check_c_compiler_flag("-march=native" GGML_COMPILER_SUPPORTS_MARCH_NATIVE) +if (GGML_COMPILER_SUPPORTS_MARCH_NATIVE) + set_source_files_properties(ggml-quants.c PROPERTIES COMPILE_FLAGS "-march=native") +endif() + set_target_properties(ggml-base PROPERTIES VERSION ${GGML_VERSION} SOVERSION ${GGML_VERSION_MAJOR} diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index a2ab8872c4..ba3dc4cede 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -396,7 +396,7 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type( // struct ggml_backend_meta_buffer_context { - static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding); + static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::quant_levels); std::map, std::pair> split_state_cache; std::map< const ggml_tensor *, std::vector> simple_tensors; diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 05245b6980..767ac91fd7 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -1,5 +1,15 @@ #include "ggml-impl.h" #include "ggml-blas.h" + +// Helper: compute quant_levels stride for a given row. +// For Q2_KPT (per-block levels), stride depends on tensor width. +static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) { + if (type == GGML_TYPE_Q2_KPT) { + return (size_t)(ne0 / 256) * 4 * sizeof(float); + } + return constant_stride; +} + #include "ggml-backend-impl.h" #include @@ -77,10 +87,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); + const size_t lrs = ggml_quant_levels_stride(src0->type, ggml_get_type_traits(src0->type)->levels_row_stride, src0->ne[0]); #ifdef GGML_USE_OPENMP #pragma omp parallel for num_threads(n_threads) for (int64_t i01 = 0; i01 < ne01; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs); } #else for (int i = 1; i < n_threads; i++) { @@ -89,7 +100,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg if (start < end) { ctx->tasks.push_back(std::async(std::launch::async, [=]() { for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs); } })); } @@ -99,7 +110,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg const int64_t start = 0; const int64_t end = ne01/n_threads; for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs); } } #endif diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index f05683b44c..c00fc2c12d 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -298,6 +298,7 @@ typedef struct { } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); + // 3-bit quantization // weight is represented as x = a * q // 16 blocks of 16 elements each @@ -327,6 +328,12 @@ typedef struct { } block_q4_K; static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding"); +// Q3_KPT: Q3_K with learned per-tensor levels +// Reuses block_q3_K structure but maps 3-bit indices through learned level table +typedef block_q3_K block_q3_kpt; +#define Q3KPT_N_LEVELS 8 + + // 5-bit quantization // 8 blocks of 32 elements each // weight is represented as x = a * q + b @@ -449,6 +456,115 @@ typedef struct { } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); +// 3.875 bpw - per-tensor Lloyd-Max scalar quantization +// 256 elements = 16 sub-blocks of 16, 8-entry level table trained per tensor +// Layout: 2 (d) + 2 (dmin) + 24 (scales: 32x6-bit) + 96 (qs: 256x3-bit) = 124 bytes +typedef struct { + ggml_half d; // 2 bytes: global scale for 16-elem sub-block ranges + ggml_half dmin; // 2 bytes: global scale for sub-block neg_mins + uint8_t scales[3*QK_K/32]; // 24 bytes: 32 x 6-bit (indices 0..15 = ranges, 16..31 = neg_mins) + uint8_t qs[3*QK_K/8]; // 96 bytes: 256 x 3-bit Lloyd-Max level index, sequential +} block_q3_pt; +static_assert(sizeof(block_q3_pt) == 124, "wrong q3_pt block size"); + +#define Q3PT_N_LEVELS 8 + +// Q4_DPT: IQ4_NL with learned per-tensor int8 levels (4.125 bpw) +// Block format: identical to block_iq4_nl (2 + 16 = 18 bytes per 32 elements) +typedef block_iq4_nl block_q4_dpt; +#define Q4DPT_N_LEVELS 16 + +// Q2_DPT: 2-bit per-tensor Lloyd-Max scalar quantization (2.5 bpw) +// Block format: 2 bytes (FP16 scale) + 8 bytes (2-bit indices for 32 elements) = 10 bytes per block +// 4 learned int8 levels per tensor, optimized via Lloyd-Max k-means +typedef struct { + ggml_half d; // 2 bytes: FP16 scale (delta) + uint8_t qs[8]; // 8 bytes: 2-bit indices (4 values per byte, 32 elements total) +} block_q2_dpt; +static_assert(sizeof(block_q2_dpt) == sizeof(ggml_half) + 8, "wrong q2_dpt block size/padding"); + +#define QK2_DPT 32 +#define Q2DPT_N_LEVELS 4 + +// Q2_KPT: Q2_K with learned per-tensor float levels (2.625 bpw) +// Reuses block_q2_K structure but maps 2-bit indices through learned level table +typedef block_q2_K block_q2_kpt; +#define Q2KPT_N_LEVELS 4 + +// IQ2_TQ: Trellis Quantized with RNG codebook (2.0625 bpw) +// +// Reconstruction: y[i] = d * hash(seed, block_idx, position, trellis_state, qs_idx) +// where hash is a deterministic function mapping to [-1, 1] +// and trellis_state evolves as: next = (state + idx + 1) & 7 +// +// Block layout (66 bytes per 256 elements): +// IQ2_TQ: 2-bit scalar quantization with per-tensor trained asymmetric grid table +// 32 groups of 8 elements per 256-element super-block +// - ggml_half d (2 bytes): super-block scale +// - uint8_t scales[16] (16 bytes): 32 × 4-bit grid entry index per group +// - uint8_t qs[64] (64 bytes): 256 × 2-bit element index within grid entry +// recon[j] = d * IQ2TQ_GRID_SCALE * grid[group_idx][elem_idx] +typedef struct { + ggml_half d; // Super-block scale (2 bytes) + uint8_t scales[QK_K/16]; // 32 × 4-bit grid entry index per group (16 bytes) + uint8_t qs[QK_K/4]; // 256 × 2-bit element index (64 bytes) +} block_iq2_tq; +static_assert(sizeof(block_iq2_tq) == 82, "wrong iq2_tq block size"); +// 2 + 16 + 64 = 82 bytes per 256 weights = 2.5625 bpw + +#define IQ2TQ_GROUP_SIZE 8 // Elements per group +#define IQ2TQ_N_GROUPS (QK_K / IQ2TQ_GROUP_SIZE) // 32 groups per super-block +#define IQ2TQ_GRID_SCALE 0.125f // Grid value multiplier: recon = d * GRID_SCALE * grid_int8 + +// IQ3_TQ: 3-bit scalar quantization with per-tensor trained asymmetric grid table (3.5625 bpw) +// 32 groups of 8 elements per 256-element super-block +// Each grid entry has 8 int8 levels (3 bits → 8 values per element) +// Grid table: 16 entries × 8 int8 = 128 bytes per tensor +// Block layout: +// - ggml_half d (2 bytes): super-block scale +// - uint8_t scales[16] (16 bytes): 32 × 4-bit grid entry index per group +// - uint8_t qs[96] (96 bytes): 256 × 3-bit element index within grid entry +// recon[j] = d * IQ3TQ_GRID_SCALE * grid[group_idx][elem_idx] +typedef struct { + ggml_half d; // Super-block scale (2 bytes) + uint8_t scales[QK_K/16]; // 32 × 4-bit grid entry index per group (16 bytes) + uint8_t qs[3*QK_K/8]; // 256 × 3-bit element index (96 bytes) +} block_iq3_tq; +static_assert(sizeof(block_iq3_tq) == 114, "wrong iq3_tq block size"); +// 2 + 16 + 96 = 114 bytes per 256 weights = 3.5625 bpw + +#define IQ3TQ_GROUP_SIZE 8 // Elements per group +#define IQ3TQ_N_GROUPS (QK_K / IQ3TQ_GROUP_SIZE) // 32 groups per super-block +#define IQ3TQ_N_LEVELS 8 // 3-bit → 8 levels per grid entry +#define IQ3TQ_GRID_SCALE 0.125f // Grid value multiplier +#define IQ3TQ_GRID_SIZE 128 // 16 entries × 8 int8 = 128 bytes per tensor + +// IQ1_BN: 8D vector quantized with per-tensor trained 4096-entry codebook (1.5625 bpw) +// 32 groups of 8 elements per 256-element super-block +// Each group selects one of 4096 trained 8D vectors via 12-bit codebook index +// Codebook: 4096 entries × 8 int8 = 32768 bytes per tensor +// Block layout: +// - ggml_half d (2 bytes): super-block scale +// - uint8_t qs[48] (48 bytes): 32 × 12-bit codebook indices packed in pairs +// 12-bit pair packing (groups 2k, 2k+1 → 3 bytes at qs[3k]): +// idx_even = qs[3k] | ((qs[3k+1] & 0x0F) << 8) +// idx_odd = (qs[3k+1] >> 4) | (qs[3k+2] << 4) +// recon[g*8+k] = d * IQ1BN_GRID_SCALE * codebook[ci][k] +typedef struct { + ggml_half d; // Super-block scale (2 bytes) + uint8_t qs[3*QK_K/16]; // 32 × 12-bit codebook indices packed in pairs (48 bytes) +} block_iq1_bn; +static_assert(sizeof(block_iq1_bn) == 50, "wrong iq1_bn block size"); +// 2 + 48 = 50 bytes per 256 weights = 1.5625 bpw + +#define IQ1BN_GROUP_SIZE 8 +#define IQ1BN_N_GROUPS (QK_K / IQ1BN_GROUP_SIZE) // 32 +#define IQ1BN_CODEBOOK_K 4096 // number of codebook entries +#define IQ1BN_CODEBOOK_DIM 8 // vector dimension (= group size) +#define IQ1BN_GRID_SCALE 0.125f // Grid value multiplier +#define IQ1BN_CODEBOOK_SIZE (IQ1BN_CODEBOOK_K * IQ1BN_CODEBOOK_DIM) // 32768 bytes +#define IQ1BN_AUX_SIZE IQ1BN_CODEBOOK_SIZE // 32768 bytes + #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index c589a213e9..f2cdc1ebfc 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -33,6 +33,8 @@ #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K +#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K +#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 @@ -203,6 +205,15 @@ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0 #elif defined(__riscv) // quants.c +#define quantize_row_q8_K_generic quantize_row_q8_K +#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K +#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K +#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K +#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0 +#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K +#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K +#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0 +#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 // repack.cpp @@ -307,6 +318,8 @@ #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K #define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0 #define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K +#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K +#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0 #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index e09db59cf2..ef95794730 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -137,7 +137,111 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in //===================================== Dot products ================================= -void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + const int qk = QK1_0; // 128 + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + GGML_UNUSED(levels); + + const block_q1_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + float sumf = 0.0f; + +#if defined(__ARM_NEON) + float32x4_t sumv = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d); + + // Process 4 Q8_0 blocks (each has 32 elements) + for (int k = 0; k < 4; k++) { + const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k]; + const float d1 = GGML_CPU_FP16_TO_FP32(yb->d); + + // Get the 4 bytes of bits for this Q8_0 block (32 bits = 4 bytes) + // Bits are at offset k*4 bytes in x[i].qs + const uint8_t * bits = &x[i].qs[k * 4]; + + // Load 32 int8 values from y + const int8x16_t y0 = vld1q_s8(yb->qs); + const int8x16_t y1 = vld1q_s8(yb->qs + 16); + + // Byte 0-1: bits for y0[0..15] + const uint64_t expand0 = table_b2b_0[bits[0]]; + const uint64_t expand1 = table_b2b_0[bits[1]]; + // Byte 2-3: bits for y1[0..15] + const uint64_t expand2 = table_b2b_0[bits[2]]; + const uint64_t expand3 = table_b2b_0[bits[3]]; + + // Build the sign vectors by reinterpreting the table values + uint8x8_t e0 = vcreate_u8(expand0); + uint8x8_t e1 = vcreate_u8(expand1); + uint8x8_t e2 = vcreate_u8(expand2); + uint8x8_t e3 = vcreate_u8(expand3); + + // Shift right by 4 to get 0 or 1 + int8x8_t s0 = vreinterpret_s8_u8(vshr_n_u8(e0, 4)); + int8x8_t s1 = vreinterpret_s8_u8(vshr_n_u8(e1, 4)); + int8x8_t s2 = vreinterpret_s8_u8(vshr_n_u8(e2, 4)); + int8x8_t s3 = vreinterpret_s8_u8(vshr_n_u8(e3, 4)); + + // Convert 0/1 to -1/+1: sign = 2*val - 1 + int8x8_t one = vdup_n_s8(1); + s0 = vsub_s8(vadd_s8(s0, s0), one); // 2*s0 - 1 + s1 = vsub_s8(vadd_s8(s1, s1), one); + s2 = vsub_s8(vadd_s8(s2, s2), one); + s3 = vsub_s8(vadd_s8(s3, s3), one); + + // Combine into 16-element vectors + int8x16_t signs0 = vcombine_s8(s0, s1); + int8x16_t signs1 = vcombine_s8(s2, s3); + + // Multiply signs with y values and accumulate + // dot(signs, y) where signs are +1/-1 + int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), signs0, y0); + int32x4_t p1 = ggml_vdotq_s32(p0, signs1, y1); + + // Scale by d1 and accumulate + sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1); + } + } + + sumf = vaddvq_f32(sumv); +#else + // Scalar fallback + for (int i = 0; i < nb; i++) { + const float d0 = GGML_FP16_TO_FP32(x[i].d); + + // Process 4 Q8_0 blocks + for (int k = 0; k < 4; k++) { + const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d); + + int sumi = 0; + for (int j = 0; j < QK8_0; j++) { + const int bit_index = k * QK8_0 + j; + const int byte_index = bit_index / 8; + const int bit_offset = bit_index % 8; + + const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1; + sumi += xi * y[i*4 + k].qs[j]; + } + sumf += d0 * d1 * sumi; + } + } +#endif + + *s = sumf; +} + + +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK1_0; // 128 const int nb = n / qk; @@ -240,7 +344,7 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -533,7 +637,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -753,12 +857,13 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo *s = sumf; } -void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); + GGML_UNUSED(levels); assert(n % QK_NVFP4 == 0); const block_nvfp4 * GGML_RESTRICT x = vx; @@ -837,7 +942,92 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo *s = sumf; } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + GGML_UNUSED(levels); + assert(n % QK_NVFP4 == 0); + + const block_nvfp4 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + // Each NVFP4 super-block (64 elements) spans 2 q8_0 blocks + const int nb = n / QK_NVFP4; + + float sumf = 0; + +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) + const int8x16_t values = vld1q_s8(kvalues_mxfp4); + const uint8x16_t m4b = vdupq_n_u8(0x0f); + float32x4_t acc = vdupq_n_f32(0.0f); + + for (int ib = 0; ib < nb; ++ib) { + const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs); + const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16); + + const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_0, m4b)); + const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4)); + const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_1, m4b)); + const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4)); + + const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs); + const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16); + const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b)); + const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b)); + + const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs); + const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16); + const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b)); + const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b)); + + const int32x4_t p0 = vaddq_s32( + ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0), + ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0)); + const int32x4_t p1 = vaddq_s32( + ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1), + ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1)); + + const int32x4_t sums = vpaddq_s32(p0, p1); + + // Decode 4 UE4M3 scales to f32 and multiply with q8 scales + const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d); + const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d); + const float32x4_t nvsc = { + ggml_ue4m3_to_fp32(x[ib].d[0]), + ggml_ue4m3_to_fp32(x[ib].d[1]), + ggml_ue4m3_to_fp32(x[ib].d[2]), + ggml_ue4m3_to_fp32(x[ib].d[3]) + }; + const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1}); + + acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales); + } + sumf = vaddvq_f32(acc); +#else + for (int ib = 0; ib < nb; ++ib) { + for (int si = 0; si < 4; ++si) { + const float d = ggml_ue4m3_to_fp32(x[ib].d[si]); + const int q8b = si / 2; + const int q8o = (si % 2) * QK_NVFP4_SUB; + const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8b].d); + + int sumi_lo = 0, sumi_hi = 0; + for (int j = 0; j < QK_NVFP4_SUB/2; ++j) { + const uint8_t qv = x[ib].qs[si*(QK_NVFP4_SUB/2) + j]; + sumi_lo += y[2*ib + q8b].qs[q8o + j + 0] * kvalues_mxfp4[qv & 0xf]; + sumi_hi += y[2*ib + q8b].qs[q8o + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4]; + } + sumf += dy * d * (sumi_lo + sumi_hi); + } + } +#endif + *s = sumf; +} + +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -949,7 +1139,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -1067,7 +1257,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -3953,6 +4143,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c index 74e0c086c6..686deb57f3 100644 --- a/ggml/src/ggml-cpu/arch/loongarch/quants.c +++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c @@ -644,7 +644,7 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -772,7 +772,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -827,11 +827,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -880,11 +880,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -936,11 +936,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -983,7 +983,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -1956,6 +1956,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + #if defined(__loongarch_asx) static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i a = __lasx_xvmulwev_h_b(x, y); diff --git a/ggml/src/ggml-cpu/arch/powerpc/quants.c b/ggml/src/ggml-cpu/arch/powerpc/quants.c index 644c380c73..c8fc0a3766 100644 --- a/ggml/src/ggml-cpu/arch/powerpc/quants.c +++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c @@ -141,7 +141,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -207,11 +207,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -274,7 +274,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -340,11 +340,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -412,11 +412,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -488,11 +488,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -557,7 +557,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -2000,6 +2000,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); @@ -2190,7 +2194,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(nb); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index d7e9ba4634..471680dd6f 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -213,7 +213,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { #if defined(__riscv_v) const int qk = QK8_0; const int nb = n / qk; @@ -264,11 +264,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; #else - ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { #if defined(__riscv_v) const int qk = QK8_1; const int nb = n / qk; @@ -315,11 +315,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; #else - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { #if defined(__riscv_v) const int qk = QK8_0; const int nb = n / qk; @@ -369,11 +369,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; #else - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { #if defined(__riscv_v) const int qk = QK8_1; const int nb = n / qk; @@ -422,11 +422,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; #else - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -470,7 +470,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -2954,6 +2954,14 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif } +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} + #if defined __riscv_v_intrinsic static void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 500857579a..c118690192 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -146,7 +146,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -201,11 +201,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -258,7 +258,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -353,11 +353,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -495,11 +495,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -648,11 +648,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -698,7 +698,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -1388,7 +1388,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(nb); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -1463,3 +1463,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } + +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c index 648c6fcaba..62a6674347 100644 --- a/ggml/src/ggml-cpu/arch/wasm/quants.c +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -229,7 +229,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in //===================================== Dot products ================================= -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -355,7 +355,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -442,11 +442,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_1; const int nb = n / qk; @@ -537,11 +537,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(sumf); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { const int qk = QK8_0; const int nb = n / qk; @@ -605,7 +605,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(y); UNUSED(ib); UNUSED(sumf); - ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -1218,3 +1218,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } + +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); +} diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index 74d699f633..f609b5d186 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -540,7 +540,8 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -698,7 +699,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_1; const int nb = n / qk; @@ -753,11 +755,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(x); UNUSED(y); UNUSED(ib); - ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -843,7 +846,8 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo *s = sumf; } -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -919,11 +923,12 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(ib); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_1; const int nb = n / qk; @@ -1005,11 +1010,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(ib); UNUSED(x); UNUSED(y); - ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -1077,7 +1083,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi *s = sumf; } -void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1205,11 +1212,12 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1271,11 +1279,12 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1463,11 +1472,12 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1735,11 +1745,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1913,11 +1924,12 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(kmask2); UNUSED(kmask3); UNUSED(utmp); - ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2123,11 +2135,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(kmask2); UNUSED(kmask3); UNUSED(utmp); - ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2328,7 +2341,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } @@ -2369,7 +2382,8 @@ static const int8_t keven_signs_q2xs[1024] = { }; #endif -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2483,11 +2497,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2780,11 +2795,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -2965,11 +2981,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -3089,11 +3106,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -3299,11 +3317,17 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); + ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); +} + +void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -3418,11 +3442,12 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -3625,11 +3650,12 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo UNUSED(y); UNUSED(nb); UNUSED(scale); - ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } -void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -3713,7 +3739,185 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v *s = sumf; } -void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_q4_dpt * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + const int8_t * values = (const int8_t *)levels; + GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); + + int ib = 0; + float sumf = 0; + +#if defined __AVX2__ + + const __m128i values128 = _mm_loadu_si128((const __m128i*)values); + const __m128i m4b = _mm_set1_epi8(0x0f); + const __m256i mone = _mm256_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs); + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs); + const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); + const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), + _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); + const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); + const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); + const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); + const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); + accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), + _mm256_cvtepi32_ps(p_1), accum1); + accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), + _mm256_cvtepi32_ps(p_2), accum2); + } + + sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); + +#elif defined __AVX__ + const __m128i values128 = _mm_loadu_si128((const __m128i*)values); + const __m128i m4b = _mm_set1_epi8(0x0f); + + __m256 accum = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); + + const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); + const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); + const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); + const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); + + const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1); + const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); + accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); + } + + sumf = hsum_float_8(accum); + +#endif + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + sumi1 += y[ib].qs[j+ 0] * values[x[ib].qs[j] & 0xf]; + sumi2 += y[ib].qs[j+QK4_NL/2] * values[x[ib].qs[j] >> 4]; + } + sumf += d * (sumi1 + sumi2); + } + *s = sumf; +} + +void ggml_vec_dot_q2_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK2_DPT == 0); + static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same"); + + const block_q2_dpt * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK2_DPT; + + const int8_t * values = (const int8_t *)levels; + GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor"); + + int ib = 0; + float sumf = 0; + +#if defined __AVX2__ + + const __m128i values128 = _mm_loadu_si128((const __m128i*)values); + const __m128i m3 = _mm_set1_epi8(0x03); + + __m256 accum = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q2bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs); + const __m128i q2bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs); + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs); + + // Extract 2-bit indices and lookup values - process 8 elements at a time + // For each byte of q2bits, we have 4 x 2-bit indices + const __m128i q2_01_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_1, m3)); + const __m128i q2_01_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 2), m3)); + const __m128i q2_02_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 4), m3)); + const __m128i q2_02_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 6), m3)); + const __m128i q2_11_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_2, m3)); + const __m128i q2_11_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 2), m3)); + const __m128i q2_12_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 4), m3)); + const __m128i q2_12_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 6), m3)); + + // Combine pairs into __m256i + const __m256i q4b_1a = MM256_SET_M128I(q2_01_h, q2_01_l); + const __m256i q4b_1b = MM256_SET_M128I(q2_02_h, q2_02_l); + const __m256i q4b_2a = MM256_SET_M128I(q2_11_h, q2_11_l); + const __m256i q4b_2b = MM256_SET_M128I(q2_12_h, q2_12_l); + + // Split q8 into pairs and compute dot products + const __m256i q8b_1a = _mm256_and_si256(q8b_1, _mm256_set1_epi16(0x00ff)); + const __m256i q8b_1b = _mm256_srli_epi16(q8b_1, 8); + const __m256i q8b_2a = _mm256_and_si256(q8b_2, _mm256_set1_epi16(0x00ff)); + const __m256i q8b_2b = _mm256_srli_epi16(q8b_2, 8); + + const __m256i p16_1a = mul_add_epi8(q4b_1a, q8b_1a); + const __m256i p16_1b = mul_add_epi8(q4b_1b, q8b_1b); + const __m256i p16_2a = mul_add_epi8(q4b_2a, q8b_2a); + const __m256i p16_2b = mul_add_epi8(q4b_2b, q8b_2b); + + const __m256i mone = _mm256_set1_epi16(1); + const __m256i p_1 = _mm256_add_epi32(_mm256_madd_epi16(p16_1a, mone), _mm256_madd_epi16(p16_1b, mone)); + const __m256i p_2 = _mm256_add_epi32(_mm256_madd_epi16(p16_2a, mone), _mm256_madd_epi16(p16_2b, mone)); + + accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), + _mm256_cvtepi32_ps(p_1), accum); + accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), + _mm256_cvtepi32_ps(p_2), accum); + } + + sumf = hsum_float_8(accum); + +#endif + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); + int sumi = 0; + for (int j = 0; j < QK2_DPT/4; ++j) { + uint8_t q = x[ib].qs[j]; + sumi += y[ib].qs[j*4 + 0] * values[(q >> 0) & 3]; + sumi += y[ib].qs[j*4 + 1] * values[(q >> 2) & 3]; + sumi += y[ib].qs[j*4 + 2] * values[(q >> 4) & 3]; + sumi += y[ib].qs[j*4 + 3] * values[(q >> 6) & 3]; + } + sumf += d * sumi; + } + *s = sumf; +} + +void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -3815,6 +4019,6 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v UNUSED(x); UNUSED(y); UNUSED(nb); - ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); + ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); #endif } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 2b3eb5b5ce..3d1abc1a6e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-quants.h" #include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-impl.h" @@ -396,6 +397,52 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_Q3_PT] = { + // from_float not set — requires codebook initialization via q3pt_set_codebook() + .vec_dot = ggml_vec_dot_q3_pt_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q3_KPT] = { + // from_float not set — requires level initialization via q3kpt_set_levels() + .vec_dot = ggml_vec_dot_q3_kpt_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_Q4_DPT] = { + // from_float not set — requires level initialization via q4dpt_set_levels() + .vec_dot = ggml_vec_dot_q4_dpt_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, + [GGML_TYPE_Q2_DPT] = { + // from_float not set — requires level initialization via q2dpt_set_levels() + .vec_dot = ggml_vec_dot_q2_dpt_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, + [GGML_TYPE_Q2_KPT] = { + // from_float not set — requires level initialization via q2kpt_set_levels() + .vec_dot = ggml_vec_dot_q2_kpt_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + .levels_row_stride = 0, // computed dynamically: (ne0/QK_K)*Q2KPT_N_LEVELS*sizeof(float) + }, + [GGML_TYPE_IQ2_TQ] = { + .vec_dot = ggml_vec_dot_iq2_tq_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ3_TQ] = { + .vec_dot = ggml_vec_dot_iq3_tq_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, + [GGML_TYPE_IQ1_BN] = { + .vec_dot = ggml_vec_dot_iq1_bn_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_I32] = { .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32, }, @@ -1165,8 +1212,15 @@ static void ggml_compute_forward_mul_mat_one_chunk( const bool src1_cont = ggml_is_contiguous(src1); - ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + // For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float) + // ne00 is the number of elements per row in src0 (input dimension), NOT ne0 (= ne01 = output rows). + // For non-square matrices (e.g. ffn_up: [hidden, intermediate]) ne00 != ne01, so ne00 is correct. + // For other types, use the static stride from type_traits_cpu + const size_t levels_row_stride = (type == GGML_TYPE_Q2_KPT) + ? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float) + : type_traits_cpu[type].levels_row_stride; // broadcast factors const int64_t r2 = ne12 / ne02; @@ -1227,7 +1281,11 @@ static void ggml_compute_forward_mul_mat_one_chunk( //} for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { - vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); + // For Q2_KPT, levels are stored per-expert: [expert0_rows, expert1_rows, ...] + // So for 3D tensors we need to index by (i03 * ne01 + ir0) + const size_t levels_row_idx = (type == GGML_TYPE_Q2_KPT && ne03 > 1) ? (i03 * ne01 + ir0) : ir0; + const void * row_levels = (const char*)src0->quant_levels + levels_row_idx * levels_row_stride; + vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot, row_levels); } for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { @@ -1293,7 +1351,8 @@ void ggml_compute_forward_mul_mat( nb1/ggml_type_size(dst->type), src0->type, src1->type, - dst->type)) + dst->type, + src0->quant_levels)) goto UseGgmlGemm1; return; } @@ -1361,7 +1420,8 @@ UseGgmlGemm1:; nb1/ggml_type_size(dst->type), src0->type, vec_dot_type, - dst->type)) + dst->type, + src0->quant_levels)) goto UseGgmlGemm2; return; } @@ -1461,8 +1521,14 @@ static void ggml_compute_forward_mul_mat_id_one_chunk( const enum ggml_type type = src0->type; - ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + // For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float) + // ne00 is the input dimension (elements per row in src0), NOT ne0 (= ne01 = output rows). + // For other types, use the static stride from type_traits_cpu + const size_t levels_row_stride = (type == GGML_TYPE_Q2_KPT) + ? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float) + : type_traits_cpu[type].levels_row_stride; const int64_t blck_0 = 16; const int64_t blck_1 = 16; @@ -1495,7 +1561,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk( float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1); + const void * row_levels = (const char*)src0->quant_levels + (cur_a * ne01 + ir0) * levels_row_stride; + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1, row_levels); } memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float)); diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 34e320e2f5..e934515efb 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -1356,16 +1356,20 @@ class tinyBLAS_Q0_AVX { const TA *A, int64_t lda, const TB *B, int64_t ldb, TC *C, int64_t ldc, - int ith, int nth) + int ith, int nth, + const int8_t * custom_table = nullptr) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { - const int8_t kvalues_iq4nl[16] = { - -127, -104, -83, -65, - -49, -35, -22, -10, - 1, 13, 25, 38, - 53, 69, 89, 113 - }; - - iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl); + if (custom_table) { + iq4nlt = _mm_loadu_si128((const __m128i *)custom_table); + } else { + const int8_t kvalues_iq4nl[16] = { + -127, -104, -83, -65, + -49, -35, -22, -10, + 1, 13, 25, 38, + 53, 69, 89, 113 + }; + iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl); + } } void matmul(int64_t m, int64_t n) { @@ -3684,7 +3688,7 @@ class tinyBLAS_PPC { */ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, - int64_t ldc, int Atype, int Btype, int Ctype) { + int64_t ldc, int Atype, int Btype, int Ctype, const void * quant_levels) { assert(m >= 0); assert(n >= 0); @@ -4024,6 +4028,26 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 #endif } + case GGML_TYPE_Q4_DPT: { + if (Btype != GGML_TYPE_Q8_0) + return false; +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + // Q4_DPT has identical block layout to IQ4_NL (block_q4_dpt = block_iq4_nl) + // but uses a per-tensor lookup table instead of the fixed IQ4_NL values. + const int8_t * levels = (const int8_t *)quant_levels; + if (!levels) return false; + tinyBLAS_Q0_AVX tb{ + k, (const block_iq4_nl *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + params->ith, params->nth, levels}; + tb.matmul(m, n); + return true; +#else + return false; +#endif + } + default: return false; } diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h index 867b0c04ae..117a36560e 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.h +++ b/ggml/src/ggml-cpu/llamafile/sgemm.h @@ -18,7 +18,7 @@ extern "C" { bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t, const void *, int64_t, const void *, int64_t, void *, int64_t, - int, int, int); + int, int, int, const void * quant_levels); #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index a9bc21da6f..3c09311eaf 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8,6 +8,19 @@ #include "unary-ops.h" #include "vec.h" +// Helper: compute quant_levels stride for a given row. +// For most types this is the constant levels_row_stride from type_traits. +// For Q2_KPT (per-block levels), stride depends on tensor width (ne[0]). +static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) { + if (type == GGML_TYPE_Q2_KPT) { + // Q2_KPT has Q2KPT_N_LEVELS floats per 256-element block + // Stride = (ne0 / 256) * Q2KPT_N_LEVELS * sizeof(float) + return (size_t)(ne0 / 256) * 4 * sizeof(float); + } + return constant_stride; +} + + #include #include #include @@ -517,9 +530,11 @@ static void ggml_compute_forward_dup_from_q( const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + const size_t q_lrs0 = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); dequantize_row_q( (const void *) ((char *) src0->data + x_offset), - (float *) ((char *) dst->data + dst_offset), qk); + (float *) ((char *) dst->data + dst_offset), qk, + (const char*)src0->quant_levels + i01 * q_lrs0); } } @@ -639,7 +654,8 @@ static void ggml_compute_forward_add_q_f32( assert(ne00 % 32 == 0); // unquantize row from src0 to temp buffer - dequantize_row_q(src0_row, wdata, ne00); + const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); + dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add); // add src1 ggml_vec_acc_f32(ne00, wdata, src1_row); // quantize row to dst @@ -688,6 +704,9 @@ void ggml_compute_forward_add( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_TQ: + case GGML_TYPE_IQ3_TQ: + case GGML_TYPE_IQ1_BN: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -974,7 +993,8 @@ static void ggml_compute_forward_add1_q_f32( assert(ne0 % 32 == 0); // unquantize row from src0 to temp buffer - dequantize_row_q(src0_row, wdata, ne0); + const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); + dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add); // add src1 ggml_vec_acc1_f32(ne0, wdata, v); // quantize row to dst @@ -1139,6 +1159,9 @@ void ggml_compute_forward_add1( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_TQ: + case GGML_TYPE_IQ3_TQ: + case GGML_TYPE_IQ1_BN: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -1269,6 +1292,9 @@ void ggml_compute_forward_acc( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_TQ: + case GGML_TYPE_IQ3_TQ: + case GGML_TYPE_IQ1_BN: default: { GGML_ABORT("fatal error"); @@ -4321,7 +4347,8 @@ static void ggml_compute_forward_out_prod_q_f32( float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - dequantize_row_q(s0, wdata, ne0); + const size_t q_lrs_op = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); + dequantize_row_q(s0, wdata, ne0, (const char*)src0->quant_levels + i01 * q_lrs_op); ggml_vec_mad_f32(ne0, d, wdata, *s1); } } @@ -4358,6 +4385,9 @@ void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_TQ: + case GGML_TYPE_IQ3_TQ: + case GGML_TYPE_IQ1_BN: { ggml_compute_forward_out_prod_q_f32(params, dst); } break; @@ -4635,6 +4665,9 @@ void ggml_compute_forward_set( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_TQ: + case GGML_TYPE_IQ3_TQ: + case GGML_TYPE_IQ1_BN: default: { GGML_ABORT("fatal error"); @@ -4698,9 +4731,21 @@ static void ggml_compute_forward_get_rows_q( GGML_ASSERT(i01 >= 0 && i01 < ne01); + const size_t q_lrs_gr = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]); + // For Q2_KPT with 3D tensors, levels are indexed by [i12 * ne02 * ne01 + i11 * ne01 + i01] + // For 2D tensors, levels are indexed by [i11 * ne01 + i01] (or just [i01] if ne02 == 1) + size_t levels_row_idx; + if (type == GGML_TYPE_Q2_KPT && ne03 > 1) { + levels_row_idx = (i12 * ne02 + i11) * ne01 + i01; + } else if (type == GGML_TYPE_Q2_KPT) { + levels_row_idx = i11 * ne01 + i01; + } else { + levels_row_idx = i01; + } dequantize_row_q( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), - (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc, + (const char*)src0->quant_levels + levels_row_idx * q_lrs_gr); } } @@ -4859,6 +4904,9 @@ void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_TQ: + case GGML_TYPE_IQ3_TQ: + case GGML_TYPE_IQ1_BN: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -5436,7 +5484,7 @@ static void ggml_compute_forward_soft_max_ext_back_f32( // linear runtime, no additional memory float dot_y_dy = 0; - ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); + ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1, nullptr); ggml_vec_cpy_f32 (nc, dx, dy); ggml_vec_acc1_f32 (nc, dx, -dot_y_dy); ggml_vec_mul_f32 (nc, dx, dx, y); @@ -5571,6 +5619,8 @@ void ggml_compute_forward_clamp( case GGML_TYPE_NVFP4: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: + case GGML_TYPE_Q3_KPT: + case GGML_TYPE_Q4_DPT: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: @@ -5583,6 +5633,12 @@ void ggml_compute_forward_clamp( case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: + case GGML_TYPE_Q3_PT: + case GGML_TYPE_Q2_KPT: + case GGML_TYPE_Q2_DPT: + case GGML_TYPE_IQ2_TQ: + case GGML_TYPE_IQ3_TQ: + case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: @@ -6007,7 +6063,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( float v = 0; ggml_vec_dot_f16(ne02, &v, 0, (ggml_fp16_t *) wdata_src + i1n, 0, - (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); + (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1, nullptr); dst_data[i10*s0 + i00] += v; } } @@ -6095,7 +6151,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( float v = 0; ggml_vec_dot_f32(ne02, &v, 0, wdata_src + i1n, 0, - wdata_kernel + i00*ne02, 0, 1); + wdata_kernel + i00*ne02, 0, 1, nullptr); dst_data[i10*s0 + i00] += v; } } @@ -7021,11 +7077,11 @@ static void ggml_compute_forward_conv_transpose_2d_impl( if constexpr (std::is_same_v) { ggml_vec_dot_f16(ne03, &v, 0, wdata_src + i1n, 0, - wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1, nullptr); } else { ggml_vec_dot_f32(ne03, &v, 0, wdata_src + i1n, 0, - wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1, nullptr); } dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; } @@ -8298,7 +8354,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( float s; // KQ value const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3); - kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1); + kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1, k->quant_levels); s = s*scale; // scale KQ value @@ -8345,7 +8401,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk( // V += v*expf(s - M) if (v_to_float) { - v_to_float(v_data, V32, DV); + v_to_float(v_data, V32, DV, v->quant_levels); ggml_vec_mad_f32(DV, VKQ32, V32, vs); } else { // V is F32 @@ -9058,7 +9114,7 @@ static void ggml_compute_forward_flash_attn_back_f32( ggml_vec_dot_f32(neq0, S + i1, 0, (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1, nullptr); } // scale @@ -9172,7 +9228,7 @@ static void ggml_compute_forward_flash_attn_back_f32( // S = SM * (S - dot(SM, S)) float dot_SM_gradSM = 0; - ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1); + ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1, nullptr); ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); ggml_vec_mul_f32 (masked_begin, S, S, SM); @@ -10535,7 +10591,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk( // delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k) for (int64_t j = 0; j < S_v; ++j) { float sum = 0.0f; - ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1); + ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1, nullptr); delta[j] = (v_d[j] - sum) * beta_val; } @@ -10547,7 +10603,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk( // attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q) for (int64_t j = 0; j < S_v; ++j) { float sum = 0.0f; - ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1); + ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1, nullptr); attn_data[j] = sum * scale; } diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index f66127c229..f34a1e5e10 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -120,7 +120,8 @@ void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI //===================================== Dot products ================================= -void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK1_0; const int nb = n / qk; @@ -165,7 +166,8 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c } -void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -202,7 +204,8 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c } // TODO: add WASM SIMD -void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_1; const int nb = n / qk; @@ -238,7 +241,8 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -270,7 +274,8 @@ void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, } // NVFP4: super-block of 64 elements = 4 sub-blocks of 16 = 2 q8_0 blocks -void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -305,7 +310,8 @@ void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -348,7 +354,8 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_1; const int nb = n / qk; @@ -391,7 +398,8 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); const int qk = QK8_0; const int nb = n / qk; @@ -421,7 +429,8 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -473,7 +482,8 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -505,7 +515,8 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -557,7 +568,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -636,7 +648,8 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -710,8 +723,7 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) sumf += sums[l]; *s = sumf; } - -void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -741,6 +753,7 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c float sumf = 0; for (int i = 0; i < nb; ++i) { + GGML_UNUSED(levels); const uint8_t * GGML_RESTRICT q4 = x[i].qs; const uint8_t * GGML_RESTRICT hm = x[i].qh; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -791,7 +804,8 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -846,7 +860,8 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } -void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -888,7 +903,8 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs *s = 0.125f * sumf; } -void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -938,7 +954,8 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = 0.125f * sumf; } -void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -990,7 +1007,8 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = 0.125f * sumf; } -void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1034,7 +1052,8 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs *s = 0.25f * sumf; } -void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1090,7 +1109,65 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q3_pt * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q3_PT levels not set for tensor"); + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float xd = GGML_CPU_FP16_TO_FP32(x[i].d); + const float xdmin = GGML_CPU_FP16_TO_FP32(x[i].dmin); + const float yd = y[i].d; + const uint8_t * sc = x[i].scales; + const uint8_t * qs = x[i].qs; + const int8_t * q8 = y[i].qs; + + float block_sum = 0.f; + for (int ib = 0; ib < QK_K/16; ++ib) { + // Inline 6-bit unpack for range scale (index ib) and neg_min scale (index ib + QK_K/16) + const int sbit0 = ib * 6, sbyte0 = sbit0 / 8, soff0 = sbit0 % 8; + const int sbit1 = (ib + QK_K/16) * 6, sbyte1 = sbit1 / 8, soff1 = sbit1 % 8; + uint8_t qrange = (sc[sbyte0] >> soff0) & 0x3F; + if (soff0 > 2) { qrange |= (uint8_t)((sc[sbyte0+1] << (8 - soff0)) & 0x3F); } + uint8_t qnmin = (sc[sbyte1] >> soff1) & 0x3F; + if (soff1 > 2) { qnmin |= (uint8_t)((sc[sbyte1+1] << (8 - soff1)) & 0x3F); } + const float range = xd * (float)qrange; + const float sub_min = -xdmin * (float)qnmin; + + float sum_lq = 0.f; + for (int j = 0; j < 16; ++j) { + // Inline 3-bit unpack + const int qk = ib * 16 + j; + const int qbit = qk * 3; + const int qbyte = qbit / 8; + const int qoff = qbit % 8; + int q = (qs[qbyte] >> qoff) & 0x7; + if (qoff > 5) { q |= (int)((qs[qbyte+1] << (8 - qoff)) & 0x7); } + sum_lq += lv[q] * (float)q8[qk]; + } + // min contribution uses precomputed 16-element sum from block_q8_K.bsums + block_sum += sum_lq * range + sub_min * (float)y[i].bsums[ib]; + } + sumf += block_sum * yd; + } + *s = sumf; +} + +void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1133,7 +1210,375 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +// Q3_KPT vec_dot - similar to Q3_K but with learned levels +void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q3_kpt * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q3_KPT levels not set for tensor"); + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); + const float yd = y[i].d; + const uint8_t * q = x[i].qs; + const uint8_t * hm = x[i].hmask; + const int8_t * q8 = y[i].qs; + uint8_t m = 1; + + uint32_t aux32[4]; + memcpy(aux32, x[i].scales, 12); + uint32_t tmp = aux32[2]; + aux32[2] = ((aux32[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux32[3] = ((aux32[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux32[0] = (aux32[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux32[1] = (aux32[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + const uint8_t * aux = (const uint8_t *)aux32; + + int is = 0; + float block_sum = 0.f; + for (int blk = 0; blk < QK_K; blk += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + int sc1 = (int)aux[is] - 32; + int sc2 = (int)aux[is+1] - 32; + is += 2; + float dl1 = d_all * sc1; + float dl2 = d_all * sc2; + + float sum1 = 0.f, sum2 = 0.f; + for (int l = 0; l < 16; ++l) { + int k_idx = ((q[l+0] >> shift) & 3) + ((hm[l+0] & m) ? 4 : 0); + sum1 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+0]; + } + for (int l = 0; l < 16; ++l) { + int k_idx = ((q[l+16] >> shift) & 3) + ((hm[l+16] & m) ? 4 : 0); + sum2 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+16]; + } + block_sum += dl1 * sum1 + dl2 * sum2; + + shift += 2; + m <<= 1; + q8 += 32; + } + q += 32; + } + sumf += block_sum * yd; + } + *s = sumf; +} + +void ggml_vec_dot_q3_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + ggml_vec_dot_q3_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); +} + +// Q2_KPT vec_dot - similar to Q2_K but with learned levels +void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_kpt * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q2_KPT levels not set for tensor"); + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + // Per-block levels: block i uses lv[i*4 + 0..3] + const float * block_lv = lv + i * Q2KPT_N_LEVELS; + + // Precompute mapped levels for this block: ml[k] = levels[k] * 3.0 + float ml[Q2KPT_N_LEVELS]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { + ml[k] = block_lv[k] * 3.0f; + } + + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + // Min term: accumulate integer bsums * min_scale (same as Q2_K) + int summs = 0; + for (int j = 0; j < 16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); + + // Scale term: need floating-point because levels are non-uniform + int is = 0; + float fsum = 0; + for (int k = 0; k < QK_K/128; ++k) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + int d_sc = sc[is++] & 0xF; + float suml = 0; + for (int l = 0; l < 16; ++l) { + int idx = (q2[l] >> shift) & 3; + suml += ml[idx] * (float)q8[l]; + } + fsum += d_sc * suml; + + d_sc = sc[is++] & 0xF; + suml = 0; + for (int l = 16; l < 32; ++l) { + int idx = (q2[l] >> shift) & 3; + suml += ml[idx] * (float)q8[l]; + } + fsum += d_sc * suml; + + shift += 2; + q8 += 32; + } + q2 += 32; + } + sumf += dall * fsum - dmin * summs; + } + *s = sumf; +} + +void ggml_vec_dot_q2_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + ggml_vec_dot_q2_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels); +} + +// IQ2_TQ: 2-bit with asymmetric 4-tuple grid per group +// Default grid table — only used when no per-tensor grid is available +static const int8_t iq2tq_grid_cpu[16][4] = { + {-20, -8, -2, 6}, {-14, -8, -2, 4}, {-16,-10, 0, 12}, {-14, -4, 2, 8}, + {-20, -4, 4, 12}, {-8, -4, 0, 4}, {-8, -4, 0, 8}, {-12, -6, 2, 12}, + {-4, -2, 2, 4}, {-10, -2, 4, 8}, {-16, -6, 4, 20}, {-12, -2, 6, 14}, + {-8, -2, 4, 14}, {-4, 0, 4, 8}, {-8, -2, 6, 22}, {-4, 2, 8, 14}, +}; + +void ggml_vec_dot_iq2_tq_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); + + const int8_t (*grid)[4] = levels ? (const int8_t (*)[4])levels : (const int8_t (*)[4])iq2tq_grid_cpu; + const block_iq2_tq * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + const int nb = n / QK_K; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ2TQ_GRID_SCALE; + const float yd = y[i].d; + const int8_t * q8 = y[i].qs; + + int32_t fsum = 0; + + for (int g = 0; g < IQ2TQ_N_GROUPS; ++g) { + int si = (x[i].scales[g / 2] >> (4 * (g % 2))) & 0xF; + const int8_t * ge = grid[si]; + const int8_t * q8g = q8 + g * 8; + + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + int qi = (x[i].qs[j / 4] >> ((j % 4) * 2)) & 3; + fsum += (int32_t)ge[qi] * (int32_t)q8g[k]; + } + } + + sumf += d * yd * (float)fsum; + } + + *s = sumf; +} + +// IQ3_TQ default grid (must match ggml-quants.c) +static const int8_t iq3tq_grid_cpu[16][8] = { + {-24,-18,-12, -6, 0, 6, 12, 18}, + {-20,-15,-10, -5, 0, 5, 10, 15}, + {-16,-12, -8, -4, 0, 4, 8, 12}, + {-12, -8, -4, -2, 0, 2, 4, 8}, + {-24,-16, -8, -2, 2, 6, 10, 14}, + {-14,-10, -6, -2, 2, 8, 16, 24}, + {-20,-14, -8, -4, 0, 4, 10, 18}, + {-18,-10, -4, 0, 4, 8, 14, 20}, + { -8, -6, -4, -2, 0, 2, 4, 6}, + {-10, -6, -4, -2, 2, 4, 6, 10}, + {-22,-14, -6, -2, 2, 6, 14, 22}, + {-16, -8, -4, -2, 0, 4, 8, 16}, + {-24,-20,-16,-12, -8, -4, 0, 4}, + { -4, 0, 4, 8, 12, 16, 20, 24}, + {-20,-16,-10, -4, 4, 10, 16, 20}, + {-12, -8, -6, -2, 2, 6, 8, 12}, +}; + +void ggml_vec_dot_iq3_tq_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); + + const int8_t (*grid)[8] = levels ? (const int8_t (*)[8])levels : (const int8_t (*)[8])iq3tq_grid_cpu; + const block_iq3_tq * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + const int nb = n / QK_K; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ3TQ_GRID_SCALE; + const float yd = y[i].d; + const int8_t * q8 = y[i].qs; + + int32_t fsum = 0; + + for (int g = 0; g < IQ3TQ_N_GROUPS; ++g) { + int si = (x[i].scales[g / 2] >> (4 * (g % 2))) & 0xF; + const int8_t * ge = grid[si]; + const int8_t * q8g = q8 + g * 8; + + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + // 3-bit unpack + int bit_pos = j * 3; + int byte_idx = bit_pos >> 3; + int bit_off = bit_pos & 7; + uint16_t val = x[i].qs[byte_idx]; + if (bit_off > 5) val |= ((uint16_t)x[i].qs[byte_idx + 1] << 8); + int qi = (val >> bit_off) & 7; + fsum += (int32_t)ge[qi] * (int32_t)q8g[k]; + } + } + + sumf += d * yd * (float)fsum; + } + + *s = sumf; +} + +// IQ1_BN: 8D vector quantized — codebook[256][8] + scale_table[16] +void ggml_vec_dot_iq1_bn_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); + + GGML_ASSERT(levels && "IQ1_BN requires per-tensor codebook in quant_levels"); + const int8_t * codebook = (const int8_t *)levels; + const block_iq1_bn * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + const int nb = n / QK_K; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ1BN_GRID_SCALE; + const float yd = y[i].d; + const int8_t * q8 = y[i].qs; + + int32_t block_sum = 0; + + for (int g = 0; g < IQ1BN_N_GROUPS; ++g) { + int ci = (g & 1) + ? ((x[i].qs[3*(g/2)+1] >> 4) | ((int)x[i].qs[3*(g/2)+2] << 4)) + : (x[i].qs[3*(g/2)] | (((int)x[i].qs[3*(g/2)+1] & 0x0F) << 8)); + const int8_t * cb = codebook + ci * IQ1BN_CODEBOOK_DIM; + const int8_t * q8g = q8 + g * IQ1BN_GROUP_SIZE; + + for (int k = 0; k < IQ1BN_CODEBOOK_DIM; ++k) { + block_sum += (int32_t)cb[k] * (int32_t)q8g[k]; + } + } + + sumf += d * yd * (float)block_sum; + } + + *s = sumf; +} + +void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK4_NL == 0); + static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); + + const block_q4_dpt * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK4_NL; + + const int8_t * values = (const int8_t *)levels; + GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); + + float sumf = 0; + for (int ib = 0; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d); + int32_t blk = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + blk += (int32_t)y[ib].qs[j+ 0] * (int32_t)values[x[ib].qs[j] & 0xf]; + blk += (int32_t)y[ib].qs[j+QK4_NL/2] * (int32_t)values[x[ib].qs[j] >> 4]; + } + sumf += d * (float)blk; + } + *s = sumf; +} + +void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + assert(n % QK2_DPT == 0); + static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same"); + + const block_q2_dpt * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + const int nb = n / QK2_DPT; + + const int8_t * values = (const int8_t *)levels; + GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor"); + + float sumf = 0; + for (int ib = 0; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d); + int32_t blk = 0; + for (int j = 0; j < QK2_DPT/4; ++j) { + uint8_t q = x[ib].qs[j]; + blk += (int32_t)y[ib].qs[j*4 + 0] * (int32_t)values[(q >> 0) & 3]; + blk += (int32_t)y[ib].qs[j*4 + 1] * (int32_t)values[(q >> 2) & 3]; + blk += (int32_t)y[ib].qs[j*4 + 2] * (int32_t)values[(q >> 4) & 3]; + blk += (int32_t)y[ib].qs[j*4 + 3] * (int32_t)values[(q >> 6) & 3]; + } + sumf += d * (float)blk; + } + *s = sumf; +} + +void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -1194,7 +1639,8 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); @@ -1223,7 +1669,8 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } -void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index d4bc87a1c0..731e19d757 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -37,66 +37,79 @@ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); // Dot product -void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); -void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); -void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); -void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_pt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q2_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q2_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_tq_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq3_tq_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq1_bn_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); // Generic implementation void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); -void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); -void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); -void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); +void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels); #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index d0e4001338..ebf63280ad 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -8,7 +8,8 @@ ggml_fp16_t ggml_table_gelu_f16[1 << 16]; // precomputed quick gelu table for f16 (128 KB) ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; -void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) { +void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); GGML_UNUSED(nrc); GGML_UNUSED(bx); @@ -136,7 +137,8 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G *s = sumf; } -void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) { +void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); GGML_UNUSED(nrc); GGML_UNUSED(bx); @@ -261,7 +263,8 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * *s = sumf; } -void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) { +void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) { + GGML_UNUSED(levels); assert(nrc == 1); GGML_UNUSED(nrc); GGML_UNUSED(bx); diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index a0375a28de..75a6bbe0d9 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -39,9 +39,9 @@ extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; // fundamental operations // -void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc); -void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); -void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); +void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels); +void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels); +void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels); void ggml_vec_silu_f32(const int n, float * y, const float * x); ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean ) @@ -873,7 +873,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float } } -inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } +inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1, NULL); *s = sqrtf(*s); } inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 8a4246223b..96635cf3c6 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1057,6 +1057,27 @@ struct ggml_cuda_type_traits { static constexpr int qi = QI4_NL; }; +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK4_NL; + static constexpr int qr = QR4_NL; + static constexpr int qi = QI4_NL; +}; + +// Per-tensor lookup table for Q4_DPT (device global memory). +// Each TU gets its own copy; initialized via cudaGetSymbolAddress + cudaMemcpyAsync before use. +__device__ int8_t q4dpt_levels_cuda[16]; + +// Per-tensor lookup table for Q2_DPT (4 int8 levels). +__device__ int8_t q2dpt_levels_cuda[4]; + +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK2_DPT; + static constexpr int qr = 4; // 4 elements per "quantum" (2-bit) + static constexpr int qi = 1; // 1 uint32 per block +}; + template<> struct ggml_cuda_type_traits { static constexpr int qk = QK_K; @@ -1064,6 +1085,38 @@ struct ggml_cuda_type_traits { static constexpr int qi = QI4_XS; }; +// Per-tensor grid for IQ2_TQ (16 × 4 int8 = 64 bytes). +__device__ int8_t iq2tq_grid_cuda[64]; + +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK_K; + static constexpr int qr = 4; + static constexpr int qi = QK_K / (4*4); // 16 +}; + +// Per-tensor grid for IQ3_TQ (16 × 8 int8 = 128 bytes). +__device__ int8_t iq3tq_grid_cuda[128]; + + +// Per-tensor codebook for IQ1_BN (4096 × 8 int8 = 32768 bytes). +__device__ int8_t iq1bn_codebook_cuda[IQ1BN_CODEBOOK_SIZE]; + +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK_K; + static constexpr int qr = 4; + static constexpr int qi = QK_K / (4*4); // 16 +}; + + +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK_K; + static constexpr int qr = 4; + static constexpr int qi = QK_K / (4*4); // 16 +}; + template<> struct ggml_cuda_type_traits { static constexpr int qk = QK_K; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 79ccfe568a..1b5039b4c7 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -593,12 +593,187 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t dequantize_block_iq1_s<<>>(vx, y); } +void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream) { + int8_t * d_q4dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyDeviceToDevice, stream)); +} + +void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream) { + int8_t * d_q2dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, levels, 4, cudaMemcpyDeviceToDevice, stream)); +} + +void ggml_cuda_set_iq2tq_grid(const void * grid, cudaStream_t stream) { + int8_t * d_grid; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq2tq_grid_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_grid, grid, 64, cudaMemcpyHostToDevice, stream)); +} + +void ggml_cuda_set_iq3tq_grid(const void * grid, cudaStream_t stream) { + int8_t * d_grid; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq3tq_grid_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_grid, grid, 128, cudaMemcpyHostToDevice, stream)); +} + + +void ggml_cuda_set_iq1bn_aux(const void * aux, cudaStream_t stream) { + int8_t * d_cb; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_cb, iq1bn_codebook_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_cb, aux, IQ1BN_CODEBOOK_SIZE, cudaMemcpyHostToDevice, stream)); +} + template static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = (k + QK_K - 1) / QK_K; dequantize_block_iq4_nl<<>>(vx, y); } +template +static __global__ void dequantize_block_q4_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const int64_t i = blockIdx.x; + const block_q4_dpt * x = (const block_q4_dpt *) vx + i*(QK_K/QK4_NL); + + const int64_t tid = threadIdx.x; + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * q4dpt_levels_cuda[q4[j] & 0xf]; + y[j+16] = d * q4dpt_levels_cuda[q4[j] >> 4]; + } +} + +template +static void dequantize_row_q4_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_q4_dpt<<>>(vx, y); +} + +template +static __global__ void dequantize_block_q2_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const int64_t i = blockIdx.x; + const block_q2_dpt * x = (const block_q2_dpt *) vx + i*(QK_K/QK2_DPT); + + const int64_t tid = threadIdx.x; + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q2 = x[ib].qs + il; + const float d = (float)x[ib].d; + uint8_t q = q2[0]; + y[ 0] = d * q2dpt_levels_cuda[(q >> 0) & 3]; + y[ 1] = d * q2dpt_levels_cuda[(q >> 2) & 3]; + y[ 2] = d * q2dpt_levels_cuda[(q >> 4) & 3]; + y[ 3] = d * q2dpt_levels_cuda[(q >> 6) & 3]; +} + +template +static void dequantize_row_q2_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_q2_dpt<<>>(vx, y); +} + +template +static __global__ void dequantize_block_iq2_tq(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const int64_t i = blockIdx.x; + const block_iq2_tq * bq = (const block_iq2_tq *) vx + i; + const int g = threadIdx.x; // group index 0..31 + + const float dq = __half2float(bq->d) * IQ2TQ_GRID_SCALE; + + const int si = (bq->scales[g / 2] >> (4 * (g & 1))) & 0xF; + const int8_t * ge = iq2tq_grid_cuda + si * 4; + + dst_t * y = yy + i * QK_K + g * 8; + const uint8_t * qs = bq->qs + g * 2; + + y[0] = dq * ge[(qs[0] >> 0) & 3]; + y[1] = dq * ge[(qs[0] >> 2) & 3]; + y[2] = dq * ge[(qs[0] >> 4) & 3]; + y[3] = dq * ge[(qs[0] >> 6) & 3]; + y[4] = dq * ge[(qs[1] >> 0) & 3]; + y[5] = dq * ge[(qs[1] >> 2) & 3]; + y[6] = dq * ge[(qs[1] >> 4) & 3]; + y[7] = dq * ge[(qs[1] >> 6) & 3]; +} + +template +static void dequantize_row_iq2_tq_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq2_tq<<>>(vx, y); +} + +template +static __global__ void dequantize_block_iq3_tq(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const int64_t i = blockIdx.x; + const block_iq3_tq * bq = (const block_iq3_tq *) vx + i; + const int g = threadIdx.x; // group index 0..31 + + const float dq = __half2float(bq->d) * IQ3TQ_GRID_SCALE; + + const int si = (bq->scales[g / 2] >> (4 * (g & 1))) & 0xF; + const int8_t * ge = iq3tq_grid_cuda + si * 8; + + dst_t * y = yy + i * QK_K + g * 8; + const uint8_t * qs = bq->qs + g * 3; + const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16); + + y[0] = dq * ge[(bits >> 0) & 7]; + y[1] = dq * ge[(bits >> 3) & 7]; + y[2] = dq * ge[(bits >> 6) & 7]; + y[3] = dq * ge[(bits >> 9) & 7]; + y[4] = dq * ge[(bits >> 12) & 7]; + y[5] = dq * ge[(bits >> 15) & 7]; + y[6] = dq * ge[(bits >> 18) & 7]; + y[7] = dq * ge[(bits >> 21) & 7]; +} + +template +static void dequantize_row_iq3_tq_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq3_tq<<>>(vx, y); +} + + +template +static __global__ void dequantize_block_iq1_bn(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const int64_t i = blockIdx.x; + const block_iq1_bn * bq = (const block_iq1_bn *) vx + i; + const int g = threadIdx.x; // group index 0..31 + + const float dq = __half2float(bq->d) * IQ1BN_GRID_SCALE; + + // Extract 12-bit codebook index + const int pair = g / 2; + int ci; + if (g & 1) { + ci = (bq->qs[3*pair+1] >> 4) | ((int)bq->qs[3*pair+2] << 4); + } else { + ci = bq->qs[3*pair] | (((int)bq->qs[3*pair+1] & 0x0F) << 8); + } + const int8_t * cb = iq1bn_codebook_cuda + ci * IQ1BN_CODEBOOK_DIM; + + dst_t * y = yy + i * QK_K + g * IQ1BN_GROUP_SIZE; + y[0] = dq * cb[0]; + y[1] = dq * cb[1]; + y[2] = dq * cb[2]; + y[3] = dq * cb[3]; + y[4] = dq * cb[4]; + y[5] = dq * cb[5]; + y[6] = dq * cb[6]; + y[7] = dq * cb[7]; +} + +template +static void dequantize_row_iq1_bn_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq1_bn<<>>(vx, y); +} + template static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = k / QK_K; @@ -748,6 +923,16 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq1_m_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_Q4_DPT: + return dequantize_row_q4_dpt_cuda; + case GGML_TYPE_Q2_DPT: + return dequantize_row_q2_dpt_cuda; + case GGML_TYPE_IQ2_TQ: + return dequantize_row_iq2_tq_cuda; + case GGML_TYPE_IQ3_TQ: + return dequantize_row_iq3_tq_cuda; + case GGML_TYPE_IQ1_BN: + return dequantize_row_iq1_bn_cuda; case GGML_TYPE_IQ4_XS: return dequantize_row_iq4_xs_cuda; case GGML_TYPE_IQ3_S: @@ -801,6 +986,16 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq1_m_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_Q4_DPT: + return dequantize_row_q4_dpt_cuda; + case GGML_TYPE_Q2_DPT: + return dequantize_row_q2_dpt_cuda; + case GGML_TYPE_IQ2_TQ: + return dequantize_row_iq2_tq_cuda; + case GGML_TYPE_IQ3_TQ: + return dequantize_row_iq3_tq_cuda; + case GGML_TYPE_IQ1_BN: + return dequantize_row_iq1_bn_cuda; case GGML_TYPE_IQ4_XS: return dequantize_row_iq4_xs_cuda; case GGML_TYPE_IQ3_S: diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index f5d37c7b99..3b96662dd3 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -31,6 +31,22 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type); to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type); +// Set the Q4_DPT lookup table in device constant memory. +void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream); + +// Set the Q2_DPT lookup table in device constant memory. +void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream); + +// Set the IQ2_TQ per-tensor grid (64 bytes: 16 entries × 4 int8 levels). +void ggml_cuda_set_iq2tq_grid(const void * grid, cudaStream_t stream); + +// Set the IQ3_TQ per-tensor grid (128 bytes: 16 entries × 8 int8 levels). +void ggml_cuda_set_iq3tq_grid(const void * grid, cudaStream_t stream); + + +// Set the IQ1_BN per-tensor codebook+scale (2064 bytes). +void ggml_cuda_set_iq1bn_aux(const void * aux, cudaStream_t stream); + template __host__ __device__ inline dst_t ggml_cuda_cast(src_t x) { if constexpr (std::is_same_v) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 3113de017f..6eca0ebb6d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include "ggml-cuda/common.cuh" +#include "ggml-quants.h" #include "ggml-cuda/acc.cuh" #include "ggml-cuda/add-id.cuh" #include "ggml-cuda/arange.cuh" @@ -1426,6 +1427,24 @@ static void ggml_cuda_op_mul_mat_cublas( row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT; + // Upload per-tensor grids/levels before any dequantize path (fp16, fp32, or bf16) + if (src0->type == GGML_TYPE_Q4_DPT) { + GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)"); + ggml_cuda_set_q4dpt_levels((const int8_t *)src0->quant_levels, stream); + } + if (src0->type == GGML_TYPE_IQ2_TQ) { + GGML_ASSERT(src0->quant_levels && "IQ2_TQ MUL_MAT requires grid (set tensor->quant_levels)"); + ggml_cuda_set_iq2tq_grid(src0->quant_levels, stream); + } + if (src0->type == GGML_TYPE_IQ3_TQ) { + GGML_ASSERT(src0->quant_levels && "IQ3_TQ MUL_MAT requires grid (set tensor->quant_levels)"); + ggml_cuda_set_iq3tq_grid(src0->quant_levels, stream); + } + if (src0->type == GGML_TYPE_IQ1_BN) { + GGML_ASSERT(src0->quant_levels && "IQ1_BN MUL_MAT requires codebook (set tensor->quant_levels)"); + ggml_cuda_set_iq1bn_aux(src0->quant_levels, stream); + } + if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { ggml_cuda_pool_alloc src1_as_bf16(ctx.pool(id)); if (src1->type != GGML_TYPE_BF16) { @@ -4804,6 +4823,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_Q4_DPT: + case GGML_TYPE_IQ2_TQ: + case GGML_TYPE_IQ3_TQ: + case GGML_TYPE_IQ1_BN: case GGML_TYPE_IQ4_XS: case GGML_TYPE_BF16: return true; @@ -4838,7 +4861,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g { return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 || - op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) && + op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL || + op->type == GGML_TYPE_Q4_DPT) && op->src[0]->type == GGML_TYPE_F32 && (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32); } break; @@ -4891,6 +4915,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) { return true; } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_DPT) { + return true; + } if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) { return true; } diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 27b4145ac9..69e46e3bd0 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -2,6 +2,8 @@ #include "mmq.cuh" #include "quantize.cuh" #include "mmid.cuh" +#include "convert.cuh" +#include "ggml-quants.h" static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) { switch (args.type_x) { @@ -65,6 +67,12 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con case GGML_TYPE_IQ4_NL: mul_mat_q_case(ctx, args, stream); break; + case GGML_TYPE_Q4_DPT: + mul_mat_q_case(ctx, args, stream); + break; + case GGML_TYPE_Q2_DPT: + mul_mat_q_case(ctx, args, stream); + break; default: GGML_ABORT("fatal error"); break; @@ -82,6 +90,22 @@ void ggml_cuda_mul_mat_q( cudaStream_t stream = ctx.stream(); const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + // Set Q4_DPT lookup table from tensor's quant_levels + if (src0->type == GGML_TYPE_Q4_DPT) { + GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)"); + int8_t * d_q4dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream)); + } + + // Set Q2_DPT lookup table from tensor's quant_levels + if (src0->type == GGML_TYPE_Q2_DPT) { + GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)"); + int8_t * d_q2dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream)); + } + const size_t ts_src0 = ggml_type_size(src0->type); const size_t ts_src1 = ggml_type_size(src1->type); const size_t ts_dst = ggml_type_size(dst->type); @@ -290,6 +314,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_Q4_DPT: + case GGML_TYPE_Q2_DPT: mmq_supported = true; break; default: @@ -367,3 +393,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } + +// Q4_DPT must be instantiated in this TU (not a separate template-instance file) +// because it accesses the TU-local __device__ variable q4dpt_levels_cuda, +// which is initialized by the code above. +DECL_MMQ_CASE(GGML_TYPE_Q4_DPT); +DECL_MMQ_CASE(GGML_TYPE_Q2_DPT); diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 1891114147..6ac7dbd059 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -1,6 +1,7 @@ #pragma once #include "common.cuh" +#include "ggml.h" #include "vecdotq.cuh" #include "mma.cuh" @@ -88,6 +89,8 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) { return MMQ_Q8_1_DS_LAYOUT_DS4; case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_Q4_DPT: + case GGML_TYPE_Q2_DPT: return MMQ_Q8_1_DS_LAYOUT_D4; default: GGML_ABORT("fatal error"); @@ -205,6 +208,8 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0; case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q4_DPT: return MMQ_DP4A_TXS_Q8_0; + case GGML_TYPE_Q2_DPT: return MMQ_DP4A_TXS_Q8_0_16; default: return tile_x_sizes{0, 0, 0}; } } @@ -250,6 +255,8 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0; case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q4_DPT: return MMQ_MMA_TILE_X_K_Q8_0; + case GGML_TYPE_Q2_DPT: return MMQ_MMA_TILE_X_K_Q8_0; default: return 0; } } @@ -2763,6 +2770,71 @@ template static __device__ __forceinline__ void loa } } +template static __device__ __forceinline__ void load_tiles_q4_dpt( + const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + +#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + int * x_qs = (int *) x_tile; + float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2); +#else + constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_DPT, mmq_y); + int * x_qs = (int *) x_tile; + float * x_df = (float *) (x_qs + txs.qs); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + + constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_NL); + constexpr int nrows = warp_size / threads_per_row; + const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; + const int kbx = txi / QI4_NL; + const int kqsx = txi % QI4_NL; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row); + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbx; + + const int aux_q4 = get_int_b2(bxi->qs, kqsx); + const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda); + const int k0 = kbx * (2 * QI4_NL) + kqsx; + +#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x; + x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + QI4_NL] = v.y; +#else + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x; + x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI4_NL] = v.y; +#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + } + + constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_NL; + constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row; + const int kbxd = threadIdx.x % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) { + int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbxd; + +#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d); +#else + x_df[i*(MMQ_TILE_NE_K/QI4_NL) + i/QI4_NL + kbxd] = __half2float(bxi->d); +#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) + } +} + template static __device__ __forceinline__ void load_tiles_iq2_xxs( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { constexpr int nwarps = mmq_get_nwarps_device(); @@ -3447,6 +3519,22 @@ struct mmq_type_traits { static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; +template +struct mmq_type_traits { + static constexpr int vdr = VDR_Q4_DPT_Q8_1_MMQ; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_dpt; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; +}; + +template +struct mmq_type_traits { + static constexpr int vdr = VDR_Q2_DPT_Q8_1_MMQ; + static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_dpt; // Reuse Q4_DPT loader (same layout) + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; +}; + template struct mmq_type_traits { static constexpr int vdr = VDR_IQ4_XS_Q8_1_MMQ; diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 07b10167bc..d312b1348e 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -2,6 +2,8 @@ #include "quantize.cuh" #include "unary.cuh" #include "vecdotq.cuh" +#include "convert.cuh" +#include "ggml-quants.h" #include @@ -28,6 +30,11 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1; case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1; case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1; + case GGML_TYPE_Q4_DPT: return vec_dot_q4_dpt_q8_1; + case GGML_TYPE_Q2_DPT: return vec_dot_q2_dpt_q8_1; + case GGML_TYPE_IQ2_TQ: return vec_dot_iq2_tq_q8_1; + case GGML_TYPE_IQ3_TQ: return vec_dot_iq3_tq_q8_1; + case GGML_TYPE_IQ1_BN: return vec_dot_iq1_bn_q8_1; case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1; case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1; default: return nullptr; @@ -54,6 +61,11 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) { case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ; case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ; case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ; + case GGML_TYPE_Q4_DPT: return VDR_Q4_DPT_Q8_1_MMVQ; + case GGML_TYPE_Q2_DPT: return VDR_Q2_DPT_Q8_1_MMVQ; + case GGML_TYPE_IQ2_TQ: return VDR_IQ2_TQ_Q8_1_MMVQ; + case GGML_TYPE_IQ3_TQ: return VDR_IQ3_TQ_Q8_1_MMVQ; + case GGML_TYPE_IQ1_BN: return VDR_IQ1_BN_Q8_1_MMVQ; case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ; default: return 1; } @@ -1000,6 +1012,30 @@ static void mul_mat_vec_q_switch_type( nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; + case GGML_TYPE_Q4_DPT: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); + break; + case GGML_TYPE_IQ2_TQ: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); + break; + case GGML_TYPE_IQ3_TQ: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); + break; + case GGML_TYPE_IQ1_BN: + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); + break; case GGML_TYPE_IQ4_XS: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, @@ -1029,6 +1065,45 @@ void ggml_cuda_mul_mat_vec_q( cudaStream_t stream = ctx.stream(); + // Set Q4_DPT lookup table from tensor's quant_levels + if (src0->type == GGML_TYPE_Q4_DPT) { + GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)"); + int8_t * d_q4dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream)); + } + + // Set Q2_DPT lookup table from tensor's quant_levels + if (src0->type == GGML_TYPE_Q2_DPT) { + GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)"); + int8_t * d_q2dpt_levels; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream)); + } + + // Set IQ2_TQ per-tensor grid + if (src0->type == GGML_TYPE_IQ2_TQ) { + GGML_ASSERT(src0->quant_levels && "IQ2_TQ MUL_MAT requires grid (set tensor->quant_levels)"); + int8_t * d_grid; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq2tq_grid_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_grid, src0->quant_levels, 64, cudaMemcpyHostToDevice, stream)); + } + + // Set IQ3_TQ per-tensor grid + if (src0->type == GGML_TYPE_IQ3_TQ) { + GGML_ASSERT(src0->quant_levels && "IQ3_TQ MUL_MAT requires grid (set tensor->quant_levels)"); + int8_t * d_grid; + CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq3tq_grid_cuda)); + CUDA_CHECK(cudaMemcpyAsync(d_grid, src0->quant_levels, 128, cudaMemcpyHostToDevice, stream)); + } + + + // Set IQ1_BN per-tensor codebook+scale + if (src0->type == GGML_TYPE_IQ1_BN) { + GGML_ASSERT(src0->quant_levels && "IQ1_BN MUL_MAT requires codebook (set tensor->quant_levels)"); + ggml_cuda_set_iq1bn_aux(src0->quant_levels, stream); + } + const size_t ts_src0 = ggml_type_size(src0->type); const size_t ts_src1 = ggml_type_size(src1->type); const size_t ts_dst = ggml_type_size(dst->type); diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index 40b2b41e7e..429b3ca73e 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -1240,6 +1240,194 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( return d * sumi; } +#define VDR_Q4_DPT_Q8_1_MMVQ 2 +#define VDR_Q4_DPT_Q8_1_MMQ 4 + +static __device__ __forceinline__ float vec_dot_q4_dpt_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + const block_q4_dpt * bq4 = (const block_q4_dpt *) vbq + kbx; + + const int * q8 = (const int *) bq8_1->qs + iqs; + + int sumi = 0; +#pragma unroll + for (int l = 0; l < VDR_Q4_DPT_Q8_1_MMVQ; ++l) { + const int aux_q4 = get_int_b2(bq4->qs, iqs + l); + const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda); + + sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi); + sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi); + } + + const float d = __half2float(bq4->d) * __low2float(bq8_1->ds); + return d * sumi; +} + +// Q2_DPT: 2-bit quantization with 4 learned levels +// Helper: lookup 4 int8 levels using 2-bit indices packed in a 32-bit int +static __device__ __forceinline__ int4 get_int_from_table_4(const int & q2, const int8_t * table) { + int4 result; + result.x = table[(q2 >> 0) & 3]; + result.y = table[(q2 >> 8) & 3]; + result.z = table[(q2 >> 16) & 3]; + result.w = table[(q2 >> 24) & 3]; + return result; +} + +#define VDR_Q2_DPT_Q8_1_MMVQ 4 +#define VDR_Q2_DPT_Q8_1_MMQ 8 + +static __device__ __forceinline__ float vec_dot_q2_dpt_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + const block_q2_dpt * bq2 = (const block_q2_dpt *) vbq + kbx; + + const int * q8 = (const int *) bq8_1->qs + iqs; + + int sumi = 0; +#pragma unroll + for (int l = 0; l < VDR_Q2_DPT_Q8_1_MMVQ; ++l) { + const int aux_q2 = get_int_b4(bq2->qs, l); + const int4 v = get_int_from_table_4(aux_q2, q2dpt_levels_cuda); + + sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi); + sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi); + sumi = ggml_cuda_dp4a(v.z, q8[l + 8], sumi); + sumi = ggml_cuda_dp4a(v.w, q8[l + 12], sumi); + } + + const float d = __half2float(bq2->d) * __low2float(bq8_1->ds); + return d * sumi; +} + +// IQ2_TQ: 2-bit with per-tensor trained 16×4 grid table +// Grid lookup helper: 4 × 2-bit indices packed in a byte → 4 grid values packed as int32 +static __device__ __forceinline__ int iq2tq_grid_lookup4(uint8_t qbyte, const int8_t * grid_entry) { + uint32_t r = (uint32_t)(uint8_t)grid_entry[(qbyte >> 0) & 3]; + r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 2) & 3] << 8; + r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 4) & 3] << 16; + r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 6) & 3] << 24; + return (int)r; +} + +#define VDR_IQ2_TQ_Q8_1_MMVQ 1 +#define VDR_IQ2_TQ_Q8_1_MMQ 1 + +static __device__ __forceinline__ float vec_dot_iq2_tq_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + const block_iq2_tq * bq = (const block_iq2_tq *) vbq + kbx; + + // iqs selects which 16-element portion (0..15): 2 groups of 8 elements + const int q8b = iqs / 2; // Q8_1 block index (0..7) + const int q8off = (iqs & 1) * 4; // int32 offset within Q8_1 block (0 or 4) + + // Grid indices for groups iqs*2 and iqs*2+1 + const uint8_t sc = bq->scales[iqs]; + const int8_t * ge0 = iq2tq_grid_cuda + (sc & 0xF) * 4; + const int8_t * ge1 = iq2tq_grid_cuda + (sc >> 4) * 4; + + const uint8_t * qs = bq->qs + iqs * 4; + const int * q8 = (const int *)bq8_1[q8b].qs + q8off; + + int sumi = 0; + + // Group 0: 8 elements = 2 bytes qs, 2 int32 Q8_1 + sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[0], ge0), q8[0], sumi); + sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[1], ge0), q8[1], sumi); + + // Group 1: next 8 elements + sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[2], ge1), q8[2], sumi); + sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[3], ge1), q8[3], sumi); + + return __half2float(bq->d) * IQ2TQ_GRID_SCALE * __low2float(bq8_1[q8b].ds) * sumi; +} + +// IQ3_TQ: 3-bit with per-tensor trained 16×8 grid table +#define VDR_IQ3_TQ_Q8_1_MMVQ 1 +#define VDR_IQ3_TQ_Q8_1_MMQ 1 + +static __device__ __forceinline__ float vec_dot_iq3_tq_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + const block_iq3_tq * bq = (const block_iq3_tq *) vbq + kbx; + + const int q8b = iqs / 2; + const int q8off = (iqs & 1) * 4; + + const uint8_t sc = bq->scales[iqs]; + const int8_t * ge0 = iq3tq_grid_cuda + (sc & 0xF) * 8; + const int8_t * ge1 = iq3tq_grid_cuda + (sc >> 4) * 8; + + const int * q8 = (const int *)bq8_1[q8b].qs + q8off; + + int sumi = 0; + + // Group 0: 8 elements, 3 bytes of qs + { + const uint8_t * qs = bq->qs + (iqs * 2) * 3; + const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16); + + int v0 = (uint8_t)ge0[(bits >> 0) & 7] | ((uint32_t)(uint8_t)ge0[(bits >> 3) & 7] << 8) + | ((uint32_t)(uint8_t)ge0[(bits >> 6) & 7] << 16) | ((uint32_t)(uint8_t)ge0[(bits >> 9) & 7] << 24); + sumi = ggml_cuda_dp4a(v0, q8[0], sumi); + + int v1 = (uint8_t)ge0[(bits >> 12) & 7] | ((uint32_t)(uint8_t)ge0[(bits >> 15) & 7] << 8) + | ((uint32_t)(uint8_t)ge0[(bits >> 18) & 7] << 16) | ((uint32_t)(uint8_t)ge0[(bits >> 21) & 7] << 24); + sumi = ggml_cuda_dp4a(v1, q8[1], sumi); + } + + // Group 1: next 8 elements, next 3 bytes of qs + { + const uint8_t * qs = bq->qs + (iqs * 2 + 1) * 3; + const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16); + + int v0 = (uint8_t)ge1[(bits >> 0) & 7] | ((uint32_t)(uint8_t)ge1[(bits >> 3) & 7] << 8) + | ((uint32_t)(uint8_t)ge1[(bits >> 6) & 7] << 16) | ((uint32_t)(uint8_t)ge1[(bits >> 9) & 7] << 24); + sumi = ggml_cuda_dp4a(v0, q8[2], sumi); + + int v1 = (uint8_t)ge1[(bits >> 12) & 7] | ((uint32_t)(uint8_t)ge1[(bits >> 15) & 7] << 8) + | ((uint32_t)(uint8_t)ge1[(bits >> 18) & 7] << 16) | ((uint32_t)(uint8_t)ge1[(bits >> 21) & 7] << 24); + sumi = ggml_cuda_dp4a(v1, q8[3], sumi); + } + + return __half2float(bq->d) * IQ3TQ_GRID_SCALE * __low2float(bq8_1[q8b].ds) * sumi; +} + + +// IQ1_BN: 8D vector quantized with per-tensor trained 4096-entry codebook +#define VDR_IQ1_BN_Q8_1_MMVQ 1 +#define VDR_IQ1_BN_Q8_1_MMQ 1 + +static __device__ __forceinline__ float vec_dot_iq1_bn_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + const block_iq1_bn * bq = (const block_iq1_bn *) vbq + kbx; + + // iqs = 0..15, each thread handles 2 groups (16 elements) + const int q8b = iqs / 2; + const int q8off = (iqs & 1) * 4; + + // Extract two 12-bit codebook indices from qs[3*iqs .. 3*iqs+2] + const uint8_t * qs = bq->qs + 3 * iqs; + const int ci0 = qs[0] | (((int)qs[1] & 0x0F) << 8); + const int ci1 = (qs[1] >> 4) | ((int)qs[2] << 4); + + const int * cb0 = (const int *)(iq1bn_codebook_cuda + ci0 * IQ1BN_CODEBOOK_DIM); + const int * cb1 = (const int *)(iq1bn_codebook_cuda + ci1 * IQ1BN_CODEBOOK_DIM); + + const int * q8 = (const int *)bq8_1[q8b].qs + q8off; + + int sumi = 0; + sumi = ggml_cuda_dp4a(cb0[0], q8[0], sumi); + sumi = ggml_cuda_dp4a(cb0[1], q8[1], sumi); + sumi = ggml_cuda_dp4a(cb1[0], q8[2], sumi); + sumi = ggml_cuda_dp4a(cb1[1], q8[3], sumi); + + return __half2float(bq->d) * IQ1BN_GRID_SCALE * __low2float(bq8_1[q8b].ds) * (float)sumi; +} + #define VDR_IQ4_XS_Q8_1_MMVQ 4 #define VDR_IQ4_XS_Q8_1_MMQ 4 diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 15443aa554..e9edbe4938 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -374,7 +374,8 @@ void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 * GGML_RE } } -void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + GGML_UNUSED(levels); static const int qk = QK1_0; assert(k % qk == 0); @@ -394,7 +395,7 @@ void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK4_0; assert(k % qk == 0); @@ -414,7 +415,7 @@ void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK4_1; assert(k % qk == 0); @@ -435,7 +436,7 @@ void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK5_0; assert(k % qk == 0); @@ -461,7 +462,7 @@ void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK5_1; assert(k % qk == 0); @@ -488,7 +489,7 @@ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK8_0; assert(k % qk == 0); @@ -504,7 +505,7 @@ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRI } } -void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { static const int qk = QK_MXFP4; assert(k % qk == 0); @@ -524,7 +525,8 @@ void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_REST } } -void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + GGML_UNUSED(levels); static const int qk = QK_NVFP4; static const int qk_sub = QK_NVFP4_SUB; static const int n_sub = QK_NVFP4 / QK_NVFP4_SUB; @@ -824,6 +826,15 @@ static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint } } +// Extract only the scale (not min) from Q4_K-style packed scales +static inline void get_scale_k4_only(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d) { + if (j < 4) { + *d = q[j] & 63; + } else { + *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + } +} + //========================- 2-bit (de)-quantization void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) { @@ -896,7 +907,7 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST } } -void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1240,7 +1251,7 @@ void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_REST } } -void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1464,7 +1475,7 @@ void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_REST } } -void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1666,7 +1677,7 @@ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_REST } } -void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -1874,7 +1885,7 @@ void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_REST } } -void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2349,7 +2360,7 @@ size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, return nrow * row_size; } -void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2388,7 +2399,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_REST } } -void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2409,7 +2420,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_REST // ====================== "True" 2-bit (de)-quantization -void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2437,7 +2448,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_ // ====================== 2.3125 bpw (de)-quantization -void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2464,7 +2475,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RE // ====================== 2.5625 bpw (de)-quantization -void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2496,7 +2507,7 @@ void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_REST // ====================== 3.0625 bpw (de)-quantization -void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2528,7 +2539,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_ // ====================== 3.3125 bpw (de)-quantization -void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2571,7 +2582,7 @@ void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_REST // ====================== 1.5625 bpw (de)-quantization -void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2596,7 +2607,7 @@ void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_REST } } -void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2646,7 +2657,7 @@ void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_REST } } -void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK4_NL == 0); const int64_t nb = k / QK4_NL; @@ -2664,7 +2675,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RE } } -void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -2728,7 +2739,7 @@ void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_REST } } -void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; @@ -4205,6 +4216,4045 @@ void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RE quantize_iq3_s(x, y, 1, k, NULL); } +// ====================== Q3_KPT: Q3_K with learned per-tensor levels ====================== +// +// Block format: Identical to block_q3_K (110 bytes per QK_K=256 elements) +// hmask[QK_K/8] : high bit for 3-bit indices +// qs[QK_K/4] : low 2 bits for 3-bit indices +// scales[12] : 6-bit quantized scales +// d : super-block scale +// +// The difference from Q3_K: instead of q ∈ {-4,-3,-2,-1,0,1,2,3}, +// we use learned levels L[0..7] and compute: x = d * sc * (L[k] - 4) +// where k is the 3-bit index. +// +// Per-tensor: 8 float32 "levels" in [0,1] from Lloyd-Max training. +// Stored in GGUF as "q3_kpt.levels" (float32 array). + +static float q3kpt_levels[Q3KPT_N_LEVELS]; +static bool q3kpt_levels_set = false; + +GGML_API void q3kpt_set_levels(const float * levels) { + memcpy(q3kpt_levels, levels, Q3KPT_N_LEVELS * sizeof(float)); + q3kpt_levels_set = true; +} + +GGML_API const float * q3kpt_get_levels(void) { + return q3kpt_levels_set ? q3kpt_levels : NULL; +} + +GGML_API void q3kpt_free_levels(void) { + q3kpt_levels_set = false; +} + + +// Train levels in the symmetric quantization space +GGML_API void q3kpt_train_levels(const float * data, + int64_t nrow, + int64_t n_per_row, + const float * imatrix, + float levels_out[Q3KPT_N_LEVELS]) { + // Binning parameters + const int N_BINS = 8192; + const float bin_width = 1.0f / N_BINS; + float * bin_sum_w = (float *) calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *) calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + const int nb = (int) (n_per_row / QK_K); + + // Single pass: use simple max_abs/4 scale estimation per sub-block, then bin + for (int64_t row = 0; row < nrow; ++row) { + const float * xrow = data + row * n_per_row; + + for (int i = 0; i < nb; i++) { + const float * x = xrow + i * QK_K; + + for (int j = 0; j < QK_K / 16; ++j) { + // Simple symmetric scale: max_abs / 4 + float amax = 0; + for (int l = 0; l < 16; ++l) { + float ax = fabsf(x[16 * j + l]); + if (ax > amax) { + amax = ax; + } + } + if (amax < 1e-10f) { + continue; + } + + float d = amax / 4.0f; + float inv_d = 1.0f / d; + + for (int l = 0; l < 16; ++l) { + float val = x[16 * j + l] * inv_d; + // Map from [-4, 3] symmetric space to [0, 1] + float t = (val + 4.0f) / 7.0f; + + if (t < 0.0f) { + t = 0.0f; + } + if (t > 1.0f) { + t = 1.0f; + } + + int bin_idx = (int) (t * N_BINS); + if (bin_idx >= N_BINS) { + bin_idx = N_BINS - 1; + } + + int elem = i * QK_K + 16 * j + l; + float w = imatrix ? imatrix[elem] : 1.0f; + if (w < 1e-10f) { + w = 1e-10f; + } + w *= d * d; + + bin_sum_w[bin_idx] += w; + bin_sum_wt[bin_idx] += w * t; + } + } + } + } + + // Initialize 8 levels uniformly in [0, 1] + float levels[Q3KPT_N_LEVELS]; + for (int k = 0; k < Q3KPT_N_LEVELS; ++k) { + levels[k] = (float) k / (Q3KPT_N_LEVELS - 1); + } + + // Lloyd-Max iterations on bins + for (int iter = 0; iter < 100; ++iter) { + float sum_w[Q3KPT_N_LEVELS] = { 0 }; + float sum_wt[Q3KPT_N_LEVELS] = { 0 }; + + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) { + continue; + } + const float t = (b + 0.5f) * bin_width; + int best = 0; + float best_d2 = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < Q3KPT_N_LEVELS; ++k) { + float d2 = (t - levels[k]) * (t - levels[k]); + if (d2 < best_d2) { + best_d2 = d2; + best = k; + } + } + sum_w[best] += bin_sum_w[b]; + sum_wt[best] += bin_sum_wt[b]; + } + + float max_delta = 0.0f; + for (int k = 0; k < Q3KPT_N_LEVELS; ++k) { + if (sum_w[k] > 1e-12f) { + float new_level = sum_wt[k] / sum_w[k]; + max_delta = fmaxf(max_delta, fabsf(new_level - levels[k])); + levels[k] = new_level; + } + } + if (max_delta < 1e-10f) { + break; + } + + for (int k = 1; k < Q3KPT_N_LEVELS; ++k) { + float v = levels[k]; + int m = k - 1; + while (m >= 0 && levels[m] > v) { + levels[m + 1] = levels[m]; + m--; + } + levels[m + 1] = v; + } + } + + memcpy(levels_out, levels, Q3KPT_N_LEVELS * sizeof(float)); + q3kpt_set_levels(levels); + free(bin_sum_w); + free(bin_sum_wt); +} + +void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q3_KPT levels not set for tensor"); + + // levels are in [0,1], map to approximate [-4, 3] range for Q3_K compatibility + // The dequant formula: y = d * sc * (L[k] * 8 - 4) = d * sc * (L[k] - 0.5) * 8 + // But simpler: store shifted levels and use: y = d * sc * L_shifted[k] + // where L_shifted[k] = (L[k] - 0.5) * 8 or just use (L[k] - 4) if L is in [0,7] + + // Actually, let's use: reconstructed = d * sc * (L[k] - 4) + // where L[k] is in [0, 7] (shifted from [0,1]) + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + for (int i = 0; i < nb; i++) { + const float d_all = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * q = x[i].qs; + const uint8_t * hm = x[i].hmask; + uint8_t m = 1; + + uint32_t aux32[4]; + memcpy(aux32, x[i].scales, 12); + uint32_t tmp = aux32[2]; + aux32[2] = ((aux32[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + aux32[3] = ((aux32[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + aux32[0] = (aux32[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); + aux32[1] = (aux32[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + const uint8_t * aux = (const uint8_t *) aux32; + + int is = 0; + for (int n = 0; n < QK_K; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + int sc1 = (int) aux[is] - 32; + int sc2 = (int) aux[is + 1] - 32; + is += 2; + float dl1 = d_all * sc1; + float dl2 = d_all * sc2; + + for (int l = 0; l < 16; ++l) { + int k_idx = ((q[l + 0] >> shift) & 3) + ((hm[l + 0] & m) ? 4 : 0); + y[l + 0] = dl1 * (lv[k_idx] * 7.0f - 4.0f); + } + for (int l = 0; l < 16; ++l) { + int k_idx = ((q[l + 16] >> shift) & 3) + ((hm[l + 16] & m) ? 4 : 0); + y[l + 16] = dl2 * (lv[k_idx] * 7.0f - 4.0f); + } + y += 32; + shift += 2; + m <<= 1; + } + q += 32; + } + } +} + +// Helper: find optimal symmetric scale for non-uniform mapped levels. +// Closely mirrors make_qx_quants but uses nearest-mapped-level assignment +// instead of rounding to nearest integer. +// mapped_levels[k] = levels[k]*7 - 4, k=0..7. +// Returns the per-sub-block scale d such that x[i] ≈ d * ml[L[i]]. +// L[i] gets the best level index [0..7]. +static float make_q3kpt_quants(int n, + const float * GGML_RESTRICT x, + int8_t * GGML_RESTRICT L, + const float * GGML_RESTRICT weight, + const float * mapped_levels) { + // Find the most negative and most positive mapped levels + float ml_neg = mapped_levels[0], ml_pos = mapped_levels[Q3KPT_N_LEVELS - 1]; + + // Precompute boundaries for branchless nearest-level search + float bounds[Q3KPT_N_LEVELS - 1]; + for (int k = 0; k < Q3KPT_N_LEVELS - 1; ++k) { + bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); + } + + // Find max absolute value in data (and its sign) + float max = 0, amax = 0; + for (int i = 0; i < n; ++i) { + float ax = fabsf(x[i]); + if (ax > amax) { + amax = ax; + max = x[i]; + } + } + if (amax < GROUP_MAX_EPS) { + // Find level closest to 0 + int zero_k = 0; + float zero_d = fabsf(mapped_levels[0]); + for (int k = 1; k < Q3KPT_N_LEVELS; ++k) { + if (fabsf(mapped_levels[k]) < zero_d) { + zero_d = fabsf(mapped_levels[k]); + zero_k = k; + } + } + for (int i = 0; i < n; ++i) { + L[i] = zero_k; + } + return 0.f; + } + + float best_scale = 0; + float best_obj = 0; + bool first = true; + + for (int is = -15; is <= 15; ++is) { + float iscales[2] = { + -(fabsf(ml_neg) + 0.1f * is) / max, // map max to ml_neg (Q3_K style) + (fabsf(ml_pos) + 0.1f * is) / max // map max to ml_pos + }; + + for (int opt = 0; opt < 2; ++opt) { + float iscale = iscales[opt]; + + float sumlx = 0, suml2 = 0; + for (int i = 0; i < n; ++i) { + float scaled = x[i] * iscale; + // Branchless nearest level assignment + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]) + + (scaled > bounds[3]) + (scaled > bounds[4]) + (scaled > bounds[5]) + (scaled > bounds[6]); + float w = weight ? weight[i] : x[i] * x[i]; + sumlx += w * x[i] * mapped_levels[best_k]; + suml2 += w * mapped_levels[best_k] * mapped_levels[best_k]; + } + + if (suml2 > 0 && (first || sumlx * sumlx > best_obj * suml2)) { + float scale = sumlx / suml2; + best_obj = scale * sumlx; + best_scale = scale; + first = false; + // Re-assign L with this iscale + for (int i = 0; i < n; ++i) { + float scaled = x[i] * iscale; + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]) + + (scaled > bounds[3]) + (scaled > bounds[4]) + (scaled > bounds[5]) + (scaled > bounds[6]); + L[i] = best_k; + } + } + } + } + return best_scale; +} + +static void quantize_row_q3_kpt_impl(const float * GGML_RESTRICT x, + block_q3_kpt * GGML_RESTRICT y, + int64_t n_per_row, + const float * GGML_RESTRICT quant_weights) { + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + const float * levels = q3kpt_get_levels(); + GGML_ASSERT(levels != NULL && "Q3_KPT levels not set - call q3kpt_set_levels() first"); + + // Precompute mapped levels: ml[k] = levels[k] * 7 - 4 + float mapped_levels[Q3KPT_N_LEVELS]; + for (int k = 0; k < Q3KPT_N_LEVELS; ++k) { + mapped_levels[k] = levels[k] * 7.0f - 4.0f; + } + + // Precompute boundaries for branchless nearest-level search + float bounds[Q3KPT_N_LEVELS - 1]; + for (int k = 0; k < Q3KPT_N_LEVELS - 1; ++k) { + bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); + } + + int8_t L[QK_K]; + float scales[QK_K / 16]; + float weight[16]; + float sw[QK_K / 16]; + int8_t Ls[QK_K / 16]; + + for (int i = 0; i < nb; i++) { + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) { + sumx2 += x[j] * x[j]; + } + float sigma2 = 2 * sumx2 / QK_K; + + // First pass: find per-sub-block scales optimized for mapped levels + for (int j = 0; j < QK_K / 16; ++j) { + if (quant_weights) { + const float * qw = quant_weights + QK_K * i + 16 * j; + for (int l = 0; l < 16; ++l) { + weight[l] = qw[l] * sqrtf(sigma2 + x[16 * j + l] * x[16 * j + l]); + } + } else { + for (int l = 0; l < 16; ++l) { + weight[l] = x[16 * j + l] * x[16 * j + l]; + } + } + float sumw = 0; + for (int l = 0; l < 16; ++l) { + sumw += weight[l]; + } + sw[j] = sumw; + + scales[j] = make_q3kpt_quants(16, x + 16 * j, L + 16 * j, weight, mapped_levels); + } + + // Two-tier scale quantization (identical to Q3_K) + memset(y[i].scales, 0, 12); + float d_block = make_qx_quants(QK_K / 16, 32, scales, Ls, 1, sw); + for (int j = 0; j < QK_K / 16; ++j) { + int l = Ls[j]; + if (j < 8) { + y[i].scales[j] = l & 0xF; + } else { + y[i].scales[j - 8] |= ((l & 0xF) << 4); + } + l >>= 4; + y[i].scales[j % 4 + 8] |= (l << (2 * (j / 4))); + } + y[i].d = GGML_FP32_TO_FP16(d_block); + + // Second pass: level assignment using the quantized scales but + // assigning nearest LEARNED LEVEL instead of nearest integer + int8_t sc; + for (int j = 0; j < QK_K / 16; ++j) { + sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j - 8] >> 4; + sc = (sc | (((y[i].scales[8 + j % 4] >> (2 * (j / 4))) & 3) << 4)) - 32; + float d = GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) { + // Find level closest to 0 for zero-scale sub-blocks + int zero_k = 0; + float zero_dist = fabsf(mapped_levels[0]); + for (int k = 1; k < Q3KPT_N_LEVELS; ++k) { + if (fabsf(mapped_levels[k]) < zero_dist) { + zero_dist = fabsf(mapped_levels[k]); + zero_k = k; + } + } + for (int ii = 0; ii < 16; ++ii) { + L[16 * j + ii] = zero_k; + } + continue; + } + for (int ii = 0; ii < 16; ++ii) { + float scaled = x[16 * j + ii] / d; + // Branchless nearest level assignment + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]) + + (scaled > bounds[3]) + (scaled > bounds[4]) + (scaled > bounds[5]) + (scaled > bounds[6]); + L[16 * j + ii] = best_k; + } + } + + // Pack level indices (same bit layout as Q3_K) + memset(y[i].hmask, 0, QK_K / 8); + int m = 0; + uint8_t hm = 1; + for (int j = 0; j < QK_K; ++j) { + if (L[j] > 3) { + y[i].hmask[m] |= hm; + L[j] -= 4; + } + if (++m == QK_K / 8) { + m = 0; + hm <<= 1; + } + } + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y[i].qs[j / 4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } + x += QK_K; + } +} + +size_t quantize_q3_kpt(const float * GGML_RESTRICT src, + void * GGML_RESTRICT dst, + int64_t nrow, + int64_t n_per_row, + const float * imatrix) { + size_t row_size = ggml_row_size(GGML_TYPE_Q3_KPT, n_per_row); + char * qrow = (char *) dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q3_kpt_impl(src, (block_q3_kpt *) qrow, n_per_row, imatrix); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} + +void quantize_row_q3_kpt_ref(const float * GGML_RESTRICT x, block_q3_kpt * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_q3_kpt(x, y, 1, k, NULL); +} + +// Forward declaration needed since quantize_row_iq4_nl_impl is defined later in this file. +static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, + const float * GGML_RESTRICT x, + ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, + float * scales, float * weight, uint8_t * L, + const int8_t * values, + const float * quant_weights, + const int ntry); + +// ====================== Q4_DPT: IQ4_NL with learned per-tensor int8 levels ====================== +// +// Block format: identical to block_iq4_nl (18 bytes per QK4_NL=32 elements) +// d : ggml_half — per-block scale +// qs : QK4_NL/2 bytes — 4-bit indices into the 16-entry level table +// +// The difference from IQ4_NL: instead of the fixed kvalues_iq4nl int8 table, +// we use 16 int8 levels learned per-tensor via weighted Lloyd-Max k-means. +// Normalization: symmetric (x/amax), bin domain [-1, 1]. +// Levels stored in GGUF as "q4_dpt.levels" (int8 array, 16 values per tensor). + +static int8_t q4dpt_levels[Q4DPT_N_LEVELS]; +static bool q4dpt_levels_set = false; + +void q4dpt_set_levels(const int8_t * levels) { + memcpy(q4dpt_levels, levels, Q4DPT_N_LEVELS * sizeof(int8_t)); + q4dpt_levels_set = true; +} + +const int8_t * q4dpt_get_levels(void) { + return q4dpt_levels_set ? q4dpt_levels : NULL; +} + +void q4dpt_free_levels(void) { + q4dpt_levels_set = false; +} + + +// Run Lloyd-Max iterations on a pre-built histogram. +// levels[] (n_levels entries) is updated in-place (and kept sorted). +static void q4dpt_run_lloyd_max(const float * bin_sum_w, const float * bin_sum_wt, + float * levels, int n_levels, int n_bins, float bin_width, int max_iter) { + // sw/swt sized for the max possible n_levels (Q4DPT_N_LEVELS) + float sw[Q4DPT_N_LEVELS] = { 0 }; + float swt[Q4DPT_N_LEVELS] = { 0 }; + for (int iter = 0; iter < max_iter; ++iter) { + for (int k = 0; k < n_levels; ++k) { sw[k] = 0; swt[k] = 0; } + for (int b = 0; b < n_bins; ++b) { + if (bin_sum_w[b] < 1e-12f) { continue; } + float t = -1.0f + (b + 0.5f) * bin_width; + int best = 0; + float bd = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < n_levels; ++k) { + float d = (t - levels[k]) * (t - levels[k]); + if (d < bd) { bd = d; best = k; } + } + sw[best] += bin_sum_w[b]; + swt[best] += bin_sum_wt[b]; + } + float max_delta = 0.0f; + for (int k = 0; k < n_levels; ++k) { + if (sw[k] > 1e-12f) { + float nl = swt[k] / sw[k]; + max_delta = fmaxf(max_delta, fabsf(nl - levels[k])); + levels[k] = nl; + } + } + if (max_delta < 1e-10f) { break; } + for (int k = 1; k < n_levels; ++k) { + float v = levels[k]; + int m = k - 1; + while (m >= 0 && levels[m] > v) { levels[m+1] = levels[m]; m--; } + levels[m+1] = v; + } + } +} + +// Train 16 Lloyd-Max int8 levels. +// Bins x/amax values from 32-element IQ4_NL-style blocks into [-1,1], +// runs weighted k-means (seeded from IQ4_NL values), then rounds float +// centroids to sorted int8[16] with post-rounding local search. +void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]) { + const int N_BINS = 8192; + const float bin_width = 2.0f / N_BINS; + float * bin_sum_w = (float *) calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *) calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + const int64_t n_blocks = n_per_row / QK4_NL; + + // Build weighted histogram: normalize each block by amax, bin into [-1, 1] + for (int64_t row = 0; row < nrow; ++row) { + const float * xrow = data + row * n_per_row; + for (int64_t ib = 0; ib < n_blocks; ++ib) { + const float * xb = xrow + ib * QK4_NL; + float amax = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { amax = ax; } + } + if (amax < 1e-10f) { continue; } + const float inv_amax = 1.0f / amax; + for (int j = 0; j < QK4_NL; ++j) { + float w = 1.0f; + if (imatrix) { + w = imatrix[ib * QK4_NL + j]; + if (w < 1e-10f) { w = 1e-10f; } + } + w *= amax * amax; + float t = xb[j] * inv_amax; + int bin_idx = (int)((t + 1.0f) * 0.5f * N_BINS); + if (bin_idx < 0) { bin_idx = 0; } + if (bin_idx >= N_BINS) { bin_idx = N_BINS - 1; } + bin_sum_w[bin_idx] += w; + bin_sum_wt[bin_idx] += w * t; + } + } + } + + // Initialize from IQ4_NL values normalized to [-1, 1], then run Lloyd-Max + float best_levels[Q4DPT_N_LEVELS]; + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + best_levels[k] = (float)kvalues_iq4nl[k] / 127.0f; + } + q4dpt_run_lloyd_max(bin_sum_w, bin_sum_wt, best_levels, Q4DPT_N_LEVELS, N_BINS, bin_width, 500); + + // Round float centroids to int8, preserve sort order + int8_t levels_i8[Q4DPT_N_LEVELS]; + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + int v = (int)roundf(best_levels[k] * 127.0f); + if (v < -128) { v = -128; } + if (v > 127) { v = 127; } + levels_i8[k] = (int8_t)v; + } + + // Post-rounding local search: try ±1 adjustments to each level greedily. + // The int8 rounding can introduce sub-optimal level placement; this + // hill-climbing on discrete int8 values often recovers a better solution. + for (int pass = 0; pass < 10; ++pass) { + int improved = 0; + for (int k = 0; k < Q4DPT_N_LEVELS; ++k) { + // Evaluate current histogram MSE with int8 levels + float cur_levels[Q4DPT_N_LEVELS]; + for (int i = 0; i < Q4DPT_N_LEVELS; ++i) { + cur_levels[i] = (float)levels_i8[i] / 127.0f; + } + float cur_mse = 0.0f; + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) { continue; } + float t = -1.0f + (b + 0.5f) * bin_width; + float bd = (t - cur_levels[0]) * (t - cur_levels[0]); + for (int i = 1; i < Q4DPT_N_LEVELS; ++i) { + float d = (t - cur_levels[i]) * (t - cur_levels[i]); + if (d < bd) { bd = d; } + } + cur_mse += bin_sum_w[b] * bd; + } + + int8_t best_val = levels_i8[k]; + int8_t lo = (k > 0) ? (int8_t)(levels_i8[k-1] + 1) : -128; + int8_t hi = (k < Q4DPT_N_LEVELS - 1) ? (int8_t)(levels_i8[k+1] - 1) : 127; + for (int delta = -1; delta <= 1; delta += 2) { + int8_t nv = (int8_t)(levels_i8[k] + delta); + if (nv < lo || nv > hi) { continue; } + cur_levels[k] = (float)nv / 127.0f; + float test_mse = 0.0f; + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) { continue; } + float t = -1.0f + (b + 0.5f) * bin_width; + float bd = (t - cur_levels[0]) * (t - cur_levels[0]); + for (int i = 1; i < Q4DPT_N_LEVELS; ++i) { + float d = (t - cur_levels[i]) * (t - cur_levels[i]); + if (d < bd) { bd = d; } + } + test_mse += bin_sum_w[b] * bd; + } + if (test_mse < cur_mse) { + best_val = nv; + cur_mse = test_mse; + improved = 1; + } + cur_levels[k] = (float)levels_i8[k] / 127.0f; // restore + } + levels_i8[k] = best_val; + } + if (!improved) { break; } + } + + memcpy(levels_out, levels_i8, Q4DPT_N_LEVELS * sizeof(int8_t)); + + q4dpt_set_levels(levels_out); + + free(bin_sum_w); + free(bin_sum_wt); +} + +void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + assert(k % QK4_NL == 0); + const int64_t nb = k / QK4_NL; + const int8_t * values = (const int8_t *)levels; + GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor"); + + for (int i = 0; i < nb; i++) { + const uint8_t * qs = x[i].qs; + const float d = GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK4_NL/2; ++j) { + y[j] = d * (float)values[qs[j] & 0xf]; + y[j + QK4_NL/2] = d * (float)values[qs[j] >> 4]; + } + y += QK4_NL; + } +} + +// Quantize one 32-element block using int8 levels and optimal per-block scale. +// IQ4_NL-style scale perturbation with negative-scale support and final re-assignment. +static void quantize_block_q4_dpt(const float * GGML_RESTRICT xb, block_q4_dpt * GGML_RESTRICT out, + const int8_t * values, const float * qw, int ntry) { + float amax = 0.0f, max_val = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { amax = ax; max_val = xb[j]; } + } + if (amax < 1e-10f) { + out->d = 0; + memset(out->qs, 0, QK4_NL/2); + return; + } + + // Initial scale: d = -max/values[0] (allows negative d for asymmetric levels) + float d = ntry > 0 ? -max_val / (float)values[0] : max_val / (float)values[0]; + float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f; + + // Initial assignment + optimal scale via least-squares + uint8_t L[QK4_NL]; + float sumqx = 0.0f, sumq2 = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float dist = fabsf(al - (float)values[k]); + if (dist < bd) { bd = dist; bk = k; } + } + L[j] = (uint8_t)bk; + float q = (float)values[bk]; + float w = qw ? qw[j] : 1.0f; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + d = (sumq2 > 1e-20f) ? sumqx / sumq2 : d; + float best = d * sumqx; + uint8_t best_L[QK4_NL]; + memcpy(best_L, L, QK4_NL); + float best_d = d; + + // Scale perturbation: id = (itry + values[0]) / max_val (IQ4_NL-style) + for (int itry = -ntry; itry <= ntry; ++itry) { + id = ((float)itry + (float)values[0]) / max_val; + sumqx = sumq2 = 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float dist = fabsf(al - (float)values[k]); + if (dist < bd) { bd = dist; bk = k; } + } + L[j] = (uint8_t)bk; + float q = (float)values[bk]; + float w = qw ? qw[j] : 1.0f; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + if (sumq2 > 0.0f && sumqx * sumqx > best * sumq2) { + d = sumqx / sumq2; + best = d * sumqx; + best_d = d; + memcpy(best_L, L, QK4_NL); + } + } + + // Final re-assignment using the best scale + id = (fabsf(best_d) > 1e-20f) ? 1.0f / best_d : 0.0f; + for (int j = 0; j < QK4_NL; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int k = 1; k < Q4DPT_N_LEVELS; ++k) { + float dist = fabsf(al - (float)values[k]); + if (dist < bd) { bd = dist; bk = k; } + } + best_L[j] = (uint8_t)bk; + } + + out->d = GGML_FP32_TO_FP16(best_d); + for (int j = 0; j < QK4_NL/2; ++j) { + out->qs[j] = best_L[j] | (best_L[j + QK4_NL/2] << 4); + } +} + +size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, + int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK4_NL == 0); + const int8_t * values = q4dpt_get_levels(); + GGML_ASSERT(values != NULL && "Q4_DPT levels not set - call q4dpt_set_levels() first"); + + const int64_t nblock = n_per_row / QK4_NL; + char * qrow = (char *) dst; + + for (int64_t row = 0; row < nrow; ++row) { + block_q4_dpt * q4 = (block_q4_dpt *) qrow; + for (int64_t ibl = 0; ibl < nblock; ++ibl) { + const float * qw = quant_weights ? quant_weights + QK4_NL * ibl : NULL; + quantize_block_q4_dpt(src + QK4_NL * ibl, &q4[ibl], values, qw, 15); + } + src += n_per_row; + qrow += nblock * sizeof(block_q4_dpt); + } + return (size_t) nrow * nblock * sizeof(block_q4_dpt); +} + +void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k) { + assert(k % QK4_NL == 0); + quantize_q4_dpt(x, y, 1, k, NULL); +} + +//////////////////////////////////////////////////////////////////////////////// +// Q2_DPT - 2-bit per-tensor Lloyd-Max quantization (2.5 bpw) +//////////////////////////////////////////////////////////////////////////////// + +// Global levels (used during quantization for the current tensor) +static int8_t q2dpt_levels[Q2DPT_N_LEVELS]; +static bool q2dpt_levels_set = false; + +void q2dpt_set_levels(const int8_t * levels) { + memcpy(q2dpt_levels, levels, Q2DPT_N_LEVELS * sizeof(int8_t)); + q2dpt_levels_set = true; +} + +const int8_t * q2dpt_get_levels(void) { + return q2dpt_levels_set ? q2dpt_levels : NULL; +} + +void q2dpt_free_levels(void) { + q2dpt_levels_set = false; +} + +// Lloyd-Max k-means for Q2_DPT: train 4 int8 levels from weight data. +// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4]. +// Also sets the global levels via q2dpt_set_levels(). +void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]) { + GGML_ASSERT(nrow * n_per_row > 0); + GGML_ASSERT(n_per_row % QK2_DPT == 0); + + const int N_BINS = 8192; + const float bin_width = 2.0f / N_BINS; + + // Allocate and clear histogram buffers + float * bin_sum_w = (float *) calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *) calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + // Build histogram: bin normalized values (x/amax), weight by amax^2 + for (int64_t row = 0; row < nrow; ++row) { + const float * row_data = data + row * n_per_row; + const float * row_w = imatrix ? imatrix + row * n_per_row : NULL; + + for (int64_t ibl = 0; ibl < n_per_row / QK2_DPT; ++ibl) { + const float * block = row_data + ibl * QK2_DPT; + const float * w = row_w ? row_w + ibl * QK2_DPT : NULL; + + // Find max abs in block + float amax = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float ax = fabsf(block[j]); + if (ax > amax) amax = ax; + } + if (amax < 1e-10f) continue; + + // Bin normalized values + for (int j = 0; j < QK2_DPT; ++j) { + float x = block[j] / amax; + float wt = amax * amax * (w ? w[j] : 1.0f); + int bin = (int)((x + 1.0f) / bin_width); + bin = (bin < 0) ? 0 : (bin >= N_BINS) ? N_BINS - 1 : bin; + bin_sum_w[bin] += wt; + bin_sum_wt[bin] += x * wt; + } + } + } + + // Initialize from Q4_DPT levels (subsample to 4 levels) + const int8_t * q4dpt_init = q4dpt_get_levels(); + float best_levels[Q2DPT_N_LEVELS]; + if (q4dpt_init) { + // Subsample Q4_DPT's 16 levels to 4 levels + best_levels[0] = (float)q4dpt_init[0] / 127.0f; + best_levels[1] = (float)q4dpt_init[5] / 127.0f; + best_levels[2] = (float)q4dpt_init[10] / 127.0f; + best_levels[3] = (float)q4dpt_init[15] / 127.0f; + } else { + // Fallback: uniform asymmetric initialization + best_levels[0] = -1.0f; + best_levels[1] = -0.33f; + best_levels[2] = 0.33f; + best_levels[3] = 1.0f; + } + + // Run Lloyd-Max iterations + q4dpt_run_lloyd_max(bin_sum_w, bin_sum_wt, best_levels, Q2DPT_N_LEVELS, N_BINS, bin_width, 500); + + // Round to int8 and enforce sorted order + int8_t levels_i8[Q2DPT_N_LEVELS]; + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + int v = (int)roundf(best_levels[k] * 127.0f); + if (v < -128) v = -128; + if (v > 127) v = 127; + levels_i8[k] = (int8_t)v; + } + + // Greedy local search: try +/-1 adjustments + float base_score = 0.0f; + for (int bin = 0; bin < N_BINS; ++bin) { + if (bin_sum_w[bin] > 0) { + float x = bin_sum_wt[bin] / bin_sum_w[bin]; + float best_dist = fabsf(x - best_levels[0]); + for (int k = 1; k < Q2DPT_N_LEVELS; ++k) { + float dist = fabsf(x - best_levels[k]); + if (dist < best_dist) best_dist = dist; + } + base_score += best_dist * bin_sum_w[bin]; + } + } + + for (int pass = 0; pass < 10; ++pass) { + bool improved = false; + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + int8_t orig = levels_i8[k]; + for (int delta = -1; delta <= 1; delta += 2) { + int8_t trial = (int8_t)(orig + delta); + if (k > 0 && trial <= levels_i8[k-1]) continue; + if (k < Q2DPT_N_LEVELS - 1 && trial >= levels_i8[k+1]) continue; + + levels_i8[k] = trial; + float cur_levels[Q2DPT_N_LEVELS]; + for (int i = 0; i < Q2DPT_N_LEVELS; ++i) + cur_levels[i] = (float)levels_i8[i] / 127.0f; + + float cur_score = 0.0f; + for (int bin = 0; bin < N_BINS; ++bin) { + if (bin_sum_w[bin] > 0) { + float x = bin_sum_wt[bin] / bin_sum_w[bin]; + float best_dist = fabsf(x - cur_levels[0]); + for (int i = 1; i < Q2DPT_N_LEVELS; ++i) { + float dist = fabsf(x - cur_levels[i]); + if (dist < best_dist) best_dist = dist; + } + cur_score += best_dist * bin_sum_w[bin]; + } + } + + if (cur_score < base_score) { + base_score = cur_score; + improved = true; + } else { + levels_i8[k] = orig; + } + } + } + if (!improved) break; + } + + memcpy(levels_out, levels_i8, Q2DPT_N_LEVELS * sizeof(int8_t)); + q2dpt_set_levels(levels_i8); + + free(bin_sum_w); + free(bin_sum_wt); +} + +void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + assert(k % QK2_DPT == 0); + const int64_t nb = k / QK2_DPT; + const int8_t * values = (const int8_t *)levels; + GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor"); + + for (int i = 0; i < nb; i++) { + const uint8_t * qs = x[i].qs; + const float d = GGML_FP16_TO_FP32(x[i].d); + for (int j = 0; j < QK2_DPT/4; ++j) { + uint8_t q = qs[j]; + y[j*4 + 0] = d * (float)values[(q >> 0) & 3]; + y[j*4 + 1] = d * (float)values[(q >> 2) & 3]; + y[j*4 + 2] = d * (float)values[(q >> 4) & 3]; + y[j*4 + 3] = d * (float)values[(q >> 6) & 3]; + } + y += QK2_DPT; + } +} + +// Strategy bitmask for quantize_block_q2_dpt (for A/B testing). +// Bit 0: level-anchor CD (approach A) +// Bit 1: boundary sweep+CD (approach B) +// Bit 2: dual-extreme CD (approach C: max_val AND min_val anchors) +// Bit 3: element-anchor CD (approach D: every xb[j]/values[k] as anchor) +// Bit 4: brute-force monotone partition (approach E: exhaustive search, O(n^3) per block) +static int q2dpt_quant_strategy = 0x3; // default: A+B + +void q2dpt_set_quant_strategy(int s) { q2dpt_quant_strategy = s; } + +// Refine d via iterated CD until convergence. Returns best d. +static float q2dpt_cd_refine(const float * GGML_RESTRICT xb, const float * qw, + const int8_t * values, float d) { + for (int iter = 0; iter < 8; ++iter) { + float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f; + float sumqx = 0.0f, sumq2 = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int m = 1; m < Q2DPT_N_LEVELS; ++m) { + float dist = fabsf(al - (float)values[m]); + if (dist < bd) { bd = dist; bk = m; } + } + float q = (float)values[bk]; + float w = qw ? qw[j] : 1.0f; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + if (sumq2 < 1e-20f) break; + float d_new = sumqx / sumq2; + if (fabsf(d_new - d) < 1e-8f * fabsf(d)) break; + d = d_new; + } + return d; +} + +// Evaluate a candidate d: returns objective, fills L[]. +static float q2dpt_eval(const float * GGML_RESTRICT xb, const float * qw, + const int8_t * values, float d, uint8_t * L) { + float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f; + float sumqx = 0.0f, sumq2 = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int m = 1; m < Q2DPT_N_LEVELS; ++m) { + float dist = fabsf(al - (float)values[m]); + if (dist < bd) { bd = dist; bk = m; } + } + L[j] = (uint8_t)bk; + float q = (float)values[bk]; + float w = qw ? qw[j] : 1.0f; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + if (sumq2 < 1e-20f) return -1e30f; + return (sumqx / sumq2) * sumqx; +} + +// Helper: try a single starting d, refine via CD, update best if improved. +static inline void q2dpt_try_d(const float * GGML_RESTRICT xb, const float * qw, + const int8_t * values, float d_init, + float * best, float * best_d, uint8_t * best_L) { + uint8_t L[QK2_DPT]; + float d = q2dpt_cd_refine(xb, qw, values, d_init); + float score = q2dpt_eval(xb, qw, values, d, L); + if (score > *best) { + *best = score; *best_d = d; + memcpy(best_L, L, QK2_DPT); + } +} + +// Quantize one 32-element block using 4 int8 levels and optimal per-block scale. +// The q2dpt_quant_strategy bitmask selects which search approaches are used: +// Bit 0 (A): level anchors + CD (d = max_val / values[k] for each k) +// Bit 1 (B): boundary sweep + CD (d = xb[j] / boundary for each j and boundary) +// Bit 2 (C): dual-extreme anchors + CD (A using both max_val AND min_val) +// Bit 3 (D): element-anchor scan + CD (d = xb[j] / values[k] for each j, k) +// Bit 4 (E): brute-force monotone partition (exhaustive over all C(35,3) partitions) +static void quantize_block_q2_dpt(const float * GGML_RESTRICT xb, block_q2_dpt * GGML_RESTRICT out, + const int8_t * values, const float * qw, int ntry) { + (void)ntry; + const int strat = q2dpt_quant_strategy; + + float amax = 0.0f, max_val = 0.0f, min_val = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) { amax = ax; max_val = xb[j]; } + if (xb[j] < min_val) min_val = xb[j]; + if (xb[j] > max_val) max_val = xb[j]; + } + if (amax < 1e-10f) { + out->d = 0; + memset(out->qs, 0, QK2_DPT/4); + return; + } + + uint8_t best_L[QK2_DPT]; + float best = -1e30f; + float best_d = 0.0f; + + // --- A: level-anchor CD (4 starting points) --- + if (strat & 0x1) { + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + if (values[k] == 0) continue; + q2dpt_try_d(xb, qw, values, max_val / (float)values[k], + &best, &best_d, best_L); + } + } + + // --- B: boundary-crossing sweep + CD --- + if (strat & 0x2) { + for (int b = 0; b < Q2DPT_N_LEVELS - 1; ++b) { + float bnd = ((float)values[b] + (float)values[b + 1]) * 0.5f; + if (fabsf(bnd) < 0.5f) continue; + for (int j = 0; j < QK2_DPT; ++j) { + if (fabsf(xb[j]) < 1e-12f) continue; + q2dpt_try_d(xb, qw, values, xb[j] / bnd, + &best, &best_d, best_L); + } + } + } + + // --- C: dual-extreme anchors + CD (8 starting points) --- + if (strat & 0x4) { + float extremes[2] = { max_val, min_val }; + for (int e = 0; e < 2; ++e) { + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + if (values[k] == 0) continue; + q2dpt_try_d(xb, qw, values, extremes[e] / (float)values[k], + &best, &best_d, best_L); + } + } + } + + // --- D: element-anchor scan + CD (32 x 4 starting points) --- + if (strat & 0x8) { + for (int j = 0; j < QK2_DPT; ++j) { + if (fabsf(xb[j]) < 1e-12f) continue; + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) { + if (values[k] == 0) continue; + q2dpt_try_d(xb, qw, values, xb[j] / (float)values[k], + &best, &best_d, best_L); + } + } + } + + // --- E: brute-force monotone partition enumeration --- + // For a single scale d, the optimal assignment must be monotone on sorted x: + // if x_i < x_j then L[i] <= L[j] (for d>0) or L[i] >= L[j] (for d<0). + // We enumerate all C(32+3, 3) = C(35,3) = 6545 ways to partition 32 sorted + // elements into 4 groups, score each in O(1) using prefix sums, then pick best. + if (strat & 0x10) { + // Sort elements by value, keeping original indices + int idx[QK2_DPT]; + for (int j = 0; j < QK2_DPT; ++j) idx[j] = j; + // Simple insertion sort (only 32 elements) + for (int i = 1; i < QK2_DPT; ++i) { + int t = idx[i]; + float tv = xb[t]; + int j = i - 1; + while (j >= 0 && xb[idx[j]] > tv) { + idx[j + 1] = idx[j]; + --j; + } + idx[j + 1] = t; + } + + // Build weighted prefix sums: swx[i] = sum_{j0 maps sorted ascending to values ascending, + // d<0 maps sorted ascending to values descending. + for (int flip = 0; flip < 2; ++flip) { + float v[Q2DPT_N_LEVELS]; + if (flip == 0) { + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) v[k] = (float)values[k]; + } else { + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) v[k] = (float)values[Q2DPT_N_LEVELS - 1 - k]; + } + + // Precompute per-level pair products for scoring + float vv[Q2DPT_N_LEVELS]; + for (int k = 0; k < Q2DPT_N_LEVELS; ++k) vv[k] = v[k] * v[k]; + + float bf_best = -1e30f; + int bf_b1 = 0, bf_b2 = 0, bf_b3 = 0; + + // Enumerate partition boundaries b1 <= b2 <= b3 where group k = [b_k, b_{k+1}) + // b0=0, b4=32. 0 <= b1 <= b2 <= b3 <= 32. + for (int b1 = 0; b1 <= QK2_DPT; ++b1) { + // Segment 0: indices [0, b1) assigned to v[0] + float s0_wx = swx[b1] - swx[0]; + float s0_w = sw[b1] - sw[0]; + for (int b2 = b1; b2 <= QK2_DPT; ++b2) { + // Segment 1: indices [b1, b2) assigned to v[1] + float s01_wx = s0_wx + (swx[b2] - swx[b1]); + float s01_w = s0_w + (sw[b2] - sw[b1]); + // sumqx and sumq2 for segments 0+1 can be expressed but we need all 4. + // For efficiency, compute incrementally for b3. + float partial_sumqx = v[0] * s0_wx + v[1] * (swx[b2] - swx[b1]); + float partial_sumq2 = vv[0] * s0_w + vv[1] * (sw[b2] - sw[b1]); + (void)s01_wx; (void)s01_w; + for (int b3 = b2; b3 <= QK2_DPT; ++b3) { + // Segment 2: [b2, b3) -> v[2], Segment 3: [b3, 32) -> v[3] + float seg2_wx = swx[b3] - swx[b2]; + float seg2_w = sw[b3] - sw[b2]; + float seg3_wx = swx[QK2_DPT] - swx[b3]; + float seg3_w = sw[QK2_DPT] - sw[b3]; + + float sumqx = partial_sumqx + v[2] * seg2_wx + v[3] * seg3_wx; + float sumq2 = partial_sumq2 + vv[2] * seg2_w + vv[3] * seg3_w; + + if (sumq2 < 1e-20f) continue; + // score = d * sumqx = sumqx^2 / sumq2 (only valid when sumqx > 0) + float score = sumqx * sumqx / sumq2; + // d = sumqx / sumq2; for validity we need d > 0 when flip==0, d < 0 when flip==1 + // i.e. sumqx > 0 for flip==0, sumqx < 0 for flip==1 + if (flip == 0 && sumqx <= 0.0f) continue; + if (flip == 1 && sumqx >= 0.0f) continue; + if (score > bf_best) { + bf_best = score; + bf_b1 = b1; bf_b2 = b2; bf_b3 = b3; + } + } + } + } + + if (bf_best > -1e29f) { + // Reconstruct the assignment for the best partition + uint8_t L_bf[QK2_DPT]; + for (int j = 0; j < bf_b1; ++j) { + int orig = idx[j]; + L_bf[orig] = flip == 0 ? 0 : (Q2DPT_N_LEVELS - 1); + } + for (int j = bf_b1; j < bf_b2; ++j) { + int orig = idx[j]; + L_bf[orig] = flip == 0 ? 1 : (Q2DPT_N_LEVELS - 2); + } + for (int j = bf_b2; j < bf_b3; ++j) { + int orig = idx[j]; + L_bf[orig] = flip == 0 ? 2 : (Q2DPT_N_LEVELS - 1 - 2); + } + for (int j = bf_b3; j < QK2_DPT; ++j) { + int orig = idx[j]; + L_bf[orig] = flip == 0 ? 3 : 0; + } + // Compute d from this assignment + float sumqx = 0.0f, sumq2 = 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float q = (float)values[L_bf[j]]; + float w = qw ? qw[j] : 1.0f; + sumqx += w * q * xb[j]; + sumq2 += w * q * q; + } + if (sumq2 > 1e-20f) { + float d_bf = sumqx / sumq2; + float score = (sumqx / sumq2) * sumqx; + if (score > best) { + best = score; + best_d = d_bf; + memcpy(best_L, L_bf, QK2_DPT); + } + } + } + } + } + + // Final re-assignment with best scale + float id = (fabsf(best_d) > 1e-20f) ? 1.0f / best_d : 0.0f; + for (int j = 0; j < QK2_DPT; ++j) { + float al = id * xb[j]; + int bk = 0; + float bd = fabsf(al - (float)values[0]); + for (int k = 1; k < Q2DPT_N_LEVELS; ++k) { + float dist = fabsf(al - (float)values[k]); + if (dist < bd) { bd = dist; bk = k; } + } + best_L[j] = (uint8_t)bk; + } + + // Pack 2-bit indices: 4 values per byte + out->d = GGML_FP32_TO_FP16(best_d); + for (int j = 0; j < QK2_DPT/4; ++j) { + out->qs[j] = best_L[j*4] | (best_L[j*4+1] << 2) | (best_L[j*4+2] << 4) | (best_L[j*4+3] << 6); + } +} + +size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, + int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK2_DPT == 0); + const int8_t * values = q2dpt_get_levels(); + GGML_ASSERT(values != NULL && "Q2_DPT levels not set - call q2dpt_set_levels() first"); + + const int64_t nblock = n_per_row / QK2_DPT; + char * qrow = (char *) dst; + + for (int64_t row = 0; row < nrow; ++row) { + block_q2_dpt * q2 = (block_q2_dpt *) qrow; + for (int64_t ibl = 0; ibl < nblock; ++ibl) { + const float * qw = quant_weights ? quant_weights + QK2_DPT * ibl : NULL; + quantize_block_q2_dpt(src + QK2_DPT * ibl, &q2[ibl], values, qw, 7); + } + src += n_per_row; + qrow += nblock * sizeof(block_q2_dpt); + } + return (size_t) nrow * nblock * sizeof(block_q2_dpt); +} + +void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k) { + assert(k % QK2_DPT == 0); + quantize_q2_dpt(x, y, 1, k, NULL); +} + +// ====================== Q2_KPT: Q2_K with learned per-tensor float levels ====================== + +static float q2kpt_levels[Q2KPT_N_LEVELS]; +static bool q2kpt_levels_set = false; + +// Global level storage for Q2_KPT (per-block levels for last quantized tensor) +static float *q2kpt_block_levels = NULL; +static size_t q2kpt_max_levels = 0; +static size_t q2kpt_cur_levels = 0; + +// Prepare the levels buffer for a tensor with given dimensions. +// This should be called before parallel quantization to pre-allocate storage. +void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row) { + const int nb = (int)(n_per_row / QK_K); + const size_t total_levels = (size_t)nrows * nb * Q2KPT_N_LEVELS; + if (total_levels > q2kpt_max_levels) { + q2kpt_block_levels = (float *) realloc(q2kpt_block_levels, total_levels * sizeof(float)); + q2kpt_max_levels = total_levels; + } + q2kpt_cur_levels = total_levels; +} + +void q2kpt_set_levels(const float * levels) { + memcpy(q2kpt_levels, levels, Q2KPT_N_LEVELS * sizeof(float)); + q2kpt_levels_set = true; +} + +const float * q2kpt_get_levels(void) { + // Return per-block levels if available, otherwise global levels + if (q2kpt_block_levels && q2kpt_cur_levels > 0) { + return q2kpt_block_levels; + } + return q2kpt_levels_set ? q2kpt_levels : NULL; +} + +void q2kpt_free_levels(void) { + q2kpt_levels_set = false; + if (q2kpt_block_levels) { + free(q2kpt_block_levels); + q2kpt_block_levels = NULL; + q2kpt_max_levels = 0; + q2kpt_cur_levels = 0; + } +} + +// Train 4 Lloyd-Max float levels for a single 256-element block. +// Normalizes sub-block values to [0,1] using Q2_K-style scale+min estimation, then runs k-means. +// Forward declaration (defined later in this file) +static float make_q2kpt_quants(int n, const float * GGML_RESTRICT x, + uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, + const float * mapped_levels, const float * weight); + +// ---- q2kpt_quantize_block_given_levels ---------------------------------------- +// Quantize one QK_K-element block using caller-specified levels (no training). +// block_x: QK_K floats of original data +// y: output block_q2_kpt (filled in place) +// quant_weights: QK_K importance weights (or NULL → use x[i]²) +// sigma2: mean(x²) for the block (for weight formula) +// levels: Q2KPT_N_LEVELS values in [0,1], must be sorted ascending +// ------------------------------------------------------------------------------- +static void q2kpt_quantize_block_given_levels( + const float * GGML_RESTRICT block_x, + block_q2_kpt * GGML_RESTRICT y, + const float * GGML_RESTRICT quant_weights, + float sigma2, + const float levels[Q2KPT_N_LEVELS]) { + + float mapped_levels[Q2KPT_N_LEVELS]; + float bounds[Q2KPT_N_LEVELS - 1]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) mapped_levels[k] = levels[k] * 3.0f; + for (int k = 0; k < Q2KPT_N_LEVELS - 1; ++k) + bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); + + uint8_t L[QK_K]; + float mins[QK_K / 16], scales[QK_K / 16], sw[QK_K / 16]; + float weight[16]; + uint8_t Ls[QK_K / 16], Lm[QK_K / 16]; + + memset(sw, 0, sizeof(sw)); + float sumx2 = sigma2 * QK_K; // reconstitute (or recompute below) + + for (int j = 0; j < QK_K / 16; ++j) { + const float * bx = block_x + 16 * j; + if (quant_weights) { + const float * qw = quant_weights + 16 * j; + for (int l = 0; l < 16; ++l) + weight[l] = qw[l] * sqrtf(sigma2 + bx[l] * bx[l]); + } else { + for (int l = 0; l < 16; ++l) + weight[l] = bx[l] * bx[l]; + } + for (int l = 0; l < 16; ++l) sw[j] += weight[l]; + scales[j] = make_q2kpt_quants(16, bx, L + 16 * j, &mins[j], mapped_levels, weight); + } + + float dm = make_qp_quants(QK_K / 16, 15, scales, Ls, sw); + float mm = make_qp_quants(QK_K / 16, 15, mins, Lm, sw); + + y->d = GGML_FP32_TO_FP16(dm); + y->dmin = GGML_FP32_TO_FP16(mm); + dm = GGML_FP16_TO_FP32(y->d); + mm = GGML_FP16_TO_FP32(y->dmin); + + for (int j = 0; j < QK_K / 16; ++j) y->scales[j] = Ls[j] | (Lm[j] << 4); + + // Second pass: reassign with quantized scales + for (int j = 0; j < QK_K / 16; ++j) { + const float d = dm * (y->scales[j] & 0xF); + if (!d) { + int zero_k = 0; + float zero_d = fabsf(mapped_levels[0]); + for (int k = 1; k < Q2KPT_N_LEVELS; ++k) { + if (fabsf(mapped_levels[k]) < zero_d) { zero_d = fabsf(mapped_levels[k]); zero_k = k; } + } + for (int ii = 0; ii < 16; ++ii) L[16 * j + ii] = zero_k; + continue; + } + const float m = mm * (y->scales[j] >> 4); + for (int ii = 0; ii < 16; ++ii) { + float scaled = (block_x[16 * j + ii] + m) / d; + L[16 * j + ii] = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + } + } + + // Pack 2-bit indices (Q2_K layout) + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y->qs[j / 4 + l] = L[j + l] | (L[j + l + 32] << 2) + | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } + + (void)sumx2; +} + +// ---- Histogram Lloyd-Max helper ---------------------------------------------- +// Runs weighted Lloyd-Max iterations on a pre-built histogram. +// bin_centers[b]: representative value for bin b (weighted centroid) +// bin_w[b]: total weight of data in bin b +// levels[]: centroids, in/out (must be sorted ascending on entry) +// ------------------------------------------------------------------------------- +static void q2kpt_histogram_lloyd_max( + int n_bins, const float * bin_centers, const float * bin_w, + float * levels, int n_levels, int n_iter) { + + for (int iter = 0; iter < n_iter; ++iter) { + float sum_w[Q2KPT_N_LEVELS] = { 0.0f }; + float sum_wt[Q2KPT_N_LEVELS] = { 0.0f }; + + for (int b = 0; b < n_bins; ++b) { + float w = bin_w[b]; + if (w < 1e-30f) continue; + float t = bin_centers[b]; + int best = 0; + float bd2 = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < n_levels; ++k) { + float d2 = (t - levels[k]) * (t - levels[k]); + if (d2 < bd2) { bd2 = d2; best = k; } + } + sum_w[best] += w; + sum_wt[best] += w * t; + } + + float new_levels[Q2KPT_N_LEVELS]; + float max_delta = 0.0f; + for (int k = 0; k < n_levels; ++k) { + new_levels[k] = (sum_w[k] > 1e-30f) ? sum_wt[k] / sum_w[k] : levels[k]; + } + // Sort ascending (insertion sort, n_levels=4) + for (int k = 1; k < n_levels; ++k) { + float v = new_levels[k]; int m = k - 1; + while (m >= 0 && new_levels[m] > v) { new_levels[m + 1] = new_levels[m]; --m; } + new_levels[m + 1] = v; + } + for (int k = 0; k < n_levels; ++k) + max_delta = fmaxf(max_delta, fabsf(new_levels[k] - levels[k])); + memcpy(levels, new_levels, n_levels * sizeof(float)); + if (max_delta < 1e-7f) break; + } +} + +// ---- q2kpt_optimize_block_levels ---------------------------------------------- +// Full closed-loop EM training for one QK_K block using histogram binning: +// 1. Init: histogram Lloyd-Max on normalized [0,1] sub-block values +// 2. EM cycles: full E-step → build effective-T histogram → cheap Lloyd-Max +// 3. Final quantize with best levels seen +// +// block_x: QK_K floats of original data +// block_y: workspace / output (holds the best quantization on return) +// quant_weights: QK_K per-element importance weights (or NULL → use x[i]²) +// sigma2: mean(x²) for the block +// levels_out: Q2KPT_N_LEVELS trained levels in [0,1], ascending +// ------------------------------------------------------------------------------- +#define Q2KPT_N_BINS 128 // histogram bins +#define Q2KPT_INIT_LLOYD 10 // Lloyd-Max iters on init histogram +#define Q2KPT_N_EM_CYCLES 4 // number of full E-step calls +#define Q2KPT_LLOYD_PER_CYCLE 10 // cheap histogram Lloyd-Max iters per cycle + +static void q2kpt_optimize_block_levels( + const float * GGML_RESTRICT block_x, + block_q2_kpt * GGML_RESTRICT block_y, + const float * GGML_RESTRICT quant_weights, + float sigma2, + float levels_out[Q2KPT_N_LEVELS]) { + + const float inv_bins = 1.0f / Q2KPT_N_BINS; + + // ---- Build per-element weights and sub-block-normalised values ----------- + float weights[QK_K]; + float norm_vals[QK_K]; + + for (int sb = 0; sb < QK_K / 16; ++sb) { + const float * xsb = block_x + sb * 16; + float xmin = xsb[0], xmax = xsb[0]; + for (int l = 1; l < 16; ++l) { + if (xsb[l] < xmin) xmin = xsb[l]; + if (xsb[l] > xmax) xmax = xsb[l]; + } + if (xmin > 0.0f) xmin = 0.0f; + float range = xmax - xmin; + float inv_range = (range > 1e-10f) ? 1.0f / range : 0.0f; + + for (int l = 0; l < 16; ++l) { + int el = sb * 16 + l; + float t = (range > 1e-10f) ? + fmaxf(0.0f, fminf(1.0f, (xsb[l] - xmin) * inv_range)) : 0.0f; + norm_vals[el] = t; + + float w; + if (quant_weights) { + w = quant_weights[el] * sqrtf(sigma2 + xsb[l] * xsb[l]); + } else { + w = xsb[l] * xsb[l]; + } + // Scale by range² so normalised-space errors match actual-space + weights[el] = fmaxf(w * range * range, 1e-30f); + } + } + + // ---- Phase 1: Init levels via histogram Lloyd-Max on norm_vals ---------- + float bin_w[Q2KPT_N_BINS]; + float bin_wt[Q2KPT_N_BINS]; + float bin_centers[Q2KPT_N_BINS]; + + memset(bin_w, 0, sizeof(bin_w)); + memset(bin_wt, 0, sizeof(bin_wt)); + + for (int i = 0; i < QK_K; ++i) { + float t = norm_vals[i]; + int b = (int)(t * Q2KPT_N_BINS); + if (b >= Q2KPT_N_BINS) b = Q2KPT_N_BINS - 1; + bin_w[b] += weights[i]; + bin_wt[b] += weights[i] * t; + } + for (int b = 0; b < Q2KPT_N_BINS; ++b) + bin_centers[b] = (bin_w[b] > 1e-30f) ? bin_wt[b] / bin_w[b] : (b + 0.5f) * inv_bins; + + float levels[Q2KPT_N_LEVELS]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) + levels[k] = (float)k / (Q2KPT_N_LEVELS - 1); + + q2kpt_histogram_lloyd_max(Q2KPT_N_BINS, bin_centers, bin_w, + levels, Q2KPT_N_LEVELS, Q2KPT_INIT_LLOYD); + + // ---- Phase 2: EM cycles ------------------------------------------------- + // Each cycle: full E-step → build effective-T histogram → cheap Lloyd-Max. + // Effective-T: T_i = (x_i + B_i) / A_i, W_i = w_i * A_i² + // The M-step optimal level for class k is the W-weighted mean of T_i in k. + float best_levels[Q2KPT_N_LEVELS]; + memcpy(best_levels, levels, sizeof(levels)); + float best_err = 1e38f; + + float eff_bin_w[Q2KPT_N_BINS]; + float eff_bin_wt[Q2KPT_N_BINS]; + float eff_bin_centers[Q2KPT_N_BINS]; + + for (int cycle = 0; cycle < Q2KPT_N_EM_CYCLES; ++cycle) { + + // Full E-step + q2kpt_quantize_block_given_levels(block_x, block_y, quant_weights, sigma2, levels); + + float d_all_q = GGML_FP16_TO_FP32(block_y->d); + float dmin_q = GGML_FP16_TO_FP32(block_y->dmin); + + memset(eff_bin_w, 0, sizeof(eff_bin_w)); + memset(eff_bin_wt, 0, sizeof(eff_bin_wt)); + float err = 0.0f; + + for (int el = 0; el < QK_K; ++el) { + int sb = el / 16; + int k_j = block_y->scales[sb] & 0xF; + int m_j = block_y->scales[sb] >> 4; + + float A = d_all_q * (float)k_j * 3.0f; + float mn = dmin_q * (float)m_j; + + int qs_byte = (el / 128) * 32 + el % 32; + int shift = ((el % 128) / 32) * 2; + int idx = (block_y->qs[qs_byte] >> shift) & 3; + + float w = quant_weights ? + quant_weights[el] * sqrtf(sigma2 + block_x[el] * block_x[el]) : + block_x[el] * block_x[el]; + w = fmaxf(w, 1e-30f); + + float y_approx = A * levels[idx] - mn; + float diff = y_approx - block_x[el]; + err += w * diff * diff; + + // Build effective-T histogram for histogram Lloyd-Max M-step + if (A > 1e-10f) { + float T = fmaxf(0.0f, fminf(1.0f, (block_x[el] + mn) / A)); + float W = w * A * A; + int b = (int)(T * Q2KPT_N_BINS); + if (b >= Q2KPT_N_BINS) b = Q2KPT_N_BINS - 1; + eff_bin_w[b] += W; + eff_bin_wt[b] += W * T; + } + } + + if (err < best_err) { + best_err = err; + memcpy(best_levels, levels, sizeof(levels)); + } + + for (int b = 0; b < Q2KPT_N_BINS; ++b) + eff_bin_centers[b] = (eff_bin_w[b] > 1e-30f) + ? eff_bin_wt[b] / eff_bin_w[b] + : (b + 0.5f) * inv_bins; + + q2kpt_histogram_lloyd_max(Q2KPT_N_BINS, eff_bin_centers, eff_bin_w, + levels, Q2KPT_N_LEVELS, Q2KPT_LLOYD_PER_CYCLE); + } + + // Final quantize with the best levels found across all cycles + memcpy(levels_out, best_levels, sizeof(best_levels)); + q2kpt_quantize_block_given_levels(block_x, block_y, quant_weights, sigma2, best_levels); +} + +// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT. +// Uses Q2_K-style scale+min estimation to normalize sub-block values to [0,1]. +GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[Q2KPT_N_LEVELS]) { + const int N_BINS = 8192; + const float bin_width = 1.0f / N_BINS; + float * bin_sum_w = (float *) calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *) calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + const int nb = (int)(n_per_row / QK_K); + + // Single pass: for each 16-element sub-block, estimate scale+min, normalize to [0,1], bin + for (int64_t row = 0; row < nrow; ++row) { + const float * xrow = data + row * n_per_row; + + for (int i = 0; i < nb; i++) { + const float * x = xrow + i * QK_K; + + for (int j = 0; j < QK_K / 16; ++j) { + // Find min and max of sub-block + float xmin = x[16 * j], xmax = x[16 * j]; + for (int l = 1; l < 16; ++l) { + if (x[16 * j + l] < xmin) xmin = x[16 * j + l]; + if (x[16 * j + l] > xmax) xmax = x[16 * j + l]; + } + // Q2_K clamps min to <= 0 + if (xmin > 0) xmin = 0; + float range = xmax - xmin; + if (range < 1e-10f) continue; + + float inv_range = 1.0f / range; + + for (int l = 0; l < 16; ++l) { + // Normalize to [0, 1]: t = (x - min) / range + float t = (x[16 * j + l] - xmin) * inv_range; + if (t < 0.0f) t = 0.0f; + if (t > 1.0f) t = 1.0f; + + int bin_idx = (int)(t * N_BINS); + if (bin_idx >= N_BINS) bin_idx = N_BINS - 1; + + int elem = i * QK_K + 16 * j + l; + float w = imatrix ? imatrix[elem] : 1.0f; + if (w < 1e-10f) w = 1e-10f; + // Weight by range² (like Q3_KPT weights by d²) + w *= range * range; + + bin_sum_w[bin_idx] += w; + bin_sum_wt[bin_idx] += w * t; + } + } + } + } + + // Initialize 4 levels uniformly in [0, 1] + float levels[Q2KPT_N_LEVELS]; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { + levels[k] = (float)k / (Q2KPT_N_LEVELS - 1); + } + + // Lloyd-Max iterations on bins + for (int iter = 0; iter < 100; ++iter) { + float sum_w[Q2KPT_N_LEVELS] = { 0 }; + float sum_wt[Q2KPT_N_LEVELS] = { 0 }; + + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) continue; + const float t = (b + 0.5f) * bin_width; + int best = 0; + float bd2 = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < Q2KPT_N_LEVELS; ++k) { + float d2 = (t - levels[k]) * (t - levels[k]); + if (d2 < bd2) { bd2 = d2; best = k; } + } + sum_w[best] += bin_sum_w[b]; + sum_wt[best] += bin_sum_wt[b]; + } + + float max_delta = 0.0f; + for (int k = 0; k < Q2KPT_N_LEVELS; ++k) { + if (sum_w[k] > 1e-12f) { + float new_level = sum_wt[k] / sum_w[k]; + max_delta = fmaxf(max_delta, fabsf(new_level - levels[k])); + levels[k] = new_level; + } + } + if (max_delta < 1e-10f) break; + + // Sort levels + for (int k = 1; k < Q2KPT_N_LEVELS; ++k) { + float v = levels[k]; + int m = k - 1; + while (m >= 0 && levels[m] > v) { + levels[m + 1] = levels[m]; + m--; + } + levels[m + 1] = v; + } + } + + memcpy(levels_out, levels, Q2KPT_N_LEVELS * sizeof(float)); + q2kpt_set_levels(levels); + free(bin_sum_w); + free(bin_sum_wt); +} + +void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + const float * lv = (const float *)levels; + GGML_ASSERT(lv != NULL && "Q2_KPT levels not set for tensor"); + + for (int i = 0; i < nb; i++) { + // Per-block levels: block i uses lv[i*4 + 0..3] + const float * block_lv = lv + i * Q2KPT_N_LEVELS; + + // Precompute mapped levels: ml[k] = levels[k] * 3.0 + float ml[Q2KPT_N_LEVELS]; + for (int j = 0; j < Q2KPT_N_LEVELS; ++j) { + ml[j] = block_lv[j] * 3.0f; + } + + const float d_all = GGML_FP16_TO_FP32(x[i].d); + const float m_all = GGML_FP16_TO_FP32(x[i].dmin); + const uint8_t * q = x[i].qs; + + int is = 0; + for (int n = 0; n < QK_K; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + uint8_t sc = x[i].scales[is++]; + float dl = d_all * (sc & 0xF); + float mn = m_all * (sc >> 4); + for (int l = 0; l < 16; ++l) { + int idx = (q[l] >> shift) & 3; + *y++ = dl * ml[idx] - mn; + } + + sc = x[i].scales[is++]; + dl = d_all * (sc & 0xF); + mn = m_all * (sc >> 4); + for (int l = 0; l < 16; ++l) { + int idx = (q[l + 16] >> shift) & 3; + *y++ = dl * ml[idx] - mn; + } + + shift += 2; + } + q += 32; + } + } +} + +// Helper: find optimal (scale, min) for non-uniform mapped levels with offset. +// mapped_levels[k] = levels[k]*3, k=0..3. +// Model: x[i] ≈ scale * ml[L[i]] - min_offset, with min_offset >= 0. +// Returns the per-sub-block scale; *the_min receives the min offset. +// L[i] gets the best level index [0..3]. +static float make_q2kpt_quants(int n, + const float * GGML_RESTRICT x, + uint8_t * GGML_RESTRICT L, + float * GGML_RESTRICT the_min, + const float * mapped_levels, + const float * weight) { + // Precompute boundaries for nearest-level assignment + float bounds[Q2KPT_N_LEVELS - 1]; + for (int k = 0; k < Q2KPT_N_LEVELS - 1; ++k) { + bounds[k] = 0.5f * (mapped_levels[k] + mapped_levels[k + 1]); + } + + float xmin = x[0], xmax = x[0]; + for (int i = 1; i < n; ++i) { + if (x[i] < xmin) xmin = x[i]; + if (x[i] > xmax) xmax = x[i]; + } + if (xmin > 0) xmin = 0; + if (xmax <= xmin) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = -xmin; + return 0.f; + } + + float ml_max = mapped_levels[Q2KPT_N_LEVELS - 1]; + + float best_scale = 0, best_min = 0; + float best_obj = 0; + bool first = true; + + // Grid search: try multiple trial scales + for (int is = -9; is <= 36; ++is) { + float iscale = (ml_max + 0.1f * is) / (xmax - xmin); + float trial_min = -xmin; + + float sum_l = 0, sum_l2 = 0, sum_lx = 0; + float sum_x = 0, sum_w = 0; + for (int i = 0; i < n; ++i) { + float scaled = iscale * (x[i] + trial_min); + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + float ml_val = mapped_levels[best_k]; + float w = weight ? weight[i] : x[i] * x[i]; + sum_l += w * ml_val; + sum_l2 += w * ml_val * ml_val; + sum_lx += w * ml_val * x[i]; + sum_x += w * x[i]; + sum_w += w; + } + + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_lx - sum_x * sum_l) / D; + float this_min = (sum_l2 * sum_x - sum_l * sum_lx) / D; + + if (this_min > 0) { + this_min = 0; + this_scale = sum_lx / sum_l2; + } + + float cur_error = 0; + for (int i = 0; i < n; ++i) { + float scaled = (x[i] - this_min) / (this_scale > 1e-15f ? this_scale : 1e-15f); + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + float diff = this_scale * mapped_levels[best_k] + this_min - x[i]; + float w = weight ? weight[i] : x[i] * x[i]; + cur_error += w * diff * diff; + } + + if (first || cur_error < best_obj) { + best_obj = cur_error; + best_scale = this_scale; + best_min = this_min; + first = false; + } + } + } + + // Inner EM refinement from best found by grid search: iterate + // assign→refit→assign until convergence (fixes Problem 3) + for (int refine = 0; refine < 8; ++refine) { + float sum_l = 0, sum_l2 = 0, sum_lx = 0, sum_x = 0, sum_w = 0; + for (int i = 0; i < n; ++i) { + float scaled = (x[i] - best_min) / (best_scale > 1e-15f ? best_scale : 1e-15f); + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + float ml_val = mapped_levels[best_k]; + float w = weight ? weight[i] : x[i] * x[i]; + sum_l += w * ml_val; + sum_l2 += w * ml_val * ml_val; + sum_lx += w * ml_val * x[i]; + sum_x += w * x[i]; + sum_w += w; + } + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D <= 0) break; + float new_scale = (sum_w * sum_lx - sum_x * sum_l) / D; + float new_min = (sum_l2 * sum_x - sum_l * sum_lx) / D; + if (new_min > 0) { + new_min = 0; + new_scale = sum_l2 > 1e-30f ? sum_lx / sum_l2 : 0.f; + } + float cur_error = 0; + for (int i = 0; i < n; ++i) { + float scaled = (x[i] - new_min) / (new_scale > 1e-15f ? new_scale : 1e-15f); + int best_k = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + float diff = new_scale * mapped_levels[best_k] + new_min - x[i]; + float w = weight ? weight[i] : x[i] * x[i]; + cur_error += w * diff * diff; + } + if (cur_error >= best_obj - 1e-12f * best_obj) { break; } + best_obj = cur_error; + best_scale = new_scale; + best_min = new_min; + } + + // Final assignment with best (scale, min) + for (int i = 0; i < n; ++i) { + float scaled = (x[i] - best_min) / (best_scale > 1e-15f ? best_scale : 1e-15f); + L[i] = (scaled > bounds[0]) + (scaled > bounds[1]) + (scaled > bounds[2]); + } + + *the_min = -best_min; + return best_scale; +} + +static void quantize_row_q2_kpt_impl(const float * GGML_RESTRICT x, + block_q2_kpt * GGML_RESTRICT y, + int64_t n_per_row, + const float * GGML_RESTRICT quant_weights, + const float * GGML_RESTRICT imatrix, + float * GGML_RESTRICT block_levels) { + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + GGML_ASSERT(block_levels != NULL && "block_levels buffer must be provided"); + + for (int i = 0; i < nb; ++i) { + const float * block_x = x + i * QK_K; + + // Per-block quant_weights and imatrix slices (fixes the imatrix offset bug: + // previously the full-row imatrix was passed and indexed from 0 for every block) + const float * block_qw = quant_weights ? quant_weights + i * QK_K : NULL; + const float * block_im = imatrix ? imatrix + i * QK_K : NULL; + + float sumx2 = 0.0f; + for (int j = 0; j < QK_K; ++j) sumx2 += block_x[j] * block_x[j]; + float sigma2 = sumx2 / QK_K; + + float block_lv[Q2KPT_N_LEVELS]; + // Runs k-means init + EM loop; fills block_lv AND writes the best + // quantized block into y[i] as a side-effect. + q2kpt_optimize_block_levels(block_x, &y[i], block_qw, sigma2, block_lv); + + memcpy(block_levels + i * Q2KPT_N_LEVELS, block_lv, Q2KPT_N_LEVELS * sizeof(float)); + + (void)block_im; // imatrix is folded into block_qw; retained for future use + } +} + +size_t quantize_q2_kpt(const float * GGML_RESTRICT src, + void * GGML_RESTRICT dst, + int64_t start_row, + int64_t nrow, + int64_t n_per_row, + const float * imatrix) { + size_t row_size = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row); + char * qrow = (char *) dst; + const int nb = (int)(n_per_row / QK_K); + const size_t total_levels = (size_t)nrow * nb * Q2KPT_N_LEVELS; + const size_t levels_needed = (size_t)(start_row + nrow) * nb * Q2KPT_N_LEVELS; + + // Ensure buffer is large enough (should have been pre-allocated via q2kpt_prepare_levels) + if (levels_needed > q2kpt_max_levels) { + q2kpt_block_levels = (float *) realloc(q2kpt_block_levels, levels_needed * sizeof(float)); + q2kpt_max_levels = levels_needed; + } + q2kpt_cur_levels = levels_needed; + + // Temporary buffer for one row's block levels + float * row_block_levels = (float *) malloc(nb * Q2KPT_N_LEVELS * sizeof(float)); + + for (int64_t row = 0; row < nrow; ++row) { + // Quantize row with per-block trained levels + quantize_row_q2_kpt_impl(src, (block_q2_kpt *) qrow, n_per_row, imatrix, imatrix, row_block_levels); + // Copy this row's block levels to the global buffer at the correct offset + memcpy(q2kpt_block_levels + (start_row + row) * nb * Q2KPT_N_LEVELS, row_block_levels, + nb * Q2KPT_N_LEVELS * sizeof(float)); + src += n_per_row; + qrow += row_size; + } + free(row_block_levels); + return nrow * row_size; +} + +// Train per-row levels for all rows of a tensor and write to out_levels. +// out_levels must be pre-allocated to nrow * Q2KPT_N_LEVELS floats. +void q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float * out_levels) { + for (int64_t r = 0; r < nrow; ++r) { + q2kpt_train_levels(data + r * n_per_row, 1, n_per_row, imatrix, + out_levels + r * Q2KPT_N_LEVELS); + } +} + +void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_q2_kpt(x, y, 0, 1, k, NULL); +} + + +// ============================================================================ +// ============================================================================ +// IQ2_TQ: E8 Lattice Vector Quantization (2.1875 bpw) +// ============================================================================ +// +// --- Per-tensor grid state --- +static int8_t iq2tq_active_grid[16][4]; +static bool iq2tq_grid_set = false; + +void iq2tq_set_grid(const int8_t grid[64]) { + memcpy(iq2tq_active_grid, grid, 64); + iq2tq_grid_set = true; +} + +const int8_t * iq2tq_get_grid(void) { + return iq2tq_grid_set ? (const int8_t *)iq2tq_active_grid : NULL; +} + +// --- Default grid: trained via K-means on optimal per-group level-sets --- +// recon[j] = d * IQ2TQ_GRID_SCALE * grid[si][qi] +// Only used when no per-tensor grid is available (fallback) +static const int8_t iq2tq_grid_default[16][4] = { + {-20, -8, -2, 6}, // 0: {-2.50, -1.00, -0.25, 0.75} + {-14, -8, -2, 4}, // 1: {-1.75, -1.00, -0.25, 0.50} + {-16, -10, 0, 12}, // 2: {-2.00, -1.25, 0.00, 1.50} + {-14, -4, 2, 8}, // 3: {-1.75, -0.50, 0.25, 1.00} + {-20, -4, 4, 12}, // 4: {-2.50, -0.50, 0.50, 1.50} + {-8, -4, 0, 4}, // 5: {-1.00, -0.50, 0.00, 0.50} + {-8, -4, 0, 8}, // 6: {-1.00, -0.50, 0.00, 1.00} + {-12, -6, 2, 12}, // 7: {-1.50, -0.75, 0.25, 1.50} + {-4, -2, 2, 4}, // 8: {-0.50, -0.25, 0.25, 0.50} + {-10, -2, 4, 8}, // 9: {-1.25, -0.25, 0.50, 1.00} + {-16, -6, 4, 20}, // 10: {-2.00, -0.75, 0.50, 2.50} + {-12, -2, 6, 14}, // 11: {-1.50, -0.25, 0.75, 1.75} + {-8, -2, 4, 14}, // 12: {-1.00, -0.25, 0.50, 1.75} + {-4, 0, 4, 8}, // 13: {-0.50, 0.00, 0.50, 1.00} + {-8, -2, 6, 22}, // 14: {-1.00, -0.25, 0.75, 2.75} + {-4, 2, 8, 14}, // 15: {-0.50, 0.25, 1.00, 1.75} +}; + +// Get current grid: per-tensor if set, else default +static inline const int8_t (*iq2tq_cur_grid(void))[4] { + return iq2tq_grid_set ? (const int8_t (*)[4])iq2tq_active_grid + : (const int8_t (*)[4])iq2tq_grid_default; +} + +// Unpack 2-bit index from qs array: element j is in bits (j%4)*2..(j%4)*2+1 of byte j/4 +static inline int iq2tq_get_qi(const uint8_t * qs, int j) { + return (qs[j / 4] >> ((j % 4) * 2)) & 3; +} + +// Pack 2-bit index into qs array +static inline void iq2tq_set_qi(uint8_t * qs, int j, int val) { + int byte_idx = j / 4; + int bit_off = (j % 4) * 2; + qs[byte_idx] = (qs[byte_idx] & ~(3 << bit_off)) | ((val & 3) << bit_off); +} + +// Find nearest qi in sorted grid entry via midpoint boundaries +static inline int iq2tq_nearest_qi(float xn, const int8_t * g) { + if (xn <= 0.5f * (g[0] + g[1])) return 0; + if (xn <= 0.5f * (g[1] + g[2])) return 1; + if (xn <= 0.5f * (g[2] + g[3])) return 2; + return 3; +} + +// Dequantization — 2-bit with asymmetric grid per group +void dequantize_row_iq2_tq(const block_iq2_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + const int8_t (*grid)[4] = levels ? (const int8_t (*)[4])levels + : (const int8_t (*)[4])iq2tq_grid_default; + const int nb = k / QK_K; + + for (int i = 0; i < nb; ++i) { + const block_iq2_tq * b = x + i; + const float dq = GGML_FP16_TO_FP32(b->d) * IQ2TQ_GRID_SCALE; + + for (int g = 0; g < IQ2TQ_N_GROUPS; ++g) { + int si = (b->scales[g / 2] >> (4 * (g % 2))) & 0xF; + const int8_t * ge = grid[si]; + + for (int k2 = 0; k2 < 8; ++k2) { + int j = g * 8 + k2; + int qi = iq2tq_get_qi(b->qs, j); + y[i * QK_K + j] = dq * (float)ge[qi]; + } + } + } +} + +// Reference quantization +void quantize_row_iq2_tq_ref(const float * GGML_RESTRICT x, block_iq2_tq * GGML_RESTRICT y, int64_t k) { + quantize_iq2_tq(x, y, 1, k, NULL); +} + +// Quantize one row — 2-bit with asymmetric grid + OLS d +static void quantize_row_iq2_tq_impl( + const float * GGML_RESTRICT x, + block_iq2_tq * GGML_RESTRICT y, + int64_t n_per_row, + const float * GGML_RESTRICT quant_weights +) { + assert(n_per_row % QK_K == 0); + const int8_t (*grid)[4] = iq2tq_cur_grid(); + const int nb = n_per_row / QK_K; + + for (int bi = 0; bi < nb; ++bi) { + const float * xb = x + bi * QK_K; + block_iq2_tq * yb = y + bi; + memset(yb, 0, sizeof(block_iq2_tq)); + + // Compute sigma2 for importance weighting (higher = more uniform, lower = more magnitude-biased) + float sigma2 = 0; + for (int j = 0; j < QK_K; ++j) sigma2 += xb[j] * xb[j]; + sigma2 = 8.0f * sigma2 / QK_K; + + float amax = 0; + for (int j = 0; j < QK_K; ++j) { + float av = fabsf(xb[j]); + if (av > amax) amax = av; + } + if (amax < 1e-15f) continue; + + // Initial d: max grid value ~24 in int8, recon = d * 0.125 * 24 = 3d → d = amax/3 + float d = amax / 3.0f; + + uint8_t qs[64] = {0}; + uint8_t scales_out[16] = {0}; + int grid_idx[IQ2TQ_N_GROUPS]; // chosen grid entry per group + + // Phase 1: For each group, try all 16 grid entries, pick best + float dq = d * IQ2TQ_GRID_SCALE; + float inv_dq = (fabsf(dq) > 1e-15f) ? 1.0f / dq : 0.0f; + for (int g = 0; g < IQ2TQ_N_GROUPS; ++g) { + float best_err = 1e30f; + int best_si = 3; // default: centered medium + + for (int si = 0; si < 16; ++si) { + const int8_t * ge = grid[si]; + float g_err = 0; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + float xn = xb[j] * inv_dq; + int qi = iq2tq_nearest_qi(xn, ge); + float recon = dq * (float)ge[qi]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + float err = xb[j] - recon; + g_err += wk * err * err; + } + if (g_err < best_err) { best_err = g_err; best_si = si; } + } + grid_idx[g] = best_si; + + // Quantize elements with chosen grid + const int8_t * ge = grid[best_si]; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + iq2tq_set_qi(qs, j, iq2tq_nearest_qi(xb[j] * inv_dq, ge)); + } + } + + // Iterative refinement + for (int iter = 0; iter < 12; ++iter) { + // Re-fit d via weighted OLS: d = sum(w*x*g) / (GRID_SCALE * sum(w*g*g)) + double sumxg = 0, sumgg = 0; + for (int j = 0; j < QK_K; ++j) { + int g = j / 8; + float gval = (float)grid[grid_idx[g]][iq2tq_get_qi(qs, j)]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + sumxg += wk * xb[j] * gval; + sumgg += wk * gval * gval; + } + d = (sumgg > 0) ? (float)(sumxg / (IQ2TQ_GRID_SCALE * sumgg)) : d; + + // Re-optimize grids and re-quantize + dq = d * IQ2TQ_GRID_SCALE; + inv_dq = (fabsf(dq) > 1e-15f) ? 1.0f / dq : 0.0f; + memset(scales_out, 0, 16); + for (int g = 0; g < IQ2TQ_N_GROUPS; ++g) { + float best_err = 1e30f; + int best_si = 3; + + for (int si = 0; si < 16; ++si) { + const int8_t * ge = grid[si]; + float g_err = 0; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + float xn = xb[j] * inv_dq; + int qi = iq2tq_nearest_qi(xn, ge); + float recon = dq * (float)ge[qi]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + float err = xb[j] - recon; + g_err += wk * err * err; + } + if (g_err < best_err) { best_err = g_err; best_si = si; } + } + scales_out[g / 2] |= (best_si << (4 * (g % 2))); + grid_idx[g] = best_si; + + const int8_t * ge = grid[best_si]; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + iq2tq_set_qi(qs, j, iq2tq_nearest_qi(xb[j] * inv_dq, ge)); + } + } + } + + // Final OLS d + { + double sumxg = 0, sumgg = 0; + for (int j = 0; j < QK_K; ++j) { + int g = j / 8; + float gval = (float)grid[grid_idx[g]][iq2tq_get_qi(qs, j)]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + sumxg += wk * xb[j] * gval; + sumgg += wk * gval * gval; + } + d = (sumgg > 0) ? (float)(sumxg / (IQ2TQ_GRID_SCALE * sumgg)) : d; + } + + // Multi-d search: try nearby d values and re-optimize grids + { + float best_d = d; + float best_total_err = 1e30f; + uint8_t best_scales[16], best_qs[64]; + int best_grid_idx[IQ2TQ_N_GROUPS]; + static const float d_factors[] = {0.8f, 0.85f, 0.9f, 0.925f, 0.95f, 0.975f, 1.0f, 1.025f, 1.05f, 1.075f, 1.1f, 1.15f, 1.2f}; + static const int n_d_factors = sizeof(d_factors) / sizeof(d_factors[0]); + + for (int df = 0; df < n_d_factors; ++df) { + float td = d * d_factors[df]; + float tdq = td * IQ2TQ_GRID_SCALE; + float tinv = (fabsf(tdq) > 1e-15f) ? 1.0f / tdq : 0.0f; + uint8_t tscales[16] = {0}; + uint8_t tqs[64] = {0}; + int tgrid[IQ2TQ_N_GROUPS]; + float total_err = 0; + + for (int g = 0; g < IQ2TQ_N_GROUPS; ++g) { + float best_err = 1e30f; + int best_si = 3; + for (int si = 0; si < 16; ++si) { + const int8_t * ge = grid[si]; + float g_err = 0; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + float xn = xb[j] * tinv; + int qi = iq2tq_nearest_qi(xn, ge); + float recon = tdq * (float)ge[qi]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + float err = xb[j] - recon; + g_err += wk * err * err; + } + if (g_err < best_err) { best_err = g_err; best_si = si; } + } + tscales[g / 2] |= (best_si << (4 * (g % 2))); + tgrid[g] = best_si; + const int8_t * ge = grid[best_si]; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + iq2tq_set_qi(tqs, j, iq2tq_nearest_qi(xb[j] * tinv, ge)); + } + total_err += best_err; + } + + if (total_err < best_total_err) { + best_total_err = total_err; + best_d = td; + memcpy(best_scales, tscales, 16); + memcpy(best_qs, tqs, 64); + memcpy(best_grid_idx, tgrid, sizeof(tgrid)); + } + } + d = best_d; + memcpy(scales_out, best_scales, 16); + memcpy(qs, best_qs, 64); + memcpy(grid_idx, best_grid_idx, sizeof(grid_idx)); + } + + // Post multi-d refinement: 2 more OLS+grid iterations from the best d + for (int iter = 0; iter < 2; ++iter) { + double sumxg = 0, sumgg = 0; + for (int j = 0; j < QK_K; ++j) { + int g = j / 8; + float gval = (float)grid[grid_idx[g]][iq2tq_get_qi(qs, j)]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + sumxg += wk * xb[j] * gval; + sumgg += wk * gval * gval; + } + d = (sumgg > 0) ? (float)(sumxg / (IQ2TQ_GRID_SCALE * sumgg)) : d; + + dq = d * IQ2TQ_GRID_SCALE; + inv_dq = (fabsf(dq) > 1e-15f) ? 1.0f / dq : 0.0f; + memset(scales_out, 0, 16); + for (int g = 0; g < IQ2TQ_N_GROUPS; ++g) { + float best_err = 1e30f; + int best_si = 3; + for (int si = 0; si < 16; ++si) { + const int8_t * ge = grid[si]; + float g_err = 0; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + float xn = xb[j] * inv_dq; + int qi = iq2tq_nearest_qi(xn, ge); + float recon = dq * (float)ge[qi]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + float err = xb[j] - recon; + g_err += wk * err * err; + } + if (g_err < best_err) { best_err = g_err; best_si = si; } + } + scales_out[g / 2] |= (best_si << (4 * (g % 2))); + grid_idx[g] = best_si; + const int8_t * ge = grid[best_si]; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + iq2tq_set_qi(qs, j, iq2tq_nearest_qi(xb[j] * inv_dq, ge)); + } + } + } + + // Write result + yb->d = GGML_FP32_TO_FP16(d); + memcpy(yb->scales, scales_out, 16); + memcpy(yb->qs, qs, 64); + } +} + +// Public quantize function +size_t quantize_iq2_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, + int64_t nrows, int64_t n_per_row, const float * imatrix) { + size_t row_size = ggml_row_size(GGML_TYPE_IQ2_TQ, n_per_row); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrows; ++row) { + quantize_row_iq2_tq_impl(src, (block_iq2_tq *)qrow, n_per_row, imatrix); + src += n_per_row; + qrow += row_size; + } + return nrows * row_size; +} + +// Train per-tensor grid via K-means on optimal per-group level-sets +void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t grid_out[64]) { + assert(n_per_row % QK_K == 0); + const int nb_per_row = n_per_row / QK_K; + const int max_groups = 200000; + + // Collect optimal per-group level-sets with importance weights + float (*sets)[4] = (float (*)[4])malloc(max_groups * 4 * sizeof(float)); + float * set_weights = (float *)malloc(max_groups * sizeof(float)); + int n_sets = 0; + + for (int64_t row = 0; row < nrow && n_sets < max_groups - IQ2TQ_N_GROUPS; row++) { + const float * xrow = data + row * n_per_row; + for (int bi = 0; bi < nb_per_row && n_sets < max_groups; bi++) { + const float * xb = xrow + bi * QK_K; + + // Compute d = amax / 3 + float amax = 0; + for (int j = 0; j < QK_K; j++) { + float av = fabsf(xb[j]); + if (av > amax) amax = av; + } + if (amax < 1e-15f) continue; + float inv_dq = 1.0f / (amax / 3.0f * IQ2TQ_GRID_SCALE); + + for (int g = 0; g < IQ2TQ_N_GROUPS && n_sets < max_groups; g++) { + // Compute importance weight for this group + float gw = 1.0f; + if (imatrix) { + gw = 0; + for (int k = 0; k < 8; k++) gw += imatrix[g * 8 + k]; + gw /= 8.0f; + if (gw < 1e-10f) gw = 1e-10f; + } + + // Normalize group values by d*GRID_SCALE + float xn[8]; + for (int k = 0; k < 8; k++) { + xn[k] = xb[g * 8 + k] * inv_dq; + } + + // Sort + for (int i = 0; i < 7; i++) + for (int j2 = i + 1; j2 < 8; j2++) + if (xn[j2] < xn[i]) { float t = xn[i]; xn[i] = xn[j2]; xn[j2] = t; } + + // Optimal 4-level: split sorted 8 values into 4 pairs, compute centroids + // Use Lloyd-Max: iterate assignment + centroid recomputation + float L[4] = { xn[0], xn[2], xn[5], xn[7] }; + for (int iter = 0; iter < 15; iter++) { + float b01 = 0.5f * (L[0] + L[1]); + float b12 = 0.5f * (L[1] + L[2]); + float b23 = 0.5f * (L[2] + L[3]); + float sum[4] = {0}; int cnt[4] = {0}; + for (int k = 0; k < 8; k++) { + int qi = (xn[k] <= b01) ? 0 : (xn[k] <= b12) ? 1 : (xn[k] <= b23) ? 2 : 3; + sum[qi] += xn[k]; cnt[qi]++; + } + int converged = 1; + for (int q = 0; q < 4; q++) { + float newL = (cnt[q] > 0) ? sum[q] / cnt[q] : L[q]; + if (fabsf(newL - L[q]) > 0.01f) converged = 0; + L[q] = newL; + } + if (converged) break; + } + + memcpy(sets[n_sets], L, 4 * sizeof(float)); + set_weights[n_sets] = gw; + n_sets++; + } + } + } + + // K-means clustering of level-sets into 16 clusters + float centroids[16][4]; + // Initialize: spread evenly + for (int i = 0; i < 16; i++) { + int idx = (int)((int64_t)i * n_sets / 16); + memcpy(centroids[i], sets[idx], 4 * sizeof(float)); + } + + int * assign = (int *)calloc(n_sets, sizeof(int)); + for (int iter = 0; iter < 100; iter++) { + int changed = 0; + for (int i = 0; i < n_sets; i++) { + float best_dist = 1e30f; + int best_c = 0; + for (int c = 0; c < 16; c++) { + float dist = 0; + for (int d = 0; d < 4; d++) { + float diff = sets[i][d] - centroids[c][d]; + dist += diff * diff; + } + if (dist < best_dist) { best_dist = dist; best_c = c; } + } + if (assign[i] != best_c) { assign[i] = best_c; changed++; } + } + + double sum[16][4] = {{0}}; + double wcnt[16] = {0}; + for (int i = 0; i < n_sets; i++) { + int c = assign[i]; + float w = set_weights[i]; + for (int d = 0; d < 4; d++) sum[c][d] += w * sets[i][d]; + wcnt[c] += w; + } + for (int c = 0; c < 16; c++) { + if (wcnt[c] > 0) { + for (int d = 0; d < 4; d++) centroids[c][d] = (float)(sum[c][d] / wcnt[c]); + } + } + if (changed == 0) break; + } + + // Sort centroids by sum of levels for consistent ordering + for (int i = 0; i < 15; i++) { + for (int j2 = i + 1; j2 < 16; j2++) { + float si2 = centroids[i][0] + centroids[i][1] + centroids[i][2] + centroids[i][3]; + float sj = centroids[j2][0] + centroids[j2][1] + centroids[j2][2] + centroids[j2][3]; + if (sj < si2) { + float tmp[4]; + memcpy(tmp, centroids[i], 16); + memcpy(centroids[i], centroids[j2], 16); + memcpy(centroids[j2], tmp, 16); + } + } + } + + // Round to int8 and output + for (int i = 0; i < 16; i++) { + for (int d = 0; d < 4; d++) { + float v = centroids[i][d]; + int8_t iv = (int8_t)(v > 0 ? v + 0.5f : v - 0.5f); + grid_out[i * 4 + d] = iv; + } + } + + free(sets); + free(set_weights); + free(assign); +} + +// ============================================================================ +// IQ3_TQ: 3-bit scalar quantization with per-tensor trained grid table +// ============================================================================ + +// --- Per-tensor grid state (16 entries × 8 levels = 128 bytes) --- +static int8_t iq3tq_active_grid[16][IQ3TQ_N_LEVELS]; +static bool iq3tq_grid_set = false; + +void iq3tq_set_grid(const int8_t grid[IQ3TQ_GRID_SIZE]) { + memcpy(iq3tq_active_grid, grid, IQ3TQ_GRID_SIZE); + iq3tq_grid_set = true; +} + +const int8_t * iq3tq_get_grid(void) { + return iq3tq_grid_set ? (const int8_t *)iq3tq_active_grid : NULL; +} + +// Default grid: 16 entries × 8 int8 levels (fallback only, per-tensor training is used in practice) +static const int8_t iq3tq_grid_default[16][IQ3TQ_N_LEVELS] = { + {-24,-18,-12, -6, 0, 6, 12, 18}, + {-20,-15,-10, -5, 0, 5, 10, 15}, + {-16,-12, -8, -4, 0, 4, 8, 12}, + {-12, -8, -4, -2, 0, 2, 4, 8}, + {-24,-16, -8, -2, 2, 6, 10, 14}, + {-14,-10, -6, -2, 2, 8, 16, 24}, + {-20,-14, -8, -4, 0, 4, 10, 18}, + {-18,-10, -4, 0, 4, 8, 14, 20}, + { -8, -6, -4, -2, 0, 2, 4, 6}, + {-10, -6, -4, -2, 2, 4, 6, 10}, + {-22,-14, -6, -2, 2, 6, 14, 22}, + {-16, -8, -4, -2, 0, 4, 8, 16}, + {-24,-20,-16,-12, -8, -4, 0, 4}, + { -4, 0, 4, 8, 12, 16, 20, 24}, + {-20,-16,-10, -4, 4, 10, 16, 20}, + {-12, -8, -6, -2, 2, 6, 8, 12}, +}; + +static inline const int8_t (*iq3tq_cur_grid(void))[IQ3TQ_N_LEVELS] { + return iq3tq_grid_set ? (const int8_t (*)[IQ3TQ_N_LEVELS])iq3tq_active_grid + : (const int8_t (*)[IQ3TQ_N_LEVELS])iq3tq_grid_default; +} + +// 3-bit pack/unpack +static inline int iq3tq_get_qi(const uint8_t * qs, int j) { + int bit_pos = j * 3; + int byte_idx = bit_pos >> 3; + int bit_off = bit_pos & 7; + uint16_t val = qs[byte_idx]; + if (bit_off > 5) val |= ((uint16_t)qs[byte_idx + 1] << 8); + return (val >> bit_off) & 7; +} + +static inline void iq3tq_set_qi(uint8_t * qs, int j, int val) { + int bit_pos = j * 3; + int byte_idx = bit_pos >> 3; + int bit_off = bit_pos & 7; + qs[byte_idx] &= ~((7 << bit_off) & 0xFF); + qs[byte_idx] |= ((val & 7) << bit_off) & 0xFF; + if (bit_off > 5) { + qs[byte_idx + 1] &= ~((7 >> (8 - bit_off)) & 0xFF); + qs[byte_idx + 1] |= ((val & 7) >> (8 - bit_off)) & 0xFF; + } +} + +// Find nearest qi in sorted 8-level grid entry +static inline int iq3tq_nearest_qi(float xn, const int8_t * g) { + for (int i = 0; i < 7; ++i) { + if (xn <= 0.5f * (g[i] + g[i+1])) return i; + } + return 7; +} + +// Dequantization +void dequantize_row_iq3_tq(const block_iq3_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + const int8_t (*grid)[IQ3TQ_N_LEVELS] = levels ? (const int8_t (*)[IQ3TQ_N_LEVELS])levels + : (const int8_t (*)[IQ3TQ_N_LEVELS])iq3tq_grid_default; + const int nb = k / QK_K; + + for (int i = 0; i < nb; ++i) { + const block_iq3_tq * b = x + i; + const float dq = GGML_FP16_TO_FP32(b->d) * IQ3TQ_GRID_SCALE; + + for (int g = 0; g < IQ3TQ_N_GROUPS; ++g) { + int si = (b->scales[g / 2] >> (4 * (g % 2))) & 0xF; + const int8_t * ge = grid[si]; + + for (int k2 = 0; k2 < 8; ++k2) { + int j = g * 8 + k2; + int qi = iq3tq_get_qi(b->qs, j); + y[i * QK_K + j] = dq * (float)ge[qi]; + } + } + } +} + +void quantize_row_iq3_tq_ref(const float * GGML_RESTRICT x, block_iq3_tq * GGML_RESTRICT y, int64_t k) { + quantize_iq3_tq(x, y, 1, k, NULL); +} + +// Quantize one row — 3-bit with asymmetric grid + OLS d +static void quantize_row_iq3_tq_impl( + const float * GGML_RESTRICT x, + block_iq3_tq * GGML_RESTRICT y, + int64_t n_per_row, + const float * GGML_RESTRICT quant_weights +) { + assert(n_per_row % QK_K == 0); + const int8_t (*grid)[IQ3TQ_N_LEVELS] = iq3tq_cur_grid(); + const int nb = n_per_row / QK_K; + + for (int bi = 0; bi < nb; ++bi) { + const float * xb = x + bi * QK_K; + block_iq3_tq * yb = y + bi; + memset(yb, 0, sizeof(block_iq3_tq)); + + float sigma2 = 0; + for (int j = 0; j < QK_K; ++j) sigma2 += xb[j] * xb[j]; + sigma2 = 8.0f * sigma2 / QK_K; + + float amax = 0; + for (int j = 0; j < QK_K; ++j) { + float av = fabsf(xb[j]); + if (av > amax) amax = av; + } + if (amax < 1e-15f) continue; + + float d = amax / 3.0f; + + uint8_t qs[96] = {0}; + uint8_t scales_out[16] = {0}; + int grid_idx[IQ3TQ_N_GROUPS]; + + // Phase 1: For each group, try all 16 grid entries, pick best + float dq = d * IQ3TQ_GRID_SCALE; + float inv_dq = (fabsf(dq) > 1e-15f) ? 1.0f / dq : 0.0f; + for (int g = 0; g < IQ3TQ_N_GROUPS; ++g) { + float best_err = 1e30f; + int best_si = 0; + + for (int si = 0; si < 16; ++si) { + const int8_t * ge = grid[si]; + float g_err = 0; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + float xn = xb[j] * inv_dq; + int qi = iq3tq_nearest_qi(xn, ge); + float recon = dq * (float)ge[qi]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + float err = xb[j] - recon; + g_err += wk * err * err; + } + if (g_err < best_err) { best_err = g_err; best_si = si; } + } + grid_idx[g] = best_si; + const int8_t * ge = grid[best_si]; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + iq3tq_set_qi(qs, j, iq3tq_nearest_qi(xb[j] * inv_dq, ge)); + } + } + + // Iterative refinement + for (int iter = 0; iter < 12; ++iter) { + double sumxg = 0, sumgg = 0; + for (int j = 0; j < QK_K; ++j) { + int g = j / 8; + float gval = (float)grid[grid_idx[g]][iq3tq_get_qi(qs, j)]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + sumxg += wk * xb[j] * gval; + sumgg += wk * gval * gval; + } + d = (sumgg > 0) ? (float)(sumxg / (IQ3TQ_GRID_SCALE * sumgg)) : d; + + dq = d * IQ3TQ_GRID_SCALE; + inv_dq = (fabsf(dq) > 1e-15f) ? 1.0f / dq : 0.0f; + memset(scales_out, 0, 16); + for (int g = 0; g < IQ3TQ_N_GROUPS; ++g) { + float best_err = 1e30f; + int best_si = 0; + for (int si = 0; si < 16; ++si) { + const int8_t * ge = grid[si]; + float g_err = 0; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + float xn = xb[j] * inv_dq; + int qi = iq3tq_nearest_qi(xn, ge); + float recon = dq * (float)ge[qi]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + float err = xb[j] - recon; + g_err += wk * err * err; + } + if (g_err < best_err) { best_err = g_err; best_si = si; } + } + scales_out[g / 2] |= (best_si << (4 * (g % 2))); + grid_idx[g] = best_si; + const int8_t * ge = grid[best_si]; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + iq3tq_set_qi(qs, j, iq3tq_nearest_qi(xb[j] * inv_dq, ge)); + } + } + } + + // Final OLS d + { + double sumxg = 0, sumgg = 0; + for (int j = 0; j < QK_K; ++j) { + int g = j / 8; + float gval = (float)grid[grid_idx[g]][iq3tq_get_qi(qs, j)]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + sumxg += wk * xb[j] * gval; + sumgg += wk * gval * gval; + } + d = (sumgg > 0) ? (float)(sumxg / (IQ3TQ_GRID_SCALE * sumgg)) : d; + } + + // Multi-d search + { + float best_d = d; + float best_total_err = 1e30f; + uint8_t best_scales[16], best_qs[96]; + int best_grid_idx[IQ3TQ_N_GROUPS]; + static const float d_factors[] = {0.8f, 0.85f, 0.9f, 0.925f, 0.95f, 0.975f, 1.0f, 1.025f, 1.05f, 1.075f, 1.1f, 1.15f, 1.2f}; + static const int n_d_factors = sizeof(d_factors) / sizeof(d_factors[0]); + + for (int df = 0; df < n_d_factors; ++df) { + float td = d * d_factors[df]; + float tdq = td * IQ3TQ_GRID_SCALE; + float tinv = (fabsf(tdq) > 1e-15f) ? 1.0f / tdq : 0.0f; + uint8_t tscales[16] = {0}; + uint8_t tqs[96] = {0}; + int tgrid[IQ3TQ_N_GROUPS]; + float total_err = 0; + + for (int g = 0; g < IQ3TQ_N_GROUPS; ++g) { + float best_err = 1e30f; + int best_si = 0; + for (int si = 0; si < 16; ++si) { + const int8_t * ge = grid[si]; + float g_err = 0; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + float xn = xb[j] * tinv; + int qi = iq3tq_nearest_qi(xn, ge); + float recon = tdq * (float)ge[qi]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + float err = xb[j] - recon; + g_err += wk * err * err; + } + if (g_err < best_err) { best_err = g_err; best_si = si; } + } + tscales[g / 2] |= (best_si << (4 * (g % 2))); + tgrid[g] = best_si; + const int8_t * ge = grid[best_si]; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + iq3tq_set_qi(tqs, j, iq3tq_nearest_qi(xb[j] * tinv, ge)); + } + total_err += best_err; + } + + if (total_err < best_total_err) { + best_total_err = total_err; + best_d = td; + memcpy(best_scales, tscales, 16); + memcpy(best_qs, tqs, 96); + memcpy(best_grid_idx, tgrid, sizeof(tgrid)); + } + } + d = best_d; + memcpy(scales_out, best_scales, 16); + memcpy(qs, best_qs, 96); + memcpy(grid_idx, best_grid_idx, sizeof(grid_idx)); + } + + // Post multi-d refinement + for (int iter = 0; iter < 2; ++iter) { + double sumxg = 0, sumgg = 0; + for (int j = 0; j < QK_K; ++j) { + int g = j / 8; + float gval = (float)grid[grid_idx[g]][iq3tq_get_qi(qs, j)]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + sumxg += wk * xb[j] * gval; + sumgg += wk * gval * gval; + } + d = (sumgg > 0) ? (float)(sumxg / (IQ3TQ_GRID_SCALE * sumgg)) : d; + + dq = d * IQ3TQ_GRID_SCALE; + inv_dq = (fabsf(dq) > 1e-15f) ? 1.0f / dq : 0.0f; + memset(scales_out, 0, 16); + for (int g = 0; g < IQ3TQ_N_GROUPS; ++g) { + float best_err = 1e30f; + int best_si = 0; + for (int si = 0; si < 16; ++si) { + const int8_t * ge = grid[si]; + float g_err = 0; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + float xn = xb[j] * inv_dq; + int qi = iq3tq_nearest_qi(xn, ge); + float recon = dq * (float)ge[qi]; + float wk = quant_weights ? quant_weights[bi * QK_K + j] * sqrtf(sigma2 + xb[j] * xb[j]) : 1.0f; + float err = xb[j] - recon; + g_err += wk * err * err; + } + if (g_err < best_err) { best_err = g_err; best_si = si; } + } + scales_out[g / 2] |= (best_si << (4 * (g % 2))); + grid_idx[g] = best_si; + const int8_t * ge = grid[best_si]; + for (int k = 0; k < 8; ++k) { + int j = g * 8 + k; + iq3tq_set_qi(qs, j, iq3tq_nearest_qi(xb[j] * inv_dq, ge)); + } + } + } + + yb->d = GGML_FP32_TO_FP16(d); + memcpy(yb->scales, scales_out, 16); + memcpy(yb->qs, qs, 96); + } +} + +size_t quantize_iq3_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, + int64_t nrows, int64_t n_per_row, const float * imatrix) { + size_t row_size = ggml_row_size(GGML_TYPE_IQ3_TQ, n_per_row); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrows; ++row) { + quantize_row_iq3_tq_impl(src, (block_iq3_tq *)qrow, n_per_row, imatrix); + src += n_per_row; + qrow += row_size; + } + return nrows * row_size; +} + +// Train per-tensor grid: 16 entries × 8 levels via K-means on optimal per-group level-sets +void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t grid_out[IQ3TQ_GRID_SIZE]) { + assert(n_per_row % QK_K == 0); + const int nb_per_row = n_per_row / QK_K; + const int max_groups = 200000; + const int NL = IQ3TQ_N_LEVELS; // 8 + + float (*sets)[IQ3TQ_N_LEVELS] = (float (*)[IQ3TQ_N_LEVELS])malloc(max_groups * NL * sizeof(float)); + float * set_weights = (float *)malloc(max_groups * sizeof(float)); + int n_sets = 0; + + for (int64_t row = 0; row < nrow && n_sets < max_groups - IQ3TQ_N_GROUPS; row++) { + const float * xrow = data + row * n_per_row; + for (int bi = 0; bi < nb_per_row && n_sets < max_groups; bi++) { + const float * xb = xrow + bi * QK_K; + + float amax = 0; + for (int j = 0; j < QK_K; j++) { + float av = fabsf(xb[j]); + if (av > amax) amax = av; + } + if (amax < 1e-15f) continue; + float inv_dq = 1.0f / (amax / 3.0f * IQ3TQ_GRID_SCALE); + + for (int g = 0; g < IQ3TQ_N_GROUPS && n_sets < max_groups; g++) { + float gw = 1.0f; + if (imatrix) { + gw = 0; + for (int k = 0; k < 8; k++) gw += imatrix[g * 8 + k]; + gw /= 8.0f; + if (gw < 1e-10f) gw = 1e-10f; + } + + // Normalize and sort group values + float xn[8]; + for (int k = 0; k < 8; k++) xn[k] = xb[g * 8 + k] * inv_dq; + for (int a = 0; a < 7; a++) + for (int b2 = a + 1; b2 < 8; b2++) + if (xn[b2] < xn[a]) { float t = xn[a]; xn[a] = xn[b2]; xn[b2] = t; } + + // With 8 elements and 8 levels, optimal assignment is 1:1 + // But we still run Lloyd-Max to handle cases with duplicates + float L[IQ3TQ_N_LEVELS]; + for (int q = 0; q < NL; q++) L[q] = xn[q]; + for (int iter = 0; iter < 10; iter++) { + float bounds[7]; + for (int q = 0; q < NL - 1; q++) bounds[q] = 0.5f * (L[q] + L[q+1]); + float sum[IQ3TQ_N_LEVELS] = {0}; int cnt[IQ3TQ_N_LEVELS] = {0}; + for (int k = 0; k < 8; k++) { + int qi = NL - 1; + for (int q = 0; q < NL - 1; q++) { + if (xn[k] <= bounds[q]) { qi = q; break; } + } + sum[qi] += xn[k]; cnt[qi]++; + } + int converged = 1; + for (int q = 0; q < NL; q++) { + float newL = (cnt[q] > 0) ? sum[q] / cnt[q] : L[q]; + if (fabsf(newL - L[q]) > 0.01f) converged = 0; + L[q] = newL; + } + if (converged) break; + } + + memcpy(sets[n_sets], L, NL * sizeof(float)); + set_weights[n_sets] = gw; + n_sets++; + } + } + } + + // K-means clustering into 16 clusters + float centroids[16][IQ3TQ_N_LEVELS]; + for (int i = 0; i < 16; i++) { + int idx = (int)((int64_t)i * n_sets / 16); + memcpy(centroids[i], sets[idx], NL * sizeof(float)); + } + + int * assign = (int *)calloc(n_sets, sizeof(int)); + for (int iter = 0; iter < 100; iter++) { + int changed = 0; + for (int i = 0; i < n_sets; i++) { + float best_dist = 1e30f; + int best_c = 0; + for (int c = 0; c < 16; c++) { + float dist = 0; + for (int dd = 0; dd < NL; dd++) { + float diff = sets[i][dd] - centroids[c][dd]; + dist += diff * diff; + } + if (dist < best_dist) { best_dist = dist; best_c = c; } + } + if (assign[i] != best_c) { assign[i] = best_c; changed++; } + } + + double sum[16][IQ3TQ_N_LEVELS]; + double wcnt[16]; + memset(sum, 0, sizeof(sum)); + memset(wcnt, 0, sizeof(wcnt)); + for (int i = 0; i < n_sets; i++) { + int c = assign[i]; + float w = set_weights[i]; + for (int dd = 0; dd < NL; dd++) sum[c][dd] += w * sets[i][dd]; + wcnt[c] += w; + } + for (int c = 0; c < 16; c++) { + if (wcnt[c] > 0) { + for (int dd = 0; dd < NL; dd++) centroids[c][dd] = (float)(sum[c][dd] / wcnt[c]); + } + } + if (changed == 0) break; + } + + // Sort centroids by sum of levels + for (int i = 0; i < 15; i++) { + for (int j2 = i + 1; j2 < 16; j2++) { + float si2 = 0, sj = 0; + for (int dd = 0; dd < NL; dd++) { si2 += centroids[i][dd]; sj += centroids[j2][dd]; } + if (sj < si2) { + float tmp[IQ3TQ_N_LEVELS]; + memcpy(tmp, centroids[i], NL * sizeof(float)); + memcpy(centroids[i], centroids[j2], NL * sizeof(float)); + memcpy(centroids[j2], tmp, NL * sizeof(float)); + } + } + } + + // Round to int8 + for (int i = 0; i < 16; i++) { + for (int dd = 0; dd < NL; dd++) { + float v = centroids[i][dd]; + grid_out[i * NL + dd] = (int8_t)(v > 0 ? v + 0.5f : v - 0.5f); + } + } + + free(sets); + free(set_weights); + free(assign); +} + +// ===================================================================================== +// IQ1_BN: 8D Vector Quantized with per-tensor trained 4096-entry codebook (1.5625 bpw) +// ===================================================================================== + +// Global state for current tensor's codebook (32768 bytes) +static int8_t iq1bn_active_aux[IQ1BN_AUX_SIZE]; +static bool iq1bn_aux_set = false; + +static const int8_t iq1bn_default_aux[IQ1BN_AUX_SIZE] = {0}; + +void iq1bn_set_aux(const int8_t aux[IQ1BN_AUX_SIZE]) { + memcpy(iq1bn_active_aux, aux, IQ1BN_AUX_SIZE); + iq1bn_aux_set = true; +} + +const int8_t * iq1bn_get_aux(void) { + return iq1bn_aux_set ? iq1bn_active_aux : iq1bn_default_aux; +} + +static inline const int8_t * iq1bn_cur_codebook(void) { + return iq1bn_aux_set ? iq1bn_active_aux : iq1bn_default_aux; +} + +// 12-bit index extraction helpers +static inline int iq1bn_get_idx(const uint8_t * qs, int g) { + int pair = g / 2; + if (g & 1) { + return (qs[3*pair+1] >> 4) | ((int)qs[3*pair+2] << 4); + } else { + return qs[3*pair] | (((int)qs[3*pair+1] & 0x0F) << 8); + } +} + +static inline void iq1bn_set_idx(uint8_t * qs, int pair, int idx0, int idx1) { + qs[3*pair] = idx0 & 0xFF; + qs[3*pair+1] = ((idx0 >> 8) & 0x0F) | ((idx1 & 0x0F) << 4); + qs[3*pair+2] = (idx1 >> 4) & 0xFF; +} + +// Dequantization +void dequantize_row_iq1_bn(const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + const int8_t * codebook = levels ? (const int8_t *)levels : iq1bn_cur_codebook(); + const int nb = k / QK_K; + + for (int i = 0; i < nb; ++i) { + const block_iq1_bn * b = x + i; + const float dq = GGML_FP16_TO_FP32(b->d) * IQ1BN_GRID_SCALE; + + for (int g = 0; g < IQ1BN_N_GROUPS; ++g) { + int ci = iq1bn_get_idx(b->qs, g); + const int8_t * cb = codebook + ci * IQ1BN_CODEBOOK_DIM; + + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + y[i * QK_K + g * IQ1BN_GROUP_SIZE + j] = dq * (float)cb[j]; + } + } + } +} + +// Reference quantization +void quantize_row_iq1_bn_ref(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + const int8_t * codebook = iq1bn_cur_codebook(); + + for (int i = 0; i < nb; ++i) { + float amax = 0; + for (int j = 0; j < QK_K; ++j) { + float ax = fabsf(x[i * QK_K + j]); + if (ax > amax) amax = ax; + } + if (amax < 1e-15f) { + memset(y + i, 0, sizeof(block_iq1_bn)); + continue; + } + float d = amax / 3.0f; + float inv_dq = 1.0f / (d * IQ1BN_GRID_SCALE); + + uint16_t ci_out[IQ1BN_N_GROUPS]; + for (int g = 0; g < IQ1BN_N_GROUPS; ++g) { + float xn[IQ1BN_CODEBOOK_DIM]; + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + xn[j] = x[i * QK_K + g * IQ1BN_GROUP_SIZE + j] * inv_dq; + } + + float best_err = 1e30f; + int best_ci = 0; + for (int c = 0; c < IQ1BN_CODEBOOK_K; ++c) { + const int8_t * cb = codebook + c * IQ1BN_CODEBOOK_DIM; + float err = 0; + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + float diff = xn[j] - (float)cb[j]; + err += diff * diff; + } + if (err < best_err) { best_err = err; best_ci = c; } + } + ci_out[g] = (uint16_t)best_ci; + } + + y[i].d = GGML_FP32_TO_FP16(d); + for (int g = 0; g < IQ1BN_N_GROUPS; g += 2) { + iq1bn_set_idx(y[i].qs, g/2, ci_out[g], ci_out[g+1]); + } + } +} + +// L2 distance helper for 8D float vectors +#if defined(__AVX2__) +static inline float iq1bn_l2_dist_8d(__m256 va, __m256 vb) { + __m256 vdiff = _mm256_sub_ps(va, vb); + __m256 vd2 = _mm256_mul_ps(vdiff, vdiff); + __m128 hi = _mm256_extractf128_ps(vd2, 1); + __m128 lo = _mm256_castps256_ps128(vd2); + __m128 sum4 = _mm_add_ps(lo, hi); + __m128 sum2 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4)); + __m128 sum1 = _mm_add_ss(sum2, _mm_movehdup_ps(sum2)); + return _mm_cvtss_f32(sum1); +} +#endif + +// Find best codebook entry for a normalized group (weighted) — float codebook version +static inline int iq1bn_find_best(const float * xn, const float * wg, + const float * cb_float, const float * cb_norm2, + int K, float * out_err) { + float best_err = 1e30f; + int best_c = 0; +#if defined(__AVX2__) + __m256 vxn = _mm256_loadu_ps(xn); + __m256 vwg = _mm256_loadu_ps(wg); + for (int c = 0; c < K; ++c) { + if (cb_norm2[c] < 1e-10f) continue; + __m256 vcb = _mm256_loadu_ps(cb_float + c * 8); + __m256 vdiff = _mm256_sub_ps(vxn, vcb); + __m256 vd2 = _mm256_mul_ps(vdiff, vdiff); + __m256 vwd2 = _mm256_mul_ps(vwg, vd2); + // Horizontal sum + __m128 hi = _mm256_extractf128_ps(vwd2, 1); + __m128 lo = _mm256_castps256_ps128(vwd2); + __m128 sum4 = _mm_add_ps(lo, hi); + __m128 sum2 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4)); + __m128 sum1 = _mm_add_ss(sum2, _mm_movehdup_ps(sum2)); + float err = _mm_cvtss_f32(sum1); + if (err < best_err) { best_err = err; best_c = c; } + } +#else + for (int c = 0; c < K; ++c) { + if (cb_norm2[c] < 1e-10f) continue; + const float * cb = cb_float + c * IQ1BN_CODEBOOK_DIM; + float err = 0; + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + float diff = xn[j] - cb[j]; + err += wg[j] * diff * diff; + } + if (err < best_err) { best_err = err; best_c = c; } + } +#endif + if (out_err) *out_err = best_err; + return best_c; +} + +// Full quantizer with OLS refinement + multi-d search +static size_t quantize_row_iq1_bn_impl(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, + int64_t n_per_row, const float * quant_weights) { + const int nb = n_per_row / QK_K; + const int8_t * codebook = iq1bn_cur_codebook(); + + // Precompute float codebook and norms (avoids repeated int8→float conversion) + float * cb_float = (float *)malloc(IQ1BN_CODEBOOK_K * IQ1BN_CODEBOOK_DIM * sizeof(float)); + float * cb_norm2 = (float *)malloc(IQ1BN_CODEBOOK_K * sizeof(float)); + for (int c = 0; c < IQ1BN_CODEBOOK_K; ++c) { + float nn = 0; + const int8_t * cb = codebook + c * IQ1BN_CODEBOOK_DIM; + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + float v = (float)cb[j]; + cb_float[c * IQ1BN_CODEBOOK_DIM + j] = v; + nn += v * v; + } + cb_norm2[c] = nn; + } + + for (int i = 0; i < nb; ++i) { + const float * xb = x + i * QK_K; + const float * wt = quant_weights ? quant_weights + i * QK_K : NULL; + + float amax = 0; + for (int j = 0; j < QK_K; ++j) { + float ax = fabsf(xb[j]); + if (ax > amax) amax = ax; + } + if (amax < 1e-15f) { + memset(y + i, 0, sizeof(block_iq1_bn)); + continue; + } + + float d = amax / 3.0f; + uint16_t ci_out[IQ1BN_N_GROUPS]; + + // OLS iterations + for (int iter = 0; iter < 5; ++iter) { + float inv_dq = 1.0f / (d * IQ1BN_GRID_SCALE); + + for (int g = 0; g < IQ1BN_N_GROUPS; ++g) { + float xn[IQ1BN_CODEBOOK_DIM]; + float wg[IQ1BN_CODEBOOK_DIM]; + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + xn[j] = xb[g * IQ1BN_GROUP_SIZE + j] * inv_dq; + wg[j] = wt ? wt[g * IQ1BN_GROUP_SIZE + j] : 1.0f; + } + ci_out[g] = (uint16_t)iq1bn_find_best(xn, wg, cb_float, cb_norm2, IQ1BN_CODEBOOK_K, NULL); + } + + // OLS: recompute d from current assignments + float sumxg = 0, sumgg = 0; + for (int g = 0; g < IQ1BN_N_GROUPS; ++g) { + const int8_t * cb = codebook + ci_out[g] * IQ1BN_CODEBOOK_DIM; + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + float gval = (float)cb[j]; + float w = wt ? wt[g * IQ1BN_GROUP_SIZE + j] : 1.0f; + sumxg += w * xb[g * IQ1BN_GROUP_SIZE + j] * gval; + sumgg += w * gval * gval; + } + } + if (sumgg > 0) { + d = sumxg / (IQ1BN_GRID_SCALE * sumgg); + } + } + + // Multi-d search + static const float d_factors[] = {0.8f, 0.85f, 0.9f, 0.95f, 1.0f, 1.05f, 1.1f, 1.15f, 1.2f}; + float best_d = d; + float best_total_err = 1e30f; + uint16_t best_ci_all[IQ1BN_N_GROUPS]; + + for (int df = 0; df < 9; ++df) { + float td = d * d_factors[df]; + float inv_dq = 1.0f / (td * IQ1BN_GRID_SCALE); + + uint16_t tci[IQ1BN_N_GROUPS]; + float total_err = 0; + + for (int g = 0; g < IQ1BN_N_GROUPS; ++g) { + float xn[IQ1BN_CODEBOOK_DIM]; + float wg[IQ1BN_CODEBOOK_DIM]; + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + xn[j] = xb[g * IQ1BN_GROUP_SIZE + j] * inv_dq; + wg[j] = wt ? wt[g * IQ1BN_GROUP_SIZE + j] : 1.0f; + } + float gerr; + tci[g] = (uint16_t)iq1bn_find_best(xn, wg, cb_float, cb_norm2, IQ1BN_CODEBOOK_K, &gerr); + total_err += gerr; + } + + if (total_err < best_total_err) { + best_total_err = total_err; + best_d = td; + memcpy(best_ci_all, tci, IQ1BN_N_GROUPS * sizeof(uint16_t)); + } + } + + // Post multi-d OLS refinement (2 iterations) + d = best_d; + memcpy(ci_out, best_ci_all, IQ1BN_N_GROUPS * sizeof(uint16_t)); + + for (int post = 0; post < 2; ++post) { + float sumxg = 0, sumgg = 0; + for (int g = 0; g < IQ1BN_N_GROUPS; ++g) { + const int8_t * cb = codebook + ci_out[g] * IQ1BN_CODEBOOK_DIM; + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + float gval = (float)cb[j]; + float w = wt ? wt[g * IQ1BN_GROUP_SIZE + j] : 1.0f; + sumxg += w * xb[g * IQ1BN_GROUP_SIZE + j] * gval; + sumgg += w * gval * gval; + } + } + if (sumgg > 0) { + d = sumxg / (IQ1BN_GRID_SCALE * sumgg); + } + + float inv_dq = 1.0f / (d * IQ1BN_GRID_SCALE); + for (int g = 0; g < IQ1BN_N_GROUPS; ++g) { + float xn[IQ1BN_CODEBOOK_DIM]; + float wg[IQ1BN_CODEBOOK_DIM]; + for (int j = 0; j < IQ1BN_CODEBOOK_DIM; ++j) { + xn[j] = xb[g * IQ1BN_GROUP_SIZE + j] * inv_dq; + wg[j] = wt ? wt[g * IQ1BN_GROUP_SIZE + j] : 1.0f; + } + ci_out[g] = (uint16_t)iq1bn_find_best(xn, wg, cb_float, cb_norm2, IQ1BN_CODEBOOK_K, NULL); + } + } + + // Write block — pack 12-bit indices + y[i].d = GGML_FP32_TO_FP16(d); + for (int g = 0; g < IQ1BN_N_GROUPS; g += 2) { + iq1bn_set_idx(y[i].qs, g/2, ci_out[g], ci_out[g+1]); + } + } + + free(cb_float); + free(cb_norm2); + return nb * sizeof(block_iq1_bn); +} + +// Public quantization entry point +size_t quantize_iq1_bn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, + int64_t nrows, int64_t n_per_row, const float * imatrix) { + GGML_ASSERT(n_per_row % QK_K == 0); + float * quant_weights = (float *)malloc(n_per_row * sizeof(float)); + size_t total = 0; + + for (int64_t row = 0; row < nrows; ++row) { + const float * row_data = src + row * n_per_row; + + if (imatrix) { + const float * im_row = imatrix; + float sumw = 0; + for (int64_t j = 0; j < n_per_row; ++j) { + quant_weights[j] = im_row[j]; + sumw += im_row[j]; + } + float scale = sumw > 0 ? n_per_row / sumw : 1.0f; + for (int64_t j = 0; j < n_per_row; ++j) { + quant_weights[j] *= scale; + } + } else { + float sigma2 = 0; + for (int64_t j = 0; j < n_per_row; ++j) { + sigma2 += row_data[j] * row_data[j]; + } + sigma2 = 8.0f * sigma2 / n_per_row; + if (sigma2 < 1e-15f) sigma2 = 1.0f; + for (int64_t j = 0; j < n_per_row; ++j) { + float w = row_data[j] * row_data[j] + sigma2; + quant_weights[j] = w; + } + } + + total += quantize_row_iq1_bn_impl(row_data, (block_iq1_bn *)((char *)dst + total), + n_per_row, quant_weights); + } + + free(quant_weights); + return total; +} + +// Thread work structs for parallel K-means +#include + +// Worker for K-means++ min_dist update (parallel over samples) +typedef struct { + const float * samples; + float * min_dist; + const float * centroid; // single centroid to compute distance to + int i_start; + int i_end; + int dim; +} iq1bn_kmpp_work_t; + +static void * iq1bn_kmpp_worker(void * arg) { + iq1bn_kmpp_work_t * w = (iq1bn_kmpp_work_t *)arg; + const float * cc = w->centroid; + const int dim = w->dim; + // Use scalar distance to match original K-means++ pick sequence + // (SIMD horizontal sum rounding differences cascade in the pick chain) + for (int i = w->i_start; i < w->i_end; ++i) { + const float * s = w->samples + i * dim; + float dist = 0; + for (int j = 0; j < dim; ++j) { + float diff = s[j] - cc[j]; + dist += diff * diff; + } + if (dist < w->min_dist[i]) w->min_dist[i] = dist; + } + return NULL; +} + +// Worker for K-means iteration (parallel over samples) +typedef struct { + const float * samples; + const float * weights; + const float * centroids; + int * assign; + float * csum; // per-thread accumulator [K * dim] + float * cwt; // per-thread accumulator [K] + int i_start; + int i_end; + int K; + int dim; + int changed; +} iq1bn_kmeans_work_t; + +static void * iq1bn_kmeans_worker(void * arg) { + iq1bn_kmeans_work_t * w = (iq1bn_kmeans_work_t *)arg; + const int K = w->K; + const int dim = w->dim; + int changed = 0; + + memset(w->csum, 0, K * dim * sizeof(float)); + memset(w->cwt, 0, K * sizeof(float)); + + for (int i = w->i_start; i < w->i_end; ++i) { + const float * s = w->samples + i * dim; + float best_dist = 1e30f; + int best_c = 0; +#if defined(__AVX2__) + __m256 vs = _mm256_loadu_ps(s); + for (int c = 0; c < K; ++c) { + float dist = iq1bn_l2_dist_8d(vs, _mm256_loadu_ps(w->centroids + c * dim)); + if (dist < best_dist) { best_dist = dist; best_c = c; } + } +#else + for (int c = 0; c < K; ++c) { + const float * cc = w->centroids + c * dim; + float dist = 0; + for (int j = 0; j < dim; ++j) { + float diff = s[j] - cc[j]; + dist += diff * diff; + } + if (dist < best_dist) { best_dist = dist; best_c = c; } + } +#endif + if (w->assign[i] != best_c) { changed++; w->assign[i] = best_c; } + float wt = w->weights[i]; + for (int j = 0; j < dim; ++j) { + w->csum[best_c * dim + j] += wt * s[j]; + } + w->cwt[best_c] += wt; + } + w->changed = changed; + return NULL; +} + +// K-means codebook training (K=4096, random init) +void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t aux_out[IQ1BN_AUX_SIZE], int nthread) { + const int max_samples = 200000; + const int dim = IQ1BN_CODEBOOK_DIM; + const int K = IQ1BN_CODEBOOK_K; // 4096 + + float * samples = (float *)malloc(max_samples * dim * sizeof(float)); + float * weights = (float *)malloc(max_samples * sizeof(float)); + int n_samples = 0; + + // Phase 1: Collect normalized 8D sub-vectors + for (int64_t row = 0; row < nrow && n_samples < max_samples; ++row) { + const float * row_data = data + row * n_per_row; + const int n_blocks = n_per_row / QK_K; + + for (int bi = 0; bi < n_blocks && n_samples < max_samples; ++bi) { + const float * block = row_data + bi * QK_K; + + float amax = 0; + for (int j = 0; j < QK_K; ++j) { + float ax = fabsf(block[j]); + if (ax > amax) amax = ax; + } + if (amax < 1e-15f) continue; + float dd = amax / 3.0f; + float inv_dq = 1.0f / (dd * IQ1BN_GRID_SCALE); + + for (int g = 0; g < IQ1BN_N_GROUPS && n_samples < max_samples; ++g) { + float gw = 1.0f; + if (imatrix) { + gw = 0; + for (int j = 0; j < dim; ++j) { + int idx = bi * QK_K + g * dim + j; + if (idx < n_per_row) gw += imatrix[idx]; + } + gw /= dim; + if (gw < 1e-10f) gw = 1e-10f; + } + + float * s = samples + n_samples * dim; + for (int j = 0; j < dim; ++j) { + s[j] = block[g * dim + j] * inv_dq; + } + weights[n_samples] = gw; + n_samples++; + } + } + } + + if (n_samples == 0) { + memset(aux_out, 0, IQ1BN_AUX_SIZE); + free(samples); free(weights); + return; + } + + fprintf(stderr, "IQ1BN: training K=%d codebook from %d samples\n", K, n_samples); + + // Phase 2: K-means++ initialization + float * centroids = (float *)calloc(K * dim, sizeof(float)); + int * assign = (int *)calloc(n_samples, sizeof(int)); + float * min_dist = (float *)malloc(n_samples * sizeof(float)); + + // Pick first centroid: weighted random + { + float total_w = 0; + for (int i = 0; i < n_samples; ++i) total_w += weights[i]; + float r = ((float)rand() / RAND_MAX) * total_w; + float cumw = 0; + int pick = 0; + for (int i = 0; i < n_samples; ++i) { + cumw += weights[i]; + if (cumw >= r) { pick = i; break; } + } + memcpy(centroids, samples + pick * dim, dim * sizeof(float)); + for (int i = 0; i < n_samples; ++i) min_dist[i] = 1e30f; + } + + // Full K-means++ initialization with parallel min_dist update + if (nthread < 1) nthread = 1; + if (nthread > n_samples) nthread = n_samples; + + // Set up thread pool for K-means++ min_dist update + pthread_t * threads = (pthread_t *)calloc(nthread, sizeof(pthread_t)); + iq1bn_kmpp_work_t * kmpp_workers = (iq1bn_kmpp_work_t *)calloc(nthread, sizeof(iq1bn_kmpp_work_t)); + for (int t = 0; t < nthread; ++t) { + kmpp_workers[t].samples = samples; + kmpp_workers[t].min_dist = min_dist; + kmpp_workers[t].dim = dim; + int chunk = n_samples / nthread; + kmpp_workers[t].i_start = t * chunk; + kmpp_workers[t].i_end = (t == nthread - 1) ? n_samples : (t + 1) * chunk; + } + + const int n_kmpp = 256; // full K-means++ for first n_kmpp, stale-dist random for rest + for (int c = 0; c < K; ++c) { + if (c < n_kmpp) { + // Update min distances using SCALAR code (parallel over samples) + // Scalar avoids SIMD rounding differences that cascade in K-means++ pick chain + const float * cc = centroids + c * dim; + for (int t = 0; t < nthread; ++t) { + kmpp_workers[t].centroid = cc; + } + if (nthread > 1) { + for (int t = 0; t < nthread; ++t) { + pthread_create(&threads[t], NULL, iq1bn_kmpp_worker, &kmpp_workers[t]); + } + for (int t = 0; t < nthread; ++t) { + pthread_join(threads[t], NULL); + } + } else { + iq1bn_kmpp_worker(&kmpp_workers[0]); + } + } + + if (c + 1 < K) { + // Pick next proportional to weighted distance + float total = 0; + for (int i = 0; i < n_samples; ++i) total += weights[i] * min_dist[i]; + float r = ((float)rand() / RAND_MAX) * total; + float cumw = 0; + int pick = 0; + for (int i = 0; i < n_samples; ++i) { + cumw += weights[i] * min_dist[i]; + if (cumw >= r) { pick = i; break; } + } + memcpy(centroids + (c + 1) * dim, samples + pick * dim, dim * sizeof(float)); + } + } + free(min_dist); + free(kmpp_workers); + + // Phase 3: Full-batch K-means with SIMD and pthread parallelism + + // Allocate per-thread accumulators (reuse threads array from K-means++ init) + iq1bn_kmeans_work_t * workers = (iq1bn_kmeans_work_t *)calloc(nthread, sizeof(iq1bn_kmeans_work_t)); + for (int t = 0; t < nthread; ++t) { + workers[t].samples = samples; + workers[t].weights = weights; + workers[t].centroids = centroids; + workers[t].assign = assign; + workers[t].csum = (float *)malloc(K * dim * sizeof(float)); + workers[t].cwt = (float *)malloc(K * sizeof(float)); + workers[t].K = K; + workers[t].dim = dim; + int chunk = n_samples / nthread; + workers[t].i_start = t * chunk; + workers[t].i_end = (t == nthread - 1) ? n_samples : (t + 1) * chunk; + } + + const int n_iters = 30; + for (int iter = 0; iter < n_iters; ++iter) { + // Launch threads for sample assignment + for (int t = 0; t < nthread; ++t) { + if (nthread > 1) { + pthread_create(&threads[t], NULL, iq1bn_kmeans_worker, &workers[t]); + } else { + iq1bn_kmeans_worker(&workers[0]); + } + } + if (nthread > 1) { + for (int t = 0; t < nthread; ++t) { + pthread_join(threads[t], NULL); + } + } + + // Merge per-thread accumulators + int changed = 0; + for (int t = 0; t < nthread; ++t) { + changed += workers[t].changed; + } + + // Sum csum and cwt across threads + // Use workers[0]'s arrays as the merge target + for (int t = 1; t < nthread; ++t) { + for (int c = 0; c < K; ++c) { + for (int j = 0; j < dim; ++j) { + workers[0].csum[c * dim + j] += workers[t].csum[c * dim + j]; + } + workers[0].cwt[c] += workers[t].cwt[c]; + } + } + + // Update centroids and find most populous cluster + int empty = 0; + int max_pop_c = 0; + float max_pop_w = 0; + for (int c = 0; c < K; ++c) { + if (workers[0].cwt[c] > 0) { + for (int j = 0; j < dim; ++j) { + centroids[c * dim + j] = workers[0].csum[c * dim + j] / workers[0].cwt[c]; + } + if (workers[0].cwt[c] > max_pop_w) { + max_pop_w = workers[0].cwt[c]; + max_pop_c = c; + } + } else { + empty++; + } + } + + // Split most populous cluster to fill empty clusters + if (empty > 0 && max_pop_w > 0) { + for (int c = 0; c < K; ++c) { + if (workers[0].cwt[c] <= 0) { + for (int j = 0; j < dim; ++j) { + float perturb = 0.01f * ((float)(rand() % 201 - 100) / 100.0f); + centroids[c * dim + j] = centroids[max_pop_c * dim + j] + perturb; + } + } + } + } + + if (iter % 5 == 0 || iter == n_iters - 1) { + fprintf(stderr, " iter %2d: changed=%d/%d empty=%d\n", iter, changed, n_samples, empty); + } + if (iter > 5 && changed < n_samples / 1000) break; + } + + // Free per-thread accumulators + for (int t = 0; t < nthread; ++t) { + free(workers[t].csum); + free(workers[t].cwt); + } + free(workers); + free(threads); + + // Round centroids to int8 codebook + int8_t * codebook_out = aux_out; + for (int c = 0; c < K; ++c) { + for (int j = 0; j < dim; ++j) { + float v = centroids[c * dim + j]; + int iv = (int)roundf(v); + if (iv < -127) iv = -127; + if (iv > 127) iv = 127; + codebook_out[c * dim + j] = (int8_t)iv; + } + } + + free(samples); free(weights); + free(centroids); free(assign); +} + +// Also add to validate_row_data switch (done via ops.cpp) + +// Global levels (used during quantization for the current tensor) +static float q3pt_levels[Q3PT_N_LEVELS]; +static bool q3pt_levels_set = false; + +void q3pt_set_levels(const float * levels) { + memcpy(q3pt_levels, levels, Q3PT_N_LEVELS * sizeof(float)); + q3pt_levels_set = true; +} + +const float * q3pt_get_levels(void) { + return q3pt_levels_set ? q3pt_levels : NULL; +} + +void q3pt_free_levels(void) { + q3pt_levels_set = false; +} + + +void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[Q3PT_N_LEVELS]) { + + const int64_t n_sub = n_per_row / 16; // 16-element sub-blocks per row + + // Binning parameters + const int N_BINS = 8192; + const float bin_width = 1.0f / N_BINS; + float * bin_sum_w = (float *)calloc(N_BINS, sizeof(float)); + float * bin_sum_wt = (float *)calloc(N_BINS, sizeof(float)); + GGML_ASSERT(bin_sum_w && bin_sum_wt); + + // First pass: bin the affine-normalized values with their weights + for (int64_t row = 0; row < nrow; ++row) { + const float * xrow = data + row * n_per_row; + for (int64_t ib = 0; ib < n_sub; ++ib) { + const float * xb = xrow + ib * 16; + const int col_base = (int)(ib * 16); + float sb_min = xb[0], sb_max = xb[0]; + for (int j = 1; j < 16; ++j) { + if (xb[j] < sb_min) sb_min = xb[j]; + if (xb[j] > sb_max) sb_max = xb[j]; + } + const float sb_range = sb_max - sb_min; + for (int j = 0; j < 16; ++j) { + float w = 1.0f; + if (imatrix) { + w = imatrix[col_base + j]; + if (w < 1e-10f) w = 1e-10f; + } + if (sb_range > 1e-6f) { + w *= sb_range; + float t = (xb[j] - sb_min) / sb_range; + int bin_idx = (int)(t * N_BINS); + if (bin_idx >= N_BINS) bin_idx = N_BINS - 1; + bin_sum_w[bin_idx] += w; + bin_sum_wt[bin_idx] += w * t; + } + } + } + } + + // Initialize 8 levels uniformly in [0, 1] + float levels[Q3PT_N_LEVELS]; + for (int k = 0; k < Q3PT_N_LEVELS; ++k) { + levels[k] = (float)k / (Q3PT_N_LEVELS - 1); + } + + // Lloyd-Max (weighted k-means) iterations with early convergence + for (int iter = 0; iter < 300; ++iter) { + float sum_w [Q3PT_N_LEVELS] = {0}; + float sum_wt[Q3PT_N_LEVELS] = {0}; + + // Process bins instead of individual values + for (int b = 0; b < N_BINS; ++b) { + if (bin_sum_w[b] < 1e-12f) continue; + const float t = (b + 0.5f) * bin_width; // representative value at bin center + int best = 0; + float best_d2 = (t - levels[0]) * (t - levels[0]); + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { + float d2 = (t - levels[k]) * (t - levels[k]); + if (d2 < best_d2) { best_d2 = d2; best = k; } + } + sum_w [best] += bin_sum_w[b]; + sum_wt[best] += bin_sum_wt[b]; + } + + // Check for early convergence + float max_delta = 0.0f; + for (int k = 0; k < Q3PT_N_LEVELS; ++k) { + if (sum_w[k] > 1e-12f) { + float new_level = sum_wt[k] / sum_w[k]; + max_delta = fmaxf(max_delta, fabsf(new_level - levels[k])); + levels[k] = new_level; + } + } + if (max_delta < 1e-10f) break; + + // Keep levels sorted (insertion sort — 8 elements) + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { + float v = levels[k]; int m = k - 1; + while (m >= 0 && levels[m] > v) { levels[m+1] = levels[m]; m--; } + levels[m+1] = v; + } + } + + memcpy(levels_out, levels, Q3PT_N_LEVELS * sizeof(float)); + q3pt_set_levels(levels); + free(bin_sum_w); + free(bin_sum_wt); +} + +// --- Q3_PT bit-packing helpers --- + +// 6-bit sequential packing: 32 values in 24 bytes (4 values per 3 bytes). +// Indices 0..15 = sub-block ranges, 16..31 = sub-block neg_mins. +static inline uint8_t q3pt_sc_get(const uint8_t * GGML_RESTRICT sc, int i) { + const int bit = i * 6; + const int byte = bit / 8; + const int off = bit % 8; + uint8_t val = (sc[byte] >> off) & 0x3F; + if (off > 2) { val |= (uint8_t)((sc[byte+1] << (8 - off)) & 0x3F); } + return val; +} + +static inline void q3pt_sc_set(uint8_t * GGML_RESTRICT sc, int i, uint8_t v) { + const int bit = i * 6; + const int byte = bit / 8; + const int off = bit % 8; + sc[byte] |= (uint8_t)((v & 0x3F) << off); + if (off > 2) { sc[byte+1] |= (uint8_t)(v >> (8 - off)); } +} + +// 3-bit sequential packing: 256 values in 96 bytes (8 values per 3 bytes). +static inline int q3pt_unpack3(const uint8_t * GGML_RESTRICT qs, int k) { + const int bit = k * 3; + const int byte = bit / 8; + const int off = bit % 8; + int val = (qs[byte] >> off) & 0x7; + if (off > 5) { val |= (int)((qs[byte+1] << (8 - off)) & 0x7); } + return val; +} + +static inline void q3pt_pack3(uint8_t * GGML_RESTRICT qs, int k, int v) { + const int bit = k * 3; + const int byte = bit / 8; + const int off = bit % 8; + qs[byte] |= (uint8_t)((v & 0x7) << off); + if (off > 5) { qs[byte+1] |= (uint8_t)((v & 0x7) >> (8 - off)); } +} + +void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + const float * L = (const float *)levels; + GGML_ASSERT(L != NULL && "Q3_PT levels not set for tensor"); + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const float dmin = GGML_FP16_TO_FP32(x[i].dmin); + const uint8_t * sc = x[i].scales; + const uint8_t * qs = x[i].qs; + + for (int ib = 0; ib < QK_K/16; ++ib) { + const float range = d * (float)q3pt_sc_get(sc, ib); + const float sub_min = -dmin * (float)q3pt_sc_get(sc, ib + QK_K/16); + for (int j = 0; j < 16; ++j) { + const int q = q3pt_unpack3(qs, ib*16 + j); + y[ib*16 + j] = L[q] * range + sub_min; + } + } + y += QK_K; + } +} + +#define Q3PT_REFINE_ITERS 5 + +// Find the optimal global d-scale for 6-bit (nmax=63) sub-block range quantization, +// minimizing Σ_i weights[i] * (vals[i] - d * clamp(round(vals[i]/d), 0, nmax))^2. +// Tries d = vals[i] / nmax as "anchor" for each sub-block i (O(n^2), n=QK_K/16=16). +// Without imatrix all weights are equal and the winner is always max/nmax, so this is a no-op. +// With imatrix it can redirect scale resolution to important sub-blocks at the cost of +// less important ones that would otherwise dominate via raw max(). +static float q3pt_find_optimal_d(const float * GGML_RESTRICT vals, + const float * GGML_RESTRICT weights, + int n, int nmax) { + float max_val = 0.f; + for (int i = 0; i < n; ++i) { if (vals[i] > max_val) max_val = vals[i]; } + if (max_val < 1e-6f) return 0.f; + float best_d = max_val / (float)nmax, best_err = FLT_MAX; + for (int i = 0; i < n; ++i) { + if (vals[i] < 1e-6f) continue; + const float d_cand = vals[i] / (float)nmax; + float err = 0.f; + for (int j = 0; j < n; ++j) { + int q = (int)(vals[j] / d_cand + 0.5f); + if (q > nmax) q = nmax; + const float delta = vals[j] - d_cand * (float)q; + err += weights[j] * delta * delta; + } + if (err < best_err) { best_err = err; best_d = d_cand; } + } + return best_d; +} + +static void quantize_row_q3_pt_impl(const float * GGML_RESTRICT x, + void * GGML_RESTRICT vy, + int64_t n, + const float * GGML_RESTRICT quant_weights) { + GGML_ASSERT(q3pt_levels_set && "Q3_PT levels not set - call q3pt_set_levels() first"); + GGML_ASSERT(n % QK_K == 0); + + const int64_t nbl = n / QK_K; + block_q3_pt * y = (block_q3_pt *) vy; + const float * L = q3pt_levels; + + for (int ibl = 0; ibl < nbl; ++ibl) { + const float * xbl = x + QK_K * ibl; + block_q3_pt * blk = &y[ibl]; + + float sigma2 = 0; + if (quant_weights) { + for (int i = 0; i < QK_K; ++i) { + sigma2 += xbl[i] * xbl[i]; + } + sigma2 = 2.f * sigma2 / QK_K; + } + + // Per-sub-block importance weights: sum of AWQ weights over 16 elements. + // Used by q3pt_find_optimal_d() to direct scale resolution toward important sub-blocks. + float w_ib[QK_K / 16]; + for (int ib = 0; ib < QK_K / 16; ++ib) { + float wsum = 0.f; + if (quant_weights) { + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + wsum += quant_weights[QK_K * ibl + elem] * sqrtf(sigma2 + xbl[elem] * xbl[elem]); + } + } else { + wsum = 16.f; // uniform — find_optimal_d is a no-op (max/63 always wins) + } + w_ib[ib] = wsum; + } + + // Compute per-sub-block ranges and neg_mins from raw min/max + float sub_ranges[QK_K / 16]; + float neg_mins[QK_K / 16]; + for (int ib = 0; ib < QK_K / 16; ++ib) { + const float * xb = xbl + ib * 16; + float sb_min = xb[0], sb_max = xb[0]; + for (int j = 1; j < 16; ++j) { + if (xb[j] < sb_min) { + sb_min = xb[j]; + } + if (xb[j] > sb_max) { + sb_max = xb[j]; + } + } + sub_ranges[ib] = sb_max - sb_min; + neg_mins[ib] = MAX(-sb_min, 0.f); + } + + // Pre-refinement: one weighted-LS pass with continuous (float) ranges before 6-bit + // quantization. Finds better initial (range, neg_min) from the raw min/max assignments, + // avoiding scale quantization noise in the very first set of level assignments. + for (int ib = 0; ib < QK_K / 16; ++ib) { + const float * xb = xbl + ib * 16; + if (sub_ranges[ib] < 1e-6f) { + continue; + } + const float inv_range0 = 1.f / sub_ranges[ib]; + const float sub_min0 = -neg_mins[ib]; + double sA = 0, sB = 0, sC = 0, sD = 0, sE = 0; + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + const float xj = xb[j]; + const float w = quant_weights ? quant_weights[QK_K * ibl + elem] * sqrtf(sigma2 + xj * xj) : 1.0f; + const float t = (xj - sub_min0) * inv_range0; + int best = 0; + float best_d2 = (t - L[0]) * (t - L[0]); + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { + const float d2 = (t - L[k]) * (t - L[k]); + if (d2 < best_d2) { + best_d2 = d2; + best = k; + } + } + const float lq = L[best]; + sA += (double) w * (double) lq * (double) lq; + sB += (double) w * (double) lq; + sC += (double) w; + sD += (double) w * (double) xj * (double) lq; + sE += (double) w * (double) xj; + } + const double det = sA * sC - sB * sB; + if (det > 1e-20) { + const float nr = (float) ((sD * sC - sE * sB) / det); + const float nm = (float) (-(sE * sA - sD * sB) / det); + if (nr > 0.f) { + sub_ranges[ib] = nr; + } + if (nm > 0.f) { + neg_mins[ib] = nm; + } + } + } + + // Importance-weighted d/dmin search (replaces plain max/63) + float d_val = q3pt_find_optimal_d(sub_ranges, w_ib, QK_K / 16, 63); + float dmin_val = q3pt_find_optimal_d(neg_mins, w_ib, QK_K / 16, 63); + + // Quantize ranges and neg_mins to 6-bit + memset(blk->scales, 0, sizeof(blk->scales)); + memset(blk->qs, 0, sizeof(blk->qs)); + const float inv_d = d_val > 0 ? 1.f / d_val : 0.f; + const float inv_dmin = dmin_val > 0 ? 1.f / dmin_val : 0.f; + for (int ib = 0; ib < QK_K / 16; ++ib) { + uint8_t sc = MIN(63, nearest_int(inv_d * sub_ranges[ib])); + uint8_t sm = MIN(63, nearest_int(inv_dmin * neg_mins[ib])); + q3pt_sc_set(blk->scales, ib, sc); + q3pt_sc_set(blk->scales, ib + QK_K / 16, sm); + } + blk->d = GGML_FP32_TO_FP16(d_val); + blk->dmin = GGML_FP32_TO_FP16(dmin_val); + + // Initial level assignment + for (int ib = 0; ib < QK_K / 16; ++ib) { + const float range = d_val * (float) q3pt_sc_get(blk->scales, ib); + const float sub_min = -dmin_val * (float) q3pt_sc_get(blk->scales, ib + QK_K / 16); + const float inv_range = range > 1e-6f ? 1.f / range : 0.f; + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + const float t = (xbl[elem] - sub_min) * inv_range; + int best = 0; + float best_d2 = (t - L[0]) * (t - L[0]); + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { + const float d2 = (t - L[k]) * (t - L[k]); + if (d2 < best_d2) { + best_d2 = d2; + best = k; + } + } + q3pt_pack3(blk->qs, elem, best); + } + } + + // Iterative refinement: weighted LS for (range, neg_min) + importance-weighted d/dmin. + for (int iter = 0; iter < Q3PT_REFINE_ITERS; ++iter) { + for (int ib = 0; ib < QK_K / 16; ++ib) { + double sA = 0, sB = 0, sC = 0, sD = 0, sE = 0; + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + const float xj = xbl[elem]; + const float w = quant_weights ? quant_weights[QK_K * ibl + elem] * sqrtf(sigma2 + xj * xj) : 1.0f; + const float lq = L[q3pt_unpack3(blk->qs, elem)]; + sA += (double) w * (double) lq * (double) lq; + sB += (double) w * (double) lq; + sC += (double) w; + sD += (double) w * (double) xj * (double) lq; + sE += (double) w * (double) xj; + } + const double det = sA * sC - sB * sB; + if (det < 1e-20) { + continue; + } + const float new_range = (float) ((sD * sC - sE * sB) / det); + const float new_negmin = (float) (-(sE * sA - sD * sB) / det); + sub_ranges[ib] = new_range > 0.f ? new_range : 0.f; + neg_mins[ib] = new_negmin > 0.f ? new_negmin : 0.f; + } + + // Importance-weighted d/dmin search on updated sub_ranges/neg_mins + d_val = q3pt_find_optimal_d(sub_ranges, w_ib, QK_K / 16, 63); + dmin_val = q3pt_find_optimal_d(neg_mins, w_ib, QK_K / 16, 63); + + // Re-pack scales + memset(blk->scales, 0, sizeof(blk->scales)); + const float inv_d2 = d_val > 0 ? 1.f / d_val : 0.f; + const float inv_dmin2 = dmin_val > 0 ? 1.f / dmin_val : 0.f; + for (int ib = 0; ib < QK_K / 16; ++ib) { + uint8_t sc = MIN(63, nearest_int(inv_d2 * sub_ranges[ib])); + uint8_t sm = MIN(63, nearest_int(inv_dmin2 * neg_mins[ib])); + q3pt_sc_set(blk->scales, ib, sc); + q3pt_sc_set(blk->scales, ib + QK_K / 16, sm); + } + blk->d = GGML_FP32_TO_FP16(d_val); + blk->dmin = GGML_FP32_TO_FP16(dmin_val); + + // Re-assign levels + memset(blk->qs, 0, sizeof(blk->qs)); + for (int ib = 0; ib < QK_K / 16; ++ib) { + const float range = d_val * (float) q3pt_sc_get(blk->scales, ib); + const float sub_min = -dmin_val * (float) q3pt_sc_get(blk->scales, ib + QK_K / 16); + const float inv_range = range > 1e-6f ? 1.f / range : 0.f; + for (int j = 0; j < 16; ++j) { + const int elem = ib * 16 + j; + const float t = (xbl[elem] - sub_min) * inv_range; + int best = 0; + float best_d2 = (t - L[0]) * (t - L[0]); + for (int k = 1; k < Q3PT_N_LEVELS; ++k) { + const float d2 = (t - L[k]) * (t - L[k]); + if (d2 < best_d2) { + best_d2 = d2; + best = k; + } + } + q3pt_pack3(blk->qs, elem, best); + } + } + } + } +} + +size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row % QK_K == 0); + int64_t nblock = n_per_row / QK_K; + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q3_pt_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock * sizeof(block_q3_pt); + } + return nrow * nblock * sizeof(block_q3_pt); +} + +void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + quantize_q3_pt(x, y, 1, k, NULL); +} // =================================== 1.5 bpw =================================================== @@ -5473,6 +9523,39 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; + case GGML_TYPE_Q3_PT: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_pt, data, nb); + } break; + case GGML_TYPE_Q3_KPT: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_kpt, data, nb); + } break; + case GGML_TYPE_Q4_DPT: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_dpt, data, nb); + } break; + case GGML_TYPE_Q2_DPT: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q2_dpt, data, nb); + } break; + case GGML_TYPE_Q2_KPT: + { + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_kpt, data, nb, d, dmin); + } break; + case GGML_TYPE_IQ2_TQ: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_tq, data, nb); + } break; + + case GGML_TYPE_IQ3_TQ: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_tq, data, nb); + } break; + case GGML_TYPE_IQ1_BN: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_bn, data, nb); + } break; case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -5489,3 +9572,5 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte return true; } + + diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index d56c86da89..a248e98ceb 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -27,6 +27,7 @@ GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4 GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q3_kpt_ref(const float * GGML_RESTRICT x, block_q3_kpt * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); @@ -42,36 +43,37 @@ GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_ GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); // Dequantization -GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); -GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); -GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); -GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); -GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); @@ -82,6 +84,14 @@ GGML_API size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RE GGML_API size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q3_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q3_KPT level management +GGML_API void q3kpt_set_levels(const float * levels); +GGML_API const float * q3kpt_get_levels(void); +GGML_API void q3kpt_free_levels(void); +GGML_API void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[Q3KPT_N_LEVELS]); GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); @@ -102,6 +112,198 @@ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTR GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1]) +GGML_API void q3pt_set_levels(const float * levels); // set global levels (quantization) +GGML_API const float * q3pt_get_levels(void); +GGML_API void q3pt_free_levels(void); + +// Per-tensor levels registry (inference — range-based lookup by data address) + +// Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized +// 16-element sub-block values. Also sets the global levels via q3pt_set_levels(). +// data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL. +GGML_API void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[8]); + +// Q4_DPT: IQ4_NL with learned per-tensor int8 levels +GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q4_DPT levels management (per-tensor Lloyd-Max int8 levels) +GGML_API void q4dpt_set_levels(const int8_t * levels); +GGML_API const int8_t * q4dpt_get_levels(void); +GGML_API void q4dpt_free_levels(void); + +// Q2_DPT: 2-bit with learned per-tensor int8 levels +GGML_API void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q2_DPT levels management (per-tensor Lloyd-Max int8 levels) +GGML_API void q2dpt_set_levels(const int8_t * levels); +GGML_API const int8_t * q2dpt_get_levels(void); +GGML_API void q2dpt_free_levels(void); +GGML_API void q2dpt_set_quant_strategy(int s); + +// Train 4 Lloyd-Max int8 levels from tensor data for Q2_DPT. +// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4]. +// Also sets the global levels via q2dpt_set_levels(). +GGML_API void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]); + +// Q2_KPT: Q2_K with learned per-tensor float levels +GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q2_KPT levels management (per-tensor float levels in [0,1]) +GGML_API void q2kpt_set_levels(const float * levels); +GGML_API const float * q2kpt_get_levels(void); +GGML_API void q2kpt_free_levels(void); +// Prepare levels buffer for a tensor with given dimensions (call before parallel quantization) +GGML_API void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row); + +// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT. +// Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids. +// Also sets the global levels via q2kpt_set_levels(). +GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[Q2KPT_N_LEVELS]); + +// Train per-row levels for all rows: writes nrow * Q2KPT_N_LEVELS floats to out_levels. +GGML_API void q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float * out_levels); + +// IQ2_TQ: 2-bit scalar with per-group asymmetric grid (2.5625 bpw) +GGML_API void dequantize_row_iq2_tq(const block_iq2_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_iq2_tq_ref(const float * GGML_RESTRICT x, block_iq2_tq * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_iq2_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]); +GGML_API void iq2tq_set_grid(const int8_t grid[64]); +GGML_API const int8_t * iq2tq_get_grid(void); + +// IQ3_TQ: 3-bit scalar with per-group asymmetric grid (3.5625 bpw) +GGML_API void dequantize_row_iq3_tq(const block_iq3_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_iq3_tq_ref(const float * GGML_RESTRICT x, block_iq3_tq * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_iq3_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[IQ3TQ_GRID_SIZE]); +GGML_API void iq3tq_set_grid(const int8_t grid[IQ3TQ_GRID_SIZE]); +GGML_API const int8_t * iq3tq_get_grid(void); + +// IQ1_BN: 8D vector quantized with per-tensor trained codebook (1.5625 bpw) +GGML_API void dequantize_row_iq1_bn(const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_iq1_bn_ref(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_iq1_bn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[IQ1BN_AUX_SIZE], int nthread); +GGML_API void iq1bn_set_aux(const int8_t aux[IQ1BN_AUX_SIZE]); +GGML_API const int8_t * iq1bn_get_aux(void); + +// Train 16 Lloyd-Max int8 levels from tensor data. +// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16]. +// Also sets the global levels via q4dpt_set_levels(). +GGML_API void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]); + +GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1]) +GGML_API void q3pt_set_levels(const float * levels); // set global levels (quantization) +GGML_API const float * q3pt_get_levels(void); +GGML_API void q3pt_free_levels(void); + +// Per-tensor levels registry (inference — range-based lookup by data address) + +// Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized +// 16-element sub-block values. Also sets the global levels via q3pt_set_levels(). +// data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL. +GGML_API void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[8]); + +// Q4_DPT: IQ4_NL with learned per-tensor int8 levels +GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q4_DPT levels management (per-tensor Lloyd-Max int8 levels) +GGML_API void q4dpt_set_levels(const int8_t * levels); +GGML_API const int8_t * q4dpt_get_levels(void); +GGML_API void q4dpt_free_levels(void); + +// Q2_DPT: 2-bit with learned per-tensor int8 levels +GGML_API void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q2_DPT levels management (per-tensor Lloyd-Max int8 levels) +GGML_API void q2dpt_set_levels(const int8_t * levels); +GGML_API const int8_t * q2dpt_get_levels(void); +GGML_API void q2dpt_free_levels(void); +GGML_API void q2dpt_set_quant_strategy(int s); + +// Train 4 Lloyd-Max int8 levels from tensor data for Q2_DPT. +// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4]. +// Also sets the global levels via q2dpt_set_levels(). +GGML_API void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]); + +// Q2_KPT: Q2_K with learned per-tensor float levels +GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix); + +// Q2_KPT levels management (per-tensor float levels in [0,1]) +GGML_API void q2kpt_set_levels(const float * levels); +GGML_API const float * q2kpt_get_levels(void); +GGML_API void q2kpt_free_levels(void); +// Prepare levels buffer for a tensor with given dimensions (call before parallel quantization) +GGML_API void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row); + +// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT. +// Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids. +// Also sets the global levels via q2kpt_set_levels(). +GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[Q2KPT_N_LEVELS]); + +// Train per-row levels for all rows: writes nrow * Q2KPT_N_LEVELS floats to out_levels. +GGML_API void q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float * out_levels); + +// IQ2_TQ: 2-bit scalar with per-group asymmetric grid (2.5625 bpw) +GGML_API void dequantize_row_iq2_tq(const block_iq2_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_iq2_tq_ref(const float * GGML_RESTRICT x, block_iq2_tq * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_iq2_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]); +GGML_API void iq2tq_set_grid(const int8_t grid[64]); +GGML_API const int8_t * iq2tq_get_grid(void); + +// IQ3_TQ: 3-bit scalar with per-group asymmetric grid (3.5625 bpw) +GGML_API void dequantize_row_iq3_tq(const block_iq3_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_iq3_tq_ref(const float * GGML_RESTRICT x, block_iq3_tq * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_iq3_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[IQ3TQ_GRID_SIZE]); +GGML_API void iq3tq_set_grid(const int8_t grid[IQ3TQ_GRID_SIZE]); +GGML_API const int8_t * iq3tq_get_grid(void); + +// IQ1_BN: 8D vector quantized with per-tensor trained codebook (1.5625 bpw) +GGML_API void dequantize_row_iq1_bn(const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels); +GGML_API void quantize_row_iq1_bn_ref(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_iq1_bn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[IQ1BN_AUX_SIZE], int nthread); +GGML_API void iq1bn_set_aux(const int8_t aux[IQ1BN_AUX_SIZE]); +GGML_API const int8_t * iq1bn_get_aux(void); + +// Train 16 Lloyd-Max int8 levels from tensor data. +// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16]. +// Also sets the global levels via q4dpt_set_levels(). +GGML_API void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]); + GGML_API void iq2xs_init_impl(enum ggml_type type); GGML_API void iq2xs_free_impl(enum ggml_type type); GGML_API void iq3xs_init_impl(int grid_size); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 977aff62d8..585b547b48 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -12218,7 +12218,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr); } -static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) { +static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant, const void * levels = nullptr) { if (quant == GGML_TYPE_F32) { memcpy(to, from, sizeof(float) * ne); return; @@ -12228,7 +12228,7 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg ggml_to_float_t dequant_fn = tt->to_float; - dequant_fn(from, to, ne); + dequant_fn(from, to, ne, levels); } static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 0142498d96..fc9b64378c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -456,6 +456,11 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { } } +static void ggml_fp16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) { + GGML_UNUSED(levels); + ggml_fp16_to_fp32_row((const ggml_fp16_t *)x, y, n); +} + void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { int i = 0; for (; i < n; ++i) { @@ -470,6 +475,11 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { } } +static void ggml_bf16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) { + GGML_UNUSED(levels); + ggml_bf16_to_fp32_row((const ggml_bf16_t *)x, y, n); +} + void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) { for (int i = 0; i < n; i++) { y[i] = ggml_compute_fp32_to_bf16(x[i]); @@ -648,7 +658,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(ggml_fp16_t), .is_quantized = false, - .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + .to_float = ggml_fp16_to_fp32_row_leveled, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row, }, [GGML_TYPE_Q1_0] = { @@ -857,7 +867,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .blck_size = 1, .type_size = sizeof(ggml_bf16_t), .is_quantized = false, - .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, + .to_float = ggml_bf16_to_fp32_row_leveled, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, }, [31] = { // GGML_TYPE_Q4_0_4_4 @@ -912,6 +922,71 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .type_size = 0, .is_quantized = false, }, + [GGML_TYPE_Q3_PT] = { + .type_name = "q3_pt", + .blck_size = QK_K, + .type_size = sizeof(block_q3_pt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q3_pt, + .from_float_ref = (ggml_from_float_t) quantize_row_q3_pt_ref, + }, + [GGML_TYPE_Q3_KPT] = { + .type_name = "q3_kpt", + .blck_size = QK_K, + .type_size = sizeof(block_q3_kpt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q3_kpt, + .from_float_ref = (ggml_from_float_t) quantize_row_q3_kpt_ref, + }, + [GGML_TYPE_Q4_DPT] = { + .type_name = "q4_dpt", + .blck_size = QK4_NL, + .type_size = sizeof(block_q4_dpt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_dpt, + .from_float_ref = (ggml_from_float_t) quantize_row_q4_dpt_ref, + }, + [GGML_TYPE_Q2_DPT] = { + .type_name = "q2_dpt", + .blck_size = QK2_DPT, + .type_size = sizeof(block_q2_dpt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_dpt, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_dpt_ref, + }, + [GGML_TYPE_Q2_KPT] = { + .type_name = "q2_kpt", + .blck_size = QK_K, + .type_size = sizeof(block_q2_kpt), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_kpt, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_kpt_ref, + .levels_row_stride = 0, // computed dynamically: (ne[0]/256)*4*sizeof(float) + }, + [GGML_TYPE_IQ2_TQ] = { + .type_name = "iq2_tq", + .blck_size = QK_K, + .type_size = sizeof(block_iq2_tq), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_tq, + .from_float_ref = (ggml_from_float_t) quantize_row_iq2_tq_ref, + }, + [GGML_TYPE_IQ3_TQ] = { + .type_name = "iq3_tq", + .blck_size = QK_K, + .type_size = sizeof(block_iq3_tq), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq3_tq, + .from_float_ref = (ggml_from_float_t) quantize_row_iq3_tq_ref, + }, + [GGML_TYPE_IQ1_BN] = { + .type_name = "iq1_bn", + .blck_size = QK_K, + .type_size = sizeof(block_iq1_bn), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq1_bn, + .from_float_ref = (ggml_from_float_t) quantize_row_iq1_bn_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -1412,6 +1487,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; + case GGML_FTYPE_MOSTLY_Q3_PT: wtype = GGML_TYPE_Q3_PT; break; + case GGML_FTYPE_MOSTLY_Q3_KPT: wtype = GGML_TYPE_Q3_KPT; break; + case GGML_FTYPE_MOSTLY_Q4_DPT: wtype = GGML_TYPE_Q4_DPT; break; + case GGML_FTYPE_MOSTLY_Q2_KPT: wtype = GGML_TYPE_Q2_KPT; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7607,6 +7686,13 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; + case GGML_TYPE_IQ2_TQ: break; // per-tensor grid stored in tensor->quant_levels + case GGML_TYPE_IQ3_TQ: break; // per-tensor grid stored in tensor->quant_levels + case GGML_TYPE_IQ1_BN: break; // per-tensor codebook stored in tensor->quant_levels + case GGML_TYPE_Q3_PT: break; // levels stored in tensor->quant_levels + case GGML_TYPE_Q3_KPT: break; // levels stored in tensor->quant_levels + case GGML_TYPE_Q4_DPT: break; // levels stored in tensor->quant_levels + case GGML_TYPE_Q2_KPT: break; // levels stored in tensor->quant_levels default: // nothing break; } @@ -7685,6 +7771,13 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q3_PT: result = quantize_q3_pt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q3_KPT: result = quantize_q3_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_DPT: result = quantize_q4_dpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q2_KPT: result = quantize_q2_kpt (src + start, (char *) dst + start_row * row_size, start_row, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_TQ: result = quantize_iq2_tq (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ3_TQ: result = quantize_iq3_tq (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index ab3cc97486..26e88f9a12 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -1331,37 +1331,63 @@ struct gguf_writer_base { if (kv.is_array) { write(GGUF_TYPE_ARRAY); - write(kv.get_type()); + const enum gguf_type elem_type = kv.get_type(); + write(elem_type); write(ne); + // Write array element data based on element type + switch (elem_type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: { + // Write raw bytes inline for array data + for (size_t i = 0; i < kv.data.size(); ++i) { + write(kv.data[i]); + } + } break; + case GGUF_TYPE_BOOL: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_STRING: { + for (size_t i = 0; i < ne; ++i) { + write(kv.get_val(i)); + } + } break; + case GGUF_TYPE_ARRAY: + default: GGML_ABORT("invalid array element type"); + } } else { write(kv.get_type()); - } - - switch (kv.get_type()) { - case GGUF_TYPE_UINT8: - case GGUF_TYPE_INT8: - case GGUF_TYPE_UINT16: - case GGUF_TYPE_INT16: - case GGUF_TYPE_UINT32: - case GGUF_TYPE_INT32: - case GGUF_TYPE_FLOAT32: - case GGUF_TYPE_UINT64: - case GGUF_TYPE_INT64: - case GGUF_TYPE_FLOAT64: { - write(kv.data); - } break; - case GGUF_TYPE_BOOL: { - for (size_t i = 0; i < ne; ++i) { - write(kv.get_val(i)); - } - } break; - case GGUF_TYPE_STRING: { - for (size_t i = 0; i < ne; ++i) { - write(kv.get_val(i)); - } - } break; - case GGUF_TYPE_ARRAY: - default: GGML_ABORT("invalid type"); + switch (kv.get_type()) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: { + write(kv.data); + } break; + case GGUF_TYPE_BOOL: { + write(kv.get_val(0)); + } break; + case GGUF_TYPE_STRING: { + write(kv.get_val(0)); + } break; + case GGUF_TYPE_ARRAY: + default: GGML_ABORT("invalid type"); + } } } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c5297a2f44..0933306ca6 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -7,10 +7,10 @@ from typing import Any # constants # -GGUF_MAGIC = 0x46554747 # "GGUF" -GGUF_VERSION = 3 +GGUF_MAGIC = 0x46554747 # "GGUF" +GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 -GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h +GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h # # metadata keys @@ -19,205 +19,209 @@ GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h class Keys: class General: - TYPE = "general.type" - ARCHITECTURE = "general.architecture" - QUANTIZATION_VERSION = "general.quantization_version" - ALIGNMENT = "general.alignment" - FILE_TYPE = "general.file_type" + TYPE = "general.type" + ARCHITECTURE = "general.architecture" + QUANTIZATION_VERSION = "general.quantization_version" + ALIGNMENT = "general.alignment" + FILE_TYPE = "general.file_type" # Recommended Sampler Parameters - SAMPLING_SEQUENCE = "general.sampling.sequence" - SAMPLING_TOP_K = "general.sampling.top_k" - SAMPLING_TOP_P = "general.sampling.top_p" - SAMPLING_MIN_P = "general.sampling.min_p" - SAMPLING_XTC_PROBABILITY = "general.sampling.xtc_probability" - SAMPLING_XTC_THRESHOLD = "general.sampling.xtc_threshold" - SAMPLING_TEMP = "general.sampling.temp" - SAMPLING_PENALTY_LAST_N = "general.sampling.penalty_last_n" - SAMPLING_PENALTY_REPEAT = "general.sampling.penalty_repeat" - SAMPLING_MIROSTAT = "general.sampling.mirostat" - SAMPLING_MIROSTAT_TAU = "general.sampling.mirostat_tau" - SAMPLING_MIROSTAT_ETA = "general.sampling.mirostat_eta" + SAMPLING_SEQUENCE = "general.sampling.sequence" + SAMPLING_TOP_K = "general.sampling.top_k" + SAMPLING_TOP_P = "general.sampling.top_p" + SAMPLING_MIN_P = "general.sampling.min_p" + SAMPLING_XTC_PROBABILITY = "general.sampling.xtc_probability" + SAMPLING_XTC_THRESHOLD = "general.sampling.xtc_threshold" + SAMPLING_TEMP = "general.sampling.temp" + SAMPLING_PENALTY_LAST_N = "general.sampling.penalty_last_n" + SAMPLING_PENALTY_REPEAT = "general.sampling.penalty_repeat" + SAMPLING_MIROSTAT = "general.sampling.mirostat" + SAMPLING_MIROSTAT_TAU = "general.sampling.mirostat_tau" + SAMPLING_MIROSTAT_ETA = "general.sampling.mirostat_eta" # Authorship Metadata - NAME = "general.name" - AUTHOR = "general.author" - VERSION = "general.version" - ORGANIZATION = "general.organization" + NAME = "general.name" + AUTHOR = "general.author" + VERSION = "general.version" + ORGANIZATION = "general.organization" - FINETUNE = "general.finetune" - BASENAME = "general.basename" + FINETUNE = "general.finetune" + BASENAME = "general.basename" - DESCRIPTION = "general.description" - QUANTIZED_BY = "general.quantized_by" + DESCRIPTION = "general.description" + QUANTIZED_BY = "general.quantized_by" - SIZE_LABEL = "general.size_label" + SIZE_LABEL = "general.size_label" # Licensing details - LICENSE = "general.license" - LICENSE_NAME = "general.license.name" - LICENSE_LINK = "general.license.link" + LICENSE = "general.license" + LICENSE_NAME = "general.license.name" + LICENSE_LINK = "general.license.link" # Typically represents the converted GGUF repo (Unless native) - URL = "general.url" # Model Website/Paper - DOI = "general.doi" - UUID = "general.uuid" - REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...) + URL = "general.url" # Model Website/Paper + DOI = "general.doi" + UUID = "general.uuid" + REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...) # Model Source during conversion - SOURCE_URL = "general.source.url" # Model Website/Paper - SOURCE_DOI = "general.source.doi" - SOURCE_UUID = "general.source.uuid" - SOURCE_REPO_URL = "general.source.repo_url" # Model Source Repository (git/svn/etc...) + SOURCE_URL = "general.source.url" # Model Website/Paper + SOURCE_DOI = "general.source.doi" + SOURCE_UUID = "general.source.uuid" + SOURCE_REPO_URL = ( + "general.source.repo_url" # Model Source Repository (git/svn/etc...) + ) # Base Model Source. There can be more than one source if it's a merged # model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in # tracing linage of models as it is finetuned or merged over time. - BASE_MODEL_COUNT = "general.base_model.count" - BASE_MODEL_NAME = "general.base_model.{id}.name" - BASE_MODEL_AUTHOR = "general.base_model.{id}.author" - BASE_MODEL_VERSION = "general.base_model.{id}.version" - BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" - BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description" - BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper - BASE_MODEL_DOI = "general.base_model.{id}.doi" - BASE_MODEL_UUID = "general.base_model.{id}.uuid" - BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...) + BASE_MODEL_COUNT = "general.base_model.count" + BASE_MODEL_NAME = "general.base_model.{id}.name" + BASE_MODEL_AUTHOR = "general.base_model.{id}.author" + BASE_MODEL_VERSION = "general.base_model.{id}.version" + BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" + BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description" + BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper + BASE_MODEL_DOI = "general.base_model.{id}.doi" + BASE_MODEL_UUID = "general.base_model.{id}.uuid" + BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...) # Dataset Source - DATASET_COUNT = "general.dataset.count" - DATASET_NAME = "general.dataset.{id}.name" - DATASET_AUTHOR = "general.dataset.{id}.author" - DATASET_VERSION = "general.dataset.{id}.version" - DATASET_ORGANIZATION = "general.dataset.{id}.organization" - DATASET_DESCRIPTION = "general.dataset.{id}.description" - DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper - DATASET_DOI = "general.dataset.{id}.doi" - DATASET_UUID = "general.dataset.{id}.uuid" - DATASET_REPO_URL = "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...) + DATASET_COUNT = "general.dataset.count" + DATASET_NAME = "general.dataset.{id}.name" + DATASET_AUTHOR = "general.dataset.{id}.author" + DATASET_VERSION = "general.dataset.{id}.version" + DATASET_ORGANIZATION = "general.dataset.{id}.organization" + DATASET_DESCRIPTION = "general.dataset.{id}.description" + DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper + DATASET_DOI = "general.dataset.{id}.doi" + DATASET_UUID = "general.dataset.{id}.uuid" + DATASET_REPO_URL = ( + "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...) + ) # Array based KV stores - TAGS = "general.tags" - LANGUAGES = "general.languages" + TAGS = "general.tags" + LANGUAGES = "general.languages" class LLM: - VOCAB_SIZE = "{arch}.vocab_size" - CONTEXT_LENGTH = "{arch}.context_length" - EMBEDDING_LENGTH = "{arch}.embedding_length" - EMBEDDING_LENGTH_OUT = "{arch}.embedding_length_out" - FEATURES_LENGTH = "{arch}.features_length" - BLOCK_COUNT = "{arch}.block_count" - LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" - FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" - EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" + VOCAB_SIZE = "{arch}.vocab_size" + CONTEXT_LENGTH = "{arch}.context_length" + EMBEDDING_LENGTH = "{arch}.embedding_length" + EMBEDDING_LENGTH_OUT = "{arch}.embedding_length_out" + FEATURES_LENGTH = "{arch}.features_length" + BLOCK_COUNT = "{arch}.block_count" + LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" + FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" + EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length" - EXPERT_CHUNK_FEED_FORWARD_LENGTH = "{arch}.expert_chunk_feed_forward_length" - USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" - TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - EXPERT_COUNT = "{arch}.expert_count" - EXPERT_USED_COUNT = "{arch}.expert_used_count" - EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" - EXPERT_GROUP_COUNT = "{arch}.expert_group_count" - EXPERT_GROUP_USED_COUNT = "{arch}.expert_group_used_count" - EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" - EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm" - EXPERT_GATING_FUNC = "{arch}.expert_gating_func" - EXPERT_GROUP_SCALE = "{arch}.expert_group_scale" - EXPERTS_PER_GROUP = "{arch}.experts_per_group" - MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers" - MOE_LATENT_SIZE = "{arch}.moe_latent_size" - NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers" - NUM_DEEPSTACK_LAYERS = "{arch}.n_deepstack_layers" - POOLING_TYPE = "{arch}.pooling_type" - LOGIT_SCALE = "{arch}.logit_scale" - DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" - DECODER_BLOCK_COUNT = "{arch}.decoder_block_count" - ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" - ROUTER_LOGIT_SOFTCAPPING = "{arch}.router_logit_softcapping" - FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" - SWIN_NORM = "{arch}.swin_norm" - RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" - TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" - TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" - RESIDUAL_SCALE = "{arch}.residual_scale" - EMBEDDING_SCALE = "{arch}.embedding_scale" - TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" - INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step" - FULL_ATTENTION_INTERVAL = "{arch}.full_attention_interval" - ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale" - ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx" - ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs" - EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" - SWIGLU_CLAMP_EXP = "{arch}.swiglu_clamp_exp" - SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp" - DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" - DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" + EXPERT_CHUNK_FEED_FORWARD_LENGTH = "{arch}.expert_chunk_feed_forward_length" + USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" + TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + EXPERT_COUNT = "{arch}.expert_count" + EXPERT_USED_COUNT = "{arch}.expert_used_count" + EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" + EXPERT_GROUP_COUNT = "{arch}.expert_group_count" + EXPERT_GROUP_USED_COUNT = "{arch}.expert_group_used_count" + EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" + EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm" + EXPERT_GATING_FUNC = "{arch}.expert_gating_func" + EXPERT_GROUP_SCALE = "{arch}.expert_group_scale" + EXPERTS_PER_GROUP = "{arch}.experts_per_group" + MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers" + MOE_LATENT_SIZE = "{arch}.moe_latent_size" + NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers" + NUM_DEEPSTACK_LAYERS = "{arch}.n_deepstack_layers" + POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" + DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" + DECODER_BLOCK_COUNT = "{arch}.decoder_block_count" + ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" + ROUTER_LOGIT_SOFTCAPPING = "{arch}.router_logit_softcapping" + FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" + SWIN_NORM = "{arch}.swin_norm" + RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" + TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" + TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" + RESIDUAL_SCALE = "{arch}.residual_scale" + EMBEDDING_SCALE = "{arch}.embedding_scale" + TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" + INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step" + FULL_ATTENTION_INTERVAL = "{arch}.full_attention_interval" + ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale" + ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx" + ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs" + EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" + SWIGLU_CLAMP_EXP = "{arch}.swiglu_clamp_exp" + SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp" + DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" + DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" class Attention: - HEAD_COUNT = "{arch}.attention.head_count" - HEAD_COUNT_KV = "{arch}.attention.head_count_kv" - MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" - CLAMP_KQV = "{arch}.attention.clamp_kqv" - KEY_LENGTH = "{arch}.attention.key_length" - VALUE_LENGTH = "{arch}.attention.value_length" - LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" - LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon" - GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups" - CAUSAL = "{arch}.attention.causal" - Q_LORA_RANK = "{arch}.attention.q_lora_rank" - KV_LORA_RANK = "{arch}.attention.kv_lora_rank" - DECAY_LORA_RANK = "{arch}.attention.decay_lora_rank" - ICLR_LORA_RANK = "{arch}.attention.iclr_lora_rank" + HEAD_COUNT = "{arch}.attention.head_count" + HEAD_COUNT_KV = "{arch}.attention.head_count_kv" + MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" + CLAMP_KQV = "{arch}.attention.clamp_kqv" + KEY_LENGTH = "{arch}.attention.key_length" + VALUE_LENGTH = "{arch}.attention.value_length" + LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" + LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon" + GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups" + CAUSAL = "{arch}.attention.causal" + Q_LORA_RANK = "{arch}.attention.q_lora_rank" + KV_LORA_RANK = "{arch}.attention.kv_lora_rank" + DECAY_LORA_RANK = "{arch}.attention.decay_lora_rank" + ICLR_LORA_RANK = "{arch}.attention.iclr_lora_rank" VALUE_RESIDUAL_MIX_LORA_RANK = "{arch}.attention.value_residual_mix_lora_rank" - GATE_LORA_RANK = "{arch}.attention.gate_lora_rank" - REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" - SLIDING_WINDOW = "{arch}.attention.sliding_window" - SCALE = "{arch}.attention.scale" - OUTPUT_SCALE = "{arch}.attention.output_scale" - TEMPERATURE_LENGTH = "{arch}.attention.temperature_length" - KEY_LENGTH_MLA = "{arch}.attention.key_length_mla" - VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" - KEY_LENGTH_SWA = "{arch}.attention.key_length_swa" - VALUE_LENGTH_SWA = "{arch}.attention.value_length_swa" - SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" - SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" - TEMPERATURE_SCALE = "{arch}.attention.temperature_scale" + GATE_LORA_RANK = "{arch}.attention.gate_lora_rank" + REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" + SLIDING_WINDOW = "{arch}.attention.sliding_window" + SCALE = "{arch}.attention.scale" + OUTPUT_SCALE = "{arch}.attention.output_scale" + TEMPERATURE_LENGTH = "{arch}.attention.temperature_length" + KEY_LENGTH_MLA = "{arch}.attention.key_length_mla" + VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" + KEY_LENGTH_SWA = "{arch}.attention.key_length_swa" + VALUE_LENGTH_SWA = "{arch}.attention.value_length_swa" + SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" + SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" + TEMPERATURE_SCALE = "{arch}.attention.temperature_scale" class Indexer: HEAD_COUNT = "{arch}.attention.indexer.head_count" KEY_LENGTH = "{arch}.attention.indexer.key_length" - TOP_K = "{arch}.attention.indexer.top_k" + TOP_K = "{arch}.attention.indexer.top_k" class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - DIMENSION_COUNT_SWA = "{arch}.rope.dimension_count_swa" - DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" - FREQ_BASE = "{arch}.rope.freq_base" - FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" - SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" - SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" - SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" - SCALING_YARN_EXT_FACTOR = "{arch}.rope.scaling.yarn_ext_factor" - SCALING_YARN_ATTN_FACTOR = "{arch}.rope.scaling.yarn_attn_factor" - SCALING_YARN_BETA_FAST = "{arch}.rope.scaling.yarn_beta_fast" - SCALING_YARN_BETA_SLOW = "{arch}.rope.scaling.yarn_beta_slow" + DIMENSION_COUNT = "{arch}.rope.dimension_count" + DIMENSION_COUNT_SWA = "{arch}.rope.dimension_count_swa" + DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" + FREQ_BASE = "{arch}.rope.freq_base" + FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" + SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" + SCALING_YARN_EXT_FACTOR = "{arch}.rope.scaling.yarn_ext_factor" + SCALING_YARN_ATTN_FACTOR = "{arch}.rope.scaling.yarn_attn_factor" + SCALING_YARN_BETA_FAST = "{arch}.rope.scaling.yarn_beta_fast" + SCALING_YARN_BETA_SLOW = "{arch}.rope.scaling.yarn_beta_slow" class Split: - LLM_KV_SPLIT_NO = "split.no" - LLM_KV_SPLIT_COUNT = "split.count" + LLM_KV_SPLIT_NO = "split.no" + LLM_KV_SPLIT_COUNT = "split.count" LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" class SSM: - CONV_KERNEL = "{arch}.ssm.conv_kernel" - INNER_SIZE = "{arch}.ssm.inner_size" - STATE_SIZE = "{arch}.ssm.state_size" + CONV_KERNEL = "{arch}.ssm.conv_kernel" + INNER_SIZE = "{arch}.ssm.inner_size" + STATE_SIZE = "{arch}.ssm.state_size" TIME_STEP_RANK = "{arch}.ssm.time_step_rank" - GROUP_COUNT = "{arch}.ssm.group_count" - DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" + GROUP_COUNT = "{arch}.ssm.group_count" + DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" class KDA: HEAD_DIM = "{arch}.kda.head_dim" @@ -227,11 +231,11 @@ class Keys: class PosNet: EMBEDDING_LENGTH = "{arch}.posnet.embedding_length" - BLOCK_COUNT = "{arch}.posnet.block_count" + BLOCK_COUNT = "{arch}.posnet.block_count" class ConvNext: EMBEDDING_LENGTH = "{arch}.convnext.embedding_length" - BLOCK_COUNT = "{arch}.convnext.block_count" + BLOCK_COUNT = "{arch}.convnext.block_count" class Classifier: OUTPUT_LABELS = "{arch}.classifier.output_labels" @@ -240,120 +244,122 @@ class Keys: L_CACHE = "{arch}.shortconv.l_cache" class Tokenizer: - MODEL = "tokenizer.ggml.model" - PRE = "tokenizer.ggml.pre" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - EOT_ID = "tokenizer.ggml.eot_token_id" - EOM_ID = "tokenizer.ggml.eom_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - MASK_ID = "tokenizer.ggml.mask_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - ADD_SEP = "tokenizer.ggml.add_sep_token" - ADD_PREFIX = "tokenizer.ggml.add_space_prefix" - REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" + MODEL = "tokenizer.ggml.model" + PRE = "tokenizer.ggml.pre" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + TOKEN_TYPE_COUNT = ( + "tokenizer.ggml.token_type_count" # for BERT-style token types + ) + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + MASK_ID = "tokenizer.ggml.mask_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + ADD_SEP = "tokenizer.ggml.add_sep_token" + ADD_PREFIX = "tokenizer.ggml.add_space_prefix" + REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces" PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" - CHAT_TEMPLATE = "tokenizer.chat_template" - CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" - CHAT_TEMPLATES = "tokenizer.chat_templates" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" + CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" + CHAT_TEMPLATES = "tokenizer.chat_templates" # FIM/Infill special tokens constants - FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" - FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" - FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id" - FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id" - FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id" - FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id" + FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" + FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" + FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id" + FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id" + FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id" + FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id" # deprecated: - PREFIX_ID = "tokenizer.ggml.prefix_token_id" - SUFFIX_ID = "tokenizer.ggml.suffix_token_id" - MIDDLE_ID = "tokenizer.ggml.middle_token_id" + PREFIX_ID = "tokenizer.ggml.prefix_token_id" + SUFFIX_ID = "tokenizer.ggml.suffix_token_id" + MIDDLE_ID = "tokenizer.ggml.middle_token_id" class Adapter: - TYPE = "adapter.type" - LORA_ALPHA = "adapter.lora.alpha" - LORA_TASK_NAME = "adapter.lora.task_name" - LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix" + TYPE = "adapter.type" + LORA_ALPHA = "adapter.lora.alpha" + LORA_TASK_NAME = "adapter.lora.task_name" + LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix" ALORA_INVOCATION_TOKENS = "adapter.alora.invocation_tokens" class IMatrix: CHUNK_COUNT = "imatrix.chunk_count" - CHUNK_SIZE = "imatrix.chunk_size" - DATASETS = "imatrix.datasets" + CHUNK_SIZE = "imatrix.chunk_size" + DATASETS = "imatrix.datasets" class Clip: - PROJECTOR_TYPE = "clip.projector_type" - HAS_VISION_ENCODER = "clip.has_vision_encoder" - HAS_AUDIO_ENCODER = "clip.has_audio_encoder" - HAS_LLAVA_PROJECTOR = "clip.has_llava_projector" + PROJECTOR_TYPE = "clip.projector_type" + HAS_VISION_ENCODER = "clip.has_vision_encoder" + HAS_AUDIO_ENCODER = "clip.has_audio_encoder" + HAS_LLAVA_PROJECTOR = "clip.has_llava_projector" class ClipVision: - PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models - IMAGE_SIZE = "clip.vision.image_size" - IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels" - IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels" - PREPROC_MIN_TILES = "clip.vision.preproc_min_tiles" - PREPROC_MAX_TILES = "clip.vision.preproc_max_tiles" - PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size" - PATCH_SIZE = "clip.vision.patch_size" - EMBEDDING_LENGTH = "clip.vision.embedding_length" + PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models + IMAGE_SIZE = "clip.vision.image_size" + IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels" + IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels" + PREPROC_MIN_TILES = "clip.vision.preproc_min_tiles" + PREPROC_MAX_TILES = "clip.vision.preproc_max_tiles" + PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size" + PATCH_SIZE = "clip.vision.patch_size" + EMBEDDING_LENGTH = "clip.vision.embedding_length" FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length" - PROJECTION_DIM = "clip.vision.projection_dim" - BLOCK_COUNT = "clip.vision.block_count" - IMAGE_MEAN = "clip.vision.image_mean" - IMAGE_STD = "clip.vision.image_std" - SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size" - USE_GELU = "clip.use_gelu" - USE_SILU = "clip.use_silu" - N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl - WA_LAYER_INDEXES = "clip.vision.wa_layer_indexes" # used by youtuvl + PROJECTION_DIM = "clip.vision.projection_dim" + BLOCK_COUNT = "clip.vision.block_count" + IMAGE_MEAN = "clip.vision.image_mean" + IMAGE_STD = "clip.vision.image_std" + SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size" + USE_GELU = "clip.use_gelu" + USE_SILU = "clip.use_silu" + N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl + WA_LAYER_INDEXES = "clip.vision.wa_layer_indexes" # used by youtuvl IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers" - WINDOW_SIZE = "clip.vision.window_size" + WINDOW_SIZE = "clip.vision.window_size" class Attention: - HEAD_COUNT = "clip.vision.attention.head_count" - LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" + HEAD_COUNT = "clip.vision.attention.head_count" + LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" class Projector: - SCALE_FACTOR = "clip.vision.projector.scale_factor" + SCALE_FACTOR = "clip.vision.projector.scale_factor" class SAM: - BLOCK_COUNT = "clip.vision.sam.block_count" - EMBEDDING_LENGTH = "clip.vision.sam.embedding_length" - HEAD_COUNT = "clip.vision.sam.head_count" + BLOCK_COUNT = "clip.vision.sam.block_count" + EMBEDDING_LENGTH = "clip.vision.sam.embedding_length" + HEAD_COUNT = "clip.vision.sam.head_count" class ClipAudio: - PROJECTOR_TYPE = "clip.audio.projector_type" # for mixed modality models - NUM_MEL_BINS = "clip.audio.num_mel_bins" - EMBEDDING_LENGTH = "clip.audio.embedding_length" + PROJECTOR_TYPE = "clip.audio.projector_type" # for mixed modality models + NUM_MEL_BINS = "clip.audio.num_mel_bins" + EMBEDDING_LENGTH = "clip.audio.embedding_length" FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length" - PROJECTION_DIM = "clip.audio.projection_dim" - BLOCK_COUNT = "clip.audio.block_count" + PROJECTION_DIM = "clip.audio.projection_dim" + BLOCK_COUNT = "clip.audio.block_count" class Attention: - HEAD_COUNT = "clip.audio.attention.head_count" - LAYERNORM_EPS = "clip.audio.attention.layer_norm_epsilon" + HEAD_COUNT = "clip.audio.attention.head_count" + LAYERNORM_EPS = "clip.audio.attention.layer_norm_epsilon" class Projector: - STACK_FACTOR = "clip.audio.projector.stack_factor" + STACK_FACTOR = "clip.audio.projector.stack_factor" class Diffusion: - SHIFT_LOGITS = "diffusion.shift_logits" + SHIFT_LOGITS = "diffusion.shift_logits" class xIELU: - ALPHA_P = "xielu.alpha_p" - ALPHA_N = "xielu.alpha_n" - BETA = "xielu.beta" - EPS = "xielu.eps" + ALPHA_P = "xielu.alpha_p" + ALPHA_N = "xielu.alpha_n" + BETA = "xielu.beta" + EPS = "xielu.eps" # @@ -362,981 +368,981 @@ class Keys: class GGUFType: - MODEL = "model" + MODEL = "model" ADAPTER = "adapter" IMATRIX = "imatrix" - MMPROJ = "mmproj" # dummy, unused for now + MMPROJ = "mmproj" # dummy, unused for now class MODEL_ARCH(IntEnum): - MMPROJ = auto() # dummy arch for clip.cpp - LLAMA = auto() - LLAMA4 = auto() - DECI = auto() - FALCON = auto() - FALCON_H1 = auto() - BAICHUAN = auto() - GROK = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - REFACT = auto() - BERT = auto() - MODERN_BERT = auto() - NOMIC_BERT = auto() - NOMIC_BERT_MOE = auto() - NEO_BERT = auto() - JINA_BERT_V2 = auto() - JINA_BERT_V3 = auto() - EUROBERT = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - QWEN2MOE = auto() - QWEN2VL = auto() - QWEN3 = auto() - QWEN3MOE = auto() - QWEN3NEXT = auto() - QWEN3VL = auto() - QWEN3VLMOE = auto() - QWEN35 = auto() - QWEN35MOE = auto() - PHI2 = auto() - PHI3 = auto() - PHIMOE = auto() - PLAMO = auto() - PLAMO2 = auto() - PLAMO3 = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - MINICPM3 = auto() - GEMMA = auto() - GEMMA2 = auto() - GEMMA3 = auto() - GEMMA3N = auto() - GEMMA4 = auto() - GEMMA_EMBEDDING = auto() - STARCODER2 = auto() - RWKV6 = auto() - RWKV6QWEN2 = auto() - RWKV7 = auto() - ARWKV7 = auto() - MAMBA = auto() - MAMBA2 = auto() - JAMBA = auto() - XVERSE = auto() - COMMAND_R = auto() - COHERE2 = auto() - DBRX = auto() - OLMO = auto() - OLMO2 = auto() - OLMOE = auto() - OPENELM = auto() - ARCTIC = auto() - DEEPSEEK = auto() - DEEPSEEK2 = auto() - DEEPSEEK2OCR = auto() - CHATGLM = auto() - GLM4 = auto() - GLM4_MOE = auto() - GLM_DSA = auto() - BITNET = auto() - T5 = auto() - T5ENCODER = auto() - JAIS = auto() - JAIS2 = auto() - NEMOTRON = auto() - NEMOTRON_H = auto() - NEMOTRON_H_MOE = auto() - EXAONE = auto() - EXAONE4 = auto() - EXAONE_MOE = auto() - GRANITE = auto() - GRANITE_MOE = auto() - GRANITE_HYBRID = auto() - CHAMELEON = auto() + MMPROJ = auto() # dummy arch for clip.cpp + LLAMA = auto() + LLAMA4 = auto() + DECI = auto() + FALCON = auto() + FALCON_H1 = auto() + BAICHUAN = auto() + GROK = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + REFACT = auto() + BERT = auto() + MODERN_BERT = auto() + NOMIC_BERT = auto() + NOMIC_BERT_MOE = auto() + NEO_BERT = auto() + JINA_BERT_V2 = auto() + JINA_BERT_V3 = auto() + EUROBERT = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + QWEN2MOE = auto() + QWEN2VL = auto() + QWEN3 = auto() + QWEN3MOE = auto() + QWEN3NEXT = auto() + QWEN3VL = auto() + QWEN3VLMOE = auto() + QWEN35 = auto() + QWEN35MOE = auto() + PHI2 = auto() + PHI3 = auto() + PHIMOE = auto() + PLAMO = auto() + PLAMO2 = auto() + PLAMO3 = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + MINICPM3 = auto() + GEMMA = auto() + GEMMA2 = auto() + GEMMA3 = auto() + GEMMA3N = auto() + GEMMA4 = auto() + GEMMA_EMBEDDING = auto() + STARCODER2 = auto() + RWKV6 = auto() + RWKV6QWEN2 = auto() + RWKV7 = auto() + ARWKV7 = auto() + MAMBA = auto() + MAMBA2 = auto() + JAMBA = auto() + XVERSE = auto() + COMMAND_R = auto() + COHERE2 = auto() + DBRX = auto() + OLMO = auto() + OLMO2 = auto() + OLMOE = auto() + OPENELM = auto() + ARCTIC = auto() + DEEPSEEK = auto() + DEEPSEEK2 = auto() + DEEPSEEK2OCR = auto() + CHATGLM = auto() + GLM4 = auto() + GLM4_MOE = auto() + GLM_DSA = auto() + BITNET = auto() + T5 = auto() + T5ENCODER = auto() + JAIS = auto() + JAIS2 = auto() + NEMOTRON = auto() + NEMOTRON_H = auto() + NEMOTRON_H_MOE = auto() + EXAONE = auto() + EXAONE4 = auto() + EXAONE_MOE = auto() + GRANITE = auto() + GRANITE_MOE = auto() + GRANITE_HYBRID = auto() + CHAMELEON = auto() WAVTOKENIZER_DEC = auto() - PLM = auto() - BAILINGMOE = auto() - BAILINGMOE2 = auto() - DOTS1 = auto() - ARCEE = auto() - AFMOE = auto() - ERNIE4_5 = auto() - ERNIE4_5_MOE = auto() - HUNYUAN_MOE = auto() - HUNYUAN_DENSE = auto() - SMOLLM3 = auto() - GPT_OSS = auto() - LFM2 = auto() - LFM2MOE = auto() - DREAM = auto() - SMALLTHINKER = auto() - LLADA = auto() - LLADA_MOE = auto() - SEED_OSS = auto() - GROVEMOE = auto() - APERTUS = auto() - COGVLM = auto() - MINIMAXM2 = auto() - RND1 = auto() - PANGU_EMBED = auto() - MISTRAL3 = auto() - MISTRAL4 = auto() - PADDLEOCR = auto() - MIMO2 = auto() - STEP35 = auto() - LLAMA_EMBED = auto() - MAINCODER = auto() - KIMI_LINEAR = auto() + PLM = auto() + BAILINGMOE = auto() + BAILINGMOE2 = auto() + DOTS1 = auto() + ARCEE = auto() + AFMOE = auto() + ERNIE4_5 = auto() + ERNIE4_5_MOE = auto() + HUNYUAN_MOE = auto() + HUNYUAN_DENSE = auto() + SMOLLM3 = auto() + GPT_OSS = auto() + LFM2 = auto() + LFM2MOE = auto() + DREAM = auto() + SMALLTHINKER = auto() + LLADA = auto() + LLADA_MOE = auto() + SEED_OSS = auto() + GROVEMOE = auto() + APERTUS = auto() + COGVLM = auto() + MINIMAXM2 = auto() + RND1 = auto() + PANGU_EMBED = auto() + MISTRAL3 = auto() + MISTRAL4 = auto() + PADDLEOCR = auto() + MIMO2 = auto() + STEP35 = auto() + LLAMA_EMBED = auto() + MAINCODER = auto() + KIMI_LINEAR = auto() class VISION_PROJECTOR_TYPE(IntEnum): - MLP = auto() - LDP = auto() - LDPV2 = auto() + MLP = auto() + LDP = auto() + LDPV2 = auto() RESAMPLER = auto() - GLM_EDGE = auto() - MERGER = auto() - GEMMA3N = auto() - GEMMA3 = auto() - QWEN3VL = auto() - STEP3VL = auto() - COGVLM = auto() + GLM_EDGE = auto() + MERGER = auto() + GEMMA3N = auto() + GEMMA3 = auto() + QWEN3VL = auto() + STEP3VL = auto() + COGVLM = auto() class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() - TOKEN_EMBD_NORM = auto() - TOKEN_TYPES = auto() - POS_EMBD = auto() - OUTPUT = auto() - DENSE_2_OUT = auto() # embeddinggemma 2_Dense - DENSE_3_OUT = auto() # embeddinggemma 3_Dense - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ROPE_FACTORS_LONG = auto() - ROPE_FACTORS_SHORT = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_OUT_NORM = auto() - ATTN_POST_NORM = auto() - ATTN_ROT_EMBD = auto() - ATTN_SINKS = auto() - ATTN_GATE = auto() - FFN_GATE_INP = auto() - FFN_GATE_INP_SHEXP = auto() - FFN_NORM = auto() - FFN_PRE_NORM = auto() # alias of FFN_NORM - FFN_PRE_NORM_2 = auto() # gemma4 - FFN_POST_NORM = auto() - FFN_POST_NORM_1 = auto() # gemma4 - FFN_POST_NORM_2 = auto() # gemma4 - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_ACT = auto() - FFN_NORM_EXP = auto() - FFN_GATE_EXP = auto() - FFN_DOWN_EXP = auto() - FFN_UP_EXP = auto() - FFN_GATE_UP_EXP = auto() - FFN_GATE_SHEXP = auto() - FFN_DOWN_SHEXP = auto() - FFN_UP_SHEXP = auto() - FFN_GATE_CHEXP = auto() - FFN_DOWN_CHEXP = auto() - FFN_UP_CHEXP = auto() - FFN_EXP_PROBS_B = auto() - MOE_LATENT_DOWN = auto() # nemotron 3 super - MOE_LATENT_UP = auto() # nemotron 3 super - ATTN_Q_NORM = auto() - ATTN_K_NORM = auto() - LAYER_OUT_NORM = auto() - LAYER_OUT_SCALE = auto() - PER_LAYER_TOKEN_EMBD = auto() # gemma3n - PER_LAYER_MODEL_PROJ = auto() # gemma3n - PER_LAYER_INP_GATE = auto() # gemma3n - PER_LAYER_PROJ = auto() # gemma3n - PER_LAYER_PROJ_NORM = auto() # gemma3n - PER_LAYER_POST_NORM = auto() # gemma3n - ALTUP_PROJ = auto() # gemma3n - ALTUP_UNEMBD_PROJ = auto() # gemma3n - ALTUP_CORRECT_COEF = auto() # gemma3n - ALTUP_CORRECT_SCALE = auto() # gemma3n - ALTUP_PREDICT_COEF = auto() # gemma3n - ALTUP_ROUTER = auto() # gemma3n - ALTUP_ROUTER_NORM = auto() # gemma3n - LAUREL_L = auto() # gemma3n - LAUREL_R = auto() # gemma3n - LAUREL_POST_NORM = auto() # gemma3n - SSM_IN = auto() - SSM_CONV1D = auto() - SSM_X = auto() - SSM_DT = auto() - SSM_DT_NORM = auto() - SSM_A = auto() - SSM_B_NORM = auto() - SSM_C_NORM = auto() - SSM_D = auto() - SSM_NORM = auto() - SSM_OUT = auto() - SSM_ALPHA = auto() # qwen3.5 - SSM_BETA_ALPHA = auto() # qwen3next - SSM_CONV1D_Q = auto() # Kimi Linear - SSM_CONV1D_K = auto() # Kimi Linear - SSM_CONV1D_V = auto() # Kimi Linear - SSM_F_A = auto() # Kimi Linear - SSM_F_B = auto() # Kimi Linear - SSM_BETA = auto() # Kimi Linear qwen3.5 - SSM_G_A = auto() # Kimi Linear - SSM_G_B = auto() # Kimi Linear - TIME_MIX_W0 = auto() - TIME_MIX_W1 = auto() - TIME_MIX_W2 = auto() - TIME_MIX_A0 = auto() - TIME_MIX_A1 = auto() - TIME_MIX_A2 = auto() - TIME_MIX_V0 = auto() - TIME_MIX_V1 = auto() - TIME_MIX_V2 = auto() - TIME_MIX_G1 = auto() - TIME_MIX_G2 = auto() - TIME_MIX_K_K = auto() - TIME_MIX_K_A = auto() - TIME_MIX_R_K = auto() - TIME_MIX_LERP_X = auto() - TIME_MIX_LERP_K = auto() - TIME_MIX_LERP_V = auto() - TIME_MIX_LERP_R = auto() - TIME_MIX_LERP_G = auto() - TIME_MIX_LERP_FUSED = auto() - TIME_MIX_LERP_W = auto() - TIME_MIX_FIRST = auto() - TIME_MIX_DECAY = auto() - TIME_MIX_DECAY_W1 = auto() - TIME_MIX_DECAY_W2 = auto() - TIME_MIX_KEY = auto() - TIME_MIX_VALUE = auto() - TIME_MIX_RECEPTANCE = auto() - TIME_MIX_GATE = auto() - TIME_MIX_LN = auto() - TIME_MIX_OUTPUT = auto() - CHANNEL_MIX_LERP_K = auto() - CHANNEL_MIX_LERP_R = auto() - CHANNEL_MIX_KEY = auto() + TOKEN_EMBD = auto() + TOKEN_EMBD_NORM = auto() + TOKEN_TYPES = auto() + POS_EMBD = auto() + OUTPUT = auto() + DENSE_2_OUT = auto() # embeddinggemma 2_Dense + DENSE_3_OUT = auto() # embeddinggemma 3_Dense + OUTPUT_NORM = auto() + ROPE_FREQS = auto() + ROPE_FACTORS_LONG = auto() + ROPE_FACTORS_SHORT = auto() + ATTN_Q = auto() + ATTN_K = auto() + ATTN_V = auto() + ATTN_QKV = auto() + ATTN_OUT = auto() + ATTN_NORM = auto() + ATTN_NORM_2 = auto() + ATTN_OUT_NORM = auto() + ATTN_POST_NORM = auto() + ATTN_ROT_EMBD = auto() + ATTN_SINKS = auto() + ATTN_GATE = auto() + FFN_GATE_INP = auto() + FFN_GATE_INP_SHEXP = auto() + FFN_NORM = auto() + FFN_PRE_NORM = auto() # alias of FFN_NORM + FFN_PRE_NORM_2 = auto() # gemma4 + FFN_POST_NORM = auto() + FFN_POST_NORM_1 = auto() # gemma4 + FFN_POST_NORM_2 = auto() # gemma4 + FFN_GATE = auto() + FFN_DOWN = auto() + FFN_UP = auto() + FFN_ACT = auto() + FFN_NORM_EXP = auto() + FFN_GATE_EXP = auto() + FFN_DOWN_EXP = auto() + FFN_UP_EXP = auto() + FFN_GATE_UP_EXP = auto() + FFN_GATE_SHEXP = auto() + FFN_DOWN_SHEXP = auto() + FFN_UP_SHEXP = auto() + FFN_GATE_CHEXP = auto() + FFN_DOWN_CHEXP = auto() + FFN_UP_CHEXP = auto() + FFN_EXP_PROBS_B = auto() + MOE_LATENT_DOWN = auto() # nemotron 3 super + MOE_LATENT_UP = auto() # nemotron 3 super + ATTN_Q_NORM = auto() + ATTN_K_NORM = auto() + LAYER_OUT_NORM = auto() + LAYER_OUT_SCALE = auto() + PER_LAYER_TOKEN_EMBD = auto() # gemma3n + PER_LAYER_MODEL_PROJ = auto() # gemma3n + PER_LAYER_INP_GATE = auto() # gemma3n + PER_LAYER_PROJ = auto() # gemma3n + PER_LAYER_PROJ_NORM = auto() # gemma3n + PER_LAYER_POST_NORM = auto() # gemma3n + ALTUP_PROJ = auto() # gemma3n + ALTUP_UNEMBD_PROJ = auto() # gemma3n + ALTUP_CORRECT_COEF = auto() # gemma3n + ALTUP_CORRECT_SCALE = auto() # gemma3n + ALTUP_PREDICT_COEF = auto() # gemma3n + ALTUP_ROUTER = auto() # gemma3n + ALTUP_ROUTER_NORM = auto() # gemma3n + LAUREL_L = auto() # gemma3n + LAUREL_R = auto() # gemma3n + LAUREL_POST_NORM = auto() # gemma3n + SSM_IN = auto() + SSM_CONV1D = auto() + SSM_X = auto() + SSM_DT = auto() + SSM_DT_NORM = auto() + SSM_A = auto() + SSM_B_NORM = auto() + SSM_C_NORM = auto() + SSM_D = auto() + SSM_NORM = auto() + SSM_OUT = auto() + SSM_ALPHA = auto() # qwen3.5 + SSM_BETA_ALPHA = auto() # qwen3next + SSM_CONV1D_Q = auto() # Kimi Linear + SSM_CONV1D_K = auto() # Kimi Linear + SSM_CONV1D_V = auto() # Kimi Linear + SSM_F_A = auto() # Kimi Linear + SSM_F_B = auto() # Kimi Linear + SSM_BETA = auto() # Kimi Linear qwen3.5 + SSM_G_A = auto() # Kimi Linear + SSM_G_B = auto() # Kimi Linear + TIME_MIX_W0 = auto() + TIME_MIX_W1 = auto() + TIME_MIX_W2 = auto() + TIME_MIX_A0 = auto() + TIME_MIX_A1 = auto() + TIME_MIX_A2 = auto() + TIME_MIX_V0 = auto() + TIME_MIX_V1 = auto() + TIME_MIX_V2 = auto() + TIME_MIX_G1 = auto() + TIME_MIX_G2 = auto() + TIME_MIX_K_K = auto() + TIME_MIX_K_A = auto() + TIME_MIX_R_K = auto() + TIME_MIX_LERP_X = auto() + TIME_MIX_LERP_K = auto() + TIME_MIX_LERP_V = auto() + TIME_MIX_LERP_R = auto() + TIME_MIX_LERP_G = auto() + TIME_MIX_LERP_FUSED = auto() + TIME_MIX_LERP_W = auto() + TIME_MIX_FIRST = auto() + TIME_MIX_DECAY = auto() + TIME_MIX_DECAY_W1 = auto() + TIME_MIX_DECAY_W2 = auto() + TIME_MIX_KEY = auto() + TIME_MIX_VALUE = auto() + TIME_MIX_RECEPTANCE = auto() + TIME_MIX_GATE = auto() + TIME_MIX_LN = auto() + TIME_MIX_OUTPUT = auto() + CHANNEL_MIX_LERP_K = auto() + CHANNEL_MIX_LERP_R = auto() + CHANNEL_MIX_KEY = auto() CHANNEL_MIX_RECEPTANCE = auto() - CHANNEL_MIX_VALUE = auto() - ATTN_Q_A = auto() - ATTN_Q_B = auto() - ATTN_KV_A_MQA = auto() - ATTN_KV_B = auto() - ATTN_K_B = auto() - ATTN_V_B = auto() - ATTN_Q_A_NORM = auto() - ATTN_KV_A_NORM = auto() - FFN_SUB_NORM = auto() - ATTN_SUB_NORM = auto() - DEC_ATTN_NORM = auto() - DEC_ATTN_Q = auto() - DEC_ATTN_K = auto() - DEC_ATTN_V = auto() - DEC_ATTN_OUT = auto() - DEC_ATTN_REL_B = auto() - DEC_CROSS_ATTN_NORM = auto() - DEC_CROSS_ATTN_Q = auto() - DEC_CROSS_ATTN_K = auto() - DEC_CROSS_ATTN_V = auto() - DEC_CROSS_ATTN_OUT = auto() + CHANNEL_MIX_VALUE = auto() + ATTN_Q_A = auto() + ATTN_Q_B = auto() + ATTN_KV_A_MQA = auto() + ATTN_KV_B = auto() + ATTN_K_B = auto() + ATTN_V_B = auto() + ATTN_Q_A_NORM = auto() + ATTN_KV_A_NORM = auto() + FFN_SUB_NORM = auto() + ATTN_SUB_NORM = auto() + DEC_ATTN_NORM = auto() + DEC_ATTN_Q = auto() + DEC_ATTN_K = auto() + DEC_ATTN_V = auto() + DEC_ATTN_OUT = auto() + DEC_ATTN_REL_B = auto() + DEC_CROSS_ATTN_NORM = auto() + DEC_CROSS_ATTN_Q = auto() + DEC_CROSS_ATTN_K = auto() + DEC_CROSS_ATTN_V = auto() + DEC_CROSS_ATTN_OUT = auto() DEC_CROSS_ATTN_REL_B = auto() - DEC_FFN_NORM = auto() - DEC_FFN_GATE = auto() - DEC_FFN_DOWN = auto() - DEC_FFN_UP = auto() - DEC_OUTPUT_NORM = auto() - ENC_ATTN_NORM = auto() - ENC_ATTN_Q = auto() - ENC_ATTN_K = auto() - ENC_ATTN_V = auto() - ENC_ATTN_OUT = auto() - ENC_ATTN_REL_B = auto() - ENC_FFN_NORM = auto() - ENC_FFN_GATE = auto() - ENC_FFN_DOWN = auto() - ENC_FFN_UP = auto() - ENC_OUTPUT_NORM = auto() - CLS = auto() # classifier - CLS_OUT = auto() # classifier output projection - CLS_NORM = auto() - CONV1D = auto() - CONVNEXT_DW = auto() - CONVNEXT_NORM = auto() - CONVNEXT_PW1 = auto() - CONVNEXT_PW2 = auto() - CONVNEXT_GAMMA = auto() - POSNET_CONV1 = auto() - POSNET_CONV2 = auto() - POSNET_NORM = auto() - POSNET_NORM1 = auto() - POSNET_NORM2 = auto() - POSNET_ATTN_NORM = auto() - POSNET_ATTN_Q = auto() - POSNET_ATTN_K = auto() - POSNET_ATTN_V = auto() - POSNET_ATTN_OUT = auto() - SHORTCONV_CONV = auto() - SHORTCONV_INPROJ = auto() - SHORTCONV_OUTPROJ = auto() - VISEXP_ATTN_QKV = auto() - VISEXP_ATTN_OUT = auto() - VISEXP_GATE = auto() - VISEXP_DOWN = auto() - VISEXP_UP = auto() - INDEXER_K_NORM = auto() - INDEXER_PROJ = auto() - INDEXER_ATTN_K = auto() - INDEXER_ATTN_Q_B = auto() + DEC_FFN_NORM = auto() + DEC_FFN_GATE = auto() + DEC_FFN_DOWN = auto() + DEC_FFN_UP = auto() + DEC_OUTPUT_NORM = auto() + ENC_ATTN_NORM = auto() + ENC_ATTN_Q = auto() + ENC_ATTN_K = auto() + ENC_ATTN_V = auto() + ENC_ATTN_OUT = auto() + ENC_ATTN_REL_B = auto() + ENC_FFN_NORM = auto() + ENC_FFN_GATE = auto() + ENC_FFN_DOWN = auto() + ENC_FFN_UP = auto() + ENC_OUTPUT_NORM = auto() + CLS = auto() # classifier + CLS_OUT = auto() # classifier output projection + CLS_NORM = auto() + CONV1D = auto() + CONVNEXT_DW = auto() + CONVNEXT_NORM = auto() + CONVNEXT_PW1 = auto() + CONVNEXT_PW2 = auto() + CONVNEXT_GAMMA = auto() + POSNET_CONV1 = auto() + POSNET_CONV2 = auto() + POSNET_NORM = auto() + POSNET_NORM1 = auto() + POSNET_NORM2 = auto() + POSNET_ATTN_NORM = auto() + POSNET_ATTN_Q = auto() + POSNET_ATTN_K = auto() + POSNET_ATTN_V = auto() + POSNET_ATTN_OUT = auto() + SHORTCONV_CONV = auto() + SHORTCONV_INPROJ = auto() + SHORTCONV_OUTPROJ = auto() + VISEXP_ATTN_QKV = auto() + VISEXP_ATTN_OUT = auto() + VISEXP_GATE = auto() + VISEXP_DOWN = auto() + VISEXP_UP = auto() + INDEXER_K_NORM = auto() + INDEXER_PROJ = auto() + INDEXER_ATTN_K = auto() + INDEXER_ATTN_Q_B = auto() # vision - V_MMPROJ = auto() - V_MMPROJ_FC = auto() - V_MMPROJ_MLP = auto() - V_MMPROJ_PEG = auto() - V_ENC_EMBD_CLS = auto() - V_ENC_EMBD_PATCH = auto() - V_ENC_EMBD_NORM = auto() - V_ENC_EMBD_POS = auto() - V_ENC_INPUT_NORM = auto() - V_ENC_ATTN_QKV = auto() - V_ENC_ATTN_Q = auto() - V_ENC_ATTN_Q_NORM = auto() - V_ENC_ATTN_K = auto() - V_ENC_ATTN_K_NORM = auto() - V_ENC_ATTN_V = auto() - V_ENC_ATTN_O = auto() - V_ENC_ATTN_O_NORM = auto() + V_MMPROJ = auto() + V_MMPROJ_FC = auto() + V_MMPROJ_MLP = auto() + V_MMPROJ_PEG = auto() + V_ENC_EMBD_CLS = auto() + V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_NORM = auto() + V_ENC_EMBD_POS = auto() + V_ENC_INPUT_NORM = auto() + V_ENC_ATTN_QKV = auto() + V_ENC_ATTN_Q = auto() + V_ENC_ATTN_Q_NORM = auto() + V_ENC_ATTN_K = auto() + V_ENC_ATTN_K_NORM = auto() + V_ENC_ATTN_V = auto() + V_ENC_ATTN_O = auto() + V_ENC_ATTN_O_NORM = auto() V_ENC_POST_ATTN_NORM = auto() - V_ENC_FFN_UP = auto() - V_ENC_FFN_GATE = auto() - V_ENC_FFN_DOWN = auto() - V_ENC_ATTN_POST_NORM = auto() # gemma4 - V_ENC_FFN_POST_NORM = auto() - V_LAYER_SCALE_1 = auto() - V_LAYER_SCALE_2 = auto() - V_LAYER_OUT_SCALE = auto() - V_PRE_NORM = auto() - V_POST_NORM = auto() - V_MM_PRE_NORM = auto() # hunyuanocr - V_MM_POST_NORM = auto() - V_MM_INP_NORM = auto() - V_MM_INP_PROJ = auto() # gemma3 - V_MM_SOFT_EMB_NORM = auto() # gemma3 - V_MM_EMBEDDING = auto() # gemma3n - V_MM_HARD_EMB_NORM = auto() # gemma3n - V_ENC_CONV_STEM = auto() # gemma3n - V_ENC_CONV_STEM_NORM = auto() # gemma3n - V_ENC_MSFA_EXP = auto() # gemma3n - V_ENC_MSFA_EXP_NORM = auto() # gemma3n - V_ENC_MSFA_PROJ = auto() # gemma3n - V_ENC_MSFA_PROJ_NORM = auto() # gemma3n - V_ENC_MSFA_NORM = auto() # gemma3n - V_RESMPL_POS_EMBD_K = auto() # minicpmv - V_RESMPL_ATTN_Q = auto() # minicpmv - V_RESMPL_ATTN_K = auto() # minicpmv - V_RESMPL_ATTN_V = auto() # minicpmv - V_RESMPL_ATTN_OUT = auto() # minicpmv - V_RESMPL_KV = auto() # minicpmv - V_RESMPL_KV_NORM = auto() # minicpmv - V_RESMPL_POST_NORM = auto() # minicpmv - V_RESMPL_Q_NORM = auto() # minicpmv - V_RESMPL_PROJ = auto() # minicpmv - V_RESMPL_QUERY = auto() # minicpmv - V_TOK_EMBD_IMG_BREAK = auto() # pixtral - V_MM_PATCH_MERGER = auto() # mistral small 3.1 - V_DS_NORM = auto() # qwen3vl - V_DS_FC1 = auto() # qwen3vl - V_DS_FC2 = auto() # qwen3vl - V_MM_POST_FC_NORM = auto() # cogvlm - V_MM_UP = auto() # cogvlm - V_MM_DOWN = auto() # cogvlm - V_MM_GATE = auto() # cogvlm - V_TOK_BOI = auto() # cogvlm - V_TOK_EOI = auto() # cogvlm - V_TOK_IMG_BEGIN = auto() # hunyuanocr - V_TOK_IMG_END = auto() # hunyuanocr - V_STD_BIAS = auto() # gemma4 - V_STD_SCALE = auto() # gemma4 - V_SAM_POS_EMBD = auto() # Deepseek-OCR - V_SAM_PATCH_EMBD = auto() # Deepseek-OCR - V_SAM_PRE_NORM = auto() # Deepseek-OCR - V_SAM_POST_NORM = auto() # Deepseek-OCR - V_SAM_ATTN_POS_H = auto() # Deepseek-OCR - V_SAM_ATTN_POS_W = auto() # Deepseek-OCR - V_SAM_ATTN_QKV = auto() # Deepseek-OCR - V_SAM_ATTN_OUT = auto() # Deepseek-OCR - V_SAM_MLP_LIN_1 = auto() # Deepseek-OCR - V_SAM_MLP_LIN_2 = auto() # Deepseek-OCR - V_SAM_NECK = auto() # Deepseek-OCR - V_SAM_NET_2 = auto() # Deepseek-OCR - V_SAM_NET_3 = auto() # Deepseek-OCR - V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR - V_ENC_EMBD_VSEP = auto() # Deepseek-OCR + V_ENC_FFN_UP = auto() + V_ENC_FFN_GATE = auto() + V_ENC_FFN_DOWN = auto() + V_ENC_ATTN_POST_NORM = auto() # gemma4 + V_ENC_FFN_POST_NORM = auto() + V_LAYER_SCALE_1 = auto() + V_LAYER_SCALE_2 = auto() + V_LAYER_OUT_SCALE = auto() + V_PRE_NORM = auto() + V_POST_NORM = auto() + V_MM_PRE_NORM = auto() # hunyuanocr + V_MM_POST_NORM = auto() + V_MM_INP_NORM = auto() + V_MM_INP_PROJ = auto() # gemma3 + V_MM_SOFT_EMB_NORM = auto() # gemma3 + V_MM_EMBEDDING = auto() # gemma3n + V_MM_HARD_EMB_NORM = auto() # gemma3n + V_ENC_CONV_STEM = auto() # gemma3n + V_ENC_CONV_STEM_NORM = auto() # gemma3n + V_ENC_MSFA_EXP = auto() # gemma3n + V_ENC_MSFA_EXP_NORM = auto() # gemma3n + V_ENC_MSFA_PROJ = auto() # gemma3n + V_ENC_MSFA_PROJ_NORM = auto() # gemma3n + V_ENC_MSFA_NORM = auto() # gemma3n + V_RESMPL_POS_EMBD_K = auto() # minicpmv + V_RESMPL_ATTN_Q = auto() # minicpmv + V_RESMPL_ATTN_K = auto() # minicpmv + V_RESMPL_ATTN_V = auto() # minicpmv + V_RESMPL_ATTN_OUT = auto() # minicpmv + V_RESMPL_KV = auto() # minicpmv + V_RESMPL_KV_NORM = auto() # minicpmv + V_RESMPL_POST_NORM = auto() # minicpmv + V_RESMPL_Q_NORM = auto() # minicpmv + V_RESMPL_PROJ = auto() # minicpmv + V_RESMPL_QUERY = auto() # minicpmv + V_TOK_EMBD_IMG_BREAK = auto() # pixtral + V_MM_PATCH_MERGER = auto() # mistral small 3.1 + V_DS_NORM = auto() # qwen3vl + V_DS_FC1 = auto() # qwen3vl + V_DS_FC2 = auto() # qwen3vl + V_MM_POST_FC_NORM = auto() # cogvlm + V_MM_UP = auto() # cogvlm + V_MM_DOWN = auto() # cogvlm + V_MM_GATE = auto() # cogvlm + V_TOK_BOI = auto() # cogvlm + V_TOK_EOI = auto() # cogvlm + V_TOK_IMG_BEGIN = auto() # hunyuanocr + V_TOK_IMG_END = auto() # hunyuanocr + V_STD_BIAS = auto() # gemma4 + V_STD_SCALE = auto() # gemma4 + V_SAM_POS_EMBD = auto() # Deepseek-OCR + V_SAM_PATCH_EMBD = auto() # Deepseek-OCR + V_SAM_PRE_NORM = auto() # Deepseek-OCR + V_SAM_POST_NORM = auto() # Deepseek-OCR + V_SAM_ATTN_POS_H = auto() # Deepseek-OCR + V_SAM_ATTN_POS_W = auto() # Deepseek-OCR + V_SAM_ATTN_QKV = auto() # Deepseek-OCR + V_SAM_ATTN_OUT = auto() # Deepseek-OCR + V_SAM_MLP_LIN_1 = auto() # Deepseek-OCR + V_SAM_MLP_LIN_2 = auto() # Deepseek-OCR + V_SAM_NECK = auto() # Deepseek-OCR + V_SAM_NET_2 = auto() # Deepseek-OCR + V_SAM_NET_3 = auto() # Deepseek-OCR + V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR + V_ENC_EMBD_VSEP = auto() # Deepseek-OCR # audio (mtmd) - A_ENC_EMBD_POS = auto() - A_ENC_EMBD_NORM = auto() - A_ENC_EMBD_TO_LOGITS = auto() # lfm2 - A_ENC_INP_PROJ = auto() # gemma4 - A_ENC_CONV1D = auto() - A_ENC_CONV1D_NORM = auto() # gemma3n - A_ENC_CONV2D = auto() - A_ENC_CONV_OUT = auto() - A_PRE_NORM = auto() - A_POST_NORM = auto() - A_ENC_LAYER_PRE_NORM = auto() # gemma3n - A_ENC_ATTN_Q = auto() - A_ENC_ATTN_K = auto() - A_ENC_ATTN_V = auto() - A_ENC_ATTN_POST_NORM = auto() - A_ENC_ATTN_PRE_NORM = auto() - A_ENC_ATTN_K_REL = auto() # gemma4 - A_ENC_PER_DIM_SCALE = auto() # gemma3n - A_ENC_INPUT_NORM = auto() - A_ENC_OUTPUT = auto() # TODO @ngxson: rename to ATTN_OUT - A_ENC_OUTPUT_NORM = auto() # TODO @ngxson: rename to ATTN_OUT - A_ENC_FFN_UP = auto() - A_ENC_FFN_NORM = auto() - A_ENC_FFN_POST_NORM = auto() # gemma3n - A_ENC_FFN_SCALE = auto() # gemma3n - A_ENC_FFN_GATE = auto() - A_ENC_FFN_DOWN = auto() - A_ENC_FFN_UP_1 = auto() # lfm2, gemma3n - A_ENC_FFN_NORM_1 = auto() # lfm2, gemma3n (pre-norm) - A_ENC_FFN_POST_NORM_1 = auto() # gemma3n - A_ENC_FFN_SCALE_1 = auto() # gemma3n - A_ENC_FFN_GATE_1 = auto() # lfm2, gemma3n - A_ENC_FFN_DOWN_1 = auto() # lfm2, gemma3n - A_MMPROJ = auto() - A_MMPROJ_FC = auto() - A_MM_NORM_PRE = auto() - A_MM_NORM_MID = auto() - A_MM_EMBEDDING = auto() # gemma3n - A_MM_HARD_EMB_NORM = auto() # gemma3n - A_MM_SOFT_EMB_NORM = auto() # gemma3n - A_MM_INP_PROJ = auto() # gemma3n - A_PER_DIM_K_SCALE = auto() # gemma4 - A_PER_DIM_SCALE = auto() # gemma4 + A_ENC_EMBD_POS = auto() + A_ENC_EMBD_NORM = auto() + A_ENC_EMBD_TO_LOGITS = auto() # lfm2 + A_ENC_INP_PROJ = auto() # gemma4 + A_ENC_CONV1D = auto() + A_ENC_CONV1D_NORM = auto() # gemma3n + A_ENC_CONV2D = auto() + A_ENC_CONV_OUT = auto() + A_PRE_NORM = auto() + A_POST_NORM = auto() + A_ENC_LAYER_PRE_NORM = auto() # gemma3n + A_ENC_ATTN_Q = auto() + A_ENC_ATTN_K = auto() + A_ENC_ATTN_V = auto() + A_ENC_ATTN_POST_NORM = auto() + A_ENC_ATTN_PRE_NORM = auto() + A_ENC_ATTN_K_REL = auto() # gemma4 + A_ENC_PER_DIM_SCALE = auto() # gemma3n + A_ENC_INPUT_NORM = auto() + A_ENC_OUTPUT = auto() # TODO @ngxson: rename to ATTN_OUT + A_ENC_OUTPUT_NORM = auto() # TODO @ngxson: rename to ATTN_OUT + A_ENC_FFN_UP = auto() + A_ENC_FFN_NORM = auto() + A_ENC_FFN_POST_NORM = auto() # gemma3n + A_ENC_FFN_SCALE = auto() # gemma3n + A_ENC_FFN_GATE = auto() + A_ENC_FFN_DOWN = auto() + A_ENC_FFN_UP_1 = auto() # lfm2, gemma3n + A_ENC_FFN_NORM_1 = auto() # lfm2, gemma3n (pre-norm) + A_ENC_FFN_POST_NORM_1 = auto() # gemma3n + A_ENC_FFN_SCALE_1 = auto() # gemma3n + A_ENC_FFN_GATE_1 = auto() # lfm2, gemma3n + A_ENC_FFN_DOWN_1 = auto() # lfm2, gemma3n + A_MMPROJ = auto() + A_MMPROJ_FC = auto() + A_MM_NORM_PRE = auto() + A_MM_NORM_MID = auto() + A_MM_EMBEDDING = auto() # gemma3n + A_MM_HARD_EMB_NORM = auto() # gemma3n + A_MM_SOFT_EMB_NORM = auto() # gemma3n + A_MM_INP_PROJ = auto() # gemma3n + A_PER_DIM_K_SCALE = auto() # gemma4 + A_PER_DIM_SCALE = auto() # gemma4 # nextn/mtp - NEXTN_EH_PROJ = auto() - NEXTN_EMBED_TOKENS = auto() - NEXTN_ENORM = auto() - NEXTN_HNORM = auto() + NEXTN_EH_PROJ = auto() + NEXTN_EMBED_TOKENS = auto() + NEXTN_ENORM = auto() + NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() # lfm2 audio - A_ENC_NORM_CONV = auto() - A_ENC_LINEAR_POS = auto() - A_ENC_POS_BIAS_U = auto() - A_ENC_POS_BIAS_V = auto() - A_ENC_OUT = auto() - A_ENC_CONV_DW = auto() # SSM conv - A_ENC_CONV_NORM = auto() # SSM conv - A_ENC_CONV_PW1 = auto() - A_ENC_CONV_PW2 = auto() + A_ENC_NORM_CONV = auto() + A_ENC_LINEAR_POS = auto() + A_ENC_POS_BIAS_U = auto() + A_ENC_POS_BIAS_V = auto() + A_ENC_OUT = auto() + A_ENC_CONV_DW = auto() # SSM conv + A_ENC_CONV_NORM = auto() # SSM conv + A_ENC_CONV_PW1 = auto() + A_ENC_CONV_PW2 = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.MMPROJ: "clip", # dummy arch for clip.cpp - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.LLAMA4: "llama4", - MODEL_ARCH.DECI: "deci", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GROK: "grok", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.MODERN_BERT: "modern-bert", - MODEL_ARCH.NOMIC_BERT: "nomic-bert", - MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", - MODEL_ARCH.NEO_BERT: "neo-bert", - MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", - MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3", - MODEL_ARCH.EUROBERT: "eurobert", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.QWEN2MOE: "qwen2moe", - MODEL_ARCH.QWEN2VL: "qwen2vl", - MODEL_ARCH.QWEN3: "qwen3", - MODEL_ARCH.QWEN3MOE: "qwen3moe", - MODEL_ARCH.QWEN3NEXT: "qwen3next", - MODEL_ARCH.QWEN3VL: "qwen3vl", - MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe", - MODEL_ARCH.QWEN35: "qwen35", - MODEL_ARCH.QWEN35MOE: "qwen35moe", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PHI3: "phi3", - MODEL_ARCH.PHIMOE: "phimoe", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.PLAMO2: "plamo2", - MODEL_ARCH.PLAMO3: "plamo3", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", - MODEL_ARCH.MINICPM3: "minicpm3", - MODEL_ARCH.GEMMA: "gemma", - MODEL_ARCH.GEMMA2: "gemma2", - MODEL_ARCH.GEMMA3: "gemma3", - MODEL_ARCH.GEMMA3N: "gemma3n", - MODEL_ARCH.GEMMA4: "gemma4", - MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding", - MODEL_ARCH.STARCODER2: "starcoder2", - MODEL_ARCH.RWKV6: "rwkv6", - MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", - MODEL_ARCH.RWKV7: "rwkv7", - MODEL_ARCH.ARWKV7: "arwkv7", - MODEL_ARCH.MAMBA: "mamba", - MODEL_ARCH.MAMBA2: "mamba2", - MODEL_ARCH.JAMBA: "jamba", - MODEL_ARCH.XVERSE: "xverse", - MODEL_ARCH.COMMAND_R: "command-r", - MODEL_ARCH.COHERE2: "cohere2", - MODEL_ARCH.DBRX: "dbrx", - MODEL_ARCH.OLMO: "olmo", - MODEL_ARCH.OLMO2: "olmo2", - MODEL_ARCH.OLMOE: "olmoe", - MODEL_ARCH.OPENELM: "openelm", - MODEL_ARCH.ARCTIC: "arctic", - MODEL_ARCH.DEEPSEEK: "deepseek", - MODEL_ARCH.DEEPSEEK2: "deepseek2", - MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr", - MODEL_ARCH.CHATGLM: "chatglm", - MODEL_ARCH.GLM4: "glm4", - MODEL_ARCH.GLM4_MOE: "glm4moe", - MODEL_ARCH.GLM_DSA: "glm-dsa", - MODEL_ARCH.BITNET: "bitnet", - MODEL_ARCH.T5: "t5", - MODEL_ARCH.T5ENCODER: "t5encoder", - MODEL_ARCH.JAIS: "jais", - MODEL_ARCH.JAIS2: "jais2", - MODEL_ARCH.NEMOTRON: "nemotron", - MODEL_ARCH.NEMOTRON_H: "nemotron_h", - MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe", - MODEL_ARCH.EXAONE: "exaone", - MODEL_ARCH.EXAONE4: "exaone4", - MODEL_ARCH.EXAONE_MOE: "exaone-moe", - MODEL_ARCH.GRANITE: "granite", - MODEL_ARCH.GRANITE_MOE: "granitemoe", - MODEL_ARCH.GRANITE_HYBRID: "granitehybrid", - MODEL_ARCH.CHAMELEON: "chameleon", + MODEL_ARCH.MMPROJ: "clip", # dummy arch for clip.cpp + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.LLAMA4: "llama4", + MODEL_ARCH.DECI: "deci", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.REFACT: "refact", + MODEL_ARCH.BERT: "bert", + MODEL_ARCH.MODERN_BERT: "modern-bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", + MODEL_ARCH.NEO_BERT: "neo-bert", + MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", + MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3", + MODEL_ARCH.EUROBERT: "eurobert", + MODEL_ARCH.BLOOM: "bloom", + MODEL_ARCH.STABLELM: "stablelm", + MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.QWEN2MOE: "qwen2moe", + MODEL_ARCH.QWEN2VL: "qwen2vl", + MODEL_ARCH.QWEN3: "qwen3", + MODEL_ARCH.QWEN3MOE: "qwen3moe", + MODEL_ARCH.QWEN3NEXT: "qwen3next", + MODEL_ARCH.QWEN3VL: "qwen3vl", + MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe", + MODEL_ARCH.QWEN35: "qwen35", + MODEL_ARCH.QWEN35MOE: "qwen35moe", + MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PHI3: "phi3", + MODEL_ARCH.PHIMOE: "phimoe", + MODEL_ARCH.PLAMO: "plamo", + MODEL_ARCH.PLAMO2: "plamo2", + MODEL_ARCH.PLAMO3: "plamo3", + MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.MINICPM3: "minicpm3", + MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.GEMMA2: "gemma2", + MODEL_ARCH.GEMMA3: "gemma3", + MODEL_ARCH.GEMMA3N: "gemma3n", + MODEL_ARCH.GEMMA4: "gemma4", + MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding", + MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.RWKV6: "rwkv6", + MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", + MODEL_ARCH.RWKV7: "rwkv7", + MODEL_ARCH.ARWKV7: "arwkv7", + MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.MAMBA2: "mamba2", + MODEL_ARCH.JAMBA: "jamba", + MODEL_ARCH.XVERSE: "xverse", + MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.COHERE2: "cohere2", + MODEL_ARCH.DBRX: "dbrx", + MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMO2: "olmo2", + MODEL_ARCH.OLMOE: "olmoe", + MODEL_ARCH.OPENELM: "openelm", + MODEL_ARCH.ARCTIC: "arctic", + MODEL_ARCH.DEEPSEEK: "deepseek", + MODEL_ARCH.DEEPSEEK2: "deepseek2", + MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr", + MODEL_ARCH.CHATGLM: "chatglm", + MODEL_ARCH.GLM4: "glm4", + MODEL_ARCH.GLM4_MOE: "glm4moe", + MODEL_ARCH.GLM_DSA: "glm-dsa", + MODEL_ARCH.BITNET: "bitnet", + MODEL_ARCH.T5: "t5", + MODEL_ARCH.T5ENCODER: "t5encoder", + MODEL_ARCH.JAIS: "jais", + MODEL_ARCH.JAIS2: "jais2", + MODEL_ARCH.NEMOTRON: "nemotron", + MODEL_ARCH.NEMOTRON_H: "nemotron_h", + MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe", + MODEL_ARCH.EXAONE: "exaone", + MODEL_ARCH.EXAONE4: "exaone4", + MODEL_ARCH.EXAONE_MOE: "exaone-moe", + MODEL_ARCH.GRANITE: "granite", + MODEL_ARCH.GRANITE_MOE: "granitemoe", + MODEL_ARCH.GRANITE_HYBRID: "granitehybrid", + MODEL_ARCH.CHAMELEON: "chameleon", MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", - MODEL_ARCH.PLM: "plm", - MODEL_ARCH.BAILINGMOE: "bailingmoe", - MODEL_ARCH.BAILINGMOE2: "bailingmoe2", - MODEL_ARCH.DOTS1: "dots1", - MODEL_ARCH.ARCEE: "arcee", - MODEL_ARCH.AFMOE: "afmoe", - MODEL_ARCH.ERNIE4_5: "ernie4_5", - MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe", - MODEL_ARCH.FALCON_H1: "falcon-h1", - MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", - MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense", - MODEL_ARCH.SMOLLM3: "smollm3", - MODEL_ARCH.GPT_OSS: "gpt-oss", - MODEL_ARCH.LFM2: "lfm2", - MODEL_ARCH.LFM2MOE: "lfm2moe", - MODEL_ARCH.DREAM: "dream", - MODEL_ARCH.SMALLTHINKER: "smallthinker", - MODEL_ARCH.LLADA: "llada", - MODEL_ARCH.LLADA_MOE: "llada-moe", - MODEL_ARCH.SEED_OSS: "seed_oss", - MODEL_ARCH.GROVEMOE: "grovemoe", - MODEL_ARCH.APERTUS: "apertus", - MODEL_ARCH.MINIMAXM2: "minimax-m2", - MODEL_ARCH.COGVLM: "cogvlm", - MODEL_ARCH.RND1: "rnd1", - MODEL_ARCH.PANGU_EMBED: "pangu-embedded", - MODEL_ARCH.MISTRAL3: "mistral3", - MODEL_ARCH.MISTRAL4: "mistral4", - MODEL_ARCH.PADDLEOCR: "paddleocr", - MODEL_ARCH.MIMO2: "mimo2", - MODEL_ARCH.STEP35: "step35", - MODEL_ARCH.LLAMA_EMBED: "llama-embed", - MODEL_ARCH.MAINCODER: "maincoder", - MODEL_ARCH.KIMI_LINEAR: "kimi-linear", + MODEL_ARCH.PLM: "plm", + MODEL_ARCH.BAILINGMOE: "bailingmoe", + MODEL_ARCH.BAILINGMOE2: "bailingmoe2", + MODEL_ARCH.DOTS1: "dots1", + MODEL_ARCH.ARCEE: "arcee", + MODEL_ARCH.AFMOE: "afmoe", + MODEL_ARCH.ERNIE4_5: "ernie4_5", + MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe", + MODEL_ARCH.FALCON_H1: "falcon-h1", + MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", + MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense", + MODEL_ARCH.SMOLLM3: "smollm3", + MODEL_ARCH.GPT_OSS: "gpt-oss", + MODEL_ARCH.LFM2: "lfm2", + MODEL_ARCH.LFM2MOE: "lfm2moe", + MODEL_ARCH.DREAM: "dream", + MODEL_ARCH.SMALLTHINKER: "smallthinker", + MODEL_ARCH.LLADA: "llada", + MODEL_ARCH.LLADA_MOE: "llada-moe", + MODEL_ARCH.SEED_OSS: "seed_oss", + MODEL_ARCH.GROVEMOE: "grovemoe", + MODEL_ARCH.APERTUS: "apertus", + MODEL_ARCH.MINIMAXM2: "minimax-m2", + MODEL_ARCH.COGVLM: "cogvlm", + MODEL_ARCH.RND1: "rnd1", + MODEL_ARCH.PANGU_EMBED: "pangu-embedded", + MODEL_ARCH.MISTRAL3: "mistral3", + MODEL_ARCH.MISTRAL4: "mistral4", + MODEL_ARCH.PADDLEOCR: "paddleocr", + MODEL_ARCH.MIMO2: "mimo2", + MODEL_ARCH.STEP35: "step35", + MODEL_ARCH.LLAMA_EMBED: "llama-embed", + MODEL_ARCH.MAINCODER: "maincoder", + MODEL_ARCH.KIMI_LINEAR: "kimi-linear", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { - VISION_PROJECTOR_TYPE.MLP: "mlp", - VISION_PROJECTOR_TYPE.LDP: "ldp", - VISION_PROJECTOR_TYPE.LDPV2: "ldpv2", + VISION_PROJECTOR_TYPE.MLP: "mlp", + VISION_PROJECTOR_TYPE.LDP: "ldp", + VISION_PROJECTOR_TYPE.LDPV2: "ldpv2", VISION_PROJECTOR_TYPE.RESAMPLER: "resampler", - VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter", - VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger", - VISION_PROJECTOR_TYPE.GEMMA3: "gemma3", - VISION_PROJECTOR_TYPE.QWEN3VL: "qwen3vl_merger", - VISION_PROJECTOR_TYPE.STEP3VL: "step3vl", + VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter", + VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger", + VISION_PROJECTOR_TYPE.GEMMA3: "gemma3", + VISION_PROJECTOR_TYPE.QWEN3VL: "qwen3vl_merger", + VISION_PROJECTOR_TYPE.STEP3VL: "step3vl", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense - MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", - MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_SINKS: "blk.{bid}.attn_sinks", - MODEL_TENSOR.ATTN_GATE: "blk.{bid}.attn_gate", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", - MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", - MODEL_TENSOR.FFN_PRE_NORM_2: "blk.{bid}.pre_ffw_norm_2", # gemma4 - MODEL_TENSOR.FFN_POST_NORM_1: "blk.{bid}.post_ffw_norm_1", # gemma4 - MODEL_TENSOR.FFN_POST_NORM_2: "blk.{bid}.post_ffw_norm_2", # gemma4 - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", - MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", - MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", - MODEL_TENSOR.FFN_GATE_CHEXP: "blk.{bid}.ffn_gate_chexps", - MODEL_TENSOR.FFN_DOWN_CHEXP: "blk.{bid}.ffn_down_chexps", - MODEL_TENSOR.FFN_UP_CHEXP: "blk.{bid}.ffn_up_chexps", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", - MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps", - MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", - MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super - MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super - MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - MODEL_TENSOR.LAYER_OUT_SCALE: "blk.{bid}.layer_output_scale", - MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n - MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n - MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n - MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n - MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n - MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n - MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n - MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n - MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n - MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n - MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n - MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n - MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n - MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n - MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n - MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n - MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", - MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", - MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", - MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", - MODEL_TENSOR.SSM_DT_NORM: "blk.{bid}.ssm_dt_norm", - MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", - MODEL_TENSOR.SSM_B_NORM: "blk.{bid}.ssm_b_norm", - MODEL_TENSOR.SSM_C_NORM: "blk.{bid}.ssm_c_norm", - MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", - MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm", - MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", - MODEL_TENSOR.SSM_ALPHA: "blk.{bid}.ssm_alpha", # qwen3.5 - MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba", - MODEL_TENSOR.SSM_CONV1D_Q: "blk.{bid}.ssm_conv1d_q", # Kimi Linear - MODEL_TENSOR.SSM_CONV1D_K: "blk.{bid}.ssm_conv1d_k", # Kimi Linear - MODEL_TENSOR.SSM_CONV1D_V: "blk.{bid}.ssm_conv1d_v", # Kimi Linear - MODEL_TENSOR.SSM_F_A: "blk.{bid}.ssm_f_a", # Kimi Linear - MODEL_TENSOR.SSM_F_B: "blk.{bid}.ssm_f_b", # Kimi Linear - MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear qwen3.5 - MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear - MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear - MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", - MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", - MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", - MODEL_TENSOR.TIME_MIX_A0: "blk.{bid}.time_mix_a0", - MODEL_TENSOR.TIME_MIX_A1: "blk.{bid}.time_mix_a1", - MODEL_TENSOR.TIME_MIX_A2: "blk.{bid}.time_mix_a2", - MODEL_TENSOR.TIME_MIX_V0: "blk.{bid}.time_mix_v0", - MODEL_TENSOR.TIME_MIX_V1: "blk.{bid}.time_mix_v1", - MODEL_TENSOR.TIME_MIX_V2: "blk.{bid}.time_mix_v2", - MODEL_TENSOR.TIME_MIX_G1: "blk.{bid}.time_mix_g1", - MODEL_TENSOR.TIME_MIX_G2: "blk.{bid}.time_mix_g2", - MODEL_TENSOR.TIME_MIX_K_K: "blk.{bid}.time_mix_k_k", - MODEL_TENSOR.TIME_MIX_K_A: "blk.{bid}.time_mix_k_a", - MODEL_TENSOR.TIME_MIX_R_K: "blk.{bid}.time_mix_r_k", - MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x", - MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k", - MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", - MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r", - MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g", - MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused", - MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w", - MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first", - MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay", - MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1", - MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2", - MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key", - MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value", - MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance", - MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate", - MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln", - MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output", - MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k", - MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r", - MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key", - MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance", - MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value", - MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", - MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", - MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", - MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", - MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b", - MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b", - MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", - MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", - MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", - MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", - MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", - MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", - MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", - MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", - MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", - MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", - MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", - MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", - MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", - MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", - MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", - MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b", - MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", - MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", - MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", - MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", - MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", - MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", - MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", - MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", - MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", - MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", - MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", - MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", - MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", - MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", - MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", - MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", - MODEL_TENSOR.CLS: "cls", - MODEL_TENSOR.CLS_OUT: "cls.output", - MODEL_TENSOR.CLS_NORM: "cls.norm", - MODEL_TENSOR.CONV1D: "conv1d", - MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw", - MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm", - MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1", - MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2", - MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma", - MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1", - MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2", - MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm", - MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1", - MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2", - MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm", - MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q", - MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", - MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", - MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", - MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv", - MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj", - MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj", - MODEL_TENSOR.VISEXP_ATTN_QKV: "blk.{bid}.vis_attn_qkv", - MODEL_TENSOR.VISEXP_ATTN_OUT: "blk.{bid}.vis_attn_output", - MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate", - MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down", - MODEL_TENSOR.VISEXP_UP: "blk.{bid}.vis_up", - MODEL_TENSOR.INDEXER_K_NORM: "blk.{bid}.indexer.k_norm", - MODEL_TENSOR.INDEXER_PROJ: "blk.{bid}.indexer.proj", - MODEL_TENSOR.INDEXER_ATTN_K: "blk.{bid}.indexer.attn_k", - MODEL_TENSOR.INDEXER_ATTN_Q_B: "blk.{bid}.indexer.attn_q_b", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense + MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", + MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_SINKS: "blk.{bid}.attn_sinks", + MODEL_TENSOR.ATTN_GATE: "blk.{bid}.attn_gate", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", + MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", + MODEL_TENSOR.FFN_PRE_NORM_2: "blk.{bid}.pre_ffw_norm_2", # gemma4 + MODEL_TENSOR.FFN_POST_NORM_1: "blk.{bid}.post_ffw_norm_1", # gemma4 + MODEL_TENSOR.FFN_POST_NORM_2: "blk.{bid}.post_ffw_norm_2", # gemma4 + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", + MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", + MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", + MODEL_TENSOR.FFN_GATE_CHEXP: "blk.{bid}.ffn_gate_chexps", + MODEL_TENSOR.FFN_DOWN_CHEXP: "blk.{bid}.ffn_down_chexps", + MODEL_TENSOR.FFN_UP_CHEXP: "blk.{bid}.ffn_up_chexps", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps", + MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", + MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super + MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super + MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.LAYER_OUT_SCALE: "blk.{bid}.layer_output_scale", + MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n + MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n + MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n + MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n + MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n + MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n + MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n + MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n + MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n + MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n + MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n + MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n + MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n + MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n + MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n + MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_DT_NORM: "blk.{bid}.ssm_dt_norm", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_B_NORM: "blk.{bid}.ssm_b_norm", + MODEL_TENSOR.SSM_C_NORM: "blk.{bid}.ssm_c_norm", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.SSM_ALPHA: "blk.{bid}.ssm_alpha", # qwen3.5 + MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba", + MODEL_TENSOR.SSM_CONV1D_Q: "blk.{bid}.ssm_conv1d_q", # Kimi Linear + MODEL_TENSOR.SSM_CONV1D_K: "blk.{bid}.ssm_conv1d_k", # Kimi Linear + MODEL_TENSOR.SSM_CONV1D_V: "blk.{bid}.ssm_conv1d_v", # Kimi Linear + MODEL_TENSOR.SSM_F_A: "blk.{bid}.ssm_f_a", # Kimi Linear + MODEL_TENSOR.SSM_F_B: "blk.{bid}.ssm_f_b", # Kimi Linear + MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear qwen3.5 + MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear + MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear + MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", + MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", + MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", + MODEL_TENSOR.TIME_MIX_A0: "blk.{bid}.time_mix_a0", + MODEL_TENSOR.TIME_MIX_A1: "blk.{bid}.time_mix_a1", + MODEL_TENSOR.TIME_MIX_A2: "blk.{bid}.time_mix_a2", + MODEL_TENSOR.TIME_MIX_V0: "blk.{bid}.time_mix_v0", + MODEL_TENSOR.TIME_MIX_V1: "blk.{bid}.time_mix_v1", + MODEL_TENSOR.TIME_MIX_V2: "blk.{bid}.time_mix_v2", + MODEL_TENSOR.TIME_MIX_G1: "blk.{bid}.time_mix_g1", + MODEL_TENSOR.TIME_MIX_G2: "blk.{bid}.time_mix_g2", + MODEL_TENSOR.TIME_MIX_K_K: "blk.{bid}.time_mix_k_k", + MODEL_TENSOR.TIME_MIX_K_A: "blk.{bid}.time_mix_k_a", + MODEL_TENSOR.TIME_MIX_R_K: "blk.{bid}.time_mix_r_k", + MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x", + MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k", + MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", + MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r", + MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g", + MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused", + MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w", + MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first", + MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay", + MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1", + MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2", + MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key", + MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value", + MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance", + MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate", + MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln", + MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output", + MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k", + MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r", + MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key", + MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance", + MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value", + MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a", + MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", + MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", + MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", + MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b", + MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b", + MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", + MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", + MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", + MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm", + MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm", + MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q", + MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k", + MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v", + MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o", + MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b", + MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm", + MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q", + MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k", + MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v", + MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o", + MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b", + MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm", + MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate", + MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down", + MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up", + MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm", + MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm", + MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q", + MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k", + MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v", + MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o", + MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b", + MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm", + MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate", + MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", + MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", + MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", + MODEL_TENSOR.CLS: "cls", + MODEL_TENSOR.CLS_OUT: "cls.output", + MODEL_TENSOR.CLS_NORM: "cls.norm", + MODEL_TENSOR.CONV1D: "conv1d", + MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw", + MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm", + MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1", + MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2", + MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma", + MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1", + MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2", + MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm", + MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1", + MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2", + MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm", + MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q", + MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", + MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", + MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv", + MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj", + MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj", + MODEL_TENSOR.VISEXP_ATTN_QKV: "blk.{bid}.vis_attn_qkv", + MODEL_TENSOR.VISEXP_ATTN_OUT: "blk.{bid}.vis_attn_output", + MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate", + MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down", + MODEL_TENSOR.VISEXP_UP: "blk.{bid}.vis_up", + MODEL_TENSOR.INDEXER_K_NORM: "blk.{bid}.indexer.k_norm", + MODEL_TENSOR.INDEXER_PROJ: "blk.{bid}.indexer.proj", + MODEL_TENSOR.INDEXER_ATTN_K: "blk.{bid}.indexer.attn_k", + MODEL_TENSOR.INDEXER_ATTN_Q_B: "blk.{bid}.indexer.attn_q_b", # vision - MODEL_TENSOR.V_MMPROJ: "mm.{bid}", - MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", - MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}", - MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", - MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", - MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", - MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd", - MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", - MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv", - MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", - MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm", - MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", - MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm", - MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v", - MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1", - MODEL_TENSOR.V_ENC_ATTN_O: "v.blk.{bid}.attn_out", - MODEL_TENSOR.V_ENC_ATTN_O_NORM: "v.blk.{bid}.attn_out_norm", - MODEL_TENSOR.V_ENC_POST_ATTN_NORM: "v.blk.{bid}.ln2", - MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up", - MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate", - MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down", - MODEL_TENSOR.V_ENC_ATTN_POST_NORM: "v.blk.{bid}.attn_post_norm", - MODEL_TENSOR.V_ENC_FFN_POST_NORM: "v.blk.{bid}.ffn_post_norm", - MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1", - MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2", - MODEL_TENSOR.V_LAYER_OUT_SCALE: "v.blk.{bid}.out_scale", - MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", - MODEL_TENSOR.V_POST_NORM: "v.post_ln", - MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm", - MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", - MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", - MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", # gemma3n - MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n - MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n - MODEL_TENSOR.V_ENC_CONV_STEM: "v.conv_stem.conv", # gemma3n - MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.conv_stem.bn", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_EXP: "v.msfa.ffn.pw_exp.conv", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.msfa.ffn.pw_exp.bn", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_PROJ: "v.msfa.ffn.pw_proj.conv", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: "v.msfa.ffn.pw_proj.bn", # gemma3n - MODEL_TENSOR.V_ENC_MSFA_NORM: "v.msfa.norm", # gemma3n - MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", - MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", - MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", - MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v", - MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out", - MODEL_TENSOR.V_RESMPL_KV: "resampler.kv", - MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv", - MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post", - MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q", - MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj", - MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", - MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral - MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1 - MODEL_TENSOR.V_DS_NORM: "v.deepstack.{bid}.norm", - MODEL_TENSOR.V_DS_FC1: "v.deepstack.{bid}.fc1", - MODEL_TENSOR.V_DS_FC2: "v.deepstack.{bid}.fc2", - MODEL_TENSOR.V_MM_POST_FC_NORM: "mm.post_fc_norm", # cogvlm - MODEL_TENSOR.V_MM_UP: "mm.up", - MODEL_TENSOR.V_MM_DOWN: "mm.down", - MODEL_TENSOR.V_MM_GATE: "mm.gate", - MODEL_TENSOR.V_TOK_BOI: "v.boi", - MODEL_TENSOR.V_TOK_EOI: "v.eoi", - MODEL_TENSOR.V_MM_PRE_NORM: "mm.pre_norm", - MODEL_TENSOR.V_TOK_IMG_BEGIN: "mm.image_begin", - MODEL_TENSOR.V_TOK_IMG_END: "mm.image_end", - MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4 - MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4 + MODEL_TENSOR.V_MMPROJ: "mm.{bid}", + MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", + MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}", + MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", + MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", + MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", + MODEL_TENSOR.V_ENC_EMBD_NORM: "v.norm_embd", + MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", + MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv", + MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", + MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm", + MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", + MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm", + MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v", + MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1", + MODEL_TENSOR.V_ENC_ATTN_O: "v.blk.{bid}.attn_out", + MODEL_TENSOR.V_ENC_ATTN_O_NORM: "v.blk.{bid}.attn_out_norm", + MODEL_TENSOR.V_ENC_POST_ATTN_NORM: "v.blk.{bid}.ln2", + MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up", + MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate", + MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down", + MODEL_TENSOR.V_ENC_ATTN_POST_NORM: "v.blk.{bid}.attn_post_norm", + MODEL_TENSOR.V_ENC_FFN_POST_NORM: "v.blk.{bid}.ffn_post_norm", + MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1", + MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2", + MODEL_TENSOR.V_LAYER_OUT_SCALE: "v.blk.{bid}.out_scale", + MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", + MODEL_TENSOR.V_POST_NORM: "v.post_ln", + MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm", + MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", + MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm", + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", # gemma3n + MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n + MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM: "v.conv_stem.conv", # gemma3n + MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.conv_stem.bn", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_EXP: "v.msfa.ffn.pw_exp.conv", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.msfa.ffn.pw_exp.bn", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_PROJ: "v.msfa.ffn.pw_proj.conv", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: "v.msfa.ffn.pw_proj.bn", # gemma3n + MODEL_TENSOR.V_ENC_MSFA_NORM: "v.msfa.norm", # gemma3n + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", + MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", + MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", + MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v", + MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out", + MODEL_TENSOR.V_RESMPL_KV: "resampler.kv", + MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv", + MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post", + MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q", + MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj", + MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", + MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral + MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1 + MODEL_TENSOR.V_DS_NORM: "v.deepstack.{bid}.norm", + MODEL_TENSOR.V_DS_FC1: "v.deepstack.{bid}.fc1", + MODEL_TENSOR.V_DS_FC2: "v.deepstack.{bid}.fc2", + MODEL_TENSOR.V_MM_POST_FC_NORM: "mm.post_fc_norm", # cogvlm + MODEL_TENSOR.V_MM_UP: "mm.up", + MODEL_TENSOR.V_MM_DOWN: "mm.down", + MODEL_TENSOR.V_MM_GATE: "mm.gate", + MODEL_TENSOR.V_TOK_BOI: "v.boi", + MODEL_TENSOR.V_TOK_EOI: "v.eoi", + MODEL_TENSOR.V_MM_PRE_NORM: "mm.pre_norm", + MODEL_TENSOR.V_TOK_IMG_BEGIN: "mm.image_begin", + MODEL_TENSOR.V_TOK_IMG_END: "mm.image_end", + MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4 + MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4 # DeepSeek-OCR SAM - MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd", - MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd", - MODEL_TENSOR.V_SAM_PRE_NORM: "v.sam.blk.{bid}.pre_ln", - MODEL_TENSOR.V_SAM_POST_NORM: "v.sam.blk.{bid}.post_ln", - MODEL_TENSOR.V_SAM_ATTN_POS_H: "v.sam.blk.{bid}.attn.pos_h", - MODEL_TENSOR.V_SAM_ATTN_POS_W: "v.sam.blk.{bid}.attn.pos_w", - MODEL_TENSOR.V_SAM_ATTN_QKV: "v.sam.blk.{bid}.attn.qkv", - MODEL_TENSOR.V_SAM_ATTN_OUT: "v.sam.blk.{bid}.attn.out", - MODEL_TENSOR.V_SAM_MLP_LIN_1: "v.sam.blk.{bid}.mlp.lin1", - MODEL_TENSOR.V_SAM_MLP_LIN_2: "v.sam.blk.{bid}.mlp.lin2", - MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}", - MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2", - MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", - MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR - MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR + MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd", + MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd", + MODEL_TENSOR.V_SAM_PRE_NORM: "v.sam.blk.{bid}.pre_ln", + MODEL_TENSOR.V_SAM_POST_NORM: "v.sam.blk.{bid}.post_ln", + MODEL_TENSOR.V_SAM_ATTN_POS_H: "v.sam.blk.{bid}.attn.pos_h", + MODEL_TENSOR.V_SAM_ATTN_POS_W: "v.sam.blk.{bid}.attn.pos_w", + MODEL_TENSOR.V_SAM_ATTN_QKV: "v.sam.blk.{bid}.attn.qkv", + MODEL_TENSOR.V_SAM_ATTN_OUT: "v.sam.blk.{bid}.attn.out", + MODEL_TENSOR.V_SAM_MLP_LIN_1: "v.sam.blk.{bid}.mlp.lin1", + MODEL_TENSOR.V_SAM_MLP_LIN_2: "v.sam.blk.{bid}.mlp.lin2", + MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}", + MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2", + MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", + MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR + MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR # audio (mtmd) # note: all audio tensor names must use prefix "a." or "mm.a." - MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", - MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm", - MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits", - MODEL_TENSOR.A_ENC_INP_PROJ: "a.input_projection", - MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", - MODEL_TENSOR.A_ENC_CONV2D: "a.conv2d.{bid}", - MODEL_TENSOR.A_ENC_CONV_OUT: "a.conv_out", - MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm", - MODEL_TENSOR.A_PRE_NORM: "a.pre_ln", - MODEL_TENSOR.A_POST_NORM: "a.post_ln", - MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: "a.blk.{bid}.layer_pre_norm", - MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q", - MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k", - MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v", - MODEL_TENSOR.A_ENC_ATTN_POST_NORM: "a.blk.{bid}.attn_post_norm", - MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: "a.blk.{bid}.attn_pre_norm", - MODEL_TENSOR.A_ENC_ATTN_K_REL: "a.blk.{bid}.attn_k_rel", - MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", - MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1", - MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out", - MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2", - MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm", - MODEL_TENSOR.A_ENC_FFN_POST_NORM: "a.blk.{bid}.ffn_post_norm", - MODEL_TENSOR.A_ENC_FFN_SCALE: "a.blk.{bid}.ffn_scale", - MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up", - MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate", - MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down", - MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1", - MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: "a.blk.{bid}.ffn_post_norm_1", - MODEL_TENSOR.A_ENC_FFN_SCALE_1: "a.blk.{bid}.ffn_scale_1", - MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1", - MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1", - MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1", - MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}", - MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc", - MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre", - MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid", - MODEL_TENSOR.A_MM_INP_PROJ: "mm.a.input_projection", # gemma3n - MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n - MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n - MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n - MODEL_TENSOR.A_PER_DIM_K_SCALE: "a.blk.{bid}.per_dim_k_scale", # gemma4 - MODEL_TENSOR.A_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", # gemma4 + MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", + MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm", + MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits", + MODEL_TENSOR.A_ENC_INP_PROJ: "a.input_projection", + MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", + MODEL_TENSOR.A_ENC_CONV2D: "a.conv2d.{bid}", + MODEL_TENSOR.A_ENC_CONV_OUT: "a.conv_out", + MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm", + MODEL_TENSOR.A_PRE_NORM: "a.pre_ln", + MODEL_TENSOR.A_POST_NORM: "a.post_ln", + MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: "a.blk.{bid}.layer_pre_norm", + MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q", + MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k", + MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v", + MODEL_TENSOR.A_ENC_ATTN_POST_NORM: "a.blk.{bid}.attn_post_norm", + MODEL_TENSOR.A_ENC_ATTN_PRE_NORM: "a.blk.{bid}.attn_pre_norm", + MODEL_TENSOR.A_ENC_ATTN_K_REL: "a.blk.{bid}.attn_k_rel", + MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", + MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1", + MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out", + MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2", + MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm", + MODEL_TENSOR.A_ENC_FFN_POST_NORM: "a.blk.{bid}.ffn_post_norm", + MODEL_TENSOR.A_ENC_FFN_SCALE: "a.blk.{bid}.ffn_scale", + MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up", + MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate", + MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down", + MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1", + MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: "a.blk.{bid}.ffn_post_norm_1", + MODEL_TENSOR.A_ENC_FFN_SCALE_1: "a.blk.{bid}.ffn_scale_1", + MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1", + MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1", + MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1", + MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}", + MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc", + MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre", + MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid", + MODEL_TENSOR.A_MM_INP_PROJ: "mm.a.input_projection", # gemma3n + MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n + MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n + MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n + MODEL_TENSOR.A_PER_DIM_K_SCALE: "a.blk.{bid}.per_dim_k_scale", # gemma4 + MODEL_TENSOR.A_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale", # gemma4 # lfm2 audio - MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv", - MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos", - MODEL_TENSOR.A_ENC_POS_BIAS_U: "a.blk.{bid}.pos_bias_u", - MODEL_TENSOR.A_ENC_POS_BIAS_V: "a.blk.{bid}.pos_bias_v", - MODEL_TENSOR.A_ENC_OUT: "a.pre_encode.out", - MODEL_TENSOR.A_ENC_CONV_DW: "a.blk.{bid}.conv_dw", - MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm", - MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1", - MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2", + MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv", + MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos", + MODEL_TENSOR.A_ENC_POS_BIAS_U: "a.blk.{bid}.pos_bias_u", + MODEL_TENSOR.A_ENC_POS_BIAS_V: "a.blk.{bid}.pos_bias_v", + MODEL_TENSOR.A_ENC_OUT: "a.pre_encode.out", + MODEL_TENSOR.A_ENC_CONV_DW: "a.blk.{bid}.conv_dw", + MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm", + MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1", + MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2", # NextN/MTP - MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", - MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", - MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm", - MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", - MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", - MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", + MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", + MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", + MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm", + MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1956,7 +1962,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_NORM, MODEL_TENSOR.SSM_IN, MODEL_TENSOR.SSM_BETA_ALPHA, - MODEL_TENSOR.SSM_OUT + MODEL_TENSOR.SSM_OUT, ], MODEL_ARCH.QWEN3VL: [ MODEL_TENSOR.TOKEN_EMBD, @@ -2015,7 +2021,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_NORM, MODEL_TENSOR.SSM_BETA, MODEL_TENSOR.SSM_ALPHA, - MODEL_TENSOR.SSM_OUT + MODEL_TENSOR.SSM_OUT, ], MODEL_ARCH.QWEN35MOE: [ MODEL_TENSOR.TOKEN_EMBD, @@ -2046,7 +2052,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_NORM, MODEL_TENSOR.SSM_BETA, MODEL_TENSOR.SSM_ALPHA, - MODEL_TENSOR.SSM_OUT + MODEL_TENSOR.SSM_OUT, ], MODEL_ARCH.PLAMO: [ MODEL_TENSOR.TOKEN_EMBD, @@ -2849,7 +2855,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_DOWN, ], - MODEL_ARCH.CHATGLM : [ + MODEL_ARCH.CHATGLM: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.OUTPUT_NORM, @@ -2864,7 +2870,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], - MODEL_ARCH.GLM4 : [ + MODEL_ARCH.GLM4: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.OUTPUT_NORM, @@ -3421,36 +3427,30 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_ARCH.FALCON_H1: [ # Token embedding MODEL_TENSOR.TOKEN_EMBD, - # Input layernorm MODEL_TENSOR.ATTN_NORM, - # Attention components - MODEL_TENSOR.ATTN_Q, # Query projection - MODEL_TENSOR.ATTN_K, # Key projection - MODEL_TENSOR.ATTN_V, # Value projection - MODEL_TENSOR.ATTN_OUT, # Output projection - + MODEL_TENSOR.ATTN_Q, # Query projection + MODEL_TENSOR.ATTN_K, # Key projection + MODEL_TENSOR.ATTN_V, # Value projection + MODEL_TENSOR.ATTN_OUT, # Output projection # SSM components (Mamba2 specific) - MODEL_TENSOR.SSM_IN, # Input projection for SSM - MODEL_TENSOR.SSM_CONV1D, # Convolution layer - MODEL_TENSOR.SSM_DT, # Delta time projection - MODEL_TENSOR.SSM_A, # A parameter (log form) - MODEL_TENSOR.SSM_D, # D parameter - MODEL_TENSOR.SSM_NORM, # Normalization in SSM - MODEL_TENSOR.SSM_OUT, # Output projection - + MODEL_TENSOR.SSM_IN, # Input projection for SSM + MODEL_TENSOR.SSM_CONV1D, # Convolution layer + MODEL_TENSOR.SSM_DT, # Delta time projection + MODEL_TENSOR.SSM_A, # A parameter (log form) + MODEL_TENSOR.SSM_D, # D parameter + MODEL_TENSOR.SSM_NORM, # Normalization in SSM + MODEL_TENSOR.SSM_OUT, # Output projection # Pre-feedforward layernorm MODEL_TENSOR.FFN_PRE_NORM, - # Feed-forward network components - MODEL_TENSOR.FFN_GATE, # Gate projection (SwiGLU) - MODEL_TENSOR.FFN_DOWN, # Down projection - MODEL_TENSOR.FFN_UP, # Up projection - + MODEL_TENSOR.FFN_GATE, # Gate projection (SwiGLU) + MODEL_TENSOR.FFN_DOWN, # Down projection + MODEL_TENSOR.FFN_UP, # Up projection # Post-feedforward layernorm - MODEL_TENSOR.OUTPUT_NORM, # Final layer norm - MODEL_TENSOR.OUTPUT, # Output projection (lm_head) + MODEL_TENSOR.OUTPUT_NORM, # Final layer norm + MODEL_TENSOR.OUTPUT, # Output projection (lm_head) ], MODEL_ARCH.HUNYUAN_MOE: [ MODEL_TENSOR.TOKEN_EMBD, @@ -3531,7 +3531,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.ATTN_NORM, # operator_norm + MODEL_TENSOR.ATTN_NORM, # operator_norm MODEL_TENSOR.ATTN_Q_NORM, MODEL_TENSOR.ATTN_K_NORM, MODEL_TENSOR.ATTN_Q, @@ -3539,7 +3539,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.DENSE_2_OUT, # LFM2-ColBert-350M + MODEL_TENSOR.DENSE_2_OUT, # LFM2-ColBert-350M ], MODEL_ARCH.LFM2MOE: [ MODEL_TENSOR.TOKEN_EMBD, @@ -3551,7 +3551,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.ATTN_NORM, # operator_norm + MODEL_TENSOR.ATTN_NORM, # operator_norm MODEL_TENSOR.ATTN_Q_NORM, MODEL_TENSOR.ATTN_K_NORM, MODEL_TENSOR.ATTN_Q, @@ -3948,69 +3948,77 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { class TokenType(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 + UNUSED = 5 + BYTE = 6 class RopeScalingType(Enum): - NONE = 'none' - LINEAR = 'linear' - YARN = 'yarn' - LONGROPE = 'longrope' + NONE = "none" + LINEAR = "linear" + YARN = "yarn" + LONGROPE = "longrope" class PoolingType(IntEnum): NONE = 0 MEAN = 1 - CLS = 2 + CLS = 2 LAST = 3 RANK = 4 class GGMLQuantizationType(IntEnum): - F32 = 0 - F16 = 1 - Q4_0 = 2 - Q4_1 = 3 - Q5_0 = 6 - Q5_1 = 7 - Q8_0 = 8 - Q8_1 = 9 - Q2_K = 10 - Q3_K = 11 - Q4_K = 12 - Q5_K = 13 - Q6_K = 14 - Q8_K = 15 + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + Q5_0 = 6 + Q5_1 = 7 + Q8_0 = 8 + Q8_1 = 9 + Q2_K = 10 + Q3_K = 11 + Q4_K = 12 + Q5_K = 13 + Q6_K = 14 + Q8_K = 15 IQ2_XXS = 16 - IQ2_XS = 17 + IQ2_XS = 17 IQ3_XXS = 18 - IQ1_S = 19 - IQ4_NL = 20 - IQ3_S = 21 - IQ2_S = 22 - IQ4_XS = 23 - I8 = 24 - I16 = 25 - I32 = 26 - I64 = 27 - F64 = 28 - IQ1_M = 29 - BF16 = 30 - TQ1_0 = 34 - TQ2_0 = 35 - MXFP4 = 39 - NVFP4 = 40 - Q1_0 = 41 + IQ1_S = 19 + IQ4_NL = 20 + IQ3_S = 21 + IQ2_S = 22 + IQ4_XS = 23 + I8 = 24 + I16 = 25 + I32 = 26 + I64 = 27 + F64 = 28 + IQ1_M = 29 + BF16 = 30 + TQ1_0 = 34 + TQ2_0 = 35 + MXFP4 = 39 + NVFP4 = 40 + Q1_0 = 41 + Q3_PT = 41 + Q3_KPT = 42 + Q4_DPT = 43 + Q2_DPT = 44 + Q2_KPT = 45 + IQ2_TQ = 46 + IQ3_TQ = 47 + IQ1_BN = 48 class ExpertGatingFuncType(IntEnum): - SOFTMAX = 1 - SIGMOID = 2 + SOFTMAX = 1 + SIGMOID = 2 # TODO: add GGMLFileType from ggml_ftype in ggml.h @@ -4019,49 +4027,49 @@ class ExpertGatingFuncType(IntEnum): # from llama_ftype in llama.h # ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE. class LlamaFileType(IntEnum): - ALL_F32 = 0 - MOSTLY_F16 = 1 # except 1d tensors - MOSTLY_Q4_0 = 2 # except 1d tensors - MOSTLY_Q4_1 = 3 # except 1d tensors + ALL_F32 = 0 + MOSTLY_F16 = 1 # except 1d tensors + MOSTLY_Q4_0 = 2 # except 1d tensors + MOSTLY_Q4_1 = 3 # except 1d tensors # MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16 # MOSTLY_Q4_2 = 5 # support has been removed # MOSTLY_Q4_3 = 6 # support has been removed - MOSTLY_Q8_0 = 7 # except 1d tensors - MOSTLY_Q5_0 = 8 # except 1d tensors - MOSTLY_Q5_1 = 9 # except 1d tensors - MOSTLY_Q2_K = 10 # except 1d tensors - MOSTLY_Q3_K_S = 11 # except 1d tensors - MOSTLY_Q3_K_M = 12 # except 1d tensors - MOSTLY_Q3_K_L = 13 # except 1d tensors - MOSTLY_Q4_K_S = 14 # except 1d tensors - MOSTLY_Q4_K_M = 15 # except 1d tensors - MOSTLY_Q5_K_S = 16 # except 1d tensors - MOSTLY_Q5_K_M = 17 # except 1d tensors - MOSTLY_Q6_K = 18 # except 1d tensors - MOSTLY_IQ2_XXS = 19 # except 1d tensors - MOSTLY_IQ2_XS = 20 # except 1d tensors - MOSTLY_Q2_K_S = 21 # except 1d tensors - MOSTLY_IQ3_XS = 22 # except 1d tensors - MOSTLY_IQ3_XXS = 23 # except 1d tensors - MOSTLY_IQ1_S = 24 # except 1d tensors - MOSTLY_IQ4_NL = 25 # except 1d tensors - MOSTLY_IQ3_S = 26 # except 1d tensors - MOSTLY_IQ3_M = 27 # except 1d tensors - MOSTLY_IQ2_S = 28 # except 1d tensors - MOSTLY_IQ2_M = 29 # except 1d tensors - MOSTLY_IQ4_XS = 30 # except 1d tensors - MOSTLY_IQ1_M = 31 # except 1d tensors - MOSTLY_BF16 = 32 # except 1d tensors + MOSTLY_Q8_0 = 7 # except 1d tensors + MOSTLY_Q5_0 = 8 # except 1d tensors + MOSTLY_Q5_1 = 9 # except 1d tensors + MOSTLY_Q2_K = 10 # except 1d tensors + MOSTLY_Q3_K_S = 11 # except 1d tensors + MOSTLY_Q3_K_M = 12 # except 1d tensors + MOSTLY_Q3_K_L = 13 # except 1d tensors + MOSTLY_Q4_K_S = 14 # except 1d tensors + MOSTLY_Q4_K_M = 15 # except 1d tensors + MOSTLY_Q5_K_S = 16 # except 1d tensors + MOSTLY_Q5_K_M = 17 # except 1d tensors + MOSTLY_Q6_K = 18 # except 1d tensors + MOSTLY_IQ2_XXS = 19 # except 1d tensors + MOSTLY_IQ2_XS = 20 # except 1d tensors + MOSTLY_Q2_K_S = 21 # except 1d tensors + MOSTLY_IQ3_XS = 22 # except 1d tensors + MOSTLY_IQ3_XXS = 23 # except 1d tensors + MOSTLY_IQ1_S = 24 # except 1d tensors + MOSTLY_IQ4_NL = 25 # except 1d tensors + MOSTLY_IQ3_S = 26 # except 1d tensors + MOSTLY_IQ3_M = 27 # except 1d tensors + MOSTLY_IQ2_S = 28 # except 1d tensors + MOSTLY_IQ2_M = 29 # except 1d tensors + MOSTLY_IQ4_XS = 30 # except 1d tensors + MOSTLY_IQ1_M = 31 # except 1d tensors + MOSTLY_BF16 = 32 # except 1d tensors # MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack # MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack - MOSTLY_TQ1_0 = 36 # except 1d tensors - MOSTLY_TQ2_0 = 37 # except 1d tensors - MOSTLY_MXFP4_MOE = 38 # except 1d tensors - MOSTLY_NVFP4 = 39 # except 1d tensors - MOSTLY_Q1_0 = 40 # except 1d tensors + MOSTLY_TQ1_0 = 36 # except 1d tensors + MOSTLY_TQ2_0 = 37 # except 1d tensors + MOSTLY_MXFP4_MOE = 38 # except 1d tensors + MOSTLY_NVFP4 = 39 # except 1d tensors + MOSTLY_Q1_0 = 40 # except 1d tensors - GUESSED = 1024 # not specified in the model file + GUESSED = 1024 # not specified in the model file class GGUFEndian(IntEnum): @@ -4070,18 +4078,18 @@ class GGUFEndian(IntEnum): class GGUFValueType(IntEnum): - UINT8 = 0 - INT8 = 1 - UINT16 = 2 - INT16 = 3 - UINT32 = 4 - INT32 = 5 + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 FLOAT32 = 6 - BOOL = 7 - STRING = 8 - ARRAY = 9 - UINT64 = 10 - INT64 = 11 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 FLOAT64 = 12 @staticmethod @@ -4117,10 +4125,10 @@ class VisionProjectorType: STEP3VL = "step3vl" ULTRAVOX = "ultravox" INTERNVL = "internvl" - QWEN2A = "qwen2a" # audio - QWEN3A = "qwen3a" # audio - GLMA = "glma" # audio - QWEN25O = "qwen2.5o" # omni + QWEN2A = "qwen2a" # audio + QWEN3A = "qwen3a" # audio + GLMA = "glma" # audio + QWEN25O = "qwen2.5o" # omni VOXTRAL = "voxtral" MERALION = "meralion" # audio: Whisper + gated MLP adaptor LFM2 = "lfm2" @@ -4132,121 +4140,129 @@ class VisionProjectorType: JANUS_PRO = "janus_pro" DOTSOCR = "dots_ocr" DEEPSEEKOCR = "deepseekocr" - LFM2A = "lfm2a" # audio - MUSIC_FLAMINGO = "musicflamingo" # audio + LFM2A = "lfm2a" # audio + MUSIC_FLAMINGO = "musicflamingo" # audio GLM4V = "glm4v" YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" - HUNYUANOCR = "hunyuanocr" + HUNYUANOCR = "hunyuanocr" # Items here are (block size, type size) QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { - GGMLQuantizationType.F32: (1, 4), - GGMLQuantizationType.F16: (1, 2), - GGMLQuantizationType.Q4_0: (32, 2 + 16), - GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), - GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), - GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), - GGMLQuantizationType.Q8_0: (32, 2 + 32), - GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), - GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), - GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), - GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), - GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), - GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), + GGMLQuantizationType.F32: (1, 4), + GGMLQuantizationType.F16: (1, 2), + GGMLQuantizationType.Q4_0: (32, 2 + 16), + GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), + GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), + GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), + GGMLQuantizationType.Q8_0: (32, 2 + 32), + GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), + GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), + GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), + GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), + GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), + GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4), - GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), + GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8), - GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), - GGMLQuantizationType.IQ4_NL: (32, 2 + 16), - GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), - GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), - GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), - GGMLQuantizationType.I8: (1, 1), - GGMLQuantizationType.I16: (1, 2), - GGMLQuantizationType.I32: (1, 4), - GGMLQuantizationType.I64: (1, 8), - GGMLQuantizationType.F64: (1, 8), - GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), - GGMLQuantizationType.BF16: (1, 2), - GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), - GGMLQuantizationType.TQ2_0: (256, 2 + 64), - GGMLQuantizationType.MXFP4: (32, 1 + 16), - GGMLQuantizationType.NVFP4: (64, 4 + 32), - GGMLQuantizationType.Q1_0: (128, 2 + 16), + GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), + GGMLQuantizationType.IQ4_NL: (32, 2 + 16), + GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), + GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), + GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), + GGMLQuantizationType.I8: (1, 1), + GGMLQuantizationType.I16: (1, 2), + GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), + GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), + GGMLQuantizationType.BF16: (1, 2), + GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), + GGMLQuantizationType.TQ2_0: (256, 2 + 64), + GGMLQuantizationType.MXFP4: (32, 1 + 16), + GGMLQuantizationType.NVFP4: (64, 4 + 32), + GGMLQuantizationType.Q1_0: (128, 2 + 16), + GGMLQuantizationType.Q3_PT: (256, 124), + GGMLQuantizationType.Q3_KPT: (256, 110), + GGMLQuantizationType.Q4_DPT: (32, 18), + GGMLQuantizationType.Q2_DPT: (32, 10), + GGMLQuantizationType.Q2_KPT: (256, 84), + GGMLQuantizationType.IQ2_TQ: (256, 82), + GGMLQuantizationType.IQ3_TQ: (256, 114), + GGMLQuantizationType.IQ1_BN: (256, 50), } # Aliases for backward compatibility. # general -KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE +KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION -KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT -KEY_GENERAL_NAME = Keys.General.NAME -KEY_GENERAL_AUTHOR = Keys.General.AUTHOR -KEY_GENERAL_URL = Keys.General.URL -KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION -KEY_GENERAL_LICENSE = Keys.General.LICENSE -KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL -KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE +KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT +KEY_GENERAL_NAME = Keys.General.NAME +KEY_GENERAL_AUTHOR = Keys.General.AUTHOR +KEY_GENERAL_URL = Keys.General.URL +KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION +KEY_GENERAL_LICENSE = Keys.General.LICENSE +KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL +KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE # LLM -KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE -KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH -KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH -KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT -KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH +KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE +KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH +KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH +KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT +KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL -KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT +KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT # attention -KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT -KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV -KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS -KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV -KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS +KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT +KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV +KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS +KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV +KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS # RoPE -KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE -KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE -KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR -KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN -KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED +KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT +KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE +KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE +KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR +KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN +KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED # SSM -KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL -KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE -KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE +KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL +KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE +KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK -KEY_SSM_GROUP_COUNT = Keys.SSM.GROUP_COUNT -KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS +KEY_SSM_GROUP_COUNT = Keys.SSM.GROUP_COUNT +KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS # KDA -KEY_KDA_HEAD_DIM = Keys.KDA.HEAD_DIM +KEY_KDA_HEAD_DIM = Keys.KDA.HEAD_DIM # tokenization -KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL -KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE -KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST +KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL +KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE +KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE -KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES -KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES -KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID -KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID -KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID -KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID -KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID -KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID -KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID -KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID -KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON -KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV +KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES +KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES +KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID +KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID +KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID +KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID +KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID +KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID +KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON +KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID @@ -4256,6 +4272,6 @@ KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID # deprecated -KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID -KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID -KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID +KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID +KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID +KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID diff --git a/include/llama.h b/include/llama.h index ac267b5089..434679d3f0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -155,6 +155,14 @@ extern "C" { LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_PT = 41, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_KPT = 42, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_DPT = 43, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_KPT = 44, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_TQ = 45, // except 1d tensors, trellis quantized with RNG codebook + LLAMA_FTYPE_MOSTLY_IQ3_TQ = 46, // except 1d tensors, 3-bit with per-tensor trained grid + LLAMA_FTYPE_MOSTLY_IQ1_BN = 47, // except 1d tensors, 8D vector quantized with trained codebook + LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 3df6e1f421..bdf6414aad 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -157,8 +157,8 @@ int main(int argc, char** argv) { t1 = std::chrono::high_resolution_clock::now(); float fs; - if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); - else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); + if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1, nullptr); + else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1, nullptr); t2 = std::chrono::high_resolution_clock::now(); t = 1e-3*std::chrono::duration_cast(t2-t1).count(); if (iloop > 3) ggml.addResult(fs, t); diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 2dca62848b..a78fabc28c 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -285,8 +285,8 @@ int main(int argc, char** argv) { else { const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type); vdot->from_float(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); - else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); + if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1, nullptr); + else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1, nullptr); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); diff --git a/scripts/analyze-ffn-down.py b/scripts/analyze-ffn-down.py new file mode 100644 index 0000000000..2f43089fac --- /dev/null +++ b/scripts/analyze-ffn-down.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python3 +"""Deep analysis of WHY ffn_down is hard to quantize. +Compares structural properties of all weight and activation tensors. +""" + +import numpy as np +import struct +import sys +import os + +DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data") + + +def load_f32_tensor(name): + path = os.path.join(DATA_DIR, name) + with open(path, "rb") as f: + nrow, ncol = struct.unpack("qq", f.read(16)) + data = np.frombuffer(f.read(), dtype=np.float32) + assert len(data) == nrow * ncol, f"Expected {nrow * ncol}, got {len(data)}" + return data.reshape(nrow, ncol) + + +def stats(label, arr): + """Print comprehensive statistics for a flat array.""" + a = arr.ravel() + print(f" {label}:") + print(f" shape={arr.shape}, n={len(a)}") + print(f" mean={a.mean():.6f}, std={a.std():.6f}") + print(f" min={a.min():.6f}, max={a.max():.6f}") + print(f" median={np.median(a):.6f}") + print( + f" |mean|/std = {abs(a.mean()) / (a.std() + 1e-10):.4f} (offset-to-spread ratio)" + ) + # Kurtosis (excess) - how heavy-tailed vs Gaussian + kurt = np.mean(((a - a.mean()) / (a.std() + 1e-10)) ** 4) - 3.0 + # Skewness + skew = np.mean(((a - a.mean()) / (a.std() + 1e-10)) ** 3) + print(f" skewness={skew:.4f}, excess_kurtosis={kurt:.4f}") + # Percentile ranges + pcts = np.percentile(a, [0.1, 1, 5, 25, 50, 75, 95, 99, 99.9]) + print( + f" percentiles: 0.1%={pcts[0]:.4f}, 1%={pcts[1]:.4f}, 5%={pcts[2]:.4f}, " + f"25%={pcts[3]:.4f}, 50%={pcts[4]:.4f}, 75%={pcts[5]:.4f}, " + f"95%={pcts[6]:.4f}, 99%={pcts[7]:.4f}, 99.9%={pcts[8]:.4f}" + ) + # Sparsity + near_zero = np.sum(np.abs(a) < 0.001 * a.std()) / len(a) + print(f" fraction |x| < 0.001*std: {near_zero:.4f}") + return { + "mean": a.mean(), + "std": a.std(), + "skew": skew, + "kurt": kurt, + "min": a.min(), + "max": a.max(), + } + + +# ============================================================================ +# 1. BASIC WEIGHT TENSOR COMPARISON +# ============================================================================ +print("=" * 80) +print("SECTION 1: WEIGHT TENSOR GLOBAL STATISTICS") +print("=" * 80) + +tensors = { + "ffn_gate": ("blk_0_ffn_gate_weight.f32bin", "9728x2560 (wide→narrow proj)"), + "ffn_up": ("blk_0_ffn_up_weight.f32bin", "9728x2560 (wide→narrow proj)"), + "ffn_down": ("blk_0_ffn_down_weight.f32bin", "2560x9728 (narrow→wide proj)"), + "attn_q": ("blk_0_attn_q_weight.f32bin", "4096x2560"), + "attn_k": ("blk_0_attn_k_weight.f32bin", "1024x2560"), + "attn_v": ("blk_0_attn_v_weight.f32bin", "1024x2560"), + "attn_out": ("blk_0_attn_output_weight.f32bin", "2560x4096"), +} + +weight_data = {} +for name, (fname, desc) in tensors.items(): + try: + W = load_f32_tensor(fname) + print(f"\n{'─' * 70}") + print(f" {name} [{desc}] — file: {fname}") + weight_data[name] = W + stats(name, W) + except Exception as e: + print(f" {name}: SKIP ({e})") + +# ============================================================================ +# 2. ROW-LEVEL STATISTICS (each row is a neuron output) +# ============================================================================ +print("\n" + "=" * 80) +print("SECTION 2: ROW-LEVEL VARIABILITY (per-neuron weight statistics)") +print("=" * 80) +print(" Each row of the weight matrix produces one output dimension.") +print(" High row-to-row variability in mean/std means the quantizer") +print(" must handle very different distributions across rows.\n") + +for name, W in weight_data.items(): + row_means = W.mean(axis=1) + row_stds = W.std(axis=1) + row_ranges = W.max(axis=1) - W.min(axis=1) + + print(f"\n {name} ({W.shape[0]} rows × {W.shape[1]} cols):") + print( + f" Row means: mean={row_means.mean():.6f}, std={row_means.std():.6f}, " + f"range=[{row_means.min():.6f}, {row_means.max():.6f}]" + ) + print( + f" Row stds: mean={row_stds.mean():.6f}, std={row_stds.std():.6f}, " + f"range=[{row_stds.min():.6f}, {row_stds.max():.6f}]" + ) + print(f" Row ranges: mean={row_ranges.mean():.6f}, std={row_ranges.std():.6f}") + print( + f" RowMeans CV (std/mean): {row_means.std() / (abs(row_means.mean()) + 1e-10):.4f}" + ) + print(f" RowStds CV: {row_stds.std() / (row_stds.mean() + 1e-10):.4f}") + +# ============================================================================ +# 3. GROUP-LEVEL ANALYSIS (16-element groups, like Q2_K) +# ============================================================================ +print("\n" + "=" * 80) +print("SECTION 3: GROUP-LEVEL ANALYSIS (16-element groups)") +print("=" * 80) +print(" Quantization works on 16-element groups. Key question:") +print(" How much does each group need its own OFFSET (dmin)?\n") + +GS = 16 + +for name, W in weight_data.items(): + # Look at first 256 rows for speed + nr = min(W.shape[0], 256) + nc = W.shape[1] + + group_means = [] + group_stds = [] + group_ranges = [] + group_offsets = [] # |mean| / range — how important is the offset + + for r in range(nr): + for g_start in range(0, nc, GS): + g = W[r, g_start : g_start + GS] + gm = g.mean() + gs = g.std() + gr = g.max() - g.min() + gmin = g.min() + + group_means.append(gm) + group_stds.append(gs) + group_ranges.append(gr) + # Offset importance: how large is the group mean relative to its range? + # If this is high, offset (dmin) matters a lot + if gr > 1e-10: + group_offsets.append(abs(gm) / gr) + else: + group_offsets.append(0) + + gm = np.array(group_means) + gs = np.array(group_stds) + gr = np.array(group_ranges) + go = np.array(group_offsets) + + print(f"\n {name} ({len(group_means)} groups):") + print( + f" Group mean: mean={gm.mean():.6f}, std={gm.std():.6f}, " + f"range=[{gm.min():.6f}, {gm.max():.6f}]" + ) + print(f" Group std: mean={gs.mean():.6f}, std={gs.std():.6f}") + print(f" Group range: mean={gr.mean():.6f}, std={gr.std():.6f}") + print(f" *** OFFSET IMPORTANCE (|group_mean| / range) ***") + print( + f" mean={go.mean():.4f}, median={np.median(go):.4f}, " + f"p90={np.percentile(go, 90):.4f}, max={go.max():.4f}" + ) + print(f" fraction with offset > 0.1: {np.mean(go > 0.1):.3f}") + print(f" fraction with offset > 0.2: {np.mean(go > 0.2):.3f}") + print(f" fraction with offset > 0.3: {np.mean(go > 0.3):.3f}") + + # How well does zeroing the min (Q2_K style, clamping min to 0) work? + # vs keeping the actual min + mse_no_offset = 0 # Assume uniform 4 levels [0,1,2,3] * scale + mse_with_offset = 0 # Assume uniform 4 levels [0,1,2,3] * scale + offset + + for r in range(nr): + for g_start in range(0, nc, GS): + g = W[r, g_start : g_start + GS] + gmin = g.min() + gmax = g.max() + gr = gmax - gmin + if gr < 1e-10: + continue + + # No offset: clamp min to 0, scale = max/3 + if gmin > 0: + scale_no = gmax / 3.0 + min_no = 0 + else: + scale_no = gmax / 3.0 + min_no = 0 # lose the negative offset + # Actually use (gmax - 0)/3 but we're clamping gmin to 0 + + # Better: use actual min/max + scale_w = gr / 3.0 + min_w = gmin + + for val in g: + # No offset quantization + norm_no = val / (scale_no + 1e-10) + idx_no = max(0, min(3, int(round(norm_no)))) + recon_no = scale_no * idx_no + mse_no_offset += (val - recon_no) ** 2 + + # With offset quantization + norm_w = (val - min_w) / (scale_w + 1e-10) + idx_w = max(0, min(3, int(round(norm_w)))) + recon_w = min_w + scale_w * idx_w + mse_with_offset += (val - recon_w) ** 2 + + total_elements = nr * nc + rmse_no = np.sqrt(mse_no_offset / total_elements) + rmse_w = np.sqrt(mse_with_offset / total_elements) + improvement = (rmse_no - rmse_w) / rmse_no * 100 + print(f" Quant RMSE (no offset): {rmse_no:.6f}") + print(f" Quant RMSE (with offset): {rmse_w:.6f}") + print(f" Offset benefit: {improvement:.1f}% RMSE reduction") + +# ============================================================================ +# 4. ACTIVATION ANALYSIS +# ============================================================================ +print("\n" + "=" * 80) +print("SECTION 4: ACTIVATION DISTRIBUTION COMPARISON") +print("=" * 80) + +activations = { + "ffn_input (gate/up)": "act_blk0_ffn_input.f32bin", + "ffn_down_input (swiglu)": "act_blk0_ffn_down_input.f32bin", + "attn_input (q/k/v)": "act_blk0_attn_input.f32bin", + "attn_output_input": "act_blk0_attn_output_input.f32bin", +} + +act_data = {} +for name, fname in activations.items(): + try: + A = load_f32_tensor(fname) + act_data[name] = A + print(f"\n{'─' * 70}") + print(f" {name} — {fname}") + stats(name, A) + except Exception as e: + print(f" {name}: SKIP ({e})") + +# ============================================================================ +# 5. THE CRITICAL QUESTION: PER-DIMENSION ACTIVATION MAGNITUDE +# ============================================================================ +print("\n" + "=" * 80) +print("SECTION 5: PER-DIMENSION ACTIVATION POWER (per-column RMS)") +print("=" * 80) +print(" If activation dimensions have very different magnitudes,") +print(" the quantization error in each weight dimension is weighted differently.") +print(" Dimensions with high activation power amplify weight errors.\n") + +for name, A in act_data.items(): + col_rms = np.sqrt(np.mean(A**2, axis=0)) # RMS per column (dimension) + print(f"\n {name} ({A.shape[1]} dimensions):") + print(f" Col RMS: mean={col_rms.mean():.6f}, std={col_rms.std():.6f}") + print(f" Col RMS range: [{col_rms.min():.6f}, {col_rms.max():.6f}]") + print(f" Col RMS CV (std/mean): {col_rms.std() / (col_rms.mean() + 1e-10):.4f}") + print(f" Max/Min ratio: {col_rms.max() / (col_rms.min() + 1e-10):.1f}x") + + # Top 10 and bottom 10 dimensions by power + top10 = np.argsort(col_rms)[-10:][::-1] + bot10 = np.argsort(col_rms)[:10] + print( + f" Top-10 dims by RMS: {[(int(d), f'{col_rms[d]:.4f}') for d in top10[:5]]}..." + ) + print( + f" Bot-10 dims by RMS: {[(int(d), f'{col_rms[d]:.4f}') for d in bot10[:5]]}..." + ) + + # How much do the top 10% of dimensions contribute to total power? + total_power = np.sum(col_rms**2) + sorted_power = np.sort(col_rms**2)[::-1] + top10pct = int(len(col_rms) * 0.1) + top10pct_power = np.sum(sorted_power[:top10pct]) + top1pct = max(1, int(len(col_rms) * 0.01)) + top1pct_power = np.sum(sorted_power[:top1pct]) + print( + f" Top 10% of dims contribute {top10pct_power / total_power * 100:.1f}% of total power" + ) + print( + f" Top 1% of dims contribute {top1pct_power / total_power * 100:.1f}% of total power" + ) + +# ============================================================================ +# 6. CROSS-CORRELATION: WEIGHT ERROR × ACTIVATION POWER +# ============================================================================ +print("\n" + "=" * 80) +print("SECTION 6: WHERE DO WEIGHT ERRORS MEET HIGH ACTIVATION POWER?") +print("=" * 80) +print(" For each weight dimension, compute: activation_rms[dim] × weight_error[dim]") +print(" This tells us which dimensions contribute most to matmul error.\n") + +# Focus on ffn_down vs ffn_gate for comparison +focus = [ + ("ffn_down", "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin"), + ("ffn_gate", "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin"), + ("ffn_up", "blk_0_ffn_up_weight.f32bin", "act_blk0_ffn_input.f32bin"), + ("attn_q", "blk_0_attn_q_weight.f32bin", "act_blk0_attn_input.f32bin"), +] + +for name, wfile, afile in focus: + W = load_f32_tensor(wfile) + A = load_f32_tensor(afile) + + if W.shape[1] != A.shape[1]: + print(f" {name}: dim mismatch W={W.shape[1]} vs A={A.shape[1]}, SKIP") + continue + + nc = W.shape[1] + + # Per-column activation RMS + act_rms = np.sqrt(np.mean(A**2, axis=0)) + + # Per-column weight std and range (how "hard" to quantize) + w_std = W.std(axis=0) + w_range = W.max(axis=0) - W.min(axis=0) + + # Per-column weight kurtosis (heavy tails = harder to quantize) + w_kurt = ( + np.mean(((W - W.mean(axis=0)) / (W.std(axis=0) + 1e-10)) ** 4, axis=0) - 3.0 + ) + + # Weight error proxy: with 2-bit uniform quant on 16-element groups + # Higher variance columns → more error + nr = min(W.shape[0], 256) + + # Simple Q2_K-style error estimate per dimension: + # For each group of 16 in the column direction, quantize and measure error + dim_mse = np.zeros(nc) + for g_start in range(0, nc, GS): + g_end = min(g_start + GS, nc) + for r in range(nr): + g = W[r, g_start:g_end] + gmin = min(g.min(), 0) # Q2_K clamps min to ≤0 + gmax = g.max() + gr = gmax - gmin + if gr < 1e-10: + continue + scale = gr / 3.0 + for i, val in enumerate(g): + norm = (val - gmin) / scale + idx = max(0, min(3, int(round(norm)))) + recon = gmin + scale * idx + dim_mse[g_start + i] += (val - recon) ** 2 + + dim_rmse = np.sqrt(dim_mse / nr) + + # The key metric: dimension-level contribution to matmul error + # matmul_error_contribution[d] ≈ act_rms[d] * weight_rmse[d] + matmul_contrib = act_rms * dim_rmse + + print(f"\n {name} ({nc} dimensions):") + print( + f" act_rms: mean={act_rms.mean():.4f}, CV={act_rms.std() / act_rms.mean():.4f}" + ) + print( + f" w_rmse: mean={dim_rmse.mean():.6f}, CV={dim_rmse.std() / (dim_rmse.mean() + 1e-10):.4f}" + ) + print( + f" matmul_contrib: mean={matmul_contrib.mean():.6f}, " + f"std={matmul_contrib.std():.6f}" + ) + + # Correlation between activation power and weight error + corr = np.corrcoef(act_rms, dim_rmse)[0, 1] + print(f" CORRELATION act_rms ↔ weight_rmse: {corr:.4f}") + print(f" (>0 means high-power dims are also hard to quantize — BAD)") + + # Top contributors to matmul error + top_dims = np.argsort(matmul_contrib)[-20:][::-1] + print(f" Top-5 error-contributing dimensions:") + for d in top_dims[:5]: + print( + f" dim {d}: act_rms={act_rms[d]:.4f}, w_rmse={dim_rmse[d]:.6f}, " + f"contrib={matmul_contrib[d]:.6f}, w_std={w_std[d]:.6f}, w_kurt={w_kurt[d]:.2f}" + ) + + # Distribution of matmul contributions + total_contrib = matmul_contrib.sum() + sorted_contrib = np.sort(matmul_contrib)[::-1] + for pct in [0.01, 0.05, 0.10, 0.25]: + n = max(1, int(nc * pct)) + print( + f" Top {pct * 100:.0f}% dims: {sorted_contrib[:n].sum() / total_contrib * 100:.1f}% " + f"of total matmul error" + ) + +# ============================================================================ +# 7. THE STRUCTURAL ASYMMETRY: COLUMN DIRECTION GROUP ANALYSIS +# ============================================================================ +print("\n" + "=" * 80) +print("SECTION 7: STRUCTURAL ASYMMETRY — COLUMN vs ROW GROUPING") +print("=" * 80) +print(" Quantization groups along the ROW (inner dim). For ffn_down,") +print(" each row has 9728 elements (38 groups of 256).") +print(" For ffn_gate, each row has 2560 elements (10 groups of 256).") +print(" More groups = more metadata (scales/offsets) relative to data bits.\n") + +for name, wfile, afile in focus: + W = load_f32_tensor(wfile) + nc = W.shape[1] + n_groups_per_row = nc // 256 # super-blocks per row + + print(f"\n {name}: {nc} cols → {n_groups_per_row} super-blocks per row") + print(f" Groups per row: {nc // GS} (16-element groups)") + print( + f" With Q2_K (2.625 bpw): {n_groups_per_row * 2} scale+offset bytes per row" + ) + + # How much do group means vary WITHIN a row? + nr = min(W.shape[0], 64) + intra_row_mean_var = [] + for r in range(nr): + group_means = [] + for g_start in range(0, nc, GS): + group_means.append(W[r, g_start : g_start + GS].mean()) + group_means = np.array(group_means) + intra_row_mean_var.append(group_means.std()) + + print( + f" Intra-row group mean variability (avg across rows): " + f"mean={np.mean(intra_row_mean_var):.6f}" + ) + + # How much does the sign of group means vary? + pos_frac = 0 + neg_frac = 0 + total_groups = 0 + for r in range(nr): + for g_start in range(0, nc, GS): + gm = W[r, g_start : g_start + GS].mean() + if gm > 0.001: + pos_frac += 1 + elif gm < -0.001: + neg_frac += 1 + total_groups += 1 + print( + f" Group mean sign: {pos_frac / total_groups * 100:.1f}% positive, " + f"{neg_frac / total_groups * 100:.1f}% negative, " + f"{(1 - pos_frac / total_groups - neg_frac / total_groups) * 100:.1f}% near-zero" + ) + +# ============================================================================ +# 8. THE SWIGLU EFFECT: WHY ffn_down INPUT IS SPECIAL +# ============================================================================ +print("\n" + "=" * 80) +print("SECTION 8: THE SWIGLU EFFECT — ffn_down ACTIVATION STRUCTURE") +print("=" * 80) +print(" ffn_down's activation is the SwiGLU output: silu(gate) * up") +print(" This creates a specific activation pattern that differs from") +print(" raw FFN input (RMSNorm output).\n") + +if "ffn_input (gate/up)" in act_data and "ffn_down_input (swiglu)" in act_data: + A_in = act_data["ffn_input (gate/up)"] + A_swiglu = act_data["ffn_down_input (swiglu)"] + + print(f" FFN input (RMSNorm output): {A_in.shape}") + print(f" SwiGLU output: {A_swiglu.shape}") + + # Per-token analysis + for t in range(min(A_swiglu.shape[0], 3)): + tok_in = A_in[t] + tok_sw = A_swiglu[t] + print(f"\n Token {t}:") + print( + f" FFN input: mean={tok_in.mean():.6f}, std={tok_in.std():.6f}, " + f"|max|={np.abs(tok_in).max():.6f}" + ) + print( + f" SwiGLU out: mean={tok_sw.mean():.6f}, std={tok_sw.std():.6f}, " + f"|max|={np.abs(tok_sw).max():.6f}" + ) + + # SwiGLU creates lots of near-zero values (silu suppresses negatives) + frac_nearzero_sw = np.mean(np.abs(tok_sw) < 0.01 * tok_sw.std()) + frac_nearzero_in = np.mean(np.abs(tok_in) < 0.01 * tok_in.std()) + print( + f" Near-zero fraction: FFN input={frac_nearzero_in:.3f}, " + f"SwiGLU={frac_nearzero_sw:.3f}" + ) + + # Sparsity pattern + frac_neg = np.mean(tok_sw < 0) + print(f" SwiGLU negative fraction: {frac_neg:.3f}") + + # Dimension-level analysis of SwiGLU + print(f"\n Dimension-level SwiGLU properties:") + dim_mean_sw = A_swiglu.mean(axis=0) + dim_std_sw = A_swiglu.std(axis=0) + dim_sparsity = np.mean(A_swiglu < 0, axis=0) # fraction of tokens negative per dim + + print(f" Dim mean range: [{dim_mean_sw.min():.6f}, {dim_mean_sw.max():.6f}]") + print(f" Dim std range: [{dim_std_sw.min():.6f}, {dim_std_sw.max():.6f}]") + print( + f" Dim negative fraction: mean={dim_sparsity.mean():.3f}, " + f"range=[{dim_sparsity.min():.3f}, {dim_sparsity.max():.3f}]" + ) + + # Highly sparse dimensions (mostly near-zero after SwiGLU) + high_sparsity = np.sum(dim_sparsity > 0.7) + low_sparsity = np.sum(dim_sparsity < 0.3) + print(f" Dims with >70% negative tokens: {high_sparsity}/{len(dim_sparsity)}") + print(f" Dims with <30% negative tokens: {low_sparsity}/{len(dim_sparsity)}") + +# ============================================================================ +# 9. QUANTIZATION NOISE × ACTIVATION POWER: THE MATMUL ERROR DECOMPOSITION +# ============================================================================ +print("\n" + "=" * 80) +print("SECTION 9: MATMUL ERROR DECOMPOSITION") +print("=" * 80) +print( + " matmul_error ≈ sum over groups of (activation_power_in_group × " + "weight_mse_in_group)" +) +print( + " If activation power is concentrated in groups with high weight error, " + "matmul error explodes.\n" +) + +# For ffn_down specifically, compare where activation power sits vs weight error +W_down = load_f32_tensor("blk_0_ffn_down_weight.f32bin") +A_swiglu = load_f32_tensor("act_blk0_ffn_down_input.f32bin") + +W_gate = load_f32_tensor("blk_0_ffn_gate_weight.f32bin") +A_ffn_in = load_f32_tensor("act_blk0_ffn_input.f32bin") + +for label, W, A in [("ffn_down", W_down, A_swiglu), ("ffn_gate", W_gate, A_ffn_in)]: + nc = W.shape[1] + nr = min(W.shape[0], 128) + + # Compute per-superblock (256) activation power and weight error + n_sb = nc // 256 + sb_act_power = np.zeros(n_sb) + sb_weight_mse = np.zeros(n_sb) + + for sb in range(n_sb): + s = sb * 256 + e = s + 256 + # Activation power: mean squared activation in this region + sb_act_power[sb] = np.mean(A[:, s:e] ** 2) + + # Weight MSE: Q2_K-style uniform quant error + mse = 0 + count = 0 + for r in range(nr): + for g in range(0, 256, GS): + gvals = W[r, s + g : s + g + GS] + gmin = min(gvals.min(), 0) + gmax = gvals.max() + gr = gmax - gmin + if gr < 1e-10: + continue + scale = gr / 3.0 + for v in gvals: + norm = (v - gmin) / scale + idx = max(0, min(3, int(round(norm)))) + recon = gmin + scale * idx + mse += (v - recon) ** 2 + count += 1 + sb_weight_mse[sb] = mse / max(count, 1) + + # Correlation between activation power and weight error across super-blocks + valid = sb_act_power > 1e-10 + if valid.sum() > 10: + corr = np.corrcoef(np.sqrt(sb_act_power[valid]), np.sqrt(sb_weight_mse[valid]))[ + 0, 1 + ] + else: + corr = 0 + + print(f"\n {label}:") + print(f" Super-blocks: {n_sb}") + print( + f" act_power: mean={sb_act_power.mean():.6f}, " + f"std={np.sqrt(sb_act_power.var()):.6f}, " + f"range=[{sb_act_power.min():.6f}, {sb_act_power.max():.6f}]" + ) + print( + f" weight_mse: mean={sb_weight_mse.mean():.6f}, " + f"range=[{sb_weight_mse.min():.6f}, {sb_weight_mse.max():.6f}]" + ) + print(f" CORRELATION (act_power ↔ weight_mse): {corr:.4f}") + + # Show top-5 super-blocks by contribution to matmul error + contrib = sb_act_power * sb_weight_mse + top5 = np.argsort(contrib)[-5:][::-1] + print(f" Top-5 error-contributing super-blocks (of {n_sb}):") + for idx in top5: + print( + f" SB {idx * 256}-{(idx + 1) * 256 - 1}: act_power={sb_act_power[idx]:.6f}, " + f"weight_mse={sb_weight_mse[idx]:.6f}, contrib={contrib[idx]:.6f}" + ) + +print("\n" + "=" * 80) +print("ANALYSIS COMPLETE") +print("=" * 80) diff --git a/scripts/compute-imatrix.py b/scripts/compute-imatrix.py new file mode 100644 index 0000000000..0b8d394d1b --- /dev/null +++ b/scripts/compute-imatrix.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +"""Compute imatrix (importance matrix) from captured activation tensors. + +The imatrix is the per-dimension sum-of-squares of the activations. +It's what upstream llama.cpp uses to weight quantization optimization. + +For each activation file act_blkL_*.f32bin, produces imatrix_blkL_.f32bin +where matches the weight tensor it multiplies with. + +Format: flat float32 array of length n_per_row, one importance value per dimension. +""" + +import numpy as np +import struct +import os + +DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data") + + +def load_f32_tensor(name): + path = os.path.join(DATA_DIR, name) + with open(path, "rb") as f: + nrow, ncol = struct.unpack("qq", f.read(16)) + data = np.frombuffer(f.read(), dtype=np.float32) + assert len(data) == nrow * ncol + return data.reshape(nrow, ncol) + + +def save_imatrix(name, data): + path = os.path.join(DATA_DIR, name) + data.astype(np.float32).tofile(path) + print( + f" Wrote {path}: {len(data)} dims, " + f"min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}" + ) + + +# Mapping: activation file → imatrix files for each weight it multiplies with +# Each weight tensor's column dimension matches the activation's column dimension +mappings = [ + { + "act_file": "act_blk0_ffn_input.f32bin", + "imatrix_name": "imatrix_blk0_ffn_gate_up.f32bin", + "description": "ffn_gate and ffn_up (both use ffn_input activation)", + }, + { + "act_file": "act_blk0_ffn_down_input.f32bin", + "imatrix_name": "imatrix_blk0_ffn_down.f32bin", + "description": "ffn_down (uses SwiGLU activation)", + }, + { + "act_file": "act_blk0_attn_input.f32bin", + "imatrix_name": "imatrix_blk0_attn_qkv.f32bin", + "description": "attn_q, attn_k, attn_v (all use attn_input activation)", + }, + { + "act_file": "act_blk0_attn_output_input.f32bin", + "imatrix_name": "imatrix_blk0_attn_output.f32bin", + "description": "attn_output (uses kqv_out activation)", + }, +] + +print("Computing imatrix from captured activations") +print("=" * 60) + +for m in mappings: + try: + A = load_f32_tensor(m["act_file"]) + print(f"\n{m['description']}:") + print(f" Activation: {A.shape[0]} tokens × {A.shape[1]} dims") + + # imatrix = sum over tokens of activation^2 + # This is the standard definition used by llama.cpp + imatrix = np.sum(A**2, axis=0) + + # Also compute per-dim RMS for reference + rms = np.sqrt(np.mean(A**2, axis=0)) + + print( + f" Imatrix stats: min={imatrix.min():.6f}, max={imatrix.max():.6f}, " + f"mean={imatrix.mean():.6f}, std={imatrix.std():.6f}" + ) + print( + f" RMS stats: min={rms.min():.6f}, max={rms.max():.6f}, " + f"mean={rms.mean():.6f}" + ) + + # Concentration metrics + total = imatrix.sum() + sorted_im = np.sort(imatrix)[::-1] + top1pct = max(1, int(len(imatrix) * 0.01)) + top10pct = max(1, int(len(imatrix) * 0.10)) + print(f" Power concentration:") + print( + f" Top 1% dims ({top1pct}): {sorted_im[:top1pct].sum() / total * 100:.1f}% of total" + ) + print( + f" Top 10% dims ({top10pct}): {sorted_im[:top10pct].sum() / total * 100:.1f}% of total" + ) + + save_imatrix(m["imatrix_name"], imatrix) + except Exception as e: + print(f" SKIP: {e}") + +print("\nDone.") diff --git a/scripts/extract-activations.py b/scripts/extract-activations.py new file mode 100644 index 0000000000..bc7d2faf1b --- /dev/null +++ b/scripts/extract-activations.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +"""Extract real activation tensors by running a forward pass through the model. + +Captures the INPUT activations to specific weight tensors (the vectors that get +multiplied by the weight matrix). These are what matter for quantization quality: +quantization error * activation magnitude = output error. + +Usage: + python3 scripts/extract-activations.py MODEL.gguf OUTPUT_DIR [--prompt TEXT] [--layer N] + +Output: + For each target tensor, writes a .f32bin file with header: + int64_t n_rows, int64_t row_len + followed by n_rows * row_len float32 values. + n_rows = number of tokens, row_len = hidden dimension. + +NOTE: This uses a simplified forward pass (no KV cache, single prompt). +Activations are extracted from after the norm layers (the actual matmul inputs). +""" +import sys +import os +import struct +import numpy as np + +script_dir = os.path.dirname(os.path.abspath(__file__)) +repo_root = os.path.dirname(script_dir) +sys.path.insert(0, os.path.join(repo_root, 'gguf-py')) + +from gguf import GGUFReader + + +def bf16_to_f32(raw_bytes): + """Convert raw BF16 bytes to float32 numpy array.""" + bf16 = np.frombuffer(raw_bytes, dtype=np.uint16) + f32_bits = bf16.astype(np.uint32) << 16 + return f32_bits.view(np.float32) + + +def rms_norm(x, weight, eps=1e-6): + """RMS normalization (Qwen3/Llama style).""" + rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps) + return (x / rms) * weight + + +def silu(x): + """SiLU activation.""" + return x / (1.0 + np.exp(-np.clip(x, -88, 88))) + + +def softmax(x, axis=-1): + """Numerically stable softmax.""" + x_max = np.max(x, axis=axis, keepdims=True) + e = np.exp(x - x_max) + return e / np.sum(e, axis=axis, keepdims=True) + + +def main(): + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} MODEL.gguf OUTPUT_DIR [--prompt TEXT] [--layer N]") + sys.exit(1) + + model_path = sys.argv[1] + output_dir = sys.argv[2] + prompt_text = "The quick brown fox jumps over the lazy dog. In a distant galaxy, scientists discovered" + target_layer = 16 + + for i in range(3, len(sys.argv)): + if sys.argv[i] == "--prompt" and i + 1 < len(sys.argv): + prompt_text = sys.argv[i + 1] + elif sys.argv[i] == "--layer" and i + 1 < len(sys.argv): + target_layer = int(sys.argv[i + 1]) + + os.makedirs(output_dir, exist_ok=True) + + print(f"Loading {model_path}...") + reader = GGUFReader(model_path) + + # Read model config from metadata + config = {} + for kv in reader.fields.values(): + if hasattr(kv, 'parts') and len(kv.parts) > 0: + name = kv.name + if 'block_count' in name: + config['n_layer'] = int(kv.parts[-1][0]) + elif 'embedding_length' in name: + config['hidden'] = int(kv.parts[-1][0]) + elif 'feed_forward_length' in name: + config['ffn'] = int(kv.parts[-1][0]) + elif 'head_count_kv' in name: + config['n_kv_heads'] = int(kv.parts[-1][0]) + elif 'head_count' in name and 'kv' not in name: + config['n_heads'] = int(kv.parts[-1][0]) + elif 'key_length' in name: + config['head_dim'] = int(kv.parts[-1][0]) + elif 'layer_norm_rms_epsilon' in name: + config['eps'] = float(kv.parts[-1][0]) + + print(f"Config: {config}") + hidden = config['hidden'] + + # Load tensors into a dict + def load_tensor(name): + for t in reader.tensors: + if t.name == name: + raw = bytes(t.data) + shape = [int(s) for s in t.shape] + n_el = int(t.n_elements) + if t.tensor_type.name == 'BF16': + flat = bf16_to_f32(raw) + elif t.tensor_type.name == 'F16': + flat = np.frombuffer(raw, dtype=np.float16).astype(np.float32) + elif t.tensor_type.name == 'F32': + flat = np.frombuffer(raw, dtype=np.float32) + else: + raise ValueError(f"Unsupported type: {t.tensor_type.name}") + assert flat.shape[0] == n_el, f"Expected {n_el} elements, got {flat.shape[0]}" + if len(shape) == 1: + return flat.copy() + return flat.reshape(list(reversed(shape))).copy() + raise KeyError(f"Tensor {name} not found") + + # Create simple token IDs from the prompt (use first few tokens from vocab) + # We just need realistic activations, not perfect tokenization + n_tokens = min(32, len(prompt_text.split())) + print(f"Using {n_tokens} pseudo-tokens for activation extraction") + + # Load token embedding and create input + print("Loading token_embd...") + token_embd = load_tensor("token_embd.weight") # [vocab, hidden] + # Use token IDs 100-131 (arbitrary but avoids special tokens) + token_ids = list(range(100, 100 + n_tokens)) + x = token_embd[token_ids] # [n_tokens, hidden] + print(f"Input shape: {x.shape}") + + # Run forward pass through target layer only (we just need the activations) + layer = target_layer + print(f"\nProcessing layer {layer}...") + + def save_activation(name, data): + """Save activation tensor as f32bin.""" + if data.ndim == 1: + data = data.reshape(1, -1) + n_rows, row_len = data.shape + fname = os.path.join(output_dir, name + ".f32bin") + with open(fname, 'wb') as fp: + fp.write(struct.pack('ths', Q_h, K_h) / np.sqrt(head_dim) + attn_w = softmax(scores, axis=-1) + attn_out = np.einsum('ths,shd->thd', attn_w, V_h).reshape(n_tokens, -1) + + # attn_output weight input + save_activation(f"act_blk{layer}_attn_output_input", attn_out) + + # Project and add residual + attn_proj = attn_out @ W_o.T + x = x + attn_proj + + # FFN norm → input to ffn_gate/ffn_up + ffn_norm_w = load_tensor(f"blk.{layer}.ffn_norm.weight") + x_ffn = rms_norm(x, ffn_norm_w, config.get('eps', 1e-6)) + save_activation(f"act_blk{layer}_ffn_input", x_ffn) + + # FFN: gate and up projections + W_gate = load_tensor(f"blk.{layer}.ffn_gate.weight") # [ffn, hidden] + W_up = load_tensor(f"blk.{layer}.ffn_up.weight") # [ffn, hidden] + W_down = load_tensor(f"blk.{layer}.ffn_down.weight") # [hidden, ffn] + + gate = x_ffn @ W_gate.T + up = x_ffn @ W_up.T + ffn_act = silu(gate) * up # SwiGLU activation + + # ffn_down weight input (the SwiGLU output) + save_activation(f"act_blk{layer}_ffn_down_input", ffn_act) + + print(f"\nDone! Extracted 4 activation tensors to {output_dir}/") + + +if __name__ == "__main__": + main() diff --git a/scripts/extract-tensor-data.py b/scripts/extract-tensor-data.py new file mode 100644 index 0000000000..473cfe108d --- /dev/null +++ b/scripts/extract-tensor-data.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +"""Extract tensor data from GGUF as raw f32 binary files for C++ testing. + +Usage: + python3 scripts/extract-tensor-data.py MODEL.gguf pattern1 [pattern2 ...] + +Output: + For each matching tensor, writes a .f32bin file with header: + int64_t n_rows, int64_t row_len + followed by n_rows * row_len float32 values. +""" +import sys +import os +import numpy as np + +# Support running from build/ or repo root +script_dir = os.path.dirname(os.path.abspath(__file__)) +repo_root = os.path.dirname(script_dir) +sys.path.insert(0, os.path.join(repo_root, 'gguf-py')) + +from gguf import GGUFReader + +def main(): + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} MODEL.gguf pattern1 [pattern2 ...]") + print(f" Extracts tensors whose names contain any of the given patterns.") + sys.exit(1) + + model_path = sys.argv[1] + patterns = sys.argv[2:] + + print(f"Reading {model_path}...") + reader = GGUFReader(model_path) + + for tensor in reader.tensors: + if not any(p in tensor.name for p in patterns): + continue + + print(f"\nExtracting: {tensor.name}") + print(f" Shape: {list(tensor.shape)}, type: {tensor.tensor_type.name}") + + # Convert to f32 + raw = np.array(tensor.data, dtype=np.uint8) + + if tensor.tensor_type.name == 'BF16': + bf16_vals = raw.view(np.uint16) + f32_bits = bf16_vals.astype(np.uint32) << 16 + f32_vals = f32_bits.view(np.float32) + elif tensor.tensor_type.name == 'F16': + f16_vals = raw.view(np.float16) + f32_vals = f16_vals.astype(np.float32) + elif tensor.tensor_type.name == 'F32': + f32_vals = raw.view(np.float32) + else: + print(f" SKIP: unsupported type {tensor.tensor_type.name}") + continue + + # Determine layout: GGUF stores shape as [col, row] for 2D + row_len = int(tensor.shape[0]) + n_rows = tensor.n_elements // row_len + + fname = tensor.name.replace(".", "_") + ".f32bin" + with open(fname, 'wb') as fp: + fp.write(np.array([n_rows, row_len], dtype=np.int64).tobytes()) + f32_vals.tofile(fp) + + file_size = os.path.getsize(fname) + print(f" Wrote {fname}: {n_rows} rows x {row_len} cols = {tensor.n_elements} elements") + print(f" File size: {file_size / (1024*1024):.1f} MB") + print(f" Stats: mean={f32_vals.mean():.6f}, std={f32_vals.std():.6f}, " + f"min={f32_vals.min():.6f}, max={f32_vals.max():.6f}") + +if __name__ == "__main__": + main() diff --git a/src/llama-graph.h b/src/llama-graph.h index 29e78451fb..2ba386c494 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -511,6 +511,7 @@ public: std::map samplers; }; + // // llm_graph_result // diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 4e65a45a50..321ed8cc49 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -2,6 +2,7 @@ #include "ggml-alloc.h" #include "ggml.h" +#include "llama.h" #include "gguf.h" #include "llama-hparams.h" @@ -61,6 +62,13 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; + case LLAMA_FTYPE_MOSTLY_Q3_PT: return "Q3_PT - 3.25 bpw"; + case LLAMA_FTYPE_MOSTLY_Q3_KPT: return "Q3_KPT - Q3_K with learned levels"; + case LLAMA_FTYPE_MOSTLY_Q4_DPT: return "Q4_DPT - IQ4_NL with learned levels"; + case LLAMA_FTYPE_MOSTLY_Q2_KPT: return "Q2_KPT - Q2_K with learned levels"; + case LLAMA_FTYPE_MOSTLY_IQ2_TQ: return "IQ2_TQ - 2.0625 bpw trellis quantized"; + case LLAMA_FTYPE_MOSTLY_IQ3_TQ: return "IQ3_TQ - 3.5625 bpw per-tensor trained grid"; + case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.5625 bpw 8D vector quantized"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; @@ -758,6 +766,13 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; + case GGML_TYPE_Q3_PT: ftype = LLAMA_FTYPE_MOSTLY_Q3_PT; break; + case GGML_TYPE_Q3_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q3_KPT; break; + case GGML_TYPE_Q4_DPT: ftype = LLAMA_FTYPE_MOSTLY_Q4_DPT; break; + case GGML_TYPE_Q2_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q2_KPT; break; + case GGML_TYPE_IQ2_TQ: ftype = LLAMA_FTYPE_MOSTLY_IQ2_TQ; break; + case GGML_TYPE_IQ3_TQ: ftype = LLAMA_FTYPE_MOSTLY_IQ3_TQ; break; + case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break; case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break; case GGML_TYPE_Q1_0: ftype = LLAMA_FTYPE_MOSTLY_Q1_0; break; default: diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d2ffc1f45f..993e8b3ee2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -21,6 +21,7 @@ // TODO: tmp until the ggml meta backend matures and becomes public #include "../src/ggml-ext.h" + #include #include #include @@ -8247,6 +8248,175 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + // Load per-tensor quantization auxiliary data (levels/kvalues) from GGUF metadata. + // Indexed by weight tensor pointer for direct lookup during inference. + { + // Build tensor name to tensor pointer map + std::unordered_map name_to_tensor; + for (auto & [ctx, buf_map] : ctx_buf_maps) { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + name_to_tensor[ggml_get_name(t)] = t; + } + } + + struct level_type_info { + ggml_type type; + const char * gguf_key; + size_t n_levels; // number of level values per tensor + size_t elem_bytes; // size of each level value + }; + + const level_type_info level_types[] = { + { GGML_TYPE_Q3_PT, "q3_pt.levels", 8, sizeof(float) }, + { GGML_TYPE_Q3_KPT, "q3_kpt.levels", 8, sizeof(float) }, + { GGML_TYPE_Q4_DPT, "q4_dpt.levels", 16, sizeof(int8_t) }, + }; + + for (const auto & lt : level_types) { + int64_t lv_idx = gguf_find_key(ml.metadata, lt.gguf_key); + if (lv_idx < 0) { continue; } + + const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.metadata, lv_idx); + const size_t lv_arr_n = gguf_get_arr_n(ml.metadata, lv_idx); + + size_t tensor_count = 0; + + // Iterate over GGUF slots to find matching tensors + for (size_t gguf_slot = 0; gguf_slot < lv_arr_n / lt.n_levels; ++gguf_slot) { + std::string tensor_name = gguf_get_tensor_name(ml.metadata, gguf_slot); + auto it = name_to_tensor.find(tensor_name); + if (it == name_to_tensor.end()) { continue; } + + ggml_tensor* t = it->second; + if (t->type != lt.type) { continue; } + + const size_t gguf_offset = gguf_slot * lt.n_levels; + + // Store directly indexed by tensor pointer + auto & aux = tensor_aux_data[t]; + aux.type = lt.type; + aux.host_data.assign( + lv_raw + gguf_offset * lt.elem_bytes, + lv_raw + (gguf_offset + lt.n_levels) * lt.elem_bytes + ); + aux.aux_tensor = nullptr; + + // Set quant_levels directly on the tensor + t->quant_levels = aux.host_data.data(); + + tensor_count++; + } + + if (tensor_count > 0) { + LLAMA_LOG_INFO("%s: loaded %zu %s per-tensor level tables\n", + __func__, tensor_count, lt.gguf_key); + } + } + + // Q2_KPT: per-block levels stored as per-tensor GGUF keys "{tensor_name}.q2kpt_levels" + // Each key holds n_blocks * Q2KPT_N_LEVELS floats for that tensor (4 floats per 256-element block). + { + size_t q2kpt_loaded = 0; + for (auto & [tname, t] : name_to_tensor) { + if (t->type != GGML_TYPE_Q2_KPT) { continue; } + const std::string key = tname + ".q2kpt_levels"; + int64_t lv_idx = gguf_find_key(ml.metadata, key.c_str()); + if (lv_idx < 0) { continue; } + + const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.metadata, lv_idx); + const size_t lv_n = gguf_get_arr_n(ml.metadata, lv_idx); + + auto & aux = tensor_aux_data[t]; + aux.type = GGML_TYPE_Q2_KPT; + aux.host_data.assign(lv_raw, lv_raw + lv_n * sizeof(float)); + aux.aux_tensor = nullptr; + t->quant_levels = aux.host_data.data(); + q2kpt_loaded++; + } + if (q2kpt_loaded > 0) { + LLAMA_LOG_INFO("%s: loaded %zu Q2_KPT per-block level tables\n", __func__, q2kpt_loaded); + } + } + + // IQ2_TQ: per-tensor trained grid (16 × 4 int8 = 64 bytes) + { + size_t iq2tq_loaded = 0; + for (auto & [tname, t] : name_to_tensor) { + if (t->type != GGML_TYPE_IQ2_TQ) { continue; } + + const std::string grid_key = "iq2tq.grid." + tname; + int64_t grid_idx = gguf_find_key(ml.metadata, grid_key.c_str()); + if (grid_idx < 0) { continue; } + + auto & taux = tensor_aux_data[t]; + taux.type = GGML_TYPE_IQ2_TQ; + taux.host_data.resize(64); + const int8_t * grid_data = (const int8_t *)gguf_get_arr_data(ml.metadata, grid_idx); + memcpy(taux.host_data.data(), grid_data, 64); + + t->quant_levels = taux.host_data.data(); + iq2tq_loaded++; + } + if (iq2tq_loaded > 0) { + LLAMA_LOG_INFO("%s: loaded IQ2_TQ grid for %zu tensors\n", __func__, iq2tq_loaded); + } + } + + // IQ3_TQ: per-tensor trained grid (16 × 8 int8 = 128 bytes) + { + size_t iq3tq_loaded = 0; + for (auto & [tname, t] : name_to_tensor) { + if (t->type != GGML_TYPE_IQ3_TQ) { continue; } + + const std::string grid_key = "iq3tq.grid." + tname; + int64_t grid_idx = gguf_find_key(ml.metadata, grid_key.c_str()); + if (grid_idx < 0) { + // backward compat: try old key name + const std::string old_key = "iq3qt.grid." + tname; + grid_idx = gguf_find_key(ml.metadata, old_key.c_str()); + if (grid_idx < 0) { continue; } + } + + auto & taux = tensor_aux_data[t]; + taux.type = GGML_TYPE_IQ3_TQ; + taux.host_data.resize(128); + const int8_t * grid_data = (const int8_t *)gguf_get_arr_data(ml.metadata, grid_idx); + memcpy(taux.host_data.data(), grid_data, 128); + + t->quant_levels = taux.host_data.data(); + iq3tq_loaded++; + } + if (iq3tq_loaded > 0) { + LLAMA_LOG_INFO("%s: loaded IQ3_TQ grid for %zu tensors\n", __func__, iq3tq_loaded); + } + } + + // IQ1_BN: per-tensor trained codebook (32768 bytes) + { + size_t iq1bn_loaded = 0; + for (auto & [tname, t] : name_to_tensor) { + if (t->type != GGML_TYPE_IQ1_BN) { continue; } + + const std::string aux_key = "iq1bn.aux." + tname; + int64_t aux_idx = gguf_find_key(ml.metadata, aux_key.c_str()); + if (aux_idx < 0) { continue; } + + auto & taux = tensor_aux_data[t]; + taux.type = GGML_TYPE_IQ1_BN; + taux.host_data.resize(32768); + const int8_t * aux_data = (const int8_t *)gguf_get_arr_data(ml.metadata, aux_idx); + memcpy(taux.host_data.data(), aux_data, 32768); + + t->quant_levels = taux.host_data.data(); + iq1bn_loaded++; + } + if (iq1bn_loaded > 0) { + LLAMA_LOG_INFO("%s: loaded IQ1_BN codebook for %zu tensors\n", __func__, iq1bn_loaded); + } + } + + } + if (use_mmap_buffer) { for (auto & mapping : ml.mappings) { pimpl->mappings.emplace_back(std::move(mapping)); diff --git a/src/llama-model.h b/src/llama-model.h index bba70012e1..91d03f56e5 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -574,6 +574,24 @@ struct llama_model { // for keeping track of associated LoRA adapters std::unordered_set loras; + // host-side auxiliary data for dynamic quantization types (Q4_DPT, Q3_PT, Q3_KPT) + // indexed by weight tensor pointer, allows separate GPU placement of aux data + struct tensor_auxiliary { + ggml_type type; // Quantization type this aux data is for + std::vector host_data; // Host copy of aux data (levels or kvalues) + struct ggml_tensor * aux_tensor; // Separate ggml tensor for backend placement + }; + + // Hash function for ggml_tensor pointers (reuse existing ggml_hash pattern) + struct ggml_tensor_ptr_hash { + size_t operator()(const ggml_tensor* t) const noexcept { + return (size_t)(uintptr_t)t >> 4; // Same as ggml_hash() + } + }; + + // Per-tensor auxiliary data lookup - indexed by WEIGHT tensor pointer + std::unordered_map tensor_aux_data; + // statically allocated context for assigning struct llama_meta_device_get_split_state_userdata get_split_state_ud; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f91d795b3e..58f5477695 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1,6 +1,8 @@ +#include "ggml.h" #include "llama-impl.h" #include "llama-model.h" #include "llama-model-loader.h" +#include "llama.h" #include "llama-ext.h" #include @@ -13,6 +15,98 @@ #include #include +// Q3_PT levels functions (defined in ggml-quants.c) +extern "C" { + void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[8]); + void q3pt_set_levels(const float * levels); +} + +// Q3_KPT levels functions (defined in ggml-quants.c) +extern "C" { + void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[8]); + void q3kpt_set_levels(const float * levels); +} + +// Q4_DPT levels functions (defined in ggml-quants.c) +extern "C" { + void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[16]); + void q4dpt_set_levels(const int8_t * levels); +} + +// Q2_KPT levels are handled internally by quantize_q2_kpt +#define Q2KPT_N_LEVELS 4 +#define QK_K 256 +extern "C" const float * q2kpt_get_levels(void); +extern "C" void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row); +extern "C" void q2kpt_free_levels(void); + +// IQ2_TQ functions — per-tensor trained grid +extern "C" size_t quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +extern "C" void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]); +extern "C" void iq2tq_set_grid(const int8_t grid[64]); +extern "C" const int8_t * iq2tq_get_grid(void); + +// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor) +extern "C" size_t quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +extern "C" void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]); +extern "C" void iq3tq_set_grid(const int8_t grid[128]); +extern "C" const int8_t * iq3tq_get_grid(void); + +// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor) +extern "C" size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +extern "C" void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread); +extern "C" void iq1bn_set_aux(const int8_t aux[32768]); +extern "C" const int8_t * iq1bn_get_aux(void); + +// Q3_PT levels functions (defined in ggml-quants.c) +extern "C" { + void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[8]); + void q3pt_set_levels(const float * levels); +} + +// Q3_KPT levels functions (defined in ggml-quants.c) +extern "C" { + void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, float levels_out[8]); + void q3kpt_set_levels(const float * levels); +} + +// Q4_DPT levels functions (defined in ggml-quants.c) +extern "C" { + void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row, + const float * imatrix, int8_t levels_out[16]); + void q4dpt_set_levels(const int8_t * levels); +} + +// Q2_KPT levels are handled internally by quantize_q2_kpt +#define Q2KPT_N_LEVELS 4 +#define QK_K 256 +extern "C" const float * q2kpt_get_levels(void); +extern "C" void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row); +extern "C" void q2kpt_free_levels(void); + +// IQ2_TQ functions — per-tensor trained grid +extern "C" size_t quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +extern "C" void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]); +extern "C" void iq2tq_set_grid(const int8_t grid[64]); +extern "C" const int8_t * iq2tq_get_grid(void); + +// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor) +extern "C" size_t quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +extern "C" void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]); +extern "C" void iq3tq_set_grid(const int8_t grid[128]); +extern "C" const int8_t * iq3tq_get_grid(void); + +// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor) +extern "C" size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +extern "C" void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread); +extern "C" void iq1bn_set_aux(const int8_t aux[32768]); +extern "C" const int8_t * iq1bn_get_aux(void); + // result of parsing --tensor-type option // (changes to this struct must be reflected in tools/quantize/quantize.cpp) struct tensor_type_option { @@ -234,7 +328,7 @@ static void llama_tensor_dequantize_impl( } else if (tensor->type == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements); } else if (ggml_is_quantized(tensor->type)) { - qtype->to_float(tensor->data, f32_output, nelements); + qtype->to_float(tensor->data, f32_output, nelements, tensor->quant_levels); } else { GGML_ABORT("fatal error"); // unreachable } @@ -264,13 +358,14 @@ static void llama_tensor_dequantize_impl( size_t thr_elems = thr_blocks * block_size; // number of elements for this thread size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread - auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { + const void * quant_levels = tensor->quant_levels; + auto compute = [qtype, quant_levels] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { if (typ == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); } else if (typ == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels); } else { - qtype->to_float(inbuf, outbuf, nels); + qtype->to_float(inbuf, outbuf, nels, quant_levels); } }; workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems); @@ -480,6 +575,18 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) { + new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT) { + new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -518,13 +625,16 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) { + new_type = GGML_TYPE_Q3_PT; + } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; @@ -569,16 +679,17 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K - : GGML_TYPE_Q3_K; + : (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ? GGML_TYPE_Q3_KPT : GGML_TYPE_Q3_K); } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { @@ -587,6 +698,9 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) { + new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { if (arch == LLM_ARCH_FALCON) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : @@ -616,13 +730,14 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT || + ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) { new_type = GGML_TYPE_Q5_K; } } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; } @@ -828,6 +943,14 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return GGML_TYPE_IQ4_XS; case LLAMA_FTYPE_MOSTLY_IQ3_S: case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S; + case LLAMA_FTYPE_MOSTLY_Q3_PT: return GGML_TYPE_Q3_PT; + case LLAMA_FTYPE_MOSTLY_Q3_KPT: return GGML_TYPE_Q3_KPT; + case LLAMA_FTYPE_MOSTLY_Q4_DPT: return GGML_TYPE_Q4_DPT; + case LLAMA_FTYPE_MOSTLY_Q2_KPT: return GGML_TYPE_Q2_KPT; + case LLAMA_FTYPE_MOSTLY_IQ2_TQ: return GGML_TYPE_IQ2_TQ; + case LLAMA_FTYPE_MOSTLY_IQ3_TQ: return GGML_TYPE_IQ3_TQ; + case LLAMA_FTYPE_MOSTLY_IQ1_BN: return GGML_TYPE_IQ1_BN; + default: return GGML_TYPE_COUNT; } @@ -1098,6 +1221,615 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ::zeros(fout, meta_size); }; + // Q3_PT two-pass approach: train all per-tensor levels BEFORE opening the output + // file, so the levels KV entry is already populated at the time of the metadata placeholder. + static const size_t Q3PT_N_LEVELS = 8; + std::vector q3pt_all_levels; // indexed by position in tensors[] + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT && !params->dry_run) { + LLAMA_LOG_INFO("%s: Q3_PT pass 1: training per-tensor levels...\n", __func__); + q3pt_all_levels.assign(tensors.size() * Q3PT_N_LEVELS, 0.0f); + + // Temporary dequant buffer for pass 1 (reuse f32_conv_buf / read_data declared below) + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + // Determine whether this tensor will be Q3_PT (mirror the pass-2 logic) + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname)); + } + if (new_type != GGML_TYPE_Q3_PT) { continue; } + + // Load tensor data + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + // Dequantize to f32 if needed + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + // Resolve imatrix + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: Q3_PT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + q3pt_train_levels(f32_data, nrows, n_per_row, imatrix, + q3pt_all_levels.data() + ti * Q3PT_N_LEVELS); + } + + // All levels ready — store in GGUF metadata before the file is opened + for (auto & ctx : ctx_outs) { + if (ctx) { + gguf_set_arr_data(ctx.get(), "q3_pt.levels", GGUF_TYPE_FLOAT32, + q3pt_all_levels.data(), q3pt_all_levels.size()); + } + } + LLAMA_LOG_INFO("%s: Q3_PT pass 1 complete.\n", __func__); + } + + // Q3_KPT two-pass approach: train all per-tensor levels BEFORE opening the output + static const size_t Q3KPT_N_LEVELS = 8; + std::vector q3kpt_all_levels; // indexed by position in tensors[] + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT && !params->dry_run) { + LLAMA_LOG_INFO("%s: Q3_KPT pass 1: training per-tensor levels...\n", __func__); + q3kpt_all_levels.assign(tensors.size() * Q3KPT_N_LEVELS, 0.0f); + + // Temporary dequant buffer for pass 1 + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + // Determine whether this tensor will be Q3_KPT (mirror the pass-2 logic) + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname)); + } + if (params->token_embedding_type < GGML_TYPE_COUNT && + (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") { + new_type = params->output_tensor_type; + } + if (new_type != GGML_TYPE_Q3_KPT) { continue; } + + // Load tensor data + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + // Dequantize to f32 if needed + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + // Resolve imatrix + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: Q3_KPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + q3kpt_train_levels(f32_data, nrows, n_per_row, imatrix, + q3kpt_all_levels.data() + ti * Q3KPT_N_LEVELS); + } + + // All levels ready — store in GGUF metadata before the file is opened + for (auto & ctx : ctx_outs) { + if (ctx) { + gguf_set_arr_data(ctx.get(), "q3_kpt.levels", GGUF_TYPE_FLOAT32, + q3kpt_all_levels.data(), q3kpt_all_levels.size()); + } + } + LLAMA_LOG_INFO("%s: Q3_KPT pass 1 complete.\n", __func__); + } + + // Q4_DPT two-pass approach: train all per-tensor int8 levels BEFORE opening the output + // file, so the levels KV entry is already populated at the time of the metadata placeholder. + static const size_t Q4DPT_N_LEVELS = 16; + std::vector q4dpt_all_levels; // indexed by position in tensors[] + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT && !params->dry_run) { + LLAMA_LOG_INFO("%s: Q4_DPT pass 1: training per-tensor int8 levels...\n", __func__); + q4dpt_all_levels.assign(tensors.size() * Q4DPT_N_LEVELS, (int8_t)0); + + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname)); + } + if (params->token_embedding_type < GGML_TYPE_COUNT && + (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") { + new_type = params->output_tensor_type; + } + if (new_type != GGML_TYPE_Q4_DPT) { continue; } + + // Load tensor data + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + // Dequantize to f32 if needed + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + // Resolve imatrix + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: Q4_DPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + q4dpt_train_levels(f32_data, nrows, n_per_row, imatrix, + q4dpt_all_levels.data() + ti * Q4DPT_N_LEVELS); + } + + // Store in GGUF metadata before the file is opened + for (auto & ctx : ctx_outs) { + if (ctx) { + gguf_set_arr_data(ctx.get(), "q4_dpt.levels", GGUF_TYPE_INT8, + q4dpt_all_levels.data(), q4dpt_all_levels.size()); + } + } + LLAMA_LOG_INFO("%s: Q4_DPT pass 1 complete.\n", __func__); + } + + // Q2_KPT two-pass approach: train all per-block levels BEFORE opening the output + // file, so the levels KV entry is already populated at the time of the metadata placeholder. + // Per-block levels: 4 floats per 256-element block. + struct q2kpt_tensor_levels { + std::string name; + std::vector levels; // nrows * (n_per_row / QK_K) * Q2KPT_N_LEVELS floats + }; + std::vector q2kpt_all_levels; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT && !params->dry_run) { + LLAMA_LOG_INFO("%s: Q2_KPT pass 1: training per-block levels...\n", __func__); + + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + // Determine whether this tensor will be Q2_KPT (mirror the pass-2 logic) + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname)); + } + if (params->token_embedding_type < GGML_TYPE_COUNT && + (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") { + new_type = params->output_tensor_type; + } + if (new_type != GGML_TYPE_Q2_KPT) { continue; } + + // Load tensor data + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + // Dequantize to f32 if needed + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + // Resolve imatrix + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + // Allocate levels buffer for this tensor + const int nb = n_per_row / QK_K; + const size_t n_levels = (size_t)nrows * tensor->ne[2] * nb * Q2KPT_N_LEVELS; + q2kpt_all_levels.push_back({tname, std::vector(n_levels)}); + + LLAMA_LOG_INFO("%s: Q2_KPT levels for [%zu/%zu] %s (%zu floats)\n", + __func__, ti+1, tensors.size(), tensor->name, n_levels); + + // Train levels by running quantization internally + // We need to quantize to f32 -> Q2_KPT -> f32 to get the trained levels + std::vector> p1_qbuf(ggml_nbytes(tensor)); + const size_t row_size = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row); + + // Prepare levels buffer for this tensor + q2kpt_free_levels(); + q2kpt_prepare_levels(nrows * tensor->ne[2], n_per_row); + + // Quantize each expert slice + const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; + for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { + const float * f32_data_03 = f32_data + i03 * nelements_matrix; + void * q_data_03 = (char *)p1_qbuf.data() + row_size * i03 * nrows; + const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; + + // start_row must be the absolute row index for correct levels indexing + ggml_quantize_chunk(GGML_TYPE_Q2_KPT, f32_data_03, q_data_03, i03 * nrows, nrows, n_per_row, imatrix_03); + } + + // Copy trained levels to our storage + const float * trained_levels = q2kpt_get_levels(); + if (trained_levels) { + memcpy(q2kpt_all_levels.back().levels.data(), trained_levels, n_levels * sizeof(float)); + } + } + + // Store all levels in GGUF metadata before the file is opened + for (const auto & tl : q2kpt_all_levels) { + for (auto & ctx : ctx_outs) { + if (ctx) { + const std::string key = tl.name + ".q2kpt_levels"; + gguf_set_arr_data(ctx.get(), key.c_str(), GGUF_TYPE_FLOAT32, + tl.levels.data(), tl.levels.size()); + } + } + } + LLAMA_LOG_INFO("%s: Q2_KPT pass 1 complete.\n", __func__); + } + + // IQ2_TQ: train per-tensor grid in pass 1 + struct iq2tq_meta { + std::string tensor_name; + int8_t grid[64]; + }; + std::vector iq2tq_all_meta; + if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ2_TQ) { + const int64_t t_start_p1 = ggml_time_us(); + LLAMA_LOG_INFO("%s: IQ2_TQ pass 1: training per-tensor grids...\n", __func__); + + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + // Mirror pass-2 logic: only quantize 2D+ weight tensors + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname)); + } + if (params->token_embedding_type < GGML_TYPE_COUNT && + (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") { + new_type = params->output_tensor_type; + } + if (new_type != GGML_TYPE_IQ2_TQ) { continue; } + + // Load tensor data + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + // Dequantize to f32 if needed + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + // Resolve imatrix + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: IQ2_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + + iq2tq_meta meta; + meta.tensor_name = tname; + iq2tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid); + iq2tq_all_meta.push_back(meta); + + // Save to GGUF + std::string grid_key = "iq2tq.grid." + tname; + gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 64); + } + const int64_t t_end_p1 = ggml_time_us(); + LLAMA_LOG_INFO("%s: IQ2_TQ pass 1 complete (%zu tensors trained, %.1f s).\n", + __func__, iq2tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6); + } + + // IQ3_TQ: train per-tensor grid in pass 1 (16 entries × 8 levels = 128 bytes) + struct iq3tq_meta { + std::string tensor_name; + int8_t grid[128]; + }; + std::vector iq3tq_all_meta; + if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ3_TQ) { + const int64_t t_start_p1 = ggml_time_us(); + LLAMA_LOG_INFO("%s: IQ3_TQ pass 1: training per-tensor grids...\n", __func__); + + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname)); + } + if (params->token_embedding_type < GGML_TYPE_COUNT && + (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") { + new_type = params->output_tensor_type; + } + if (new_type != GGML_TYPE_IQ3_TQ) { continue; } + + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: IQ3_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + + iq3tq_meta meta; + meta.tensor_name = tname; + iq3tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid); + iq3tq_all_meta.push_back(meta); + + std::string grid_key = "iq3tq.grid." + tname; + gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 128); + } + const int64_t t_end_p1 = ggml_time_us(); + LLAMA_LOG_INFO("%s: IQ3_TQ pass 1 complete (%zu tensors trained, %.1f s).\n", + __func__, iq3tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6); + } + + // IQ1_BN: train per-tensor codebook in pass 1 (4096 × 8D centroids = 32768 bytes) + struct iq1bn_meta { + std::string tensor_name; + int8_t aux[32768]; + }; + std::vector iq1bn_all_meta; + if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN) { + const int64_t t_start_p1 = ggml_time_us(); + LLAMA_LOG_INFO("%s: IQ1_BN pass 1: training per-tensor codebooks...\n", __func__); + + std::vector> p1_read_data; + std::vector> p1_f32_buf; + std::vector p1_workers; + p1_workers.reserve(nthread); + + for (size_t ti = 0; ti < tensors.size(); ++ti) { + ggml_tensor * tensor = tensors[ti]->tensor; + const std::string tname = ggml_get_name(tensor); + + bool quantize = tname.rfind("weight") == tname.size() - 6; + quantize &= (ggml_n_dims(tensor) >= 2); + quantize &= tname.find("_norm.weight") == std::string::npos; + quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos; + if (!quantize) { continue; } + + ggml_type new_type = default_type; + if (!params->pure) { + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname)); + } + if (params->token_embedding_type < GGML_TYPE_COUNT && + (tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") { + new_type = params->output_tensor_type; + } + if (new_type != GGML_TYPE_IQ1_BN) { continue; } + + const size_t tsz = ggml_nbytes(tensor); + if (!ml.use_mmap) { + if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); } + tensor->data = p1_read_data.data(); + } + ml.load_data_for(tensor); + + const int64_t nelements = ggml_nelements(tensor); + float * f32_data; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else { + llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread); + f32_data = (float *) p1_f32_buf.data(); + } + + const float * imatrix = nullptr; + if (imatrix_data) { + auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it2 != imatrix_data->end() && + it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) { + imatrix = it2->second.data(); + } + } + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + LLAMA_LOG_INFO("%s: IQ1_BN codebook for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name); + + iq1bn_meta meta; + meta.tensor_name = tname; + iq1bn_train_codebook(f32_data, nrows, n_per_row, imatrix, meta.aux, nthread); + iq1bn_all_meta.push_back(meta); + + std::string aux_key = "iq1bn.aux." + tname; + gguf_set_arr_data(ctx_outs[0].get(), aux_key.c_str(), GGUF_TYPE_INT8, meta.aux, 32768); + } + const int64_t t_end_p1 = ggml_time_us(); + LLAMA_LOG_INFO("%s: IQ1_BN pass 1 complete (%zu tensors trained, %.1f s).\n", + __func__, iq1bn_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6); + } + // no output file for --dry-run if (!params->dry_run) { new_ofstream(0); @@ -1106,6 +1838,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // // main loop: iterate over all weights // + size_t tensor_pass2_idx = 0; // index into tensors[], used for Q3_PT levels lookup for (size_t i = 0; i < tensors.size(); ++i) { const auto & weight = *tensors[i]; @@ -1232,6 +1965,75 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; + // Q3_PT: set the per-tensor levels (trained in pass 1) as global for quantization + if (new_type == GGML_TYPE_Q3_PT) { + q3pt_set_levels(q3pt_all_levels.data() + tensor_pass2_idx * Q3PT_N_LEVELS); + } + + // Q3_KPT: set the per-tensor levels (trained in pass 1) as global for quantization + if (new_type == GGML_TYPE_Q3_KPT) { + q3kpt_set_levels(q3kpt_all_levels.data() + tensor_pass2_idx * Q3KPT_N_LEVELS); + } + + // Q4_DPT: set the per-tensor levels (trained in pass 1) as global for quantization + if (new_type == GGML_TYPE_Q4_DPT) { + q4dpt_set_levels(q4dpt_all_levels.data() + tensor_pass2_idx * Q4DPT_N_LEVELS); + } + + // IQ2_TQ: set per-tensor trained grid + if (new_type == GGML_TYPE_IQ2_TQ) { + bool found = false; + for (const auto & meta : iq2tq_all_meta) { + if (meta.tensor_name == tm.name) { + iq2tq_set_grid(meta.grid); + found = true; + break; + } + } + if (!found) { + LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ2_TQ tensor %s\n", __func__, tm.name.c_str()); + } + } + + // IQ3_TQ: set per-tensor trained grid + if (new_type == GGML_TYPE_IQ3_TQ) { + bool found = false; + for (const auto & meta : iq3tq_all_meta) { + if (meta.tensor_name == tm.name) { + iq3tq_set_grid(meta.grid); + found = true; + break; + } + } + if (!found) { + LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ3_TQ tensor %s\n", __func__, tm.name.c_str()); + } + } + + // IQ1_BN: set per-tensor trained codebook + if (new_type == GGML_TYPE_IQ1_BN) { + bool found = false; + for (const auto & meta : iq1bn_all_meta) { + if (meta.tensor_name == tm.name) { + iq1bn_set_aux(meta.aux); + found = true; + break; + } + } + if (!found) { + LLAMA_LOG_WARN("%s: WARNING: no trained codebook for IQ1_BN tensor %s\n", __func__, tm.name.c_str()); + } + } + + // Q2_KPT: quantize_q2_kpt trains per-block levels internally. + // Levels were already trained and saved to GGUF in pass 1. + // We still need to allocate the levels buffer for quantization to work correctly. + if (new_type == GGML_TYPE_Q2_KPT) { + const int64_t total_rows = nrows * tensor->ne[2]; + q2kpt_free_levels(); // Clear any stale levels from previous tensor + q2kpt_prepare_levels(total_rows, n_per_row); // Allocate for this tensor + } + // quantize each expert separately since they have different importance matrices new_size = 0; for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { @@ -1255,7 +2057,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: fout.write((const char *) new_data, new_size); zeros(fout, GGML_PAD(new_size, align) - new_size); } // no --dry-run - } // main loop + + tensor_pass2_idx++; + } // iterate over tensors if (!params->dry_run) { close_ofstream(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index cd4bc5ef1d..a0a29bc7f2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -257,6 +257,9 @@ if (NOT GGML_BACKEND_DL) llama_build_and_test(test-rope.cpp) endif() +# Quantization laboratory - tests for 2.5 BPW proposals +llama_build_and_test(test-quant-laboratory.cpp) + # libmtmd set(LLAMA_TEST_NAME test-mtmd-c-api) llama_build_and_test(test-mtmd-c-api.c) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f41558902c..849b0265b2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -261,7 +261,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { } else if (t->type == GGML_TYPE_I8) { tv.push_back((float)*(int8_t *) &buf[i]); } else if (quantized) { - tt->to_float(&buf[i], vq.data(), bs); + tt->to_float(&buf[i], vq.data(), bs, nullptr); tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ABORT("fatal error"); diff --git a/tests/test-quant-laboratory.cpp b/tests/test-quant-laboratory.cpp new file mode 100644 index 0000000000..477461a512 --- /dev/null +++ b/tests/test-quant-laboratory.cpp @@ -0,0 +1,355 @@ +// test-quant-laboratory.cpp +// Reusable testing harness for quantization experiments. +// +// Provides: +// - Synthetic data generators (Gaussian, Laplace, uniform) +// - Real tensor data loading (f32bin format with [nrow, ncol] header) +// - Importance matrix loading (flat f32 array) +// - RMSE computation +// - Multi-approach comparison framework (quantize → dequantize → matmul error) +// - ggml graph-level verification skeleton +// +// To add a new experiment: +// 1. Add an approach function: void approach_xxx(const float *W, float *out, +// int64_t nrow, int64_t ncol, +// const float *imatrix) +// 2. Register it in compare_approaches() +// 3. Call test_approach_comparison() from main() + +#include "../ggml/src/ggml-quants.h" +#include "ggml-backend.h" +#include "ggml-alloc.h" +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ============================================================================ +// Helper functions +// ============================================================================ + +static float rmse(const float * a, const float * b, size_t n) { + double sum = 0.0; + for (size_t i = 0; i < n; ++i) { + double d = (double) a[i] - (double) b[i]; + sum += d * d; + } + return (float) sqrt(sum / n); +} + +static void fill_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f) { + std::normal_distribution dist(0.0f, sigma); + for (size_t i = 0; i < n; ++i) { + data[i] = dist(gen); + } +} + +static void fill_laplace(float * data, size_t n, std::mt19937 & gen, float b = 1.0f) { + std::uniform_real_distribution u(-0.5f, 0.5f); + for (size_t i = 0; i < n; ++i) { + float v = u(gen); + data[i] = -b * ((v > 0) - (v < 0)) * logf(1.0f - 2.0f * fabsf(v)); + } +} + +static void fill_uniform(float * data, size_t n, std::mt19937 & gen, float range = 1.0f) { + std::uniform_real_distribution dist(-range, range); + for (size_t i = 0; i < n; ++i) { + data[i] = dist(gen); + } +} + +static void fill_offset_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f, float offset = 2.0f) { + std::normal_distribution dist(offset, sigma); + for (size_t i = 0; i < n; ++i) { + data[i] = dist(gen); + } +} + +// ============================================================================ +// Data loading +// ============================================================================ +static bool load_f32_tensor(const char * path, std::vector & data, int64_t & nrow, int64_t & n_per_row) { + FILE * f = fopen(path, "rb"); + if (!f) { + return false; + } + + int64_t header[2]; + if (fread(header, sizeof(int64_t), 2, f) != 2) { + fclose(f); + return false; + } + nrow = header[0]; + n_per_row = header[1]; + + int64_t total = nrow * n_per_row; + data.resize(total); + size_t nread = fread(data.data(), sizeof(float), total, f); + fclose(f); + if ((int64_t) nread != total) { + return false; + } + return true; +} + +// Load imatrix file (flat f32 array, no header, one importance value per column dimension) +// The imatrix is the sum-of-squares of activations per dimension. +static bool load_imatrix(const char * path, std::vector & data, int64_t expected_dims) { + FILE * f = fopen(path, "rb"); + if (!f) { + return false; + } + + // Get file size to determine dimensions + fseek(f, 0, SEEK_END); + long file_size = ftell(f); + fseek(f, 0, SEEK_SET); + + int64_t dims = file_size / sizeof(float); + if (expected_dims > 0 && dims != expected_dims) { + printf(" WARN: imatrix dims %lld != expected %lld\n", (long long) dims, (long long) expected_dims); + fclose(f); + return false; + } + + data.resize(dims); + size_t nread = fread(data.data(), sizeof(float), dims, f); + fclose(f); + if ((int64_t) nread != dims) { + return false; + } + + // Compute stats + float imin = data[0], imax = data[0], isum = 0; + for (int64_t i = 0; i < dims; i++) { + if (data[i] < imin) imin = data[i]; + if (data[i] > imax) imax = data[i]; + isum += data[i]; + } + printf(" Loaded imatrix: %lld dims, min=%.6f, max=%.6f, mean=%.6f\n", + (long long) dims, imin, imax, isum / dims); + + return true; +} + +// ============================================================================ +// Test class +// ============================================================================ + +class QuantLaboratory { + public: + QuantLaboratory() : gen(42) {} + + // ======================================================================== + // MULTI-APPROACH COMPARISON FRAMEWORK + // + // Each "approach" is a function that takes float weights and produces + // dequantized float output. The framework computes: + // - Weight RMSE (dequant vs original) + // - Matmul error (dequant weights x real activations vs f64 reference) + // - Ratio vs first approach (typically Q2_K baseline) + // + // To add a new approach: + // 1. Write: void approach_xxx(const float *W, float *out, + // int64_t nrow, int64_t ncol, + // const float *imatrix) { ... } + // 2. Add it to the `approaches` array in compare_approaches() + // ======================================================================== + + // -- Example approach: Q2_K baseline (via ggml library) -- + // Uncomment and adapt for your experiment: + // + // void approach_q2k(const float * W, float * out, int64_t nrow, int64_t ncol, const float * imatrix) { + // size_t rs = ggml_row_size(GGML_TYPE_Q2_K, ncol); + // std::vector buf(nrow * rs); + // quantize_q2_K(W, buf.data(), nrow, ncol, imatrix); + // auto * tr = ggml_get_type_traits(GGML_TYPE_Q2_K); + // for (int64_t r = 0; r < nrow; r++) { + // tr->to_float(buf.data() + r * rs, out + r * ncol, ncol, NULL); + // } + // } + + void compare_approaches(const float * W, + int64_t w_nrow, + int64_t w_ncol, + const float * A, + int64_t a_nrow, + int64_t a_ncol, + const char * name, + const float * imatrix) { + if (w_ncol != a_ncol) { + return; + } + int64_t nr = std::min(w_nrow, (int64_t) 256); + int64_t nc = w_ncol; + + // Reference matmul (double precision) + std::vector ref(a_nrow * nr); + for (int64_t t = 0; t < a_nrow; t++) { + for (int64_t r = 0; r < nr; r++) { + double s = 0; + for (int64_t c = 0; c < nc; c++) { + s += (double) A[t * a_ncol + c] * (double) W[r * nc + c]; + } + ref[t * nr + r] = s; + } + } + double ref_mag2 = 0; + for (auto v : ref) { + ref_mag2 += v * v; + } + float ref_rms = (float) sqrt(ref_mag2 / (a_nrow * nr)); + (void) ref_rms; + + struct Approach { + const char * name; + float bpw; + std::function fn; + }; + + // ── Register approaches here ── + Approach approaches[] = { + // { "Q2_K (baseline)", 2.625f, + // [&](auto * W, auto * o, auto nr, auto nc, auto * im) { + // approach_q2k(W, o, nr, nc, im); + // } }, + // Add more approaches... + { "placeholder", 0.0f, nullptr }, // remove once real approaches added + }; + + printf("\n %-28s %5s %10s %10s %7s\n", name, "BPW", "RMSE", "MatmulErr", "vs Q2K"); + printf(" %-28s %5s %10s %10s %7s\n", "---", "---", "---", "---", "---"); + + float baseline_matmul_err = 0; + for (auto & ap : approaches) { + if (!ap.fn) { + continue; + } + std::vector dec(nr * nc); + ap.fn(W, dec.data(), nr, nc, imatrix); + + // Weight RMSE + double werr2 = 0; + for (int64_t i = 0; i < nr * nc; i++) { + double d = W[i] - dec[i]; + werr2 += d * d; + } + float wrmse = (float) sqrt(werr2 / (nr * nc)); + + // Matmul error + double merr2 = 0; + for (int64_t t = 0; t < a_nrow; t++) { + for (int64_t r = 0; r < nr; r++) { + double s = 0; + for (int64_t c = 0; c < nc; c++) { + s += (double) A[t * a_ncol + c] * (double) dec[r * nc + c]; + } + double d = s - ref[t * nr + r]; + merr2 += d * d; + } + } + float matmul_rmse = (float) sqrt(merr2 / (a_nrow * nr)); + + if (baseline_matmul_err == 0) { + baseline_matmul_err = matmul_rmse; + } + float ratio = (baseline_matmul_err > 1e-10f) ? matmul_rmse / baseline_matmul_err : 0; + + printf(" %-28s %5.3f %10.6f %10.6f %6.3fx\n", ap.name, ap.bpw, wrmse, matmul_rmse, ratio); + } + } + + // Run comparison on all tensor pairs from data directory + int test_approach_comparison(const char * data_dir) { + printf("\n"); + printf("=======================================================================\n"); + printf(" MULTI-APPROACH COMPARISON (real weights x real activations)\n"); + printf("=======================================================================\n"); + + struct TestPair { + const char * wf; + const char * af; + const char * imf; + const char * name; + } pairs[] = { + { "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_gate" }, + { "blk_0_ffn_up_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_up" }, + { "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin", "imatrix_blk0_ffn_down.f32bin", "ffn_down" }, + { "blk_0_attn_q_weight.f32bin", "act_blk0_attn_input.f32bin", "imatrix_blk0_attn_qkv.f32bin", "attn_q" }, + }; + + for (auto & p : pairs) { + char wp[512], ap[512], imp[512]; + snprintf(wp, sizeof(wp), "%s/%s", data_dir, p.wf); + snprintf(ap, sizeof(ap), "%s/%s", data_dir, p.af); + snprintf(imp, sizeof(imp), "%s/%s", data_dir, p.imf); + std::vector wd, ad, im; + int64_t wnr, wnc, anr, anc; + if (!load_f32_tensor(wp, wd, wnr, wnc) || !load_f32_tensor(ap, ad, anr, anc)) { + continue; + } + const float * im_ptr = nullptr; + if (load_imatrix(imp, im, wnc)) { + im_ptr = im.data(); + } else { + printf(" [%s] No imatrix found, using uniform weights\n", p.name); + } + compare_approaches(wd.data(), wnr, wnc, ad.data(), anr, anc, p.name, im_ptr); + } + printf("\n"); + return 0; + } + + private: + std::mt19937 gen; +}; + +// ============================================================================ +// Main +// ============================================================================ + +int main(int argc, char ** argv) { + ggml_backend_load_all(); + + QuantLaboratory lab; + int total_fail = 0; + + printf("Quantization Laboratory\n"); + printf("=======================\n"); + + // Real data tests (from data/ directory) + { + const char * data_dir = "data"; + if (argc > 1) { + data_dir = argv[1]; + } + + char probe[512]; + snprintf(probe, sizeof(probe), "%s/blk_0_ffn_gate_weight.f32bin", data_dir); + FILE * fp = fopen(probe, "rb"); + if (fp) { + fclose(fp); + total_fail += lab.test_approach_comparison(data_dir); + } else { + printf("\n=== Real Data Tests SKIPPED ===\n"); + printf(" No data found at %s\n", data_dir); + printf( + " Run: cd data && PYTHONPATH=../gguf-py python3 ../scripts/extract-tensor-data.py MODEL.gguf " + "blk.0.ffn_gate blk.0.ffn_up blk.0.ffn_down blk.0.attn_q\n"); + printf(" And: llama-capture-layer-data -m MODEL.gguf -l 0 -o data\n"); + } + } + + printf("\n\n=== Testing Complete: %d failures ===\n", total_fail); + + return total_fail > 0 ? 1 : 0; +} diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index a05fab5042..b313c41e65 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -54,7 +54,7 @@ static float total_quantization_error(const ggml_type_traits * qfns, const ggml_ std::vector tmp_out(test_size); qfns_cpu->from_float(test_data, tmp_q.data(), test_size); - qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr); return array_rmse(test_data, tmp_out.data(), test_size); } @@ -66,10 +66,10 @@ static float reference_quantization_error(const ggml_type_traits * qfns, const g // FIXME: why is done twice? qfns_cpu->from_float(test_data, tmp_q.data(), test_size); - qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr); qfns->from_float_ref(test_data, tmp_q.data(), test_size); - qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size); + qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size, nullptr); return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); } @@ -95,7 +95,7 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr vdot->from_float(test_data2, tmp_q2.data(), test_size); float result = INFINITY; - qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); + qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1, nullptr); const float dot_ref = dot_product(test_data1, test_data2, test_size); diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index cac0782dee..92597a04f4 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -309,7 +309,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - qfns->to_float(test_q1, test_out, size); + qfns->to_float(test_q1, test_out, size, nullptr); return test_out[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -341,7 +341,7 @@ int main(int argc, char * argv[]) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { float result; - qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); + qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1, nullptr); return result; }; size_t quantized_size = ggml_row_size(type, size); diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp index de587d456d..c56f101187 100644 --- a/tests/test-quantize-stats.cpp +++ b/tests/test-quantize-stats.cpp @@ -158,7 +158,7 @@ static void test_roundtrip_on_chunk( } else { qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size); } - qfns.to_float(quantized_scratch, output_scratch, chunk_size); + qfns.to_float(quantized_scratch, output_scratch, chunk_size, nullptr); update_error_stats(chunk_size, input_scratch, output_scratch, stats); } diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b433c91d85..1d06885e7b 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -38,5 +38,6 @@ else() add_subdirectory(export-lora) endif() add_subdirectory(fit-params) + add_subdirectory(capture-layer-data) add_subdirectory(results) endif() diff --git a/tools/capture-layer-data/CMakeLists.txt b/tools/capture-layer-data/CMakeLists.txt new file mode 100644 index 0000000000..4a81272e2e --- /dev/null +++ b/tools/capture-layer-data/CMakeLists.txt @@ -0,0 +1,9 @@ +set(TARGET llama-capture-layer-data) +add_executable(${TARGET} capture-layer-data.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/capture-layer-data/capture-layer-data.cpp b/tools/capture-layer-data/capture-layer-data.cpp new file mode 100644 index 0000000000..30cfd16b21 --- /dev/null +++ b/tools/capture-layer-data/capture-layer-data.cpp @@ -0,0 +1,251 @@ +// capture-layer-data.cpp +// Captures intermediate activation tensors during model inference +// and saves them as .f32bin files for the quantization laboratory. +// +// Usage: +// llama-capture-layer-data -m MODEL_PATH -l LAYER [-p PROMPT] [-o OUTPUT_DIR] +// +// Example: +// llama-capture-layer-data -m /devel/models/Qwen_Qwen3-4B-Instruct-2507-bf16.gguf -l 0 -o data + +#include "arg.h" +#include "common.h" +#include "ggml-backend.h" +#include "ggml.h" +#include "llama.h" +#include "log.h" + +#include +#include +#include +#include +#include +#include +#include + +struct TensorMapping { + const char * graph_name_prefix; + const char * output_suffix; +}; + +static const TensorMapping mappings[] = { + { "attn_norm", "attn_input" }, + { "kqv_out", "attn_output_input" }, + { "ffn_norm", "ffn_input" }, + { "ffn_swiglu", "ffn_down_input" }, +}; +static constexpr int N_MAPPINGS = sizeof(mappings) / sizeof(mappings[0]); + +struct CaptureState { + int target_layer; + std::string output_dir; + int captured_count = 0; + std::string pending_name; + + std::string graph_to_filename(const char * graph_name) const { + for (int i = 0; i < N_MAPPINGS; i++) { + std::string prefix = mappings[i].graph_name_prefix; + if (strncmp(graph_name, prefix.c_str(), prefix.size()) == 0) { + char buf[256]; + snprintf(buf, sizeof(buf), "act_blk%d_%s.f32bin", target_layer, mappings[i].output_suffix); + return std::string(buf); + } + } + return ""; + } +}; + +static CaptureState * g_capture_state = nullptr; + +static void save_tensor_as_f32bin(const ggml_tensor * t, const std::string & filepath) { + int64_t n_rows = t->ne[1]; + int64_t row_len = t->ne[0]; + + int64_t total = 1; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + total *= t->ne[i]; + } + + std::vector f32_data(total); + + if (t->type == GGML_TYPE_F32) { + const float * src = (const float *) t->data; + if (!src) { + LOG_ERR("Tensor %s has null data pointer\n", t->name); + return; + } + memcpy(f32_data.data(), src, total * sizeof(float)); + } else if (t->type == GGML_TYPE_F16) { + const ggml_fp16_t * src = (const ggml_fp16_t *) t->data; + for (int64_t i = 0; i < total; i++) { + f32_data[i] = ggml_fp16_to_fp32(src[i]); + } + } else if (t->type == GGML_TYPE_BF16) { + const ggml_bf16_t * src = (const ggml_bf16_t *) t->data; + for (int64_t i = 0; i < total; i++) { + f32_data[i] = ggml_bf16_to_fp32(src[i]); + } + } else { + LOG_ERR("Unsupported tensor type %s for %s\n", ggml_type_name(t->type), t->name); + return; + } + + std::ofstream file(filepath, std::ios::binary); + if (!file) { + LOG_ERR("Failed to open %s for writing\n", filepath.c_str()); + return; + } + + file.write(reinterpret_cast(&n_rows), sizeof(int64_t)); + file.write(reinterpret_cast(&row_len), sizeof(int64_t)); + file.write(reinterpret_cast(f32_data.data()), total * sizeof(float)); + + file.close(); + LOG(" Captured: %s -> %s (%lld x %lld, %s)\n", t->name, filepath.c_str(), (long long) n_rows, (long long) row_len, + ggml_type_name(t->type)); +} + +static bool capture_callback(ggml_tensor * t, bool ask, void * user_data) { + auto * state = (CaptureState *) user_data; + + if (ask) { + char target[128]; + for (int i = 0; i < N_MAPPINGS; i++) { + snprintf(target, sizeof(target), "%s-%d", mappings[i].graph_name_prefix, state->target_layer); + if (strcmp(t->name, target) == 0) { + state->pending_name = t->name; + return true; + } + } + return false; + } + + if (state->pending_name.empty()) { + return true; + } + if (strcmp(t->name, state->pending_name.c_str()) != 0) { + return true; + } + + if (!ggml_backend_buffer_is_host(t->buffer)) { + size_t nbytes = ggml_nbytes(t); + std::vector tmp(nbytes); + ggml_backend_tensor_get(t, tmp.data(), 0, nbytes); + LOG_WRN("Tensor %s is not host-accessible, data copied via backend\n", t->name); + } + + std::string filename = state->graph_to_filename(t->name); + if (!filename.empty()) { + std::filesystem::create_directories(state->output_dir); + std::string filepath = (std::filesystem::path(state->output_dir) / filename).string(); + save_tensor_as_f32bin(t, filepath); + state->captured_count++; + } + + state->pending_name.clear(); + return true; +} + +static void print_usage(void) { + LOG("Usage: llama-capture-layer-data -m MODEL_PATH [-l LAYER] [-p PROMPT] [-o OUTPUT_DIR]\n"); + LOG("\n"); + LOG(" -m MODEL Path to GGUF model (BF16/F16 recommended)\n"); + LOG(" -l LAYER Target layer index (default: 0)\n"); + LOG(" -p PROMPT Inference prompt (default: \"The quick brown fox...\")\n"); + LOG(" -o DIR Output directory for .f32bin files (default: data)\n"); +} + +int main(int argc, char ** argv) { + if (argc < 3 || (std::string(argv[1]) == "-h" || std::string(argv[1]) == "--help")) { + print_usage(); + return 1; + } + + common_params params; + int layer = 0; + std::string output_dir = "data"; + std::string prompt = "The quick brown fox jumps over the lazy dog."; + std::string model_path; + + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "-m" && i + 1 < argc) { + model_path = argv[++i]; + } else if (arg == "-l" && i + 1 < argc) { + layer = atoi(argv[++i]); + } else if (arg == "-p" && i + 1 < argc) { + prompt = argv[++i]; + } else if (arg == "-o" && i + 1 < argc) { + output_dir = argv[++i]; + } + } + + if (model_path.empty()) { + LOG_ERR("Error: -m MODEL_PATH is required\n\n"); + print_usage(); + return 1; + } + + params.model.path = model_path; + params.prompt = prompt; + params.n_batch = 512; + params.n_ubatch = 512; + params.n_gpu_layers = 0; + params.fit_params = false; + + CaptureState state; + state.target_layer = layer; + state.output_dir = output_dir; + g_capture_state = &state; + + params.cb_eval = capture_callback; + params.cb_eval_user_data = &state; + + LOG("Loading model: %s\n", model_path.c_str()); + LOG("Target layer: %d\n", layer); + LOG("Output directory: %s\n", output_dir.c_str()); + + common_init(); + ggml_backend_load_all(); + llama_backend_init(); + llama_numa_init(params.numa); + + auto llama_init = common_init_from_params(params); + if (!llama_init) { + LOG_ERR("Failed to load model\n"); + return 1; + } + + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); + + if (model == nullptr || ctx == nullptr) { + LOG_ERR("Failed to initialize context\n"); + return 1; + } + + LOG("Model loaded successfully\n"); + + const llama_vocab * vocab = llama_model_get_vocab(model); + const bool add_bos = llama_vocab_get_add_bos(vocab); + std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); + + if (tokens.empty()) { + LOG_ERR("No tokens generated from prompt\n"); + return 1; + } + + LOG("Tokenizing prompt: %zu tokens\n", tokens.size()); + LOG("Running inference...\n"); + + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { + LOG_ERR("llama_decode failed\n"); + return 1; + } + + LOG("\nDone. Captured %d tensors to %s/\n", state.captured_count, output_dir.c_str()); + + llama_backend_free(); + + return state.captured_count == 0 ? 1 : 0; +} diff --git a/tools/export-lora/export-lora.cpp b/tools/export-lora/export-lora.cpp index e1bc4a2f31..b8637512f0 100644 --- a/tools/export-lora/export-lora.cpp +++ b/tools/export-lora/export-lora.cpp @@ -318,7 +318,7 @@ struct lora_merge_ctx { auto nels = ggml_nelements(inp_base); const auto * qtype = ggml_get_type_traits(base->type); std::vector dequant_buf(nels * sizeof(float)); - qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels, nullptr); ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); } else { ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index a882c78f1b..ac2cb114bc 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -46,6 +46,13 @@ static const std::vector QUANT_OPTIONS = { { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, + { "Q3_PT", LLAMA_FTYPE_MOSTLY_Q3_PT, " 3.25 bpw quantization", }, + { "Q3_KPT", LLAMA_FTYPE_MOSTLY_Q3_KPT, " Q3_K with learned per-tensor levels" }, + { "Q4_DPT", LLAMA_FTYPE_MOSTLY_Q4_DPT, " IQ4_NL with learned per-tensor int8 levels" }, + { "Q2_KPT", LLAMA_FTYPE_MOSTLY_Q2_KPT, " Q2_K with learned per-tensor float levels" }, + { "IQ2_TQ", LLAMA_FTYPE_MOSTLY_IQ2_TQ, " 2.0625 bpw, trellis quantized" }, + { "IQ3_TQ", LLAMA_FTYPE_MOSTLY_IQ3_TQ, " 3.5625 bpw, per-tensor trained grid" }, + { "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.5625 bpw, 8D vector quantized" }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, @@ -162,6 +169,9 @@ static void usage(const char * executable) { printf(" WARNING: this is an advanced option, use with care.\n"); printf(" --dry-run\n"); printf(" calculate and show the final quantization size without performing quantization\n"); + printf(" --threads n\n"); + printf(" number of threads to use for cross-tensor parallelization (default: 0, use same as within-tensor)\n"); + printf(" when n > 0, enables parallel quantization of multiple tensors simultaneously\n"); printf(" example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n"); printf("note: --include-weights and --exclude-weights cannot be used together\n\n"); printf("-----------------------------------------------------------------------------\n"); @@ -565,6 +575,8 @@ int main(int argc, char ** argv) { } } else if (strcmp(argv[arg_idx], "--keep-split") == 0) { params.keep_split = true; + } else if (strcmp(argv[arg_idx], "--keep-split") == 0) { + params.keep_split = true; } else { usage(argv[0]); }