This commit is contained in:
Piotr Wilkin (ilintar) 2026-04-13 15:03:55 +02:00 committed by GitHub
commit e840352f97
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
59 changed files with 10652 additions and 1724 deletions

View File

@ -111,13 +111,14 @@ extern "C" {
// Internal types and functions exposed for tests and benchmarks
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
const void * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
struct ggml_type_traits_cpu {
ggml_from_float_t from_float;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t nrows; // number of rows to process simultaneously
size_t levels_row_stride; // bytes to add per row to get next row's quant_levels (0 = per-tensor)
};
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);

View File

@ -429,7 +429,15 @@ extern "C" {
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
GGML_TYPE_Q1_0 = 41,
GGML_TYPE_COUNT = 42,
GGML_TYPE_Q3_PT = 42, // 3.875 bpw per-tensor Lloyd-Max, 16-elem affine sub-blocks
GGML_TYPE_Q3_KPT = 43, // Q3_K with learned per-tensor levels (3.4375 bpw)
GGML_TYPE_Q4_DPT = 44, // IQ4_NL with learned per-tensor int8 levels (4.125 bpw)
GGML_TYPE_Q2_DPT = 45, // 2-bit with learned per-tensor int8 levels (2.5 bpw)
GGML_TYPE_Q2_KPT = 46, // Q2_K with learned per-tensor float levels (2.625 bpw)
GGML_TYPE_IQ2_TQ = 47, // Trellis quantized with RNG codebook (2.0625 bpw)
GGML_TYPE_IQ3_TQ = 48, // 3-bit with per-tensor trained grid table (3.5625 bpw)
GGML_TYPE_IQ1_BN = 49, // 8D vector quantized with per-tensor trained codebook (1.5625 bpw)
GGML_TYPE_COUNT = 50,
};
// precision
@ -457,6 +465,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
GGML_FTYPE_MOSTLY_Q3_PT = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
@ -465,8 +474,11 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_Q3_KPT = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_DPT = 28, // except 1d tensors
GGML_FTYPE_MOSTLY_Q2_KPT = 29, // except 1d tensors
GGML_FTYPE_MOSTLY_NVFP4 = 30, // except 1d tensors
GGML_FTYPE_MOSTLY_Q1_0 = 31, // except 1d tensors
};
// available tensor operations:
@ -686,9 +698,8 @@ extern "C" {
char name[GGML_MAX_NAME];
void * extra; // extra things e.g. for ggml-cuda.cu
char padding[8];
void * extra; // extra things e.g. for ggml-cuda.cu
void * quant_levels; // per-tensor quantization levels (replaces char padding[8]; same size on 64-bit)
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -2723,7 +2734,7 @@ extern "C" {
# define GGML_RESTRICT restrict
# endif
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
struct ggml_type_traits {
@ -2734,6 +2745,7 @@ extern "C" {
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float_ref;
size_t levels_row_stride; // bytes to advance quant_levels per row (0 = per-tensor)
};
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);

View File

@ -208,6 +208,13 @@ add_library(ggml-base
ggml-quants.h
gguf.cpp)
# Enable native SIMD for ggml-quants.c (needed for K-means training in quantization)
include(CheckCCompilerFlag)
check_c_compiler_flag("-march=native" GGML_COMPILER_SUPPORTS_MARCH_NATIVE)
if (GGML_COMPILER_SUPPORTS_MARCH_NATIVE)
set_source_files_properties(ggml-quants.c PROPERTIES COMPILE_FLAGS "-march=native")
endif()
set_target_properties(ggml-base PROPERTIES
VERSION ${GGML_VERSION}
SOVERSION ${GGML_VERSION_MAJOR}

View File

@ -396,7 +396,7 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
//
struct ggml_backend_meta_buffer_context {
static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::quant_levels);
std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
std::map< const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;

View File

@ -1,5 +1,15 @@
#include "ggml-impl.h"
#include "ggml-blas.h"
// Helper: compute quant_levels stride for a given row.
// For Q2_KPT (per-block levels), stride depends on tensor width.
static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) {
if (type == GGML_TYPE_Q2_KPT) {
return (size_t)(ne0 / 256) * 4 * sizeof(float);
}
return constant_stride;
}
#include "ggml-backend-impl.h"
#include <future>
@ -77,10 +87,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);
const size_t lrs = ggml_quant_levels_stride(src0->type, ggml_get_type_traits(src0->type)->levels_row_stride, src0->ne[0]);
#ifdef GGML_USE_OPENMP
#pragma omp parallel for num_threads(n_threads)
for (int64_t i01 = 0; i01 < ne01; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
}
#else
for (int i = 1; i < n_threads; i++) {
@ -89,7 +100,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
if (start < end) {
ctx->tasks.push_back(std::async(std::launch::async, [=]() {
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
}
}));
}
@ -99,7 +110,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
const int64_t start = 0;
const int64_t end = ne01/n_threads;
for (int64_t i01 = start; i01 < end; i01++) {
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
}
}
#endif

View File

@ -298,6 +298,7 @@ typedef struct {
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
// 3-bit quantization
// weight is represented as x = a * q
// 16 blocks of 16 elements each
@ -327,6 +328,12 @@ typedef struct {
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
// Q3_KPT: Q3_K with learned per-tensor levels
// Reuses block_q3_K structure but maps 3-bit indices through learned level table
typedef block_q3_K block_q3_kpt;
#define Q3KPT_N_LEVELS 8
// 5-bit quantization
// 8 blocks of 32 elements each
// weight is represented as x = a * q + b
@ -449,6 +456,115 @@ typedef struct {
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
// 3.875 bpw - per-tensor Lloyd-Max scalar quantization
// 256 elements = 16 sub-blocks of 16, 8-entry level table trained per tensor
// Layout: 2 (d) + 2 (dmin) + 24 (scales: 32x6-bit) + 96 (qs: 256x3-bit) = 124 bytes
typedef struct {
ggml_half d; // 2 bytes: global scale for 16-elem sub-block ranges
ggml_half dmin; // 2 bytes: global scale for sub-block neg_mins
uint8_t scales[3*QK_K/32]; // 24 bytes: 32 x 6-bit (indices 0..15 = ranges, 16..31 = neg_mins)
uint8_t qs[3*QK_K/8]; // 96 bytes: 256 x 3-bit Lloyd-Max level index, sequential
} block_q3_pt;
static_assert(sizeof(block_q3_pt) == 124, "wrong q3_pt block size");
#define Q3PT_N_LEVELS 8
// Q4_DPT: IQ4_NL with learned per-tensor int8 levels (4.125 bpw)
// Block format: identical to block_iq4_nl (2 + 16 = 18 bytes per 32 elements)
typedef block_iq4_nl block_q4_dpt;
#define Q4DPT_N_LEVELS 16
// Q2_DPT: 2-bit per-tensor Lloyd-Max scalar quantization (2.5 bpw)
// Block format: 2 bytes (FP16 scale) + 8 bytes (2-bit indices for 32 elements) = 10 bytes per block
// 4 learned int8 levels per tensor, optimized via Lloyd-Max k-means
typedef struct {
ggml_half d; // 2 bytes: FP16 scale (delta)
uint8_t qs[8]; // 8 bytes: 2-bit indices (4 values per byte, 32 elements total)
} block_q2_dpt;
static_assert(sizeof(block_q2_dpt) == sizeof(ggml_half) + 8, "wrong q2_dpt block size/padding");
#define QK2_DPT 32
#define Q2DPT_N_LEVELS 4
// Q2_KPT: Q2_K with learned per-tensor float levels (2.625 bpw)
// Reuses block_q2_K structure but maps 2-bit indices through learned level table
typedef block_q2_K block_q2_kpt;
#define Q2KPT_N_LEVELS 4
// IQ2_TQ: Trellis Quantized with RNG codebook (2.0625 bpw)
//
// Reconstruction: y[i] = d * hash(seed, block_idx, position, trellis_state, qs_idx)
// where hash is a deterministic function mapping to [-1, 1]
// and trellis_state evolves as: next = (state + idx + 1) & 7
//
// Block layout (66 bytes per 256 elements):
// IQ2_TQ: 2-bit scalar quantization with per-tensor trained asymmetric grid table
// 32 groups of 8 elements per 256-element super-block
// - ggml_half d (2 bytes): super-block scale
// - uint8_t scales[16] (16 bytes): 32 × 4-bit grid entry index per group
// - uint8_t qs[64] (64 bytes): 256 × 2-bit element index within grid entry
// recon[j] = d * IQ2TQ_GRID_SCALE * grid[group_idx][elem_idx]
typedef struct {
ggml_half d; // Super-block scale (2 bytes)
uint8_t scales[QK_K/16]; // 32 × 4-bit grid entry index per group (16 bytes)
uint8_t qs[QK_K/4]; // 256 × 2-bit element index (64 bytes)
} block_iq2_tq;
static_assert(sizeof(block_iq2_tq) == 82, "wrong iq2_tq block size");
// 2 + 16 + 64 = 82 bytes per 256 weights = 2.5625 bpw
#define IQ2TQ_GROUP_SIZE 8 // Elements per group
#define IQ2TQ_N_GROUPS (QK_K / IQ2TQ_GROUP_SIZE) // 32 groups per super-block
#define IQ2TQ_GRID_SCALE 0.125f // Grid value multiplier: recon = d * GRID_SCALE * grid_int8
// IQ3_TQ: 3-bit scalar quantization with per-tensor trained asymmetric grid table (3.5625 bpw)
// 32 groups of 8 elements per 256-element super-block
// Each grid entry has 8 int8 levels (3 bits → 8 values per element)
// Grid table: 16 entries × 8 int8 = 128 bytes per tensor
// Block layout:
// - ggml_half d (2 bytes): super-block scale
// - uint8_t scales[16] (16 bytes): 32 × 4-bit grid entry index per group
// - uint8_t qs[96] (96 bytes): 256 × 3-bit element index within grid entry
// recon[j] = d * IQ3TQ_GRID_SCALE * grid[group_idx][elem_idx]
typedef struct {
ggml_half d; // Super-block scale (2 bytes)
uint8_t scales[QK_K/16]; // 32 × 4-bit grid entry index per group (16 bytes)
uint8_t qs[3*QK_K/8]; // 256 × 3-bit element index (96 bytes)
} block_iq3_tq;
static_assert(sizeof(block_iq3_tq) == 114, "wrong iq3_tq block size");
// 2 + 16 + 96 = 114 bytes per 256 weights = 3.5625 bpw
#define IQ3TQ_GROUP_SIZE 8 // Elements per group
#define IQ3TQ_N_GROUPS (QK_K / IQ3TQ_GROUP_SIZE) // 32 groups per super-block
#define IQ3TQ_N_LEVELS 8 // 3-bit → 8 levels per grid entry
#define IQ3TQ_GRID_SCALE 0.125f // Grid value multiplier
#define IQ3TQ_GRID_SIZE 128 // 16 entries × 8 int8 = 128 bytes per tensor
// IQ1_BN: 8D vector quantized with per-tensor trained 4096-entry codebook (1.5625 bpw)
// 32 groups of 8 elements per 256-element super-block
// Each group selects one of 4096 trained 8D vectors via 12-bit codebook index
// Codebook: 4096 entries × 8 int8 = 32768 bytes per tensor
// Block layout:
// - ggml_half d (2 bytes): super-block scale
// - uint8_t qs[48] (48 bytes): 32 × 12-bit codebook indices packed in pairs
// 12-bit pair packing (groups 2k, 2k+1 → 3 bytes at qs[3k]):
// idx_even = qs[3k] | ((qs[3k+1] & 0x0F) << 8)
// idx_odd = (qs[3k+1] >> 4) | (qs[3k+2] << 4)
// recon[g*8+k] = d * IQ1BN_GRID_SCALE * codebook[ci][k]
typedef struct {
ggml_half d; // Super-block scale (2 bytes)
uint8_t qs[3*QK_K/16]; // 32 × 12-bit codebook indices packed in pairs (48 bytes)
} block_iq1_bn;
static_assert(sizeof(block_iq1_bn) == 50, "wrong iq1_bn block size");
// 2 + 48 = 50 bytes per 256 weights = 1.5625 bpw
#define IQ1BN_GROUP_SIZE 8
#define IQ1BN_N_GROUPS (QK_K / IQ1BN_GROUP_SIZE) // 32
#define IQ1BN_CODEBOOK_K 4096 // number of codebook entries
#define IQ1BN_CODEBOOK_DIM 8 // vector dimension (= group size)
#define IQ1BN_GRID_SCALE 0.125f // Grid value multiplier
#define IQ1BN_CODEBOOK_SIZE (IQ1BN_CODEBOOK_K * IQ1BN_CODEBOOK_DIM) // 32768 bytes
#define IQ1BN_AUX_SIZE IQ1BN_CODEBOOK_SIZE // 32768 bytes
#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL

View File

@ -33,6 +33,8 @@
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@ -203,6 +205,15 @@
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__riscv)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
// repack.cpp
@ -307,6 +318,8 @@
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0

View File

@ -137,7 +137,111 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
//===================================== Dot products =================================
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK1_0; // 128
const int nb = n / qk;
assert(n % qk == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
GGML_UNUSED(levels);
const block_q1_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
float sumf = 0.0f;
#if defined(__ARM_NEON)
float32x4_t sumv = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i++) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
// Process 4 Q8_0 blocks (each has 32 elements)
for (int k = 0; k < 4; k++) {
const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
// Get the 4 bytes of bits for this Q8_0 block (32 bits = 4 bytes)
// Bits are at offset k*4 bytes in x[i].qs
const uint8_t * bits = &x[i].qs[k * 4];
// Load 32 int8 values from y
const int8x16_t y0 = vld1q_s8(yb->qs);
const int8x16_t y1 = vld1q_s8(yb->qs + 16);
// Byte 0-1: bits for y0[0..15]
const uint64_t expand0 = table_b2b_0[bits[0]];
const uint64_t expand1 = table_b2b_0[bits[1]];
// Byte 2-3: bits for y1[0..15]
const uint64_t expand2 = table_b2b_0[bits[2]];
const uint64_t expand3 = table_b2b_0[bits[3]];
// Build the sign vectors by reinterpreting the table values
uint8x8_t e0 = vcreate_u8(expand0);
uint8x8_t e1 = vcreate_u8(expand1);
uint8x8_t e2 = vcreate_u8(expand2);
uint8x8_t e3 = vcreate_u8(expand3);
// Shift right by 4 to get 0 or 1
int8x8_t s0 = vreinterpret_s8_u8(vshr_n_u8(e0, 4));
int8x8_t s1 = vreinterpret_s8_u8(vshr_n_u8(e1, 4));
int8x8_t s2 = vreinterpret_s8_u8(vshr_n_u8(e2, 4));
int8x8_t s3 = vreinterpret_s8_u8(vshr_n_u8(e3, 4));
// Convert 0/1 to -1/+1: sign = 2*val - 1
int8x8_t one = vdup_n_s8(1);
s0 = vsub_s8(vadd_s8(s0, s0), one); // 2*s0 - 1
s1 = vsub_s8(vadd_s8(s1, s1), one);
s2 = vsub_s8(vadd_s8(s2, s2), one);
s3 = vsub_s8(vadd_s8(s3, s3), one);
// Combine into 16-element vectors
int8x16_t signs0 = vcombine_s8(s0, s1);
int8x16_t signs1 = vcombine_s8(s2, s3);
// Multiply signs with y values and accumulate
// dot(signs, y) where signs are +1/-1
int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), signs0, y0);
int32x4_t p1 = ggml_vdotq_s32(p0, signs1, y1);
// Scale by d1 and accumulate
sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1);
}
}
sumf = vaddvq_f32(sumv);
#else
// Scalar fallback
for (int i = 0; i < nb; i++) {
const float d0 = GGML_FP16_TO_FP32(x[i].d);
// Process 4 Q8_0 blocks
for (int k = 0; k < 4; k++) {
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
int sumi = 0;
for (int j = 0; j < QK8_0; j++) {
const int bit_index = k * QK8_0 + j;
const int byte_index = bit_index / 8;
const int bit_offset = bit_index % 8;
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
sumi += xi * y[i*4 + k].qs[j];
}
sumf += d0 * d1 * sumi;
}
}
#endif
*s = sumf;
}
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK1_0; // 128
const int nb = n / qk;
@ -240,7 +344,7 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -533,7 +637,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;
@ -753,12 +857,13 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
*s = sumf;
}
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
GGML_UNUSED(levels);
assert(n % QK_NVFP4 == 0);
const block_nvfp4 * GGML_RESTRICT x = vx;
@ -837,7 +942,92 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
*s = sumf;
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
GGML_UNUSED(levels);
assert(n % QK_NVFP4 == 0);
const block_nvfp4 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
// Each NVFP4 super-block (64 elements) spans 2 q8_0 blocks
const int nb = n / QK_NVFP4;
float sumf = 0;
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
const uint8x16_t m4b = vdupq_n_u8(0x0f);
float32x4_t acc = vdupq_n_f32(0.0f);
for (int ib = 0; ib < nb; ++ib) {
const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs);
const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16);
const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_0, m4b));
const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4));
const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_1, m4b));
const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b));
const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs);
const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16);
const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b));
const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
const int32x4_t p0 = vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
const int32x4_t p1 = vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
const int32x4_t sums = vpaddq_s32(p0, p1);
// Decode 4 UE4M3 scales to f32 and multiply with q8 scales
const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
const float32x4_t nvsc = {
ggml_ue4m3_to_fp32(x[ib].d[0]),
ggml_ue4m3_to_fp32(x[ib].d[1]),
ggml_ue4m3_to_fp32(x[ib].d[2]),
ggml_ue4m3_to_fp32(x[ib].d[3])
};
const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
}
sumf = vaddvq_f32(acc);
#else
for (int ib = 0; ib < nb; ++ib) {
for (int si = 0; si < 4; ++si) {
const float d = ggml_ue4m3_to_fp32(x[ib].d[si]);
const int q8b = si / 2;
const int q8o = (si % 2) * QK_NVFP4_SUB;
const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8b].d);
int sumi_lo = 0, sumi_hi = 0;
for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
const uint8_t qv = x[ib].qs[si*(QK_NVFP4_SUB/2) + j];
sumi_lo += y[2*ib + q8b].qs[q8o + j + 0] * kvalues_mxfp4[qv & 0xf];
sumi_hi += y[2*ib + q8b].qs[q8o + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
}
sumf += dy * d * (sumi_lo + sumi_hi);
}
}
#endif
*s = sumf;
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -949,7 +1139,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;
@ -1067,7 +1257,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -3953,6 +4143,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
#endif
}
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
}
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);

View File

@ -644,7 +644,7 @@ static inline __m128i get_scale_shuffle(int i) {
}
#endif
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -772,7 +772,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;
@ -827,11 +827,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -880,11 +880,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;
@ -936,11 +936,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -983,7 +983,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
@ -1956,6 +1956,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
#endif
}
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
}
#if defined(__loongarch_asx)
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
const __m256i a = __lasx_xvmulwev_h_b(x, y);

View File

@ -141,7 +141,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
//===================================== Dot products =================================
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -207,11 +207,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;
@ -274,7 +274,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
@ -340,11 +340,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -412,11 +412,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;
@ -488,11 +488,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -557,7 +557,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
@ -2000,6 +2000,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
#endif
}
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
}
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
@ -2190,7 +2194,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
UNUSED(nb);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}

View File

@ -213,7 +213,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
//===================================== Dot products =================================
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
#if defined(__riscv_v)
const int qk = QK8_0;
const int nb = n / qk;
@ -264,11 +264,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
#else
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
#if defined(__riscv_v)
const int qk = QK8_1;
const int nb = n / qk;
@ -315,11 +315,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
#else
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
#if defined(__riscv_v)
const int qk = QK8_0;
const int nb = n / qk;
@ -369,11 +369,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
#else
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
#if defined(__riscv_v)
const int qk = QK8_1;
const int nb = n / qk;
@ -422,11 +422,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
#else
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -470,7 +470,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
@ -2954,6 +2954,14 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
#endif
}
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
}
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
}
#if defined __riscv_v_intrinsic
static void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);

View File

@ -146,7 +146,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
//===================================== Dot products =================================
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -201,11 +201,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;
@ -258,7 +258,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
@ -353,11 +353,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -495,11 +495,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;
@ -648,11 +648,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -698,7 +698,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
@ -1388,7 +1388,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
UNUSED(nb);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
@ -1463,3 +1463,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
}

View File

@ -229,7 +229,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
//===================================== Dot products =================================
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -355,7 +355,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -442,11 +442,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_1;
const int nb = n / qk;
@ -537,11 +537,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(sumf);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
const int qk = QK8_0;
const int nb = n / qk;
@ -605,7 +605,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
@ -1218,3 +1218,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
}

View File

@ -540,7 +540,8 @@ static inline __m128i get_scale_shuffle(int i) {
}
#endif
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_0;
const int nb = n / qk;
@ -698,7 +699,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_1;
const int nb = n / qk;
@ -753,11 +755,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(x);
UNUSED(y);
UNUSED(ib);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -843,7 +846,8 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
*s = sumf;
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_0;
const int nb = n / qk;
@ -919,11 +923,12 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(ib);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_1;
const int nb = n / qk;
@ -1005,11 +1010,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(ib);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_0;
const int nb = n / qk;
@ -1077,7 +1083,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
}
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -1205,11 +1212,12 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -1271,11 +1279,12 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -1463,11 +1472,12 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -1735,11 +1745,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -1913,11 +1924,12 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(kmask2);
UNUSED(kmask3);
UNUSED(utmp);
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -2123,11 +2135,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(kmask2);
UNUSED(kmask3);
UNUSED(utmp);
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -2328,7 +2341,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
@ -2369,7 +2382,8 @@ static const int8_t keven_signs_q2xs[1024] = {
};
#endif
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -2483,11 +2497,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -2780,11 +2795,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -2965,11 +2981,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -3089,11 +3106,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -3299,11 +3317,17 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
}
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -3418,11 +3442,12 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -3625,11 +3650,12 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
UNUSED(y);
UNUSED(nb);
UNUSED(scale);
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -3713,7 +3739,185 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
*s = sumf;
}
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
assert(n % QK4_NL == 0);
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
const block_q4_dpt * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
const int nb = n / QK4_NL;
const int8_t * values = (const int8_t *)levels;
GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor");
int ib = 0;
float sumf = 0;
#if defined __AVX2__
const __m128i values128 = _mm_loadu_si128((const __m128i*)values);
const __m128i m4b = _mm_set1_epi8(0x0f);
const __m256i mone = _mm256_set1_epi16(1);
__m256 accum1 = _mm256_setzero_ps();
__m256 accum2 = _mm256_setzero_ps();
for (; ib + 1 < nb; ib += 2) {
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
_mm256_cvtepi32_ps(p_1), accum1);
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
_mm256_cvtepi32_ps(p_2), accum2);
}
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
#elif defined __AVX__
const __m128i values128 = _mm_loadu_si128((const __m128i*)values);
const __m128i m4b = _mm_set1_epi8(0x0f);
__m256 accum = _mm256_setzero_ps();
for (; ib + 1 < nb; ib += 2) {
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
}
sumf = hsum_float_8(accum);
#endif
for (; ib < nb; ++ib) {
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
int sumi1 = 0, sumi2 = 0;
for (int j = 0; j < QK4_NL/2; ++j) {
sumi1 += y[ib].qs[j+ 0] * values[x[ib].qs[j] & 0xf];
sumi2 += y[ib].qs[j+QK4_NL/2] * values[x[ib].qs[j] >> 4];
}
sumf += d * (sumi1 + sumi2);
}
*s = sumf;
}
void ggml_vec_dot_q2_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
assert(n % QK2_DPT == 0);
static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same");
const block_q2_dpt * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
const int nb = n / QK2_DPT;
const int8_t * values = (const int8_t *)levels;
GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor");
int ib = 0;
float sumf = 0;
#if defined __AVX2__
const __m128i values128 = _mm_loadu_si128((const __m128i*)values);
const __m128i m3 = _mm_set1_epi8(0x03);
__m256 accum = _mm256_setzero_ps();
for (; ib + 1 < nb; ib += 2) {
const __m128i q2bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
const __m128i q2bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
// Extract 2-bit indices and lookup values - process 8 elements at a time
// For each byte of q2bits, we have 4 x 2-bit indices
const __m128i q2_01_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_1, m3));
const __m128i q2_01_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 2), m3));
const __m128i q2_02_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 4), m3));
const __m128i q2_02_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 6), m3));
const __m128i q2_11_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_2, m3));
const __m128i q2_11_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 2), m3));
const __m128i q2_12_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 4), m3));
const __m128i q2_12_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 6), m3));
// Combine pairs into __m256i
const __m256i q4b_1a = MM256_SET_M128I(q2_01_h, q2_01_l);
const __m256i q4b_1b = MM256_SET_M128I(q2_02_h, q2_02_l);
const __m256i q4b_2a = MM256_SET_M128I(q2_11_h, q2_11_l);
const __m256i q4b_2b = MM256_SET_M128I(q2_12_h, q2_12_l);
// Split q8 into pairs and compute dot products
const __m256i q8b_1a = _mm256_and_si256(q8b_1, _mm256_set1_epi16(0x00ff));
const __m256i q8b_1b = _mm256_srli_epi16(q8b_1, 8);
const __m256i q8b_2a = _mm256_and_si256(q8b_2, _mm256_set1_epi16(0x00ff));
const __m256i q8b_2b = _mm256_srli_epi16(q8b_2, 8);
const __m256i p16_1a = mul_add_epi8(q4b_1a, q8b_1a);
const __m256i p16_1b = mul_add_epi8(q4b_1b, q8b_1b);
const __m256i p16_2a = mul_add_epi8(q4b_2a, q8b_2a);
const __m256i p16_2b = mul_add_epi8(q4b_2b, q8b_2b);
const __m256i mone = _mm256_set1_epi16(1);
const __m256i p_1 = _mm256_add_epi32(_mm256_madd_epi16(p16_1a, mone), _mm256_madd_epi16(p16_1b, mone));
const __m256i p_2 = _mm256_add_epi32(_mm256_madd_epi16(p16_2a, mone), _mm256_madd_epi16(p16_2b, mone));
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
_mm256_cvtepi32_ps(p_1), accum);
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
_mm256_cvtepi32_ps(p_2), accum);
}
sumf = hsum_float_8(accum);
#endif
for (; ib < nb; ++ib) {
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
int sumi = 0;
for (int j = 0; j < QK2_DPT/4; ++j) {
uint8_t q = x[ib].qs[j];
sumi += y[ib].qs[j*4 + 0] * values[(q >> 0) & 3];
sumi += y[ib].qs[j*4 + 1] * values[(q >> 2) & 3];
sumi += y[ib].qs[j*4 + 2] * values[(q >> 4) & 3];
sumi += y[ib].qs[j*4 + 3] * values[(q >> 6) & 3];
}
sumf += d * sumi;
}
*s = sumf;
}
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -3815,6 +4019,6 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
UNUSED(x);
UNUSED(y);
UNUSED(nb);
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
#endif
}

View File

@ -3,6 +3,7 @@
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-quants.h"
#include "traits.h"
#include "ggml-cpu-impl.h"
#include "ggml-impl.h"
@ -396,6 +397,52 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_Q3_PT] = {
// from_float not set — requires codebook initialization via q3pt_set_codebook()
.vec_dot = ggml_vec_dot_q3_pt_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_Q3_KPT] = {
// from_float not set — requires level initialization via q3kpt_set_levels()
.vec_dot = ggml_vec_dot_q3_kpt_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_Q4_DPT] = {
// from_float not set — requires level initialization via q4dpt_set_levels()
.vec_dot = ggml_vec_dot_q4_dpt_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
},
[GGML_TYPE_Q2_DPT] = {
// from_float not set — requires level initialization via q2dpt_set_levels()
.vec_dot = ggml_vec_dot_q2_dpt_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
},
[GGML_TYPE_Q2_KPT] = {
// from_float not set — requires level initialization via q2kpt_set_levels()
.vec_dot = ggml_vec_dot_q2_kpt_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.levels_row_stride = 0, // computed dynamically: (ne0/QK_K)*Q2KPT_N_LEVELS*sizeof(float)
},
[GGML_TYPE_IQ2_TQ] = {
.vec_dot = ggml_vec_dot_iq2_tq_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_IQ3_TQ] = {
.vec_dot = ggml_vec_dot_iq3_tq_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_IQ1_BN] = {
.vec_dot = ggml_vec_dot_iq1_bn_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_I32] = {
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
},
@ -1165,8 +1212,15 @@ static void ggml_compute_forward_mul_mat_one_chunk(
const bool src1_cont = ggml_is_contiguous(src1);
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
// For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
// ne00 is the number of elements per row in src0 (input dimension), NOT ne0 (= ne01 = output rows).
// For non-square matrices (e.g. ffn_up: [hidden, intermediate]) ne00 != ne01, so ne00 is correct.
// For other types, use the static stride from type_traits_cpu
const size_t levels_row_stride = (type == GGML_TYPE_Q2_KPT)
? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
: type_traits_cpu[type].levels_row_stride;
// broadcast factors
const int64_t r2 = ne12 / ne02;
@ -1227,7 +1281,11 @@ static void ggml_compute_forward_mul_mat_one_chunk(
//}
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
// For Q2_KPT, levels are stored per-expert: [expert0_rows, expert1_rows, ...]
// So for 3D tensors we need to index by (i03 * ne01 + ir0)
const size_t levels_row_idx = (type == GGML_TYPE_Q2_KPT && ne03 > 1) ? (i03 * ne01 + ir0) : ir0;
const void * row_levels = (const char*)src0->quant_levels + levels_row_idx * levels_row_stride;
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot, row_levels);
}
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
@ -1293,7 +1351,8 @@ void ggml_compute_forward_mul_mat(
nb1/ggml_type_size(dst->type),
src0->type,
src1->type,
dst->type))
dst->type,
src0->quant_levels))
goto UseGgmlGemm1;
return;
}
@ -1361,7 +1420,8 @@ UseGgmlGemm1:;
nb1/ggml_type_size(dst->type),
src0->type,
vec_dot_type,
dst->type))
dst->type,
src0->quant_levels))
goto UseGgmlGemm2;
return;
}
@ -1461,8 +1521,14 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(
const enum ggml_type type = src0->type;
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
// For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
// ne00 is the input dimension (elements per row in src0), NOT ne0 (= ne01 = output rows).
// For other types, use the static stride from type_traits_cpu
const size_t levels_row_stride = (type == GGML_TYPE_Q2_KPT)
? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
: type_traits_cpu[type].levels_row_stride;
const int64_t blck_0 = 16;
const int64_t blck_1 = 16;
@ -1495,7 +1561,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
const void * row_levels = (const char*)src0->quant_levels + (cur_a * ne01 + ir0) * levels_row_stride;
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1, row_levels);
}
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));

View File

@ -1356,16 +1356,20 @@ class tinyBLAS_Q0_AVX {
const TA *A, int64_t lda,
const TB *B, int64_t ldb,
TC *C, int64_t ldc,
int ith, int nth)
int ith, int nth,
const int8_t * custom_table = nullptr)
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
const int8_t kvalues_iq4nl[16] = {
-127, -104, -83, -65,
-49, -35, -22, -10,
1, 13, 25, 38,
53, 69, 89, 113
};
iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
if (custom_table) {
iq4nlt = _mm_loadu_si128((const __m128i *)custom_table);
} else {
const int8_t kvalues_iq4nl[16] = {
-127, -104, -83, -65,
-49, -35, -22, -10,
1, 13, 25, 38,
53, 69, 89, 113
};
iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
}
}
void matmul(int64_t m, int64_t n) {
@ -3684,7 +3688,7 @@ class tinyBLAS_PPC {
*/
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int Atype, int Btype, int Ctype) {
int64_t ldc, int Atype, int Btype, int Ctype, const void * quant_levels) {
assert(m >= 0);
assert(n >= 0);
@ -4024,6 +4028,26 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
#endif
}
case GGML_TYPE_Q4_DPT: {
if (Btype != GGML_TYPE_Q8_0)
return false;
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
// Q4_DPT has identical block layout to IQ4_NL (block_q4_dpt = block_iq4_nl)
// but uses a per-tensor lookup table instead of the fixed IQ4_NL values.
const int8_t * levels = (const int8_t *)quant_levels;
if (!levels) return false;
tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
k, (const block_iq4_nl *)A, lda,
(const block_q8_0 *)B, ldb,
(float *)C, ldc,
params->ith, params->nth, levels};
tb.matmul(m, n);
return true;
#else
return false;
#endif
}
default:
return false;
}

View File

@ -18,7 +18,7 @@ extern "C" {
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
const void *, int64_t, const void *, int64_t, void *, int64_t,
int, int, int);
int, int, int, const void * quant_levels);
#ifdef __cplusplus
}

View File

@ -8,6 +8,19 @@
#include "unary-ops.h"
#include "vec.h"
// Helper: compute quant_levels stride for a given row.
// For most types this is the constant levels_row_stride from type_traits.
// For Q2_KPT (per-block levels), stride depends on tensor width (ne[0]).
static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) {
if (type == GGML_TYPE_Q2_KPT) {
// Q2_KPT has Q2KPT_N_LEVELS floats per 256-element block
// Stride = (ne0 / 256) * Q2KPT_N_LEVELS * sizeof(float)
return (size_t)(ne0 / 256) * 4 * sizeof(float);
}
return constant_stride;
}
#include <algorithm>
#include <cfloat>
#include <cmath>
@ -517,9 +530,11 @@ static void ggml_compute_forward_dup_from_q(
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
const size_t q_lrs0 = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
dequantize_row_q(
(const void *) ((char *) src0->data + x_offset),
(float *) ((char *) dst->data + dst_offset), qk);
(float *) ((char *) dst->data + dst_offset), qk,
(const char*)src0->quant_levels + i01 * q_lrs0);
}
}
@ -639,7 +654,8 @@ static void ggml_compute_forward_add_q_f32(
assert(ne00 % 32 == 0);
// unquantize row from src0 to temp buffer
dequantize_row_q(src0_row, wdata, ne00);
const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add);
// add src1
ggml_vec_acc_f32(ne00, wdata, src1_row);
// quantize row to dst
@ -688,6 +704,9 @@ void ggml_compute_forward_add(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_TQ:
case GGML_TYPE_IQ3_TQ:
case GGML_TYPE_IQ1_BN:
{
ggml_compute_forward_add_q_f32(params, dst);
} break;
@ -974,7 +993,8 @@ static void ggml_compute_forward_add1_q_f32(
assert(ne0 % 32 == 0);
// unquantize row from src0 to temp buffer
dequantize_row_q(src0_row, wdata, ne0);
const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add);
// add src1
ggml_vec_acc1_f32(ne0, wdata, v);
// quantize row to dst
@ -1139,6 +1159,9 @@ void ggml_compute_forward_add1(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_TQ:
case GGML_TYPE_IQ3_TQ:
case GGML_TYPE_IQ1_BN:
{
ggml_compute_forward_add1_q_f32(params, dst);
} break;
@ -1269,6 +1292,9 @@ void ggml_compute_forward_acc(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_TQ:
case GGML_TYPE_IQ3_TQ:
case GGML_TYPE_IQ1_BN:
default:
{
GGML_ABORT("fatal error");
@ -4321,7 +4347,8 @@ static void ggml_compute_forward_out_prod_q_f32(
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
dequantize_row_q(s0, wdata, ne0);
const size_t q_lrs_op = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
dequantize_row_q(s0, wdata, ne0, (const char*)src0->quant_levels + i01 * q_lrs_op);
ggml_vec_mad_f32(ne0, d, wdata, *s1);
}
}
@ -4358,6 +4385,9 @@ void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_TQ:
case GGML_TYPE_IQ3_TQ:
case GGML_TYPE_IQ1_BN:
{
ggml_compute_forward_out_prod_q_f32(params, dst);
} break;
@ -4635,6 +4665,9 @@ void ggml_compute_forward_set(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_TQ:
case GGML_TYPE_IQ3_TQ:
case GGML_TYPE_IQ1_BN:
default:
{
GGML_ABORT("fatal error");
@ -4698,9 +4731,21 @@ static void ggml_compute_forward_get_rows_q(
GGML_ASSERT(i01 >= 0 && i01 < ne01);
const size_t q_lrs_gr = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
// For Q2_KPT with 3D tensors, levels are indexed by [i12 * ne02 * ne01 + i11 * ne01 + i01]
// For 2D tensors, levels are indexed by [i11 * ne01 + i01] (or just [i01] if ne02 == 1)
size_t levels_row_idx;
if (type == GGML_TYPE_Q2_KPT && ne03 > 1) {
levels_row_idx = (i12 * ne02 + i11) * ne01 + i01;
} else if (type == GGML_TYPE_Q2_KPT) {
levels_row_idx = i11 * ne01 + i01;
} else {
levels_row_idx = i01;
}
dequantize_row_q(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc,
(const char*)src0->quant_levels + levels_row_idx * q_lrs_gr);
}
}
@ -4859,6 +4904,9 @@ void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_TQ:
case GGML_TYPE_IQ3_TQ:
case GGML_TYPE_IQ1_BN:
{
ggml_compute_forward_get_rows_q(params, dst);
} break;
@ -5436,7 +5484,7 @@ static void ggml_compute_forward_soft_max_ext_back_f32(
// linear runtime, no additional memory
float dot_y_dy = 0;
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1, nullptr);
ggml_vec_cpy_f32 (nc, dx, dy);
ggml_vec_acc1_f32 (nc, dx, -dot_y_dy);
ggml_vec_mul_f32 (nc, dx, dx, y);
@ -5571,6 +5619,8 @@ void ggml_compute_forward_clamp(
case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q3_KPT:
case GGML_TYPE_Q4_DPT:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
@ -5583,6 +5633,12 @@ void ggml_compute_forward_clamp(
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_Q3_PT:
case GGML_TYPE_Q2_KPT:
case GGML_TYPE_Q2_DPT:
case GGML_TYPE_IQ2_TQ:
case GGML_TYPE_IQ3_TQ:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q8_K:
@ -6007,7 +6063,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
float v = 0;
ggml_vec_dot_f16(ne02, &v, 0,
(ggml_fp16_t *) wdata_src + i1n, 0,
(ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
(ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1, nullptr);
dst_data[i10*s0 + i00] += v;
}
}
@ -6095,7 +6151,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
float v = 0;
ggml_vec_dot_f32(ne02, &v, 0,
wdata_src + i1n, 0,
wdata_kernel + i00*ne02, 0, 1);
wdata_kernel + i00*ne02, 0, 1, nullptr);
dst_data[i10*s0 + i00] += v;
}
}
@ -7021,11 +7077,11 @@ static void ggml_compute_forward_conv_transpose_2d_impl(
if constexpr (std::is_same_v<kernel_t, ggml_fp16_t>) {
ggml_vec_dot_f16(ne03, &v, 0,
wdata_src + i1n, 0,
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1, nullptr);
} else {
ggml_vec_dot_f32(ne03, &v, 0,
wdata_src + i1n, 0,
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1, nullptr);
}
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
}
@ -8298,7 +8354,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
float s; // KQ value
const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1);
kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1, k->quant_levels);
s = s*scale; // scale KQ value
@ -8345,7 +8401,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
// V += v*expf(s - M)
if (v_to_float) {
v_to_float(v_data, V32, DV);
v_to_float(v_data, V32, DV, v->quant_levels);
ggml_vec_mad_f32(DV, VKQ32, V32, vs);
} else {
// V is F32
@ -9058,7 +9114,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
ggml_vec_dot_f32(neq0,
S + i1, 0,
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1, nullptr);
}
// scale
@ -9172,7 +9228,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
// S = SM * (S - dot(SM, S))
float dot_SM_gradSM = 0;
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1, nullptr);
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
ggml_vec_mul_f32 (masked_begin, S, S, SM);
@ -10535,7 +10591,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
// delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k)
for (int64_t j = 0; j < S_v; ++j) {
float sum = 0.0f;
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1);
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1, nullptr);
delta[j] = (v_d[j] - sum) * beta_val;
}
@ -10547,7 +10603,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
// attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q)
for (int64_t j = 0; j < S_v; ++j) {
float sum = 0.0f;
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1);
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1, nullptr);
attn_data[j] = sum * scale;
}

View File

@ -120,7 +120,8 @@ void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI
//===================================== Dot products =================================
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK1_0;
const int nb = n / qk;
@ -165,7 +166,8 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
}
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_0;
const int nb = n / qk;
@ -202,7 +204,8 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
}
// TODO: add WASM SIMD
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_1;
const int nb = n / qk;
@ -238,7 +241,8 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
*s = sumf;
}
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -270,7 +274,8 @@ void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
// NVFP4: super-block of 64 elements = 4 sub-blocks of 16 = 2 q8_0 blocks
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -305,7 +310,8 @@ void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = sumf;
}
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_0;
const int nb = n / qk;
@ -348,7 +354,8 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
*s = sumf;
}
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_1;
const int nb = n / qk;
@ -391,7 +398,8 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
*s = sumf;
}
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
const int qk = QK8_0;
const int nb = n / qk;
@ -421,7 +429,8 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
*s = sumf;
}
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -473,7 +482,8 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = sumf;
}
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -505,7 +515,8 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = sumf;
}
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -557,7 +568,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
*s = sumf;
}
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -636,7 +648,8 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
*s = sumf;
}
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -710,8 +723,7 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
for (int l = 0; l < 8; ++l) sumf += sums[l];
*s = sumf;
}
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -741,6 +753,7 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
float sumf = 0;
for (int i = 0; i < nb; ++i) {
GGML_UNUSED(levels);
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
const uint8_t * GGML_RESTRICT hm = x[i].qh;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
@ -791,7 +804,8 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
*s = sumf;
}
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -846,7 +860,8 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
*s = sumf;
}
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -888,7 +903,8 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
*s = 0.125f * sumf;
}
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -938,7 +954,8 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = 0.125f * sumf;
}
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -990,7 +1007,8 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = 0.125f * sumf;
}
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -1034,7 +1052,8 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
*s = 0.25f * sumf;
}
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -1090,7 +1109,65 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = sumf;
}
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q3_pt * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const float * lv = (const float *)levels;
GGML_ASSERT(lv != NULL && "Q3_PT levels not set for tensor");
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float xd = GGML_CPU_FP16_TO_FP32(x[i].d);
const float xdmin = GGML_CPU_FP16_TO_FP32(x[i].dmin);
const float yd = y[i].d;
const uint8_t * sc = x[i].scales;
const uint8_t * qs = x[i].qs;
const int8_t * q8 = y[i].qs;
float block_sum = 0.f;
for (int ib = 0; ib < QK_K/16; ++ib) {
// Inline 6-bit unpack for range scale (index ib) and neg_min scale (index ib + QK_K/16)
const int sbit0 = ib * 6, sbyte0 = sbit0 / 8, soff0 = sbit0 % 8;
const int sbit1 = (ib + QK_K/16) * 6, sbyte1 = sbit1 / 8, soff1 = sbit1 % 8;
uint8_t qrange = (sc[sbyte0] >> soff0) & 0x3F;
if (soff0 > 2) { qrange |= (uint8_t)((sc[sbyte0+1] << (8 - soff0)) & 0x3F); }
uint8_t qnmin = (sc[sbyte1] >> soff1) & 0x3F;
if (soff1 > 2) { qnmin |= (uint8_t)((sc[sbyte1+1] << (8 - soff1)) & 0x3F); }
const float range = xd * (float)qrange;
const float sub_min = -xdmin * (float)qnmin;
float sum_lq = 0.f;
for (int j = 0; j < 16; ++j) {
// Inline 3-bit unpack
const int qk = ib * 16 + j;
const int qbit = qk * 3;
const int qbyte = qbit / 8;
const int qoff = qbit % 8;
int q = (qs[qbyte] >> qoff) & 0x7;
if (qoff > 5) { q |= (int)((qs[qbyte+1] << (8 - qoff)) & 0x7); }
sum_lq += lv[q] * (float)q8[qk];
}
// min contribution uses precomputed 16-element sum from block_q8_K.bsums
block_sum += sum_lq * range + sub_min * (float)y[i].bsums[ib];
}
sumf += block_sum * yd;
}
*s = sumf;
}
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -1133,7 +1210,375 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = sumf;
}
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
// Q3_KPT vec_dot - similar to Q3_K but with learned levels
void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q3_kpt * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const float * lv = (const float *)levels;
GGML_ASSERT(lv != NULL && "Q3_KPT levels not set for tensor");
const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f;
float sumf = 0.f;
for (int i = 0; i < nb; ++i) {
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
const float yd = y[i].d;
const uint8_t * q = x[i].qs;
const uint8_t * hm = x[i].hmask;
const int8_t * q8 = y[i].qs;
uint8_t m = 1;
uint32_t aux32[4];
memcpy(aux32, x[i].scales, 12);
uint32_t tmp = aux32[2];
aux32[2] = ((aux32[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
aux32[3] = ((aux32[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
aux32[0] = (aux32[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
aux32[1] = (aux32[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
const uint8_t * aux = (const uint8_t *)aux32;
int is = 0;
float block_sum = 0.f;
for (int blk = 0; blk < QK_K; blk += 128) {
int shift = 0;
for (int j = 0; j < 4; ++j) {
int sc1 = (int)aux[is] - 32;
int sc2 = (int)aux[is+1] - 32;
is += 2;
float dl1 = d_all * sc1;
float dl2 = d_all * sc2;
float sum1 = 0.f, sum2 = 0.f;
for (int l = 0; l < 16; ++l) {
int k_idx = ((q[l+0] >> shift) & 3) + ((hm[l+0] & m) ? 4 : 0);
sum1 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+0];
}
for (int l = 0; l < 16; ++l) {
int k_idx = ((q[l+16] >> shift) & 3) + ((hm[l+16] & m) ? 4 : 0);
sum2 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+16];
}
block_sum += dl1 * sum1 + dl2 * sum2;
shift += 2;
m <<= 1;
q8 += 32;
}
q += 32;
}
sumf += block_sum * yd;
}
*s = sumf;
}
void ggml_vec_dot_q3_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
ggml_vec_dot_q3_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
}
// Q2_KPT vec_dot - similar to Q2_K but with learned levels
void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q2_kpt * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const float * lv = (const float *)levels;
GGML_ASSERT(lv != NULL && "Q2_KPT levels not set for tensor");
float sumf = 0;
for (int i = 0; i < nb; ++i) {
// Per-block levels: block i uses lv[i*4 + 0..3]
const float * block_lv = lv + i * Q2KPT_N_LEVELS;
// Precompute mapped levels for this block: ml[k] = levels[k] * 3.0
float ml[Q2KPT_N_LEVELS];
for (int k = 0; k < Q2KPT_N_LEVELS; ++k) {
ml[k] = block_lv[k] * 3.0f;
}
const uint8_t * q2 = x[i].qs;
const int8_t * q8 = y[i].qs;
const uint8_t * sc = x[i].scales;
// Min term: accumulate integer bsums * min_scale (same as Q2_K)
int summs = 0;
for (int j = 0; j < 16; ++j) {
summs += y[i].bsums[j] * (sc[j] >> 4);
}
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
// Scale term: need floating-point because levels are non-uniform
int is = 0;
float fsum = 0;
for (int k = 0; k < QK_K/128; ++k) {
int shift = 0;
for (int j = 0; j < 4; ++j) {
int d_sc = sc[is++] & 0xF;
float suml = 0;
for (int l = 0; l < 16; ++l) {
int idx = (q2[l] >> shift) & 3;
suml += ml[idx] * (float)q8[l];
}
fsum += d_sc * suml;
d_sc = sc[is++] & 0xF;
suml = 0;
for (int l = 16; l < 32; ++l) {
int idx = (q2[l] >> shift) & 3;
suml += ml[idx] * (float)q8[l];
}
fsum += d_sc * suml;
shift += 2;
q8 += 32;
}
q2 += 32;
}
sumf += dall * fsum - dmin * summs;
}
*s = sumf;
}
void ggml_vec_dot_q2_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
ggml_vec_dot_q2_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
}
// IQ2_TQ: 2-bit with asymmetric 4-tuple grid per group
// Default grid table — only used when no per-tensor grid is available
static const int8_t iq2tq_grid_cpu[16][4] = {
{-20, -8, -2, 6}, {-14, -8, -2, 4}, {-16,-10, 0, 12}, {-14, -4, 2, 8},
{-20, -4, 4, 12}, {-8, -4, 0, 4}, {-8, -4, 0, 8}, {-12, -6, 2, 12},
{-4, -2, 2, 4}, {-10, -2, 4, 8}, {-16, -6, 4, 20}, {-12, -2, 6, 14},
{-8, -2, 4, 14}, {-4, 0, 4, 8}, {-8, -2, 6, 22}, {-4, 2, 8, 14},
};
void ggml_vec_dot_iq2_tq_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
const int8_t (*grid)[4] = levels ? (const int8_t (*)[4])levels : (const int8_t (*)[4])iq2tq_grid_cpu;
const block_iq2_tq * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ2TQ_GRID_SCALE;
const float yd = y[i].d;
const int8_t * q8 = y[i].qs;
int32_t fsum = 0;
for (int g = 0; g < IQ2TQ_N_GROUPS; ++g) {
int si = (x[i].scales[g / 2] >> (4 * (g % 2))) & 0xF;
const int8_t * ge = grid[si];
const int8_t * q8g = q8 + g * 8;
for (int k = 0; k < 8; ++k) {
int j = g * 8 + k;
int qi = (x[i].qs[j / 4] >> ((j % 4) * 2)) & 3;
fsum += (int32_t)ge[qi] * (int32_t)q8g[k];
}
}
sumf += d * yd * (float)fsum;
}
*s = sumf;
}
// IQ3_TQ default grid (must match ggml-quants.c)
static const int8_t iq3tq_grid_cpu[16][8] = {
{-24,-18,-12, -6, 0, 6, 12, 18},
{-20,-15,-10, -5, 0, 5, 10, 15},
{-16,-12, -8, -4, 0, 4, 8, 12},
{-12, -8, -4, -2, 0, 2, 4, 8},
{-24,-16, -8, -2, 2, 6, 10, 14},
{-14,-10, -6, -2, 2, 8, 16, 24},
{-20,-14, -8, -4, 0, 4, 10, 18},
{-18,-10, -4, 0, 4, 8, 14, 20},
{ -8, -6, -4, -2, 0, 2, 4, 6},
{-10, -6, -4, -2, 2, 4, 6, 10},
{-22,-14, -6, -2, 2, 6, 14, 22},
{-16, -8, -4, -2, 0, 4, 8, 16},
{-24,-20,-16,-12, -8, -4, 0, 4},
{ -4, 0, 4, 8, 12, 16, 20, 24},
{-20,-16,-10, -4, 4, 10, 16, 20},
{-12, -8, -6, -2, 2, 6, 8, 12},
};
void ggml_vec_dot_iq3_tq_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
const int8_t (*grid)[8] = levels ? (const int8_t (*)[8])levels : (const int8_t (*)[8])iq3tq_grid_cpu;
const block_iq3_tq * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ3TQ_GRID_SCALE;
const float yd = y[i].d;
const int8_t * q8 = y[i].qs;
int32_t fsum = 0;
for (int g = 0; g < IQ3TQ_N_GROUPS; ++g) {
int si = (x[i].scales[g / 2] >> (4 * (g % 2))) & 0xF;
const int8_t * ge = grid[si];
const int8_t * q8g = q8 + g * 8;
for (int k = 0; k < 8; ++k) {
int j = g * 8 + k;
// 3-bit unpack
int bit_pos = j * 3;
int byte_idx = bit_pos >> 3;
int bit_off = bit_pos & 7;
uint16_t val = x[i].qs[byte_idx];
if (bit_off > 5) val |= ((uint16_t)x[i].qs[byte_idx + 1] << 8);
int qi = (val >> bit_off) & 7;
fsum += (int32_t)ge[qi] * (int32_t)q8g[k];
}
}
sumf += d * yd * (float)fsum;
}
*s = sumf;
}
// IQ1_BN: 8D vector quantized — codebook[256][8] + scale_table[16]
void ggml_vec_dot_iq1_bn_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
GGML_ASSERT(levels && "IQ1_BN requires per-tensor codebook in quant_levels");
const int8_t * codebook = (const int8_t *)levels;
const block_iq1_bn * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ1BN_GRID_SCALE;
const float yd = y[i].d;
const int8_t * q8 = y[i].qs;
int32_t block_sum = 0;
for (int g = 0; g < IQ1BN_N_GROUPS; ++g) {
int ci = (g & 1)
? ((x[i].qs[3*(g/2)+1] >> 4) | ((int)x[i].qs[3*(g/2)+2] << 4))
: (x[i].qs[3*(g/2)] | (((int)x[i].qs[3*(g/2)+1] & 0x0F) << 8));
const int8_t * cb = codebook + ci * IQ1BN_CODEBOOK_DIM;
const int8_t * q8g = q8 + g * IQ1BN_GROUP_SIZE;
for (int k = 0; k < IQ1BN_CODEBOOK_DIM; ++k) {
block_sum += (int32_t)cb[k] * (int32_t)q8g[k];
}
}
sumf += d * yd * (float)block_sum;
}
*s = sumf;
}
void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
assert(n % QK4_NL == 0);
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
const block_q4_dpt * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
const int nb = n / QK4_NL;
const int8_t * values = (const int8_t *)levels;
GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor");
float sumf = 0;
for (int ib = 0; ib < nb; ++ib) {
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d);
int32_t blk = 0;
for (int j = 0; j < QK4_NL/2; ++j) {
blk += (int32_t)y[ib].qs[j+ 0] * (int32_t)values[x[ib].qs[j] & 0xf];
blk += (int32_t)y[ib].qs[j+QK4_NL/2] * (int32_t)values[x[ib].qs[j] >> 4];
}
sumf += d * (float)blk;
}
*s = sumf;
}
void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
assert(n % QK2_DPT == 0);
static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same");
const block_q2_dpt * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
const int nb = n / QK2_DPT;
const int8_t * values = (const int8_t *)levels;
GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor");
float sumf = 0;
for (int ib = 0; ib < nb; ++ib) {
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d);
int32_t blk = 0;
for (int j = 0; j < QK2_DPT/4; ++j) {
uint8_t q = x[ib].qs[j];
blk += (int32_t)y[ib].qs[j*4 + 0] * (int32_t)values[(q >> 0) & 3];
blk += (int32_t)y[ib].qs[j*4 + 1] * (int32_t)values[(q >> 2) & 3];
blk += (int32_t)y[ib].qs[j*4 + 2] * (int32_t)values[(q >> 4) & 3];
blk += (int32_t)y[ib].qs[j*4 + 3] * (int32_t)values[(q >> 6) & 3];
}
sumf += d * (float)blk;
}
*s = sumf;
}
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
@ -1194,7 +1639,8 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = sumf;
}
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
@ -1223,7 +1669,8 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = sumf;
}
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);

View File

@ -37,66 +37,79 @@ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
// Dot product
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q3_pt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q3_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q4_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q2_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q2_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq2_tq_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq3_tq_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq1_bn_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
// Generic implementation
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
#ifdef __cplusplus
}

View File

@ -8,7 +8,8 @@ ggml_fp16_t ggml_table_gelu_f16[1 << 16];
// precomputed quick gelu table for f16 (128 KB)
ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
@ -136,7 +137,8 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
*s = sumf;
}
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
@ -261,7 +263,8 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
*s = sumf;
}
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) {
GGML_UNUSED(levels);
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);

View File

@ -39,9 +39,9 @@ extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
// fundamental operations
//
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
void ggml_vec_silu_f32(const int n, float * y, const float * x);
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
@ -873,7 +873,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
}
}
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1, NULL); *s = sqrtf(*s); }
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {

View File

@ -1057,6 +1057,27 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
static constexpr int qi = QI4_NL;
};
template<>
struct ggml_cuda_type_traits<GGML_TYPE_Q4_DPT> {
static constexpr int qk = QK4_NL;
static constexpr int qr = QR4_NL;
static constexpr int qi = QI4_NL;
};
// Per-tensor lookup table for Q4_DPT (device global memory).
// Each TU gets its own copy; initialized via cudaGetSymbolAddress + cudaMemcpyAsync before use.
__device__ int8_t q4dpt_levels_cuda[16];
// Per-tensor lookup table for Q2_DPT (4 int8 levels).
__device__ int8_t q2dpt_levels_cuda[4];
template<>
struct ggml_cuda_type_traits<GGML_TYPE_Q2_DPT> {
static constexpr int qk = QK2_DPT;
static constexpr int qr = 4; // 4 elements per "quantum" (2-bit)
static constexpr int qi = 1; // 1 uint32 per block
};
template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
static constexpr int qk = QK_K;
@ -1064,6 +1085,38 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
static constexpr int qi = QI4_XS;
};
// Per-tensor grid for IQ2_TQ (16 × 4 int8 = 64 bytes).
__device__ int8_t iq2tq_grid_cuda[64];
template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ2_TQ> {
static constexpr int qk = QK_K;
static constexpr int qr = 4;
static constexpr int qi = QK_K / (4*4); // 16
};
// Per-tensor grid for IQ3_TQ (16 × 8 int8 = 128 bytes).
__device__ int8_t iq3tq_grid_cuda[128];
// Per-tensor codebook for IQ1_BN (4096 × 8 int8 = 32768 bytes).
__device__ int8_t iq1bn_codebook_cuda[IQ1BN_CODEBOOK_SIZE];
template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ3_TQ> {
static constexpr int qk = QK_K;
static constexpr int qr = 4;
static constexpr int qi = QK_K / (4*4); // 16
};
template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ1_BN> {
static constexpr int qk = QK_K;
static constexpr int qr = 4;
static constexpr int qi = QK_K / (4*4); // 16
};
template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
static constexpr int qk = QK_K;

View File

@ -593,12 +593,187 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
}
void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream) {
int8_t * d_q4dpt_levels;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyDeviceToDevice, stream));
}
void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream) {
int8_t * d_q2dpt_levels;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, levels, 4, cudaMemcpyDeviceToDevice, stream));
}
void ggml_cuda_set_iq2tq_grid(const void * grid, cudaStream_t stream) {
int8_t * d_grid;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq2tq_grid_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_grid, grid, 64, cudaMemcpyHostToDevice, stream));
}
void ggml_cuda_set_iq3tq_grid(const void * grid, cudaStream_t stream) {
int8_t * d_grid;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq3tq_grid_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_grid, grid, 128, cudaMemcpyHostToDevice, stream));
}
void ggml_cuda_set_iq1bn_aux(const void * aux, cudaStream_t stream) {
int8_t * d_cb;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_cb, iq1bn_codebook_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_cb, aux, IQ1BN_CODEBOOK_SIZE, cudaMemcpyHostToDevice, stream));
}
template<typename dst_t>
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb = (k + QK_K - 1) / QK_K;
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static __global__ void dequantize_block_q4_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int64_t i = blockIdx.x;
const block_q4_dpt * x = (const block_q4_dpt *) vx + i*(QK_K/QK4_NL);
const int64_t tid = threadIdx.x;
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[ib].qs + 4*il;
const float d = (float)x[ib].d;
for (int j = 0; j < 4; ++j) {
y[j+ 0] = d * q4dpt_levels_cuda[q4[j] & 0xf];
y[j+16] = d * q4dpt_levels_cuda[q4[j] >> 4];
}
}
template<typename dst_t>
static void dequantize_row_q4_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb = (k + QK_K - 1) / QK_K;
dequantize_block_q4_dpt<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static __global__ void dequantize_block_q2_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int64_t i = blockIdx.x;
const block_q2_dpt * x = (const block_q2_dpt *) vx + i*(QK_K/QK2_DPT);
const int64_t tid = threadIdx.x;
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q2 = x[ib].qs + il;
const float d = (float)x[ib].d;
uint8_t q = q2[0];
y[ 0] = d * q2dpt_levels_cuda[(q >> 0) & 3];
y[ 1] = d * q2dpt_levels_cuda[(q >> 2) & 3];
y[ 2] = d * q2dpt_levels_cuda[(q >> 4) & 3];
y[ 3] = d * q2dpt_levels_cuda[(q >> 6) & 3];
}
template<typename dst_t>
static void dequantize_row_q2_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb = (k + QK_K - 1) / QK_K;
dequantize_block_q2_dpt<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static __global__ void dequantize_block_iq2_tq(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int64_t i = blockIdx.x;
const block_iq2_tq * bq = (const block_iq2_tq *) vx + i;
const int g = threadIdx.x; // group index 0..31
const float dq = __half2float(bq->d) * IQ2TQ_GRID_SCALE;
const int si = (bq->scales[g / 2] >> (4 * (g & 1))) & 0xF;
const int8_t * ge = iq2tq_grid_cuda + si * 4;
dst_t * y = yy + i * QK_K + g * 8;
const uint8_t * qs = bq->qs + g * 2;
y[0] = dq * ge[(qs[0] >> 0) & 3];
y[1] = dq * ge[(qs[0] >> 2) & 3];
y[2] = dq * ge[(qs[0] >> 4) & 3];
y[3] = dq * ge[(qs[0] >> 6) & 3];
y[4] = dq * ge[(qs[1] >> 0) & 3];
y[5] = dq * ge[(qs[1] >> 2) & 3];
y[6] = dq * ge[(qs[1] >> 4) & 3];
y[7] = dq * ge[(qs[1] >> 6) & 3];
}
template<typename dst_t>
static void dequantize_row_iq2_tq_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq2_tq<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static __global__ void dequantize_block_iq3_tq(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int64_t i = blockIdx.x;
const block_iq3_tq * bq = (const block_iq3_tq *) vx + i;
const int g = threadIdx.x; // group index 0..31
const float dq = __half2float(bq->d) * IQ3TQ_GRID_SCALE;
const int si = (bq->scales[g / 2] >> (4 * (g & 1))) & 0xF;
const int8_t * ge = iq3tq_grid_cuda + si * 8;
dst_t * y = yy + i * QK_K + g * 8;
const uint8_t * qs = bq->qs + g * 3;
const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16);
y[0] = dq * ge[(bits >> 0) & 7];
y[1] = dq * ge[(bits >> 3) & 7];
y[2] = dq * ge[(bits >> 6) & 7];
y[3] = dq * ge[(bits >> 9) & 7];
y[4] = dq * ge[(bits >> 12) & 7];
y[5] = dq * ge[(bits >> 15) & 7];
y[6] = dq * ge[(bits >> 18) & 7];
y[7] = dq * ge[(bits >> 21) & 7];
}
template<typename dst_t>
static void dequantize_row_iq3_tq_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq3_tq<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static __global__ void dequantize_block_iq1_bn(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int64_t i = blockIdx.x;
const block_iq1_bn * bq = (const block_iq1_bn *) vx + i;
const int g = threadIdx.x; // group index 0..31
const float dq = __half2float(bq->d) * IQ1BN_GRID_SCALE;
// Extract 12-bit codebook index
const int pair = g / 2;
int ci;
if (g & 1) {
ci = (bq->qs[3*pair+1] >> 4) | ((int)bq->qs[3*pair+2] << 4);
} else {
ci = bq->qs[3*pair] | (((int)bq->qs[3*pair+1] & 0x0F) << 8);
}
const int8_t * cb = iq1bn_codebook_cuda + ci * IQ1BN_CODEBOOK_DIM;
dst_t * y = yy + i * QK_K + g * IQ1BN_GROUP_SIZE;
y[0] = dq * cb[0];
y[1] = dq * cb[1];
y[2] = dq * cb[2];
y[3] = dq * cb[3];
y[4] = dq * cb[4];
y[5] = dq * cb[5];
y[6] = dq * cb[6];
y[7] = dq * cb[7];
}
template<typename dst_t>
static void dequantize_row_iq1_bn_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq1_bn<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb = k / QK_K;
@ -748,6 +923,16 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
return dequantize_row_iq1_m_cuda;
case GGML_TYPE_IQ4_NL:
return dequantize_row_iq4_nl_cuda;
case GGML_TYPE_Q4_DPT:
return dequantize_row_q4_dpt_cuda;
case GGML_TYPE_Q2_DPT:
return dequantize_row_q2_dpt_cuda;
case GGML_TYPE_IQ2_TQ:
return dequantize_row_iq2_tq_cuda;
case GGML_TYPE_IQ3_TQ:
return dequantize_row_iq3_tq_cuda;
case GGML_TYPE_IQ1_BN:
return dequantize_row_iq1_bn_cuda;
case GGML_TYPE_IQ4_XS:
return dequantize_row_iq4_xs_cuda;
case GGML_TYPE_IQ3_S:
@ -801,6 +986,16 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return dequantize_row_iq1_m_cuda;
case GGML_TYPE_IQ4_NL:
return dequantize_row_iq4_nl_cuda;
case GGML_TYPE_Q4_DPT:
return dequantize_row_q4_dpt_cuda;
case GGML_TYPE_Q2_DPT:
return dequantize_row_q2_dpt_cuda;
case GGML_TYPE_IQ2_TQ:
return dequantize_row_iq2_tq_cuda;
case GGML_TYPE_IQ3_TQ:
return dequantize_row_iq3_tq_cuda;
case GGML_TYPE_IQ1_BN:
return dequantize_row_iq1_bn_cuda;
case GGML_TYPE_IQ4_XS:
return dequantize_row_iq4_xs_cuda;
case GGML_TYPE_IQ3_S:

View File

@ -31,6 +31,22 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
// Set the Q4_DPT lookup table in device constant memory.
void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream);
// Set the Q2_DPT lookup table in device constant memory.
void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream);
// Set the IQ2_TQ per-tensor grid (64 bytes: 16 entries × 4 int8 levels).
void ggml_cuda_set_iq2tq_grid(const void * grid, cudaStream_t stream);
// Set the IQ3_TQ per-tensor grid (128 bytes: 16 entries × 8 int8 levels).
void ggml_cuda_set_iq3tq_grid(const void * grid, cudaStream_t stream);
// Set the IQ1_BN per-tensor codebook+scale (2064 bytes).
void ggml_cuda_set_iq1bn_aux(const void * aux, cudaStream_t stream);
template<typename dst_t, typename src_t>
__host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
if constexpr (std::is_same_v<dst_t, src_t>) {

View File

@ -3,6 +3,7 @@
#include "ggml-backend-impl.h"
#include "ggml-cuda/common.cuh"
#include "ggml-quants.h"
#include "ggml-cuda/acc.cuh"
#include "ggml-cuda/add-id.cuh"
#include "ggml-cuda/arange.cuh"
@ -1426,6 +1427,24 @@ static void ggml_cuda_op_mul_mat_cublas(
row_diff == src0->ne[1] &&
dst->op_params[0] == GGML_PREC_DEFAULT;
// Upload per-tensor grids/levels before any dequantize path (fp16, fp32, or bf16)
if (src0->type == GGML_TYPE_Q4_DPT) {
GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)");
ggml_cuda_set_q4dpt_levels((const int8_t *)src0->quant_levels, stream);
}
if (src0->type == GGML_TYPE_IQ2_TQ) {
GGML_ASSERT(src0->quant_levels && "IQ2_TQ MUL_MAT requires grid (set tensor->quant_levels)");
ggml_cuda_set_iq2tq_grid(src0->quant_levels, stream);
}
if (src0->type == GGML_TYPE_IQ3_TQ) {
GGML_ASSERT(src0->quant_levels && "IQ3_TQ MUL_MAT requires grid (set tensor->quant_levels)");
ggml_cuda_set_iq3tq_grid(src0->quant_levels, stream);
}
if (src0->type == GGML_TYPE_IQ1_BN) {
GGML_ASSERT(src0->quant_levels && "IQ1_BN MUL_MAT requires codebook (set tensor->quant_levels)");
ggml_cuda_set_iq1bn_aux(src0->quant_levels, stream);
}
if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
if (src1->type != GGML_TYPE_BF16) {
@ -4804,6 +4823,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_Q4_DPT:
case GGML_TYPE_IQ2_TQ:
case GGML_TYPE_IQ3_TQ:
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_BF16:
return true;
@ -4838,7 +4861,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
{
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL ||
op->type == GGML_TYPE_Q4_DPT) &&
op->src[0]->type == GGML_TYPE_F32 &&
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
} break;
@ -4891,6 +4915,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
return true;
}
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_DPT) {
return true;
}
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) {
return true;
}

View File

@ -2,6 +2,8 @@
#include "mmq.cuh"
#include "quantize.cuh"
#include "mmid.cuh"
#include "convert.cuh"
#include "ggml-quants.h"
static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
switch (args.type_x) {
@ -65,6 +67,12 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
case GGML_TYPE_IQ4_NL:
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
break;
case GGML_TYPE_Q4_DPT:
mul_mat_q_case<GGML_TYPE_Q4_DPT>(ctx, args, stream);
break;
case GGML_TYPE_Q2_DPT:
mul_mat_q_case<GGML_TYPE_Q2_DPT>(ctx, args, stream);
break;
default:
GGML_ABORT("fatal error");
break;
@ -82,6 +90,22 @@ void ggml_cuda_mul_mat_q(
cudaStream_t stream = ctx.stream();
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
// Set Q4_DPT lookup table from tensor's quant_levels
if (src0->type == GGML_TYPE_Q4_DPT) {
GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)");
int8_t * d_q4dpt_levels;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
}
// Set Q2_DPT lookup table from tensor's quant_levels
if (src0->type == GGML_TYPE_Q2_DPT) {
GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)");
int8_t * d_q2dpt_levels;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
}
const size_t ts_src0 = ggml_type_size(src0->type);
const size_t ts_src1 = ggml_type_size(src1->type);
const size_t ts_dst = ggml_type_size(dst->type);
@ -290,6 +314,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_Q4_DPT:
case GGML_TYPE_Q2_DPT:
mmq_supported = true;
break;
default:
@ -367,3 +393,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
// Q4_DPT must be instantiated in this TU (not a separate template-instance file)
// because it accesses the TU-local __device__ variable q4dpt_levels_cuda,
// which is initialized by the code above.
DECL_MMQ_CASE(GGML_TYPE_Q4_DPT);
DECL_MMQ_CASE(GGML_TYPE_Q2_DPT);

View File

@ -1,6 +1,7 @@
#pragma once
#include "common.cuh"
#include "ggml.h"
#include "vecdotq.cuh"
#include "mma.cuh"
@ -88,6 +89,8 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
return MMQ_Q8_1_DS_LAYOUT_DS4;
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_Q4_DPT:
case GGML_TYPE_Q2_DPT:
return MMQ_Q8_1_DS_LAYOUT_D4;
default:
GGML_ABORT("fatal error");
@ -205,6 +208,8 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_Q4_DPT: return MMQ_DP4A_TXS_Q8_0;
case GGML_TYPE_Q2_DPT: return MMQ_DP4A_TXS_Q8_0_16;
default: return tile_x_sizes{0, 0, 0};
}
}
@ -250,6 +255,8 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_Q4_DPT: return MMQ_MMA_TILE_X_K_Q8_0;
case GGML_TYPE_Q2_DPT: return MMQ_MMA_TILE_X_K_Q8_0;
default: return 0;
}
}
@ -2763,6 +2770,71 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
}
}
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_dpt(
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
constexpr int nwarps = mmq_get_nwarps_device();
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
int * x_qs = (int *) x_tile;
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
#else
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_DPT, mmq_y);
int * x_qs = (int *) x_tile;
float * x_df = (float *) (x_qs + txs.qs);
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_NL);
constexpr int nrows = warp_size / threads_per_row;
const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
const int kbx = txi / QI4_NL;
const int kqsx = txi % QI4_NL;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
if (need_check) {
i = min(i, i_max);
}
const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbx;
const int aux_q4 = get_int_b2(bxi->qs, kqsx);
const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda);
const int k0 = kbx * (2 * QI4_NL) + kqsx;
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + QI4_NL] = v.y;
#else
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x;
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI4_NL] = v.y;
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}
constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_NL;
constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
const int kbxd = threadIdx.x % blocks_per_tile_x_row;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
if (need_check) {
i = min(i, i_max);
}
const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbxd;
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d);
#else
x_df[i*(MMQ_TILE_NE_K/QI4_NL) + i/QI4_NL + kbxd] = __half2float(bxi->d);
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}
}
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xxs(
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
constexpr int nwarps = mmq_get_nwarps_device();
@ -3447,6 +3519,22 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_NL> {
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
};
template <int mmq_x, int mmq_y, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_DPT> {
static constexpr int vdr = VDR_Q4_DPT_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_dpt<mmq_y, need_check>;
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
};
template <int mmq_x, int mmq_y, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q2_DPT> {
static constexpr int vdr = VDR_Q2_DPT_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_dpt<mmq_y, need_check>; // Reuse Q4_DPT loader (same layout)
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
};
template <int mmq_x, int mmq_y, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_XS> {
static constexpr int vdr = VDR_IQ4_XS_Q8_1_MMQ;

View File

@ -2,6 +2,8 @@
#include "quantize.cuh"
#include "unary.cuh"
#include "vecdotq.cuh"
#include "convert.cuh"
#include "ggml-quants.h"
#include <cstdint>
@ -28,6 +30,11 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1;
case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1;
case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1;
case GGML_TYPE_Q4_DPT: return vec_dot_q4_dpt_q8_1;
case GGML_TYPE_Q2_DPT: return vec_dot_q2_dpt_q8_1;
case GGML_TYPE_IQ2_TQ: return vec_dot_iq2_tq_q8_1;
case GGML_TYPE_IQ3_TQ: return vec_dot_iq3_tq_q8_1;
case GGML_TYPE_IQ1_BN: return vec_dot_iq1_bn_q8_1;
case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1;
case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1;
default: return nullptr;
@ -54,6 +61,11 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ;
case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ;
case GGML_TYPE_Q4_DPT: return VDR_Q4_DPT_Q8_1_MMVQ;
case GGML_TYPE_Q2_DPT: return VDR_Q2_DPT_Q8_1_MMVQ;
case GGML_TYPE_IQ2_TQ: return VDR_IQ2_TQ_Q8_1_MMVQ;
case GGML_TYPE_IQ3_TQ: return VDR_IQ3_TQ_Q8_1_MMVQ;
case GGML_TYPE_IQ1_BN: return VDR_IQ1_BN_Q8_1_MMVQ;
case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ;
default: return 1;
}
@ -1000,6 +1012,30 @@ static void mul_mat_vec_q_switch_type(
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q4_DPT:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_DPT>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ2_TQ:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_TQ>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ3_TQ:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_TQ>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ1_BN:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_BN>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ4_XS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
@ -1029,6 +1065,45 @@ void ggml_cuda_mul_mat_vec_q(
cudaStream_t stream = ctx.stream();
// Set Q4_DPT lookup table from tensor's quant_levels
if (src0->type == GGML_TYPE_Q4_DPT) {
GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)");
int8_t * d_q4dpt_levels;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
}
// Set Q2_DPT lookup table from tensor's quant_levels
if (src0->type == GGML_TYPE_Q2_DPT) {
GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)");
int8_t * d_q2dpt_levels;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
}
// Set IQ2_TQ per-tensor grid
if (src0->type == GGML_TYPE_IQ2_TQ) {
GGML_ASSERT(src0->quant_levels && "IQ2_TQ MUL_MAT requires grid (set tensor->quant_levels)");
int8_t * d_grid;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq2tq_grid_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_grid, src0->quant_levels, 64, cudaMemcpyHostToDevice, stream));
}
// Set IQ3_TQ per-tensor grid
if (src0->type == GGML_TYPE_IQ3_TQ) {
GGML_ASSERT(src0->quant_levels && "IQ3_TQ MUL_MAT requires grid (set tensor->quant_levels)");
int8_t * d_grid;
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq3tq_grid_cuda));
CUDA_CHECK(cudaMemcpyAsync(d_grid, src0->quant_levels, 128, cudaMemcpyHostToDevice, stream));
}
// Set IQ1_BN per-tensor codebook+scale
if (src0->type == GGML_TYPE_IQ1_BN) {
GGML_ASSERT(src0->quant_levels && "IQ1_BN MUL_MAT requires codebook (set tensor->quant_levels)");
ggml_cuda_set_iq1bn_aux(src0->quant_levels, stream);
}
const size_t ts_src0 = ggml_type_size(src0->type);
const size_t ts_src1 = ggml_type_size(src1->type);
const size_t ts_dst = ggml_type_size(dst->type);

View File

@ -1240,6 +1240,194 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
return d * sumi;
}
#define VDR_Q4_DPT_Q8_1_MMVQ 2
#define VDR_Q4_DPT_Q8_1_MMQ 4
static __device__ __forceinline__ float vec_dot_q4_dpt_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
const block_q4_dpt * bq4 = (const block_q4_dpt *) vbq + kbx;
const int * q8 = (const int *) bq8_1->qs + iqs;
int sumi = 0;
#pragma unroll
for (int l = 0; l < VDR_Q4_DPT_Q8_1_MMVQ; ++l) {
const int aux_q4 = get_int_b2(bq4->qs, iqs + l);
const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda);
sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
}
const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
return d * sumi;
}
// Q2_DPT: 2-bit quantization with 4 learned levels
// Helper: lookup 4 int8 levels using 2-bit indices packed in a 32-bit int
static __device__ __forceinline__ int4 get_int_from_table_4(const int & q2, const int8_t * table) {
int4 result;
result.x = table[(q2 >> 0) & 3];
result.y = table[(q2 >> 8) & 3];
result.z = table[(q2 >> 16) & 3];
result.w = table[(q2 >> 24) & 3];
return result;
}
#define VDR_Q2_DPT_Q8_1_MMVQ 4
#define VDR_Q2_DPT_Q8_1_MMQ 8
static __device__ __forceinline__ float vec_dot_q2_dpt_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
const block_q2_dpt * bq2 = (const block_q2_dpt *) vbq + kbx;
const int * q8 = (const int *) bq8_1->qs + iqs;
int sumi = 0;
#pragma unroll
for (int l = 0; l < VDR_Q2_DPT_Q8_1_MMVQ; ++l) {
const int aux_q2 = get_int_b4(bq2->qs, l);
const int4 v = get_int_from_table_4(aux_q2, q2dpt_levels_cuda);
sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
sumi = ggml_cuda_dp4a(v.z, q8[l + 8], sumi);
sumi = ggml_cuda_dp4a(v.w, q8[l + 12], sumi);
}
const float d = __half2float(bq2->d) * __low2float(bq8_1->ds);
return d * sumi;
}
// IQ2_TQ: 2-bit with per-tensor trained 16×4 grid table
// Grid lookup helper: 4 × 2-bit indices packed in a byte → 4 grid values packed as int32
static __device__ __forceinline__ int iq2tq_grid_lookup4(uint8_t qbyte, const int8_t * grid_entry) {
uint32_t r = (uint32_t)(uint8_t)grid_entry[(qbyte >> 0) & 3];
r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 2) & 3] << 8;
r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 4) & 3] << 16;
r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 6) & 3] << 24;
return (int)r;
}
#define VDR_IQ2_TQ_Q8_1_MMVQ 1
#define VDR_IQ2_TQ_Q8_1_MMQ 1
static __device__ __forceinline__ float vec_dot_iq2_tq_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
const block_iq2_tq * bq = (const block_iq2_tq *) vbq + kbx;
// iqs selects which 16-element portion (0..15): 2 groups of 8 elements
const int q8b = iqs / 2; // Q8_1 block index (0..7)
const int q8off = (iqs & 1) * 4; // int32 offset within Q8_1 block (0 or 4)
// Grid indices for groups iqs*2 and iqs*2+1
const uint8_t sc = bq->scales[iqs];
const int8_t * ge0 = iq2tq_grid_cuda + (sc & 0xF) * 4;
const int8_t * ge1 = iq2tq_grid_cuda + (sc >> 4) * 4;
const uint8_t * qs = bq->qs + iqs * 4;
const int * q8 = (const int *)bq8_1[q8b].qs + q8off;
int sumi = 0;
// Group 0: 8 elements = 2 bytes qs, 2 int32 Q8_1
sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[0], ge0), q8[0], sumi);
sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[1], ge0), q8[1], sumi);
// Group 1: next 8 elements
sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[2], ge1), q8[2], sumi);
sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[3], ge1), q8[3], sumi);
return __half2float(bq->d) * IQ2TQ_GRID_SCALE * __low2float(bq8_1[q8b].ds) * sumi;
}
// IQ3_TQ: 3-bit with per-tensor trained 16×8 grid table
#define VDR_IQ3_TQ_Q8_1_MMVQ 1
#define VDR_IQ3_TQ_Q8_1_MMQ 1
static __device__ __forceinline__ float vec_dot_iq3_tq_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
const block_iq3_tq * bq = (const block_iq3_tq *) vbq + kbx;
const int q8b = iqs / 2;
const int q8off = (iqs & 1) * 4;
const uint8_t sc = bq->scales[iqs];
const int8_t * ge0 = iq3tq_grid_cuda + (sc & 0xF) * 8;
const int8_t * ge1 = iq3tq_grid_cuda + (sc >> 4) * 8;
const int * q8 = (const int *)bq8_1[q8b].qs + q8off;
int sumi = 0;
// Group 0: 8 elements, 3 bytes of qs
{
const uint8_t * qs = bq->qs + (iqs * 2) * 3;
const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16);
int v0 = (uint8_t)ge0[(bits >> 0) & 7] | ((uint32_t)(uint8_t)ge0[(bits >> 3) & 7] << 8)
| ((uint32_t)(uint8_t)ge0[(bits >> 6) & 7] << 16) | ((uint32_t)(uint8_t)ge0[(bits >> 9) & 7] << 24);
sumi = ggml_cuda_dp4a(v0, q8[0], sumi);
int v1 = (uint8_t)ge0[(bits >> 12) & 7] | ((uint32_t)(uint8_t)ge0[(bits >> 15) & 7] << 8)
| ((uint32_t)(uint8_t)ge0[(bits >> 18) & 7] << 16) | ((uint32_t)(uint8_t)ge0[(bits >> 21) & 7] << 24);
sumi = ggml_cuda_dp4a(v1, q8[1], sumi);
}
// Group 1: next 8 elements, next 3 bytes of qs
{
const uint8_t * qs = bq->qs + (iqs * 2 + 1) * 3;
const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16);
int v0 = (uint8_t)ge1[(bits >> 0) & 7] | ((uint32_t)(uint8_t)ge1[(bits >> 3) & 7] << 8)
| ((uint32_t)(uint8_t)ge1[(bits >> 6) & 7] << 16) | ((uint32_t)(uint8_t)ge1[(bits >> 9) & 7] << 24);
sumi = ggml_cuda_dp4a(v0, q8[2], sumi);
int v1 = (uint8_t)ge1[(bits >> 12) & 7] | ((uint32_t)(uint8_t)ge1[(bits >> 15) & 7] << 8)
| ((uint32_t)(uint8_t)ge1[(bits >> 18) & 7] << 16) | ((uint32_t)(uint8_t)ge1[(bits >> 21) & 7] << 24);
sumi = ggml_cuda_dp4a(v1, q8[3], sumi);
}
return __half2float(bq->d) * IQ3TQ_GRID_SCALE * __low2float(bq8_1[q8b].ds) * sumi;
}
// IQ1_BN: 8D vector quantized with per-tensor trained 4096-entry codebook
#define VDR_IQ1_BN_Q8_1_MMVQ 1
#define VDR_IQ1_BN_Q8_1_MMQ 1
static __device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
const block_iq1_bn * bq = (const block_iq1_bn *) vbq + kbx;
// iqs = 0..15, each thread handles 2 groups (16 elements)
const int q8b = iqs / 2;
const int q8off = (iqs & 1) * 4;
// Extract two 12-bit codebook indices from qs[3*iqs .. 3*iqs+2]
const uint8_t * qs = bq->qs + 3 * iqs;
const int ci0 = qs[0] | (((int)qs[1] & 0x0F) << 8);
const int ci1 = (qs[1] >> 4) | ((int)qs[2] << 4);
const int * cb0 = (const int *)(iq1bn_codebook_cuda + ci0 * IQ1BN_CODEBOOK_DIM);
const int * cb1 = (const int *)(iq1bn_codebook_cuda + ci1 * IQ1BN_CODEBOOK_DIM);
const int * q8 = (const int *)bq8_1[q8b].qs + q8off;
int sumi = 0;
sumi = ggml_cuda_dp4a(cb0[0], q8[0], sumi);
sumi = ggml_cuda_dp4a(cb0[1], q8[1], sumi);
sumi = ggml_cuda_dp4a(cb1[0], q8[2], sumi);
sumi = ggml_cuda_dp4a(cb1[1], q8[3], sumi);
return __half2float(bq->d) * IQ1BN_GRID_SCALE * __low2float(bq8_1[q8b].ds) * (float)sumi;
}
#define VDR_IQ4_XS_Q8_1_MMVQ 4
#define VDR_IQ4_XS_Q8_1_MMQ 4

File diff suppressed because it is too large Load Diff

View File

@ -27,6 +27,7 @@ GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4
GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q3_kpt_ref(const float * GGML_RESTRICT x, block_q3_kpt * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
@ -42,36 +43,37 @@ GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_
GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
// Dequantization
GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@ -82,6 +84,14 @@ GGML_API size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RE
GGML_API size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_q3_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// Q3_KPT level management
GGML_API void q3kpt_set_levels(const float * levels);
GGML_API const float * q3kpt_get_levels(void);
GGML_API void q3kpt_free_levels(void);
GGML_API void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[Q3KPT_N_LEVELS]);
GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@ -102,6 +112,198 @@ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTR
GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1])
GGML_API void q3pt_set_levels(const float * levels); // set global levels (quantization)
GGML_API const float * q3pt_get_levels(void);
GGML_API void q3pt_free_levels(void);
// Per-tensor levels registry (inference — range-based lookup by data address)
// Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized
// 16-element sub-block values. Also sets the global levels via q3pt_set_levels().
// data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL.
GGML_API void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
// Q4_DPT: IQ4_NL with learned per-tensor int8 levels
GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// Q4_DPT levels management (per-tensor Lloyd-Max int8 levels)
GGML_API void q4dpt_set_levels(const int8_t * levels);
GGML_API const int8_t * q4dpt_get_levels(void);
GGML_API void q4dpt_free_levels(void);
// Q2_DPT: 2-bit with learned per-tensor int8 levels
GGML_API void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// Q2_DPT levels management (per-tensor Lloyd-Max int8 levels)
GGML_API void q2dpt_set_levels(const int8_t * levels);
GGML_API const int8_t * q2dpt_get_levels(void);
GGML_API void q2dpt_free_levels(void);
GGML_API void q2dpt_set_quant_strategy(int s);
// Train 4 Lloyd-Max int8 levels from tensor data for Q2_DPT.
// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4].
// Also sets the global levels via q2dpt_set_levels().
GGML_API void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]);
// Q2_KPT: Q2_K with learned per-tensor float levels
GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix);
// Q2_KPT levels management (per-tensor float levels in [0,1])
GGML_API void q2kpt_set_levels(const float * levels);
GGML_API const float * q2kpt_get_levels(void);
GGML_API void q2kpt_free_levels(void);
// Prepare levels buffer for a tensor with given dimensions (call before parallel quantization)
GGML_API void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT.
// Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids.
// Also sets the global levels via q2kpt_set_levels().
GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[Q2KPT_N_LEVELS]);
// Train per-row levels for all rows: writes nrow * Q2KPT_N_LEVELS floats to out_levels.
GGML_API void q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float * out_levels);
// IQ2_TQ: 2-bit scalar with per-group asymmetric grid (2.5625 bpw)
GGML_API void dequantize_row_iq2_tq(const block_iq2_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_iq2_tq_ref(const float * GGML_RESTRICT x, block_iq2_tq * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_iq2_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
GGML_API void iq2tq_set_grid(const int8_t grid[64]);
GGML_API const int8_t * iq2tq_get_grid(void);
// IQ3_TQ: 3-bit scalar with per-group asymmetric grid (3.5625 bpw)
GGML_API void dequantize_row_iq3_tq(const block_iq3_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_iq3_tq_ref(const float * GGML_RESTRICT x, block_iq3_tq * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_iq3_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[IQ3TQ_GRID_SIZE]);
GGML_API void iq3tq_set_grid(const int8_t grid[IQ3TQ_GRID_SIZE]);
GGML_API const int8_t * iq3tq_get_grid(void);
// IQ1_BN: 8D vector quantized with per-tensor trained codebook (1.5625 bpw)
GGML_API void dequantize_row_iq1_bn(const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_iq1_bn_ref(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_iq1_bn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[IQ1BN_AUX_SIZE], int nthread);
GGML_API void iq1bn_set_aux(const int8_t aux[IQ1BN_AUX_SIZE]);
GGML_API const int8_t * iq1bn_get_aux(void);
// Train 16 Lloyd-Max int8 levels from tensor data.
// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16].
// Also sets the global levels via q4dpt_set_levels().
GGML_API void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]);
GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1])
GGML_API void q3pt_set_levels(const float * levels); // set global levels (quantization)
GGML_API const float * q3pt_get_levels(void);
GGML_API void q3pt_free_levels(void);
// Per-tensor levels registry (inference — range-based lookup by data address)
// Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized
// 16-element sub-block values. Also sets the global levels via q3pt_set_levels().
// data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL.
GGML_API void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
// Q4_DPT: IQ4_NL with learned per-tensor int8 levels
GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// Q4_DPT levels management (per-tensor Lloyd-Max int8 levels)
GGML_API void q4dpt_set_levels(const int8_t * levels);
GGML_API const int8_t * q4dpt_get_levels(void);
GGML_API void q4dpt_free_levels(void);
// Q2_DPT: 2-bit with learned per-tensor int8 levels
GGML_API void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// Q2_DPT levels management (per-tensor Lloyd-Max int8 levels)
GGML_API void q2dpt_set_levels(const int8_t * levels);
GGML_API const int8_t * q2dpt_get_levels(void);
GGML_API void q2dpt_free_levels(void);
GGML_API void q2dpt_set_quant_strategy(int s);
// Train 4 Lloyd-Max int8 levels from tensor data for Q2_DPT.
// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4].
// Also sets the global levels via q2dpt_set_levels().
GGML_API void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]);
// Q2_KPT: Q2_K with learned per-tensor float levels
GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix);
// Q2_KPT levels management (per-tensor float levels in [0,1])
GGML_API void q2kpt_set_levels(const float * levels);
GGML_API const float * q2kpt_get_levels(void);
GGML_API void q2kpt_free_levels(void);
// Prepare levels buffer for a tensor with given dimensions (call before parallel quantization)
GGML_API void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT.
// Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids.
// Also sets the global levels via q2kpt_set_levels().
GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[Q2KPT_N_LEVELS]);
// Train per-row levels for all rows: writes nrow * Q2KPT_N_LEVELS floats to out_levels.
GGML_API void q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float * out_levels);
// IQ2_TQ: 2-bit scalar with per-group asymmetric grid (2.5625 bpw)
GGML_API void dequantize_row_iq2_tq(const block_iq2_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_iq2_tq_ref(const float * GGML_RESTRICT x, block_iq2_tq * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_iq2_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
GGML_API void iq2tq_set_grid(const int8_t grid[64]);
GGML_API const int8_t * iq2tq_get_grid(void);
// IQ3_TQ: 3-bit scalar with per-group asymmetric grid (3.5625 bpw)
GGML_API void dequantize_row_iq3_tq(const block_iq3_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_iq3_tq_ref(const float * GGML_RESTRICT x, block_iq3_tq * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_iq3_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[IQ3TQ_GRID_SIZE]);
GGML_API void iq3tq_set_grid(const int8_t grid[IQ3TQ_GRID_SIZE]);
GGML_API const int8_t * iq3tq_get_grid(void);
// IQ1_BN: 8D vector quantized with per-tensor trained codebook (1.5625 bpw)
GGML_API void dequantize_row_iq1_bn(const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
GGML_API void quantize_row_iq1_bn_ref(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
GGML_API size_t quantize_iq1_bn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[IQ1BN_AUX_SIZE], int nthread);
GGML_API void iq1bn_set_aux(const int8_t aux[IQ1BN_AUX_SIZE]);
GGML_API const int8_t * iq1bn_get_aux(void);
// Train 16 Lloyd-Max int8 levels from tensor data.
// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16].
// Also sets the global levels via q4dpt_set_levels().
GGML_API void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]);
GGML_API void iq2xs_init_impl(enum ggml_type type);
GGML_API void iq2xs_free_impl(enum ggml_type type);
GGML_API void iq3xs_init_impl(int grid_size);

View File

@ -12273,7 +12273,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
}
static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant, const void * levels = nullptr) {
if (quant == GGML_TYPE_F32) {
memcpy(to, from, sizeof(float) * ne);
return;
@ -12283,7 +12283,7 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg
ggml_to_float_t dequant_fn = tt->to_float;
dequant_fn(from, to, ne);
dequant_fn(from, to, ne, levels);
}
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {

View File

@ -456,6 +456,11 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
}
}
static void ggml_fp16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) {
GGML_UNUSED(levels);
ggml_fp16_to_fp32_row((const ggml_fp16_t *)x, y, n);
}
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
int i = 0;
for (; i < n; ++i) {
@ -470,6 +475,11 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
}
}
static void ggml_bf16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) {
GGML_UNUSED(levels);
ggml_bf16_to_fp32_row((const ggml_bf16_t *)x, y, n);
}
void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
for (int i = 0; i < n; i++) {
y[i] = ggml_compute_fp32_to_bf16(x[i]);
@ -648,7 +658,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.blck_size = 1,
.type_size = sizeof(ggml_fp16_t),
.is_quantized = false,
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
.to_float = ggml_fp16_to_fp32_row_leveled,
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
},
[GGML_TYPE_Q1_0] = {
@ -857,7 +867,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.blck_size = 1,
.type_size = sizeof(ggml_bf16_t),
.is_quantized = false,
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
.to_float = ggml_bf16_to_fp32_row_leveled,
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
},
[31] = { // GGML_TYPE_Q4_0_4_4
@ -912,6 +922,71 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.type_size = 0,
.is_quantized = false,
},
[GGML_TYPE_Q3_PT] = {
.type_name = "q3_pt",
.blck_size = QK_K,
.type_size = sizeof(block_q3_pt),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q3_pt,
.from_float_ref = (ggml_from_float_t) quantize_row_q3_pt_ref,
},
[GGML_TYPE_Q3_KPT] = {
.type_name = "q3_kpt",
.blck_size = QK_K,
.type_size = sizeof(block_q3_kpt),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q3_kpt,
.from_float_ref = (ggml_from_float_t) quantize_row_q3_kpt_ref,
},
[GGML_TYPE_Q4_DPT] = {
.type_name = "q4_dpt",
.blck_size = QK4_NL,
.type_size = sizeof(block_q4_dpt),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q4_dpt,
.from_float_ref = (ggml_from_float_t) quantize_row_q4_dpt_ref,
},
[GGML_TYPE_Q2_DPT] = {
.type_name = "q2_dpt",
.blck_size = QK2_DPT,
.type_size = sizeof(block_q2_dpt),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q2_dpt,
.from_float_ref = (ggml_from_float_t) quantize_row_q2_dpt_ref,
},
[GGML_TYPE_Q2_KPT] = {
.type_name = "q2_kpt",
.blck_size = QK_K,
.type_size = sizeof(block_q2_kpt),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q2_kpt,
.from_float_ref = (ggml_from_float_t) quantize_row_q2_kpt_ref,
.levels_row_stride = 0, // computed dynamically: (ne[0]/256)*4*sizeof(float)
},
[GGML_TYPE_IQ2_TQ] = {
.type_name = "iq2_tq",
.blck_size = QK_K,
.type_size = sizeof(block_iq2_tq),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq2_tq,
.from_float_ref = (ggml_from_float_t) quantize_row_iq2_tq_ref,
},
[GGML_TYPE_IQ3_TQ] = {
.type_name = "iq3_tq",
.blck_size = QK_K,
.type_size = sizeof(block_iq3_tq),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq3_tq,
.from_float_ref = (ggml_from_float_t) quantize_row_iq3_tq_ref,
},
[GGML_TYPE_IQ1_BN] = {
.type_name = "iq1_bn",
.blck_size = QK_K,
.type_size = sizeof(block_iq1_bn),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_iq1_bn,
.from_float_ref = (ggml_from_float_t) quantize_row_iq1_bn_ref,
},
};
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
@ -1412,6 +1487,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
case GGML_FTYPE_MOSTLY_Q3_PT: wtype = GGML_TYPE_Q3_PT; break;
case GGML_FTYPE_MOSTLY_Q3_KPT: wtype = GGML_TYPE_Q3_KPT; break;
case GGML_FTYPE_MOSTLY_Q4_DPT: wtype = GGML_TYPE_Q4_DPT; break;
case GGML_FTYPE_MOSTLY_Q2_KPT: wtype = GGML_TYPE_Q2_KPT; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
}
@ -7607,6 +7686,13 @@ void ggml_quantize_init(enum ggml_type type) {
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
case GGML_TYPE_IQ2_TQ: break; // per-tensor grid stored in tensor->quant_levels
case GGML_TYPE_IQ3_TQ: break; // per-tensor grid stored in tensor->quant_levels
case GGML_TYPE_IQ1_BN: break; // per-tensor codebook stored in tensor->quant_levels
case GGML_TYPE_Q3_PT: break; // levels stored in tensor->quant_levels
case GGML_TYPE_Q3_KPT: break; // levels stored in tensor->quant_levels
case GGML_TYPE_Q4_DPT: break; // levels stored in tensor->quant_levels
case GGML_TYPE_Q2_KPT: break; // levels stored in tensor->quant_levels
default: // nothing
break;
}
@ -7685,6 +7771,13 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q3_PT: result = quantize_q3_pt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q3_KPT: result = quantize_q3_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_DPT: result = quantize_q4_dpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q2_KPT: result = quantize_q2_kpt (src + start, (char *) dst + start_row * row_size, start_row, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_TQ: result = quantize_iq2_tq (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ3_TQ: result = quantize_iq3_tq (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_F16:
{
size_t elemsize = sizeof(ggml_fp16_t);

View File

@ -1331,37 +1331,63 @@ struct gguf_writer_base {
if (kv.is_array) {
write(GGUF_TYPE_ARRAY);
write(kv.get_type());
const enum gguf_type elem_type = kv.get_type();
write(elem_type);
write(ne);
// Write array element data based on element type
switch (elem_type) {
case GGUF_TYPE_UINT8:
case GGUF_TYPE_INT8:
case GGUF_TYPE_UINT16:
case GGUF_TYPE_INT16:
case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32:
case GGUF_TYPE_FLOAT32:
case GGUF_TYPE_UINT64:
case GGUF_TYPE_INT64:
case GGUF_TYPE_FLOAT64: {
// Write raw bytes inline for array data
for (size_t i = 0; i < kv.data.size(); ++i) {
write(kv.data[i]);
}
} break;
case GGUF_TYPE_BOOL: {
for (size_t i = 0; i < ne; ++i) {
write(kv.get_val<bool>(i));
}
} break;
case GGUF_TYPE_STRING: {
for (size_t i = 0; i < ne; ++i) {
write(kv.get_val<std::string>(i));
}
} break;
case GGUF_TYPE_ARRAY:
default: GGML_ABORT("invalid array element type");
}
} else {
write(kv.get_type());
}
switch (kv.get_type()) {
case GGUF_TYPE_UINT8:
case GGUF_TYPE_INT8:
case GGUF_TYPE_UINT16:
case GGUF_TYPE_INT16:
case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32:
case GGUF_TYPE_FLOAT32:
case GGUF_TYPE_UINT64:
case GGUF_TYPE_INT64:
case GGUF_TYPE_FLOAT64: {
write(kv.data);
} break;
case GGUF_TYPE_BOOL: {
for (size_t i = 0; i < ne; ++i) {
write(kv.get_val<bool>(i));
}
} break;
case GGUF_TYPE_STRING: {
for (size_t i = 0; i < ne; ++i) {
write(kv.get_val<std::string>(i));
}
} break;
case GGUF_TYPE_ARRAY:
default: GGML_ABORT("invalid type");
switch (kv.get_type()) {
case GGUF_TYPE_UINT8:
case GGUF_TYPE_INT8:
case GGUF_TYPE_UINT16:
case GGUF_TYPE_INT16:
case GGUF_TYPE_UINT32:
case GGUF_TYPE_INT32:
case GGUF_TYPE_FLOAT32:
case GGUF_TYPE_UINT64:
case GGUF_TYPE_INT64:
case GGUF_TYPE_FLOAT64: {
write(kv.data);
} break;
case GGUF_TYPE_BOOL: {
write(kv.get_val<bool>(0));
} break;
case GGUF_TYPE_STRING: {
write(kv.get_val<std::string>(0));
} break;
case GGUF_TYPE_ARRAY:
default: GGML_ABORT("invalid type");
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -155,6 +155,14 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_PT = 41, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q3_KPT = 42, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_DPT = 43, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q2_KPT = 44, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_TQ = 45, // except 1d tensors, trellis quantized with RNG codebook
LLAMA_FTYPE_MOSTLY_IQ3_TQ = 46, // except 1d tensors, 3-bit with per-tensor trained grid
LLAMA_FTYPE_MOSTLY_IQ1_BN = 47, // except 1d tensors, 8D vector quantized with trained codebook
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};

View File

@ -157,8 +157,8 @@ int main(int argc, char** argv) {
t1 = std::chrono::high_resolution_clock::now();
float fs;
if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1, nullptr);
else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1, nullptr);
t2 = std::chrono::high_resolution_clock::now();
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
if (iloop > 3) ggml.addResult(fs, t);

View File

@ -285,8 +285,8 @@ int main(int argc, char** argv) {
else {
const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
vdot->from_float(y1.data(), q8.data(), kVecSize);
if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1, nullptr);
else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1, nullptr);
}
sumq += result;
t2 = std::chrono::high_resolution_clock::now();

604
scripts/analyze-ffn-down.py Normal file
View File

@ -0,0 +1,604 @@
#!/usr/bin/env python3
"""Deep analysis of WHY ffn_down is hard to quantize.
Compares structural properties of all weight and activation tensors.
"""
import numpy as np
import struct
import sys
import os
DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data")
def load_f32_tensor(name):
path = os.path.join(DATA_DIR, name)
with open(path, "rb") as f:
nrow, ncol = struct.unpack("qq", f.read(16))
data = np.frombuffer(f.read(), dtype=np.float32)
assert len(data) == nrow * ncol, f"Expected {nrow * ncol}, got {len(data)}"
return data.reshape(nrow, ncol)
def stats(label, arr):
"""Print comprehensive statistics for a flat array."""
a = arr.ravel()
print(f" {label}:")
print(f" shape={arr.shape}, n={len(a)}")
print(f" mean={a.mean():.6f}, std={a.std():.6f}")
print(f" min={a.min():.6f}, max={a.max():.6f}")
print(f" median={np.median(a):.6f}")
print(
f" |mean|/std = {abs(a.mean()) / (a.std() + 1e-10):.4f} (offset-to-spread ratio)"
)
# Kurtosis (excess) - how heavy-tailed vs Gaussian
kurt = np.mean(((a - a.mean()) / (a.std() + 1e-10)) ** 4) - 3.0
# Skewness
skew = np.mean(((a - a.mean()) / (a.std() + 1e-10)) ** 3)
print(f" skewness={skew:.4f}, excess_kurtosis={kurt:.4f}")
# Percentile ranges
pcts = np.percentile(a, [0.1, 1, 5, 25, 50, 75, 95, 99, 99.9])
print(
f" percentiles: 0.1%={pcts[0]:.4f}, 1%={pcts[1]:.4f}, 5%={pcts[2]:.4f}, "
f"25%={pcts[3]:.4f}, 50%={pcts[4]:.4f}, 75%={pcts[5]:.4f}, "
f"95%={pcts[6]:.4f}, 99%={pcts[7]:.4f}, 99.9%={pcts[8]:.4f}"
)
# Sparsity
near_zero = np.sum(np.abs(a) < 0.001 * a.std()) / len(a)
print(f" fraction |x| < 0.001*std: {near_zero:.4f}")
return {
"mean": a.mean(),
"std": a.std(),
"skew": skew,
"kurt": kurt,
"min": a.min(),
"max": a.max(),
}
# ============================================================================
# 1. BASIC WEIGHT TENSOR COMPARISON
# ============================================================================
print("=" * 80)
print("SECTION 1: WEIGHT TENSOR GLOBAL STATISTICS")
print("=" * 80)
tensors = {
"ffn_gate": ("blk_0_ffn_gate_weight.f32bin", "9728x2560 (wide→narrow proj)"),
"ffn_up": ("blk_0_ffn_up_weight.f32bin", "9728x2560 (wide→narrow proj)"),
"ffn_down": ("blk_0_ffn_down_weight.f32bin", "2560x9728 (narrow→wide proj)"),
"attn_q": ("blk_0_attn_q_weight.f32bin", "4096x2560"),
"attn_k": ("blk_0_attn_k_weight.f32bin", "1024x2560"),
"attn_v": ("blk_0_attn_v_weight.f32bin", "1024x2560"),
"attn_out": ("blk_0_attn_output_weight.f32bin", "2560x4096"),
}
weight_data = {}
for name, (fname, desc) in tensors.items():
try:
W = load_f32_tensor(fname)
print(f"\n{'' * 70}")
print(f" {name} [{desc}] — file: {fname}")
weight_data[name] = W
stats(name, W)
except Exception as e:
print(f" {name}: SKIP ({e})")
# ============================================================================
# 2. ROW-LEVEL STATISTICS (each row is a neuron output)
# ============================================================================
print("\n" + "=" * 80)
print("SECTION 2: ROW-LEVEL VARIABILITY (per-neuron weight statistics)")
print("=" * 80)
print(" Each row of the weight matrix produces one output dimension.")
print(" High row-to-row variability in mean/std means the quantizer")
print(" must handle very different distributions across rows.\n")
for name, W in weight_data.items():
row_means = W.mean(axis=1)
row_stds = W.std(axis=1)
row_ranges = W.max(axis=1) - W.min(axis=1)
print(f"\n {name} ({W.shape[0]} rows × {W.shape[1]} cols):")
print(
f" Row means: mean={row_means.mean():.6f}, std={row_means.std():.6f}, "
f"range=[{row_means.min():.6f}, {row_means.max():.6f}]"
)
print(
f" Row stds: mean={row_stds.mean():.6f}, std={row_stds.std():.6f}, "
f"range=[{row_stds.min():.6f}, {row_stds.max():.6f}]"
)
print(f" Row ranges: mean={row_ranges.mean():.6f}, std={row_ranges.std():.6f}")
print(
f" RowMeans CV (std/mean): {row_means.std() / (abs(row_means.mean()) + 1e-10):.4f}"
)
print(f" RowStds CV: {row_stds.std() / (row_stds.mean() + 1e-10):.4f}")
# ============================================================================
# 3. GROUP-LEVEL ANALYSIS (16-element groups, like Q2_K)
# ============================================================================
print("\n" + "=" * 80)
print("SECTION 3: GROUP-LEVEL ANALYSIS (16-element groups)")
print("=" * 80)
print(" Quantization works on 16-element groups. Key question:")
print(" How much does each group need its own OFFSET (dmin)?\n")
GS = 16
for name, W in weight_data.items():
# Look at first 256 rows for speed
nr = min(W.shape[0], 256)
nc = W.shape[1]
group_means = []
group_stds = []
group_ranges = []
group_offsets = [] # |mean| / range — how important is the offset
for r in range(nr):
for g_start in range(0, nc, GS):
g = W[r, g_start : g_start + GS]
gm = g.mean()
gs = g.std()
gr = g.max() - g.min()
gmin = g.min()
group_means.append(gm)
group_stds.append(gs)
group_ranges.append(gr)
# Offset importance: how large is the group mean relative to its range?
# If this is high, offset (dmin) matters a lot
if gr > 1e-10:
group_offsets.append(abs(gm) / gr)
else:
group_offsets.append(0)
gm = np.array(group_means)
gs = np.array(group_stds)
gr = np.array(group_ranges)
go = np.array(group_offsets)
print(f"\n {name} ({len(group_means)} groups):")
print(
f" Group mean: mean={gm.mean():.6f}, std={gm.std():.6f}, "
f"range=[{gm.min():.6f}, {gm.max():.6f}]"
)
print(f" Group std: mean={gs.mean():.6f}, std={gs.std():.6f}")
print(f" Group range: mean={gr.mean():.6f}, std={gr.std():.6f}")
print(f" *** OFFSET IMPORTANCE (|group_mean| / range) ***")
print(
f" mean={go.mean():.4f}, median={np.median(go):.4f}, "
f"p90={np.percentile(go, 90):.4f}, max={go.max():.4f}"
)
print(f" fraction with offset > 0.1: {np.mean(go > 0.1):.3f}")
print(f" fraction with offset > 0.2: {np.mean(go > 0.2):.3f}")
print(f" fraction with offset > 0.3: {np.mean(go > 0.3):.3f}")
# How well does zeroing the min (Q2_K style, clamping min to 0) work?
# vs keeping the actual min
mse_no_offset = 0 # Assume uniform 4 levels [0,1,2,3] * scale
mse_with_offset = 0 # Assume uniform 4 levels [0,1,2,3] * scale + offset
for r in range(nr):
for g_start in range(0, nc, GS):
g = W[r, g_start : g_start + GS]
gmin = g.min()
gmax = g.max()
gr = gmax - gmin
if gr < 1e-10:
continue
# No offset: clamp min to 0, scale = max/3
if gmin > 0:
scale_no = gmax / 3.0
min_no = 0
else:
scale_no = gmax / 3.0
min_no = 0 # lose the negative offset
# Actually use (gmax - 0)/3 but we're clamping gmin to 0
# Better: use actual min/max
scale_w = gr / 3.0
min_w = gmin
for val in g:
# No offset quantization
norm_no = val / (scale_no + 1e-10)
idx_no = max(0, min(3, int(round(norm_no))))
recon_no = scale_no * idx_no
mse_no_offset += (val - recon_no) ** 2
# With offset quantization
norm_w = (val - min_w) / (scale_w + 1e-10)
idx_w = max(0, min(3, int(round(norm_w))))
recon_w = min_w + scale_w * idx_w
mse_with_offset += (val - recon_w) ** 2
total_elements = nr * nc
rmse_no = np.sqrt(mse_no_offset / total_elements)
rmse_w = np.sqrt(mse_with_offset / total_elements)
improvement = (rmse_no - rmse_w) / rmse_no * 100
print(f" Quant RMSE (no offset): {rmse_no:.6f}")
print(f" Quant RMSE (with offset): {rmse_w:.6f}")
print(f" Offset benefit: {improvement:.1f}% RMSE reduction")
# ============================================================================
# 4. ACTIVATION ANALYSIS
# ============================================================================
print("\n" + "=" * 80)
print("SECTION 4: ACTIVATION DISTRIBUTION COMPARISON")
print("=" * 80)
activations = {
"ffn_input (gate/up)": "act_blk0_ffn_input.f32bin",
"ffn_down_input (swiglu)": "act_blk0_ffn_down_input.f32bin",
"attn_input (q/k/v)": "act_blk0_attn_input.f32bin",
"attn_output_input": "act_blk0_attn_output_input.f32bin",
}
act_data = {}
for name, fname in activations.items():
try:
A = load_f32_tensor(fname)
act_data[name] = A
print(f"\n{'' * 70}")
print(f" {name}{fname}")
stats(name, A)
except Exception as e:
print(f" {name}: SKIP ({e})")
# ============================================================================
# 5. THE CRITICAL QUESTION: PER-DIMENSION ACTIVATION MAGNITUDE
# ============================================================================
print("\n" + "=" * 80)
print("SECTION 5: PER-DIMENSION ACTIVATION POWER (per-column RMS)")
print("=" * 80)
print(" If activation dimensions have very different magnitudes,")
print(" the quantization error in each weight dimension is weighted differently.")
print(" Dimensions with high activation power amplify weight errors.\n")
for name, A in act_data.items():
col_rms = np.sqrt(np.mean(A**2, axis=0)) # RMS per column (dimension)
print(f"\n {name} ({A.shape[1]} dimensions):")
print(f" Col RMS: mean={col_rms.mean():.6f}, std={col_rms.std():.6f}")
print(f" Col RMS range: [{col_rms.min():.6f}, {col_rms.max():.6f}]")
print(f" Col RMS CV (std/mean): {col_rms.std() / (col_rms.mean() + 1e-10):.4f}")
print(f" Max/Min ratio: {col_rms.max() / (col_rms.min() + 1e-10):.1f}x")
# Top 10 and bottom 10 dimensions by power
top10 = np.argsort(col_rms)[-10:][::-1]
bot10 = np.argsort(col_rms)[:10]
print(
f" Top-10 dims by RMS: {[(int(d), f'{col_rms[d]:.4f}') for d in top10[:5]]}..."
)
print(
f" Bot-10 dims by RMS: {[(int(d), f'{col_rms[d]:.4f}') for d in bot10[:5]]}..."
)
# How much do the top 10% of dimensions contribute to total power?
total_power = np.sum(col_rms**2)
sorted_power = np.sort(col_rms**2)[::-1]
top10pct = int(len(col_rms) * 0.1)
top10pct_power = np.sum(sorted_power[:top10pct])
top1pct = max(1, int(len(col_rms) * 0.01))
top1pct_power = np.sum(sorted_power[:top1pct])
print(
f" Top 10% of dims contribute {top10pct_power / total_power * 100:.1f}% of total power"
)
print(
f" Top 1% of dims contribute {top1pct_power / total_power * 100:.1f}% of total power"
)
# ============================================================================
# 6. CROSS-CORRELATION: WEIGHT ERROR × ACTIVATION POWER
# ============================================================================
print("\n" + "=" * 80)
print("SECTION 6: WHERE DO WEIGHT ERRORS MEET HIGH ACTIVATION POWER?")
print("=" * 80)
print(" For each weight dimension, compute: activation_rms[dim] × weight_error[dim]")
print(" This tells us which dimensions contribute most to matmul error.\n")
# Focus on ffn_down vs ffn_gate for comparison
focus = [
("ffn_down", "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin"),
("ffn_gate", "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin"),
("ffn_up", "blk_0_ffn_up_weight.f32bin", "act_blk0_ffn_input.f32bin"),
("attn_q", "blk_0_attn_q_weight.f32bin", "act_blk0_attn_input.f32bin"),
]
for name, wfile, afile in focus:
W = load_f32_tensor(wfile)
A = load_f32_tensor(afile)
if W.shape[1] != A.shape[1]:
print(f" {name}: dim mismatch W={W.shape[1]} vs A={A.shape[1]}, SKIP")
continue
nc = W.shape[1]
# Per-column activation RMS
act_rms = np.sqrt(np.mean(A**2, axis=0))
# Per-column weight std and range (how "hard" to quantize)
w_std = W.std(axis=0)
w_range = W.max(axis=0) - W.min(axis=0)
# Per-column weight kurtosis (heavy tails = harder to quantize)
w_kurt = (
np.mean(((W - W.mean(axis=0)) / (W.std(axis=0) + 1e-10)) ** 4, axis=0) - 3.0
)
# Weight error proxy: with 2-bit uniform quant on 16-element groups
# Higher variance columns → more error
nr = min(W.shape[0], 256)
# Simple Q2_K-style error estimate per dimension:
# For each group of 16 in the column direction, quantize and measure error
dim_mse = np.zeros(nc)
for g_start in range(0, nc, GS):
g_end = min(g_start + GS, nc)
for r in range(nr):
g = W[r, g_start:g_end]
gmin = min(g.min(), 0) # Q2_K clamps min to ≤0
gmax = g.max()
gr = gmax - gmin
if gr < 1e-10:
continue
scale = gr / 3.0
for i, val in enumerate(g):
norm = (val - gmin) / scale
idx = max(0, min(3, int(round(norm))))
recon = gmin + scale * idx
dim_mse[g_start + i] += (val - recon) ** 2
dim_rmse = np.sqrt(dim_mse / nr)
# The key metric: dimension-level contribution to matmul error
# matmul_error_contribution[d] ≈ act_rms[d] * weight_rmse[d]
matmul_contrib = act_rms * dim_rmse
print(f"\n {name} ({nc} dimensions):")
print(
f" act_rms: mean={act_rms.mean():.4f}, CV={act_rms.std() / act_rms.mean():.4f}"
)
print(
f" w_rmse: mean={dim_rmse.mean():.6f}, CV={dim_rmse.std() / (dim_rmse.mean() + 1e-10):.4f}"
)
print(
f" matmul_contrib: mean={matmul_contrib.mean():.6f}, "
f"std={matmul_contrib.std():.6f}"
)
# Correlation between activation power and weight error
corr = np.corrcoef(act_rms, dim_rmse)[0, 1]
print(f" CORRELATION act_rms ↔ weight_rmse: {corr:.4f}")
print(f" (>0 means high-power dims are also hard to quantize — BAD)")
# Top contributors to matmul error
top_dims = np.argsort(matmul_contrib)[-20:][::-1]
print(f" Top-5 error-contributing dimensions:")
for d in top_dims[:5]:
print(
f" dim {d}: act_rms={act_rms[d]:.4f}, w_rmse={dim_rmse[d]:.6f}, "
f"contrib={matmul_contrib[d]:.6f}, w_std={w_std[d]:.6f}, w_kurt={w_kurt[d]:.2f}"
)
# Distribution of matmul contributions
total_contrib = matmul_contrib.sum()
sorted_contrib = np.sort(matmul_contrib)[::-1]
for pct in [0.01, 0.05, 0.10, 0.25]:
n = max(1, int(nc * pct))
print(
f" Top {pct * 100:.0f}% dims: {sorted_contrib[:n].sum() / total_contrib * 100:.1f}% "
f"of total matmul error"
)
# ============================================================================
# 7. THE STRUCTURAL ASYMMETRY: COLUMN DIRECTION GROUP ANALYSIS
# ============================================================================
print("\n" + "=" * 80)
print("SECTION 7: STRUCTURAL ASYMMETRY — COLUMN vs ROW GROUPING")
print("=" * 80)
print(" Quantization groups along the ROW (inner dim). For ffn_down,")
print(" each row has 9728 elements (38 groups of 256).")
print(" For ffn_gate, each row has 2560 elements (10 groups of 256).")
print(" More groups = more metadata (scales/offsets) relative to data bits.\n")
for name, wfile, afile in focus:
W = load_f32_tensor(wfile)
nc = W.shape[1]
n_groups_per_row = nc // 256 # super-blocks per row
print(f"\n {name}: {nc} cols → {n_groups_per_row} super-blocks per row")
print(f" Groups per row: {nc // GS} (16-element groups)")
print(
f" With Q2_K (2.625 bpw): {n_groups_per_row * 2} scale+offset bytes per row"
)
# How much do group means vary WITHIN a row?
nr = min(W.shape[0], 64)
intra_row_mean_var = []
for r in range(nr):
group_means = []
for g_start in range(0, nc, GS):
group_means.append(W[r, g_start : g_start + GS].mean())
group_means = np.array(group_means)
intra_row_mean_var.append(group_means.std())
print(
f" Intra-row group mean variability (avg across rows): "
f"mean={np.mean(intra_row_mean_var):.6f}"
)
# How much does the sign of group means vary?
pos_frac = 0
neg_frac = 0
total_groups = 0
for r in range(nr):
for g_start in range(0, nc, GS):
gm = W[r, g_start : g_start + GS].mean()
if gm > 0.001:
pos_frac += 1
elif gm < -0.001:
neg_frac += 1
total_groups += 1
print(
f" Group mean sign: {pos_frac / total_groups * 100:.1f}% positive, "
f"{neg_frac / total_groups * 100:.1f}% negative, "
f"{(1 - pos_frac / total_groups - neg_frac / total_groups) * 100:.1f}% near-zero"
)
# ============================================================================
# 8. THE SWIGLU EFFECT: WHY ffn_down INPUT IS SPECIAL
# ============================================================================
print("\n" + "=" * 80)
print("SECTION 8: THE SWIGLU EFFECT — ffn_down ACTIVATION STRUCTURE")
print("=" * 80)
print(" ffn_down's activation is the SwiGLU output: silu(gate) * up")
print(" This creates a specific activation pattern that differs from")
print(" raw FFN input (RMSNorm output).\n")
if "ffn_input (gate/up)" in act_data and "ffn_down_input (swiglu)" in act_data:
A_in = act_data["ffn_input (gate/up)"]
A_swiglu = act_data["ffn_down_input (swiglu)"]
print(f" FFN input (RMSNorm output): {A_in.shape}")
print(f" SwiGLU output: {A_swiglu.shape}")
# Per-token analysis
for t in range(min(A_swiglu.shape[0], 3)):
tok_in = A_in[t]
tok_sw = A_swiglu[t]
print(f"\n Token {t}:")
print(
f" FFN input: mean={tok_in.mean():.6f}, std={tok_in.std():.6f}, "
f"|max|={np.abs(tok_in).max():.6f}"
)
print(
f" SwiGLU out: mean={tok_sw.mean():.6f}, std={tok_sw.std():.6f}, "
f"|max|={np.abs(tok_sw).max():.6f}"
)
# SwiGLU creates lots of near-zero values (silu suppresses negatives)
frac_nearzero_sw = np.mean(np.abs(tok_sw) < 0.01 * tok_sw.std())
frac_nearzero_in = np.mean(np.abs(tok_in) < 0.01 * tok_in.std())
print(
f" Near-zero fraction: FFN input={frac_nearzero_in:.3f}, "
f"SwiGLU={frac_nearzero_sw:.3f}"
)
# Sparsity pattern
frac_neg = np.mean(tok_sw < 0)
print(f" SwiGLU negative fraction: {frac_neg:.3f}")
# Dimension-level analysis of SwiGLU
print(f"\n Dimension-level SwiGLU properties:")
dim_mean_sw = A_swiglu.mean(axis=0)
dim_std_sw = A_swiglu.std(axis=0)
dim_sparsity = np.mean(A_swiglu < 0, axis=0) # fraction of tokens negative per dim
print(f" Dim mean range: [{dim_mean_sw.min():.6f}, {dim_mean_sw.max():.6f}]")
print(f" Dim std range: [{dim_std_sw.min():.6f}, {dim_std_sw.max():.6f}]")
print(
f" Dim negative fraction: mean={dim_sparsity.mean():.3f}, "
f"range=[{dim_sparsity.min():.3f}, {dim_sparsity.max():.3f}]"
)
# Highly sparse dimensions (mostly near-zero after SwiGLU)
high_sparsity = np.sum(dim_sparsity > 0.7)
low_sparsity = np.sum(dim_sparsity < 0.3)
print(f" Dims with >70% negative tokens: {high_sparsity}/{len(dim_sparsity)}")
print(f" Dims with <30% negative tokens: {low_sparsity}/{len(dim_sparsity)}")
# ============================================================================
# 9. QUANTIZATION NOISE × ACTIVATION POWER: THE MATMUL ERROR DECOMPOSITION
# ============================================================================
print("\n" + "=" * 80)
print("SECTION 9: MATMUL ERROR DECOMPOSITION")
print("=" * 80)
print(
" matmul_error ≈ sum over groups of (activation_power_in_group × "
"weight_mse_in_group)"
)
print(
" If activation power is concentrated in groups with high weight error, "
"matmul error explodes.\n"
)
# For ffn_down specifically, compare where activation power sits vs weight error
W_down = load_f32_tensor("blk_0_ffn_down_weight.f32bin")
A_swiglu = load_f32_tensor("act_blk0_ffn_down_input.f32bin")
W_gate = load_f32_tensor("blk_0_ffn_gate_weight.f32bin")
A_ffn_in = load_f32_tensor("act_blk0_ffn_input.f32bin")
for label, W, A in [("ffn_down", W_down, A_swiglu), ("ffn_gate", W_gate, A_ffn_in)]:
nc = W.shape[1]
nr = min(W.shape[0], 128)
# Compute per-superblock (256) activation power and weight error
n_sb = nc // 256
sb_act_power = np.zeros(n_sb)
sb_weight_mse = np.zeros(n_sb)
for sb in range(n_sb):
s = sb * 256
e = s + 256
# Activation power: mean squared activation in this region
sb_act_power[sb] = np.mean(A[:, s:e] ** 2)
# Weight MSE: Q2_K-style uniform quant error
mse = 0
count = 0
for r in range(nr):
for g in range(0, 256, GS):
gvals = W[r, s + g : s + g + GS]
gmin = min(gvals.min(), 0)
gmax = gvals.max()
gr = gmax - gmin
if gr < 1e-10:
continue
scale = gr / 3.0
for v in gvals:
norm = (v - gmin) / scale
idx = max(0, min(3, int(round(norm))))
recon = gmin + scale * idx
mse += (v - recon) ** 2
count += 1
sb_weight_mse[sb] = mse / max(count, 1)
# Correlation between activation power and weight error across super-blocks
valid = sb_act_power > 1e-10
if valid.sum() > 10:
corr = np.corrcoef(np.sqrt(sb_act_power[valid]), np.sqrt(sb_weight_mse[valid]))[
0, 1
]
else:
corr = 0
print(f"\n {label}:")
print(f" Super-blocks: {n_sb}")
print(
f" act_power: mean={sb_act_power.mean():.6f}, "
f"std={np.sqrt(sb_act_power.var()):.6f}, "
f"range=[{sb_act_power.min():.6f}, {sb_act_power.max():.6f}]"
)
print(
f" weight_mse: mean={sb_weight_mse.mean():.6f}, "
f"range=[{sb_weight_mse.min():.6f}, {sb_weight_mse.max():.6f}]"
)
print(f" CORRELATION (act_power ↔ weight_mse): {corr:.4f}")
# Show top-5 super-blocks by contribution to matmul error
contrib = sb_act_power * sb_weight_mse
top5 = np.argsort(contrib)[-5:][::-1]
print(f" Top-5 error-contributing super-blocks (of {n_sb}):")
for idx in top5:
print(
f" SB {idx * 256}-{(idx + 1) * 256 - 1}: act_power={sb_act_power[idx]:.6f}, "
f"weight_mse={sb_weight_mse[idx]:.6f}, contrib={contrib[idx]:.6f}"
)
print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)

105
scripts/compute-imatrix.py Normal file
View File

@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""Compute imatrix (importance matrix) from captured activation tensors.
The imatrix is the per-dimension sum-of-squares of the activations.
It's what upstream llama.cpp uses to weight quantization optimization.
For each activation file act_blkL_*.f32bin, produces imatrix_blkL_<role>.f32bin
where <role> matches the weight tensor it multiplies with.
Format: flat float32 array of length n_per_row, one importance value per dimension.
"""
import numpy as np
import struct
import os
DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data")
def load_f32_tensor(name):
path = os.path.join(DATA_DIR, name)
with open(path, "rb") as f:
nrow, ncol = struct.unpack("qq", f.read(16))
data = np.frombuffer(f.read(), dtype=np.float32)
assert len(data) == nrow * ncol
return data.reshape(nrow, ncol)
def save_imatrix(name, data):
path = os.path.join(DATA_DIR, name)
data.astype(np.float32).tofile(path)
print(
f" Wrote {path}: {len(data)} dims, "
f"min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}"
)
# Mapping: activation file → imatrix files for each weight it multiplies with
# Each weight tensor's column dimension matches the activation's column dimension
mappings = [
{
"act_file": "act_blk0_ffn_input.f32bin",
"imatrix_name": "imatrix_blk0_ffn_gate_up.f32bin",
"description": "ffn_gate and ffn_up (both use ffn_input activation)",
},
{
"act_file": "act_blk0_ffn_down_input.f32bin",
"imatrix_name": "imatrix_blk0_ffn_down.f32bin",
"description": "ffn_down (uses SwiGLU activation)",
},
{
"act_file": "act_blk0_attn_input.f32bin",
"imatrix_name": "imatrix_blk0_attn_qkv.f32bin",
"description": "attn_q, attn_k, attn_v (all use attn_input activation)",
},
{
"act_file": "act_blk0_attn_output_input.f32bin",
"imatrix_name": "imatrix_blk0_attn_output.f32bin",
"description": "attn_output (uses kqv_out activation)",
},
]
print("Computing imatrix from captured activations")
print("=" * 60)
for m in mappings:
try:
A = load_f32_tensor(m["act_file"])
print(f"\n{m['description']}:")
print(f" Activation: {A.shape[0]} tokens × {A.shape[1]} dims")
# imatrix = sum over tokens of activation^2
# This is the standard definition used by llama.cpp
imatrix = np.sum(A**2, axis=0)
# Also compute per-dim RMS for reference
rms = np.sqrt(np.mean(A**2, axis=0))
print(
f" Imatrix stats: min={imatrix.min():.6f}, max={imatrix.max():.6f}, "
f"mean={imatrix.mean():.6f}, std={imatrix.std():.6f}"
)
print(
f" RMS stats: min={rms.min():.6f}, max={rms.max():.6f}, "
f"mean={rms.mean():.6f}"
)
# Concentration metrics
total = imatrix.sum()
sorted_im = np.sort(imatrix)[::-1]
top1pct = max(1, int(len(imatrix) * 0.01))
top10pct = max(1, int(len(imatrix) * 0.10))
print(f" Power concentration:")
print(
f" Top 1% dims ({top1pct}): {sorted_im[:top1pct].sum() / total * 100:.1f}% of total"
)
print(
f" Top 10% dims ({top10pct}): {sorted_im[:top10pct].sum() / total * 100:.1f}% of total"
)
save_imatrix(m["imatrix_name"], imatrix)
except Exception as e:
print(f" SKIP: {e}")
print("\nDone.")

View File

@ -0,0 +1,210 @@
#!/usr/bin/env python3
"""Extract real activation tensors by running a forward pass through the model.
Captures the INPUT activations to specific weight tensors (the vectors that get
multiplied by the weight matrix). These are what matter for quantization quality:
quantization error * activation magnitude = output error.
Usage:
python3 scripts/extract-activations.py MODEL.gguf OUTPUT_DIR [--prompt TEXT] [--layer N]
Output:
For each target tensor, writes a .f32bin file with header:
int64_t n_rows, int64_t row_len
followed by n_rows * row_len float32 values.
n_rows = number of tokens, row_len = hidden dimension.
NOTE: This uses a simplified forward pass (no KV cache, single prompt).
Activations are extracted from after the norm layers (the actual matmul inputs).
"""
import sys
import os
import struct
import numpy as np
script_dir = os.path.dirname(os.path.abspath(__file__))
repo_root = os.path.dirname(script_dir)
sys.path.insert(0, os.path.join(repo_root, 'gguf-py'))
from gguf import GGUFReader
def bf16_to_f32(raw_bytes):
"""Convert raw BF16 bytes to float32 numpy array."""
bf16 = np.frombuffer(raw_bytes, dtype=np.uint16)
f32_bits = bf16.astype(np.uint32) << 16
return f32_bits.view(np.float32)
def rms_norm(x, weight, eps=1e-6):
"""RMS normalization (Qwen3/Llama style)."""
rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps)
return (x / rms) * weight
def silu(x):
"""SiLU activation."""
return x / (1.0 + np.exp(-np.clip(x, -88, 88)))
def softmax(x, axis=-1):
"""Numerically stable softmax."""
x_max = np.max(x, axis=axis, keepdims=True)
e = np.exp(x - x_max)
return e / np.sum(e, axis=axis, keepdims=True)
def main():
if len(sys.argv) < 3:
print(f"Usage: {sys.argv[0]} MODEL.gguf OUTPUT_DIR [--prompt TEXT] [--layer N]")
sys.exit(1)
model_path = sys.argv[1]
output_dir = sys.argv[2]
prompt_text = "The quick brown fox jumps over the lazy dog. In a distant galaxy, scientists discovered"
target_layer = 16
for i in range(3, len(sys.argv)):
if sys.argv[i] == "--prompt" and i + 1 < len(sys.argv):
prompt_text = sys.argv[i + 1]
elif sys.argv[i] == "--layer" and i + 1 < len(sys.argv):
target_layer = int(sys.argv[i + 1])
os.makedirs(output_dir, exist_ok=True)
print(f"Loading {model_path}...")
reader = GGUFReader(model_path)
# Read model config from metadata
config = {}
for kv in reader.fields.values():
if hasattr(kv, 'parts') and len(kv.parts) > 0:
name = kv.name
if 'block_count' in name:
config['n_layer'] = int(kv.parts[-1][0])
elif 'embedding_length' in name:
config['hidden'] = int(kv.parts[-1][0])
elif 'feed_forward_length' in name:
config['ffn'] = int(kv.parts[-1][0])
elif 'head_count_kv' in name:
config['n_kv_heads'] = int(kv.parts[-1][0])
elif 'head_count' in name and 'kv' not in name:
config['n_heads'] = int(kv.parts[-1][0])
elif 'key_length' in name:
config['head_dim'] = int(kv.parts[-1][0])
elif 'layer_norm_rms_epsilon' in name:
config['eps'] = float(kv.parts[-1][0])
print(f"Config: {config}")
hidden = config['hidden']
# Load tensors into a dict
def load_tensor(name):
for t in reader.tensors:
if t.name == name:
raw = bytes(t.data)
shape = [int(s) for s in t.shape]
n_el = int(t.n_elements)
if t.tensor_type.name == 'BF16':
flat = bf16_to_f32(raw)
elif t.tensor_type.name == 'F16':
flat = np.frombuffer(raw, dtype=np.float16).astype(np.float32)
elif t.tensor_type.name == 'F32':
flat = np.frombuffer(raw, dtype=np.float32)
else:
raise ValueError(f"Unsupported type: {t.tensor_type.name}")
assert flat.shape[0] == n_el, f"Expected {n_el} elements, got {flat.shape[0]}"
if len(shape) == 1:
return flat.copy()
return flat.reshape(list(reversed(shape))).copy()
raise KeyError(f"Tensor {name} not found")
# Create simple token IDs from the prompt (use first few tokens from vocab)
# We just need realistic activations, not perfect tokenization
n_tokens = min(32, len(prompt_text.split()))
print(f"Using {n_tokens} pseudo-tokens for activation extraction")
# Load token embedding and create input
print("Loading token_embd...")
token_embd = load_tensor("token_embd.weight") # [vocab, hidden]
# Use token IDs 100-131 (arbitrary but avoids special tokens)
token_ids = list(range(100, 100 + n_tokens))
x = token_embd[token_ids] # [n_tokens, hidden]
print(f"Input shape: {x.shape}")
# Run forward pass through target layer only (we just need the activations)
layer = target_layer
print(f"\nProcessing layer {layer}...")
def save_activation(name, data):
"""Save activation tensor as f32bin."""
if data.ndim == 1:
data = data.reshape(1, -1)
n_rows, row_len = data.shape
fname = os.path.join(output_dir, name + ".f32bin")
with open(fname, 'wb') as fp:
fp.write(struct.pack('<qq', n_rows, row_len))
data.astype(np.float32).tofile(fp)
print(f" Saved {fname}: {n_rows} x {row_len} ({os.path.getsize(fname) / 1024:.1f} KB)")
# Attention norm → input to attn_q/k/v
attn_norm_w = load_tensor(f"blk.{layer}.attn_norm.weight")
x_normed = rms_norm(x, attn_norm_w, config.get('eps', 1e-6))
save_activation(f"act_blk{layer}_attn_input", x_normed)
# Compute Q, K, V to get post-attention residual
W_q = load_tensor(f"blk.{layer}.attn_q.weight") # [n_heads*head_dim, hidden]
W_k = load_tensor(f"blk.{layer}.attn_k.weight") # [n_kv_heads*head_dim, hidden]
W_v = load_tensor(f"blk.{layer}.attn_v.weight") # [n_kv_heads*head_dim, hidden]
W_o = load_tensor(f"blk.{layer}.attn_output.weight") # [hidden, n_heads*head_dim]
Q = x_normed @ W_q.T # [n_tokens, n_heads*head_dim]
K = x_normed @ W_k.T
V = x_normed @ W_v.T
# Simplified attention (no RoPE, no mask, no GQA — just need rough activations)
n_heads = config['n_heads']
head_dim = config['head_dim']
Q_h = Q.reshape(n_tokens, n_heads, head_dim)
K_h = K.reshape(n_tokens, config['n_kv_heads'], head_dim)
V_h = V.reshape(n_tokens, config['n_kv_heads'], head_dim)
# Repeat KV heads for GQA
rep = n_heads // config['n_kv_heads']
K_h = np.repeat(K_h, rep, axis=1)
V_h = np.repeat(V_h, rep, axis=1)
# Attention scores and output
scores = np.einsum('thd,shd->ths', Q_h, K_h) / np.sqrt(head_dim)
attn_w = softmax(scores, axis=-1)
attn_out = np.einsum('ths,shd->thd', attn_w, V_h).reshape(n_tokens, -1)
# attn_output weight input
save_activation(f"act_blk{layer}_attn_output_input", attn_out)
# Project and add residual
attn_proj = attn_out @ W_o.T
x = x + attn_proj
# FFN norm → input to ffn_gate/ffn_up
ffn_norm_w = load_tensor(f"blk.{layer}.ffn_norm.weight")
x_ffn = rms_norm(x, ffn_norm_w, config.get('eps', 1e-6))
save_activation(f"act_blk{layer}_ffn_input", x_ffn)
# FFN: gate and up projections
W_gate = load_tensor(f"blk.{layer}.ffn_gate.weight") # [ffn, hidden]
W_up = load_tensor(f"blk.{layer}.ffn_up.weight") # [ffn, hidden]
W_down = load_tensor(f"blk.{layer}.ffn_down.weight") # [hidden, ffn]
gate = x_ffn @ W_gate.T
up = x_ffn @ W_up.T
ffn_act = silu(gate) * up # SwiGLU activation
# ffn_down weight input (the SwiGLU output)
save_activation(f"act_blk{layer}_ffn_down_input", ffn_act)
print(f"\nDone! Extracted 4 activation tensors to {output_dir}/")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,74 @@
#!/usr/bin/env python3
"""Extract tensor data from GGUF as raw f32 binary files for C++ testing.
Usage:
python3 scripts/extract-tensor-data.py MODEL.gguf pattern1 [pattern2 ...]
Output:
For each matching tensor, writes a .f32bin file with header:
int64_t n_rows, int64_t row_len
followed by n_rows * row_len float32 values.
"""
import sys
import os
import numpy as np
# Support running from build/ or repo root
script_dir = os.path.dirname(os.path.abspath(__file__))
repo_root = os.path.dirname(script_dir)
sys.path.insert(0, os.path.join(repo_root, 'gguf-py'))
from gguf import GGUFReader
def main():
if len(sys.argv) < 3:
print(f"Usage: {sys.argv[0]} MODEL.gguf pattern1 [pattern2 ...]")
print(f" Extracts tensors whose names contain any of the given patterns.")
sys.exit(1)
model_path = sys.argv[1]
patterns = sys.argv[2:]
print(f"Reading {model_path}...")
reader = GGUFReader(model_path)
for tensor in reader.tensors:
if not any(p in tensor.name for p in patterns):
continue
print(f"\nExtracting: {tensor.name}")
print(f" Shape: {list(tensor.shape)}, type: {tensor.tensor_type.name}")
# Convert to f32
raw = np.array(tensor.data, dtype=np.uint8)
if tensor.tensor_type.name == 'BF16':
bf16_vals = raw.view(np.uint16)
f32_bits = bf16_vals.astype(np.uint32) << 16
f32_vals = f32_bits.view(np.float32)
elif tensor.tensor_type.name == 'F16':
f16_vals = raw.view(np.float16)
f32_vals = f16_vals.astype(np.float32)
elif tensor.tensor_type.name == 'F32':
f32_vals = raw.view(np.float32)
else:
print(f" SKIP: unsupported type {tensor.tensor_type.name}")
continue
# Determine layout: GGUF stores shape as [col, row] for 2D
row_len = int(tensor.shape[0])
n_rows = tensor.n_elements // row_len
fname = tensor.name.replace(".", "_") + ".f32bin"
with open(fname, 'wb') as fp:
fp.write(np.array([n_rows, row_len], dtype=np.int64).tobytes())
f32_vals.tofile(fp)
file_size = os.path.getsize(fname)
print(f" Wrote {fname}: {n_rows} rows x {row_len} cols = {tensor.n_elements} elements")
print(f" File size: {file_size / (1024*1024):.1f} MB")
print(f" Stats: mean={f32_vals.mean():.6f}, std={f32_vals.std():.6f}, "
f"min={f32_vals.min():.6f}, max={f32_vals.max():.6f}")
if __name__ == "__main__":
main()

View File

@ -511,6 +511,7 @@ public:
std::map<llama_seq_id, llama_sampler *> samplers;
};
//
// llm_graph_result
//

View File

@ -2,6 +2,7 @@
#include "ggml-alloc.h"
#include "ggml.h"
#include "llama.h"
#include "gguf.h"
#include "llama-hparams.h"
@ -61,6 +62,13 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
case LLAMA_FTYPE_MOSTLY_Q3_PT: return "Q3_PT - 3.25 bpw";
case LLAMA_FTYPE_MOSTLY_Q3_KPT: return "Q3_KPT - Q3_K with learned levels";
case LLAMA_FTYPE_MOSTLY_Q4_DPT: return "Q4_DPT - IQ4_NL with learned levels";
case LLAMA_FTYPE_MOSTLY_Q2_KPT: return "Q2_KPT - Q2_K with learned levels";
case LLAMA_FTYPE_MOSTLY_IQ2_TQ: return "IQ2_TQ - 2.0625 bpw trellis quantized";
case LLAMA_FTYPE_MOSTLY_IQ3_TQ: return "IQ3_TQ - 3.5625 bpw per-tensor trained grid";
case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.5625 bpw 8D vector quantized";
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
@ -758,6 +766,13 @@ llama_model_loader::llama_model_loader(
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
case GGML_TYPE_Q3_PT: ftype = LLAMA_FTYPE_MOSTLY_Q3_PT; break;
case GGML_TYPE_Q3_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q3_KPT; break;
case GGML_TYPE_Q4_DPT: ftype = LLAMA_FTYPE_MOSTLY_Q4_DPT; break;
case GGML_TYPE_Q2_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q2_KPT; break;
case GGML_TYPE_IQ2_TQ: ftype = LLAMA_FTYPE_MOSTLY_IQ2_TQ; break;
case GGML_TYPE_IQ3_TQ: ftype = LLAMA_FTYPE_MOSTLY_IQ3_TQ; break;
case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break;
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
case GGML_TYPE_Q1_0: ftype = LLAMA_FTYPE_MOSTLY_Q1_0; break;
default:

View File

@ -21,6 +21,7 @@
// TODO: tmp until the ggml meta backend matures and becomes public
#include "../src/ggml-ext.h"
#include <algorithm>
#include <cassert>
#include <cfloat>
@ -8247,6 +8248,175 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
// Load per-tensor quantization auxiliary data (levels/kvalues) from GGUF metadata.
// Indexed by weight tensor pointer for direct lookup during inference.
{
// Build tensor name to tensor pointer map
std::unordered_map<std::string, ggml_tensor*> name_to_tensor;
for (auto & [ctx, buf_map] : ctx_buf_maps) {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
name_to_tensor[ggml_get_name(t)] = t;
}
}
struct level_type_info {
ggml_type type;
const char * gguf_key;
size_t n_levels; // number of level values per tensor
size_t elem_bytes; // size of each level value
};
const level_type_info level_types[] = {
{ GGML_TYPE_Q3_PT, "q3_pt.levels", 8, sizeof(float) },
{ GGML_TYPE_Q3_KPT, "q3_kpt.levels", 8, sizeof(float) },
{ GGML_TYPE_Q4_DPT, "q4_dpt.levels", 16, sizeof(int8_t) },
};
for (const auto & lt : level_types) {
int64_t lv_idx = gguf_find_key(ml.metadata, lt.gguf_key);
if (lv_idx < 0) { continue; }
const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.metadata, lv_idx);
const size_t lv_arr_n = gguf_get_arr_n(ml.metadata, lv_idx);
size_t tensor_count = 0;
// Iterate over GGUF slots to find matching tensors
for (size_t gguf_slot = 0; gguf_slot < lv_arr_n / lt.n_levels; ++gguf_slot) {
std::string tensor_name = gguf_get_tensor_name(ml.metadata, gguf_slot);
auto it = name_to_tensor.find(tensor_name);
if (it == name_to_tensor.end()) { continue; }
ggml_tensor* t = it->second;
if (t->type != lt.type) { continue; }
const size_t gguf_offset = gguf_slot * lt.n_levels;
// Store directly indexed by tensor pointer
auto & aux = tensor_aux_data[t];
aux.type = lt.type;
aux.host_data.assign(
lv_raw + gguf_offset * lt.elem_bytes,
lv_raw + (gguf_offset + lt.n_levels) * lt.elem_bytes
);
aux.aux_tensor = nullptr;
// Set quant_levels directly on the tensor
t->quant_levels = aux.host_data.data();
tensor_count++;
}
if (tensor_count > 0) {
LLAMA_LOG_INFO("%s: loaded %zu %s per-tensor level tables\n",
__func__, tensor_count, lt.gguf_key);
}
}
// Q2_KPT: per-block levels stored as per-tensor GGUF keys "{tensor_name}.q2kpt_levels"
// Each key holds n_blocks * Q2KPT_N_LEVELS floats for that tensor (4 floats per 256-element block).
{
size_t q2kpt_loaded = 0;
for (auto & [tname, t] : name_to_tensor) {
if (t->type != GGML_TYPE_Q2_KPT) { continue; }
const std::string key = tname + ".q2kpt_levels";
int64_t lv_idx = gguf_find_key(ml.metadata, key.c_str());
if (lv_idx < 0) { continue; }
const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.metadata, lv_idx);
const size_t lv_n = gguf_get_arr_n(ml.metadata, lv_idx);
auto & aux = tensor_aux_data[t];
aux.type = GGML_TYPE_Q2_KPT;
aux.host_data.assign(lv_raw, lv_raw + lv_n * sizeof(float));
aux.aux_tensor = nullptr;
t->quant_levels = aux.host_data.data();
q2kpt_loaded++;
}
if (q2kpt_loaded > 0) {
LLAMA_LOG_INFO("%s: loaded %zu Q2_KPT per-block level tables\n", __func__, q2kpt_loaded);
}
}
// IQ2_TQ: per-tensor trained grid (16 × 4 int8 = 64 bytes)
{
size_t iq2tq_loaded = 0;
for (auto & [tname, t] : name_to_tensor) {
if (t->type != GGML_TYPE_IQ2_TQ) { continue; }
const std::string grid_key = "iq2tq.grid." + tname;
int64_t grid_idx = gguf_find_key(ml.metadata, grid_key.c_str());
if (grid_idx < 0) { continue; }
auto & taux = tensor_aux_data[t];
taux.type = GGML_TYPE_IQ2_TQ;
taux.host_data.resize(64);
const int8_t * grid_data = (const int8_t *)gguf_get_arr_data(ml.metadata, grid_idx);
memcpy(taux.host_data.data(), grid_data, 64);
t->quant_levels = taux.host_data.data();
iq2tq_loaded++;
}
if (iq2tq_loaded > 0) {
LLAMA_LOG_INFO("%s: loaded IQ2_TQ grid for %zu tensors\n", __func__, iq2tq_loaded);
}
}
// IQ3_TQ: per-tensor trained grid (16 × 8 int8 = 128 bytes)
{
size_t iq3tq_loaded = 0;
for (auto & [tname, t] : name_to_tensor) {
if (t->type != GGML_TYPE_IQ3_TQ) { continue; }
const std::string grid_key = "iq3tq.grid." + tname;
int64_t grid_idx = gguf_find_key(ml.metadata, grid_key.c_str());
if (grid_idx < 0) {
// backward compat: try old key name
const std::string old_key = "iq3qt.grid." + tname;
grid_idx = gguf_find_key(ml.metadata, old_key.c_str());
if (grid_idx < 0) { continue; }
}
auto & taux = tensor_aux_data[t];
taux.type = GGML_TYPE_IQ3_TQ;
taux.host_data.resize(128);
const int8_t * grid_data = (const int8_t *)gguf_get_arr_data(ml.metadata, grid_idx);
memcpy(taux.host_data.data(), grid_data, 128);
t->quant_levels = taux.host_data.data();
iq3tq_loaded++;
}
if (iq3tq_loaded > 0) {
LLAMA_LOG_INFO("%s: loaded IQ3_TQ grid for %zu tensors\n", __func__, iq3tq_loaded);
}
}
// IQ1_BN: per-tensor trained codebook (32768 bytes)
{
size_t iq1bn_loaded = 0;
for (auto & [tname, t] : name_to_tensor) {
if (t->type != GGML_TYPE_IQ1_BN) { continue; }
const std::string aux_key = "iq1bn.aux." + tname;
int64_t aux_idx = gguf_find_key(ml.metadata, aux_key.c_str());
if (aux_idx < 0) { continue; }
auto & taux = tensor_aux_data[t];
taux.type = GGML_TYPE_IQ1_BN;
taux.host_data.resize(32768);
const int8_t * aux_data = (const int8_t *)gguf_get_arr_data(ml.metadata, aux_idx);
memcpy(taux.host_data.data(), aux_data, 32768);
t->quant_levels = taux.host_data.data();
iq1bn_loaded++;
}
if (iq1bn_loaded > 0) {
LLAMA_LOG_INFO("%s: loaded IQ1_BN codebook for %zu tensors\n", __func__, iq1bn_loaded);
}
}
}
if (use_mmap_buffer) {
for (auto & mapping : ml.mappings) {
pimpl->mappings.emplace_back(std::move(mapping));

View File

@ -574,6 +574,24 @@ struct llama_model {
// for keeping track of associated LoRA adapters
std::unordered_set<llama_adapter_lora *> loras;
// host-side auxiliary data for dynamic quantization types (Q4_DPT, Q3_PT, Q3_KPT)
// indexed by weight tensor pointer, allows separate GPU placement of aux data
struct tensor_auxiliary {
ggml_type type; // Quantization type this aux data is for
std::vector<uint8_t> host_data; // Host copy of aux data (levels or kvalues)
struct ggml_tensor * aux_tensor; // Separate ggml tensor for backend placement
};
// Hash function for ggml_tensor pointers (reuse existing ggml_hash pattern)
struct ggml_tensor_ptr_hash {
size_t operator()(const ggml_tensor* t) const noexcept {
return (size_t)(uintptr_t)t >> 4; // Same as ggml_hash()
}
};
// Per-tensor auxiliary data lookup - indexed by WEIGHT tensor pointer
std::unordered_map<const ggml_tensor*, tensor_auxiliary, ggml_tensor_ptr_hash> tensor_aux_data;
// statically allocated context for assigning
struct llama_meta_device_get_split_state_userdata get_split_state_ud;

View File

@ -1,6 +1,8 @@
#include "ggml.h"
#include "llama-impl.h"
#include "llama-model.h"
#include "llama-model-loader.h"
#include "llama.h"
#include "llama-ext.h"
#include <algorithm>
@ -13,6 +15,98 @@
#include <thread>
#include <unordered_map>
// Q3_PT levels functions (defined in ggml-quants.c)
extern "C" {
void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
void q3pt_set_levels(const float * levels);
}
// Q3_KPT levels functions (defined in ggml-quants.c)
extern "C" {
void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
void q3kpt_set_levels(const float * levels);
}
// Q4_DPT levels functions (defined in ggml-quants.c)
extern "C" {
void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, int8_t levels_out[16]);
void q4dpt_set_levels(const int8_t * levels);
}
// Q2_KPT levels are handled internally by quantize_q2_kpt
#define Q2KPT_N_LEVELS 4
#define QK_K 256
extern "C" const float * q2kpt_get_levels(void);
extern "C" void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
extern "C" void q2kpt_free_levels(void);
// IQ2_TQ functions — per-tensor trained grid
extern "C" size_t quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
extern "C" void iq2tq_set_grid(const int8_t grid[64]);
extern "C" const int8_t * iq2tq_get_grid(void);
// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor)
extern "C" size_t quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]);
extern "C" void iq3tq_set_grid(const int8_t grid[128]);
extern "C" const int8_t * iq3tq_get_grid(void);
// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor)
extern "C" size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread);
extern "C" void iq1bn_set_aux(const int8_t aux[32768]);
extern "C" const int8_t * iq1bn_get_aux(void);
// Q3_PT levels functions (defined in ggml-quants.c)
extern "C" {
void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
void q3pt_set_levels(const float * levels);
}
// Q3_KPT levels functions (defined in ggml-quants.c)
extern "C" {
void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, float levels_out[8]);
void q3kpt_set_levels(const float * levels);
}
// Q4_DPT levels functions (defined in ggml-quants.c)
extern "C" {
void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
const float * imatrix, int8_t levels_out[16]);
void q4dpt_set_levels(const int8_t * levels);
}
// Q2_KPT levels are handled internally by quantize_q2_kpt
#define Q2KPT_N_LEVELS 4
#define QK_K 256
extern "C" const float * q2kpt_get_levels(void);
extern "C" void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
extern "C" void q2kpt_free_levels(void);
// IQ2_TQ functions — per-tensor trained grid
extern "C" size_t quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
extern "C" void iq2tq_set_grid(const int8_t grid[64]);
extern "C" const int8_t * iq2tq_get_grid(void);
// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor)
extern "C" size_t quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]);
extern "C" void iq3tq_set_grid(const int8_t grid[128]);
extern "C" const int8_t * iq3tq_get_grid(void);
// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor)
extern "C" size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
extern "C" void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread);
extern "C" void iq1bn_set_aux(const int8_t aux[32768]);
extern "C" const int8_t * iq1bn_get_aux(void);
// result of parsing --tensor-type option
// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
struct tensor_type_option {
@ -234,7 +328,7 @@ static void llama_tensor_dequantize_impl(
} else if (tensor->type == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
} else if (ggml_is_quantized(tensor->type)) {
qtype->to_float(tensor->data, f32_output, nelements);
qtype->to_float(tensor->data, f32_output, nelements, tensor->quant_levels);
} else {
GGML_ABORT("fatal error"); // unreachable
}
@ -264,13 +358,14 @@ static void llama_tensor_dequantize_impl(
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
const void * quant_levels = tensor->quant_levels;
auto compute = [qtype, quant_levels] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
if (typ == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
} else if (typ == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
} else {
qtype->to_float(inbuf, outbuf, nels);
qtype->to_float(inbuf, outbuf, nels, quant_levels);
}
};
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
@ -480,6 +575,18 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
new_type = GGML_TYPE_IQ4_XS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT) {
new_type = GGML_TYPE_IQ4_XS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
new_type = GGML_TYPE_Q4_K;
}
@ -518,13 +625,16 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
new_type = GGML_TYPE_Q3_PT;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@ -569,16 +679,17 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K;
: (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ? GGML_TYPE_Q3_KPT : GGML_TYPE_Q3_K);
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
@ -587,6 +698,9 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
new_type = GGML_TYPE_IQ4_XS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (arch == LLM_ARCH_FALCON) {
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
@ -616,13 +730,14 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ||
ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) {
new_type = GGML_TYPE_Q5_K;
}
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
}
@ -828,6 +943,14 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return GGML_TYPE_IQ4_XS;
case LLAMA_FTYPE_MOSTLY_IQ3_S:
case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
case LLAMA_FTYPE_MOSTLY_Q3_PT: return GGML_TYPE_Q3_PT;
case LLAMA_FTYPE_MOSTLY_Q3_KPT: return GGML_TYPE_Q3_KPT;
case LLAMA_FTYPE_MOSTLY_Q4_DPT: return GGML_TYPE_Q4_DPT;
case LLAMA_FTYPE_MOSTLY_Q2_KPT: return GGML_TYPE_Q2_KPT;
case LLAMA_FTYPE_MOSTLY_IQ2_TQ: return GGML_TYPE_IQ2_TQ;
case LLAMA_FTYPE_MOSTLY_IQ3_TQ: return GGML_TYPE_IQ3_TQ;
case LLAMA_FTYPE_MOSTLY_IQ1_BN: return GGML_TYPE_IQ1_BN;
default: return GGML_TYPE_COUNT;
}
@ -1098,6 +1221,615 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
::zeros(fout, meta_size);
};
// Q3_PT two-pass approach: train all per-tensor levels BEFORE opening the output
// file, so the levels KV entry is already populated at the time of the metadata placeholder.
static const size_t Q3PT_N_LEVELS = 8;
std::vector<float> q3pt_all_levels; // indexed by position in tensors[]
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT && !params->dry_run) {
LLAMA_LOG_INFO("%s: Q3_PT pass 1: training per-tensor levels...\n", __func__);
q3pt_all_levels.assign(tensors.size() * Q3PT_N_LEVELS, 0.0f);
// Temporary dequant buffer for pass 1 (reuse f32_conv_buf / read_data declared below)
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
// Determine whether this tensor will be Q3_PT (mirror the pass-2 logic)
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (new_type != GGML_TYPE_Q3_PT) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: Q3_PT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
q3pt_train_levels(f32_data, nrows, n_per_row, imatrix,
q3pt_all_levels.data() + ti * Q3PT_N_LEVELS);
}
// All levels ready — store in GGUF metadata before the file is opened
for (auto & ctx : ctx_outs) {
if (ctx) {
gguf_set_arr_data(ctx.get(), "q3_pt.levels", GGUF_TYPE_FLOAT32,
q3pt_all_levels.data(), q3pt_all_levels.size());
}
}
LLAMA_LOG_INFO("%s: Q3_PT pass 1 complete.\n", __func__);
}
// Q3_KPT two-pass approach: train all per-tensor levels BEFORE opening the output
static const size_t Q3KPT_N_LEVELS = 8;
std::vector<float> q3kpt_all_levels; // indexed by position in tensors[]
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT && !params->dry_run) {
LLAMA_LOG_INFO("%s: Q3_KPT pass 1: training per-tensor levels...\n", __func__);
q3kpt_all_levels.assign(tensors.size() * Q3KPT_N_LEVELS, 0.0f);
// Temporary dequant buffer for pass 1
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
// Determine whether this tensor will be Q3_KPT (mirror the pass-2 logic)
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_Q3_KPT) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: Q3_KPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
q3kpt_train_levels(f32_data, nrows, n_per_row, imatrix,
q3kpt_all_levels.data() + ti * Q3KPT_N_LEVELS);
}
// All levels ready — store in GGUF metadata before the file is opened
for (auto & ctx : ctx_outs) {
if (ctx) {
gguf_set_arr_data(ctx.get(), "q3_kpt.levels", GGUF_TYPE_FLOAT32,
q3kpt_all_levels.data(), q3kpt_all_levels.size());
}
}
LLAMA_LOG_INFO("%s: Q3_KPT pass 1 complete.\n", __func__);
}
// Q4_DPT two-pass approach: train all per-tensor int8 levels BEFORE opening the output
// file, so the levels KV entry is already populated at the time of the metadata placeholder.
static const size_t Q4DPT_N_LEVELS = 16;
std::vector<int8_t> q4dpt_all_levels; // indexed by position in tensors[]
if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT && !params->dry_run) {
LLAMA_LOG_INFO("%s: Q4_DPT pass 1: training per-tensor int8 levels...\n", __func__);
q4dpt_all_levels.assign(tensors.size() * Q4DPT_N_LEVELS, (int8_t)0);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_Q4_DPT) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: Q4_DPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
q4dpt_train_levels(f32_data, nrows, n_per_row, imatrix,
q4dpt_all_levels.data() + ti * Q4DPT_N_LEVELS);
}
// Store in GGUF metadata before the file is opened
for (auto & ctx : ctx_outs) {
if (ctx) {
gguf_set_arr_data(ctx.get(), "q4_dpt.levels", GGUF_TYPE_INT8,
q4dpt_all_levels.data(), q4dpt_all_levels.size());
}
}
LLAMA_LOG_INFO("%s: Q4_DPT pass 1 complete.\n", __func__);
}
// Q2_KPT two-pass approach: train all per-block levels BEFORE opening the output
// file, so the levels KV entry is already populated at the time of the metadata placeholder.
// Per-block levels: 4 floats per 256-element block.
struct q2kpt_tensor_levels {
std::string name;
std::vector<float> levels; // nrows * (n_per_row / QK_K) * Q2KPT_N_LEVELS floats
};
std::vector<q2kpt_tensor_levels> q2kpt_all_levels;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT && !params->dry_run) {
LLAMA_LOG_INFO("%s: Q2_KPT pass 1: training per-block levels...\n", __func__);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
// Determine whether this tensor will be Q2_KPT (mirror the pass-2 logic)
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_Q2_KPT) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
// Allocate levels buffer for this tensor
const int nb = n_per_row / QK_K;
const size_t n_levels = (size_t)nrows * tensor->ne[2] * nb * Q2KPT_N_LEVELS;
q2kpt_all_levels.push_back({tname, std::vector<float>(n_levels)});
LLAMA_LOG_INFO("%s: Q2_KPT levels for [%zu/%zu] %s (%zu floats)\n",
__func__, ti+1, tensors.size(), tensor->name, n_levels);
// Train levels by running quantization internally
// We need to quantize to f32 -> Q2_KPT -> f32 to get the trained levels
std::vector<no_init<uint8_t>> p1_qbuf(ggml_nbytes(tensor));
const size_t row_size = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row);
// Prepare levels buffer for this tensor
q2kpt_free_levels();
q2kpt_prepare_levels(nrows * tensor->ne[2], n_per_row);
// Quantize each expert slice
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
void * q_data_03 = (char *)p1_qbuf.data() + row_size * i03 * nrows;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
// start_row must be the absolute row index for correct levels indexing
ggml_quantize_chunk(GGML_TYPE_Q2_KPT, f32_data_03, q_data_03, i03 * nrows, nrows, n_per_row, imatrix_03);
}
// Copy trained levels to our storage
const float * trained_levels = q2kpt_get_levels();
if (trained_levels) {
memcpy(q2kpt_all_levels.back().levels.data(), trained_levels, n_levels * sizeof(float));
}
}
// Store all levels in GGUF metadata before the file is opened
for (const auto & tl : q2kpt_all_levels) {
for (auto & ctx : ctx_outs) {
if (ctx) {
const std::string key = tl.name + ".q2kpt_levels";
gguf_set_arr_data(ctx.get(), key.c_str(), GGUF_TYPE_FLOAT32,
tl.levels.data(), tl.levels.size());
}
}
}
LLAMA_LOG_INFO("%s: Q2_KPT pass 1 complete.\n", __func__);
}
// IQ2_TQ: train per-tensor grid in pass 1
struct iq2tq_meta {
std::string tensor_name;
int8_t grid[64];
};
std::vector<iq2tq_meta> iq2tq_all_meta;
if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ2_TQ) {
const int64_t t_start_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ2_TQ pass 1: training per-tensor grids...\n", __func__);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
// Mirror pass-2 logic: only quantize 2D+ weight tensors
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_IQ2_TQ) { continue; }
// Load tensor data
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
// Dequantize to f32 if needed
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
// Resolve imatrix
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: IQ2_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
iq2tq_meta meta;
meta.tensor_name = tname;
iq2tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid);
iq2tq_all_meta.push_back(meta);
// Save to GGUF
std::string grid_key = "iq2tq.grid." + tname;
gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 64);
}
const int64_t t_end_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ2_TQ pass 1 complete (%zu tensors trained, %.1f s).\n",
__func__, iq2tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
}
// IQ3_TQ: train per-tensor grid in pass 1 (16 entries × 8 levels = 128 bytes)
struct iq3tq_meta {
std::string tensor_name;
int8_t grid[128];
};
std::vector<iq3tq_meta> iq3tq_all_meta;
if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ3_TQ) {
const int64_t t_start_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ3_TQ pass 1: training per-tensor grids...\n", __func__);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_IQ3_TQ) { continue; }
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: IQ3_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
iq3tq_meta meta;
meta.tensor_name = tname;
iq3tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid);
iq3tq_all_meta.push_back(meta);
std::string grid_key = "iq3tq.grid." + tname;
gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 128);
}
const int64_t t_end_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ3_TQ pass 1 complete (%zu tensors trained, %.1f s).\n",
__func__, iq3tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
}
// IQ1_BN: train per-tensor codebook in pass 1 (4096 × 8D centroids = 32768 bytes)
struct iq1bn_meta {
std::string tensor_name;
int8_t aux[32768];
};
std::vector<iq1bn_meta> iq1bn_all_meta;
if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN) {
const int64_t t_start_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ1_BN pass 1: training per-tensor codebooks...\n", __func__);
std::vector<no_init<uint8_t>> p1_read_data;
std::vector<no_init<float>> p1_f32_buf;
std::vector<std::thread> p1_workers;
p1_workers.reserve(nthread);
for (size_t ti = 0; ti < tensors.size(); ++ti) {
ggml_tensor * tensor = tensors[ti]->tensor;
const std::string tname = ggml_get_name(tensor);
bool quantize = tname.rfind("weight") == tname.size() - 6;
quantize &= (ggml_n_dims(tensor) >= 2);
quantize &= tname.find("_norm.weight") == std::string::npos;
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
if (!quantize) { continue; }
ggml_type new_type = default_type;
if (!params->pure) {
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
}
if (params->token_embedding_type < GGML_TYPE_COUNT &&
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
new_type = params->token_embedding_type;
}
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
new_type = params->output_tensor_type;
}
if (new_type != GGML_TYPE_IQ1_BN) { continue; }
const size_t tsz = ggml_nbytes(tensor);
if (!ml.use_mmap) {
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
tensor->data = p1_read_data.data();
}
ml.load_data_for(tensor);
const int64_t nelements = ggml_nelements(tensor);
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else {
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
f32_data = (float *) p1_f32_buf.data();
}
const float * imatrix = nullptr;
if (imatrix_data) {
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it2 != imatrix_data->end() &&
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
imatrix = it2->second.data();
}
}
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
LLAMA_LOG_INFO("%s: IQ1_BN codebook for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
iq1bn_meta meta;
meta.tensor_name = tname;
iq1bn_train_codebook(f32_data, nrows, n_per_row, imatrix, meta.aux, nthread);
iq1bn_all_meta.push_back(meta);
std::string aux_key = "iq1bn.aux." + tname;
gguf_set_arr_data(ctx_outs[0].get(), aux_key.c_str(), GGUF_TYPE_INT8, meta.aux, 32768);
}
const int64_t t_end_p1 = ggml_time_us();
LLAMA_LOG_INFO("%s: IQ1_BN pass 1 complete (%zu tensors trained, %.1f s).\n",
__func__, iq1bn_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
}
// no output file for --dry-run
if (!params->dry_run) {
new_ofstream(0);
@ -1106,6 +1838,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
//
// main loop: iterate over all weights
//
size_t tensor_pass2_idx = 0; // index into tensors[], used for Q3_PT levels lookup
for (size_t i = 0; i < tensors.size(); ++i) {
const auto & weight = *tensors[i];
@ -1232,6 +1965,75 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
// Q3_PT: set the per-tensor levels (trained in pass 1) as global for quantization
if (new_type == GGML_TYPE_Q3_PT) {
q3pt_set_levels(q3pt_all_levels.data() + tensor_pass2_idx * Q3PT_N_LEVELS);
}
// Q3_KPT: set the per-tensor levels (trained in pass 1) as global for quantization
if (new_type == GGML_TYPE_Q3_KPT) {
q3kpt_set_levels(q3kpt_all_levels.data() + tensor_pass2_idx * Q3KPT_N_LEVELS);
}
// Q4_DPT: set the per-tensor levels (trained in pass 1) as global for quantization
if (new_type == GGML_TYPE_Q4_DPT) {
q4dpt_set_levels(q4dpt_all_levels.data() + tensor_pass2_idx * Q4DPT_N_LEVELS);
}
// IQ2_TQ: set per-tensor trained grid
if (new_type == GGML_TYPE_IQ2_TQ) {
bool found = false;
for (const auto & meta : iq2tq_all_meta) {
if (meta.tensor_name == tm.name) {
iq2tq_set_grid(meta.grid);
found = true;
break;
}
}
if (!found) {
LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ2_TQ tensor %s\n", __func__, tm.name.c_str());
}
}
// IQ3_TQ: set per-tensor trained grid
if (new_type == GGML_TYPE_IQ3_TQ) {
bool found = false;
for (const auto & meta : iq3tq_all_meta) {
if (meta.tensor_name == tm.name) {
iq3tq_set_grid(meta.grid);
found = true;
break;
}
}
if (!found) {
LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ3_TQ tensor %s\n", __func__, tm.name.c_str());
}
}
// IQ1_BN: set per-tensor trained codebook
if (new_type == GGML_TYPE_IQ1_BN) {
bool found = false;
for (const auto & meta : iq1bn_all_meta) {
if (meta.tensor_name == tm.name) {
iq1bn_set_aux(meta.aux);
found = true;
break;
}
}
if (!found) {
LLAMA_LOG_WARN("%s: WARNING: no trained codebook for IQ1_BN tensor %s\n", __func__, tm.name.c_str());
}
}
// Q2_KPT: quantize_q2_kpt trains per-block levels internally.
// Levels were already trained and saved to GGUF in pass 1.
// We still need to allocate the levels buffer for quantization to work correctly.
if (new_type == GGML_TYPE_Q2_KPT) {
const int64_t total_rows = nrows * tensor->ne[2];
q2kpt_free_levels(); // Clear any stale levels from previous tensor
q2kpt_prepare_levels(total_rows, n_per_row); // Allocate for this tensor
}
// quantize each expert separately since they have different importance matrices
new_size = 0;
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
@ -1255,7 +2057,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
fout.write((const char *) new_data, new_size);
zeros(fout, GGML_PAD(new_size, align) - new_size);
} // no --dry-run
} // main loop
tensor_pass2_idx++;
} // iterate over tensors
if (!params->dry_run) {
close_ofstream();

View File

@ -257,6 +257,9 @@ if (NOT GGML_BACKEND_DL)
llama_build_and_test(test-rope.cpp)
endif()
# Quantization laboratory - tests for 2.5 BPW proposals
llama_build_and_test(test-quant-laboratory.cpp)
# libmtmd
set(LLAMA_TEST_NAME test-mtmd-c-api)
llama_build_and_test(test-mtmd-c-api.c)

View File

@ -261,7 +261,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
} else if (t->type == GGML_TYPE_I8) {
tv.push_back((float)*(int8_t *) &buf[i]);
} else if (quantized) {
tt->to_float(&buf[i], vq.data(), bs);
tt->to_float(&buf[i], vq.data(), bs, nullptr);
tv.insert(tv.end(), vq.begin(), vq.end());
} else {
GGML_ABORT("fatal error");

View File

@ -0,0 +1,355 @@
// test-quant-laboratory.cpp
// Reusable testing harness for quantization experiments.
//
// Provides:
// - Synthetic data generators (Gaussian, Laplace, uniform)
// - Real tensor data loading (f32bin format with [nrow, ncol] header)
// - Importance matrix loading (flat f32 array)
// - RMSE computation
// - Multi-approach comparison framework (quantize → dequantize → matmul error)
// - ggml graph-level verification skeleton
//
// To add a new experiment:
// 1. Add an approach function: void approach_xxx(const float *W, float *out,
// int64_t nrow, int64_t ncol,
// const float *imatrix)
// 2. Register it in compare_approaches()
// 3. Call test_approach_comparison() from main()
#include "../ggml/src/ggml-quants.h"
#include "ggml-backend.h"
#include "ggml-alloc.h"
#include "ggml.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <functional>
#include <random>
#include <string>
#include <vector>
// ============================================================================
// Helper functions
// ============================================================================
static float rmse(const float * a, const float * b, size_t n) {
double sum = 0.0;
for (size_t i = 0; i < n; ++i) {
double d = (double) a[i] - (double) b[i];
sum += d * d;
}
return (float) sqrt(sum / n);
}
static void fill_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f) {
std::normal_distribution<float> dist(0.0f, sigma);
for (size_t i = 0; i < n; ++i) {
data[i] = dist(gen);
}
}
static void fill_laplace(float * data, size_t n, std::mt19937 & gen, float b = 1.0f) {
std::uniform_real_distribution<float> u(-0.5f, 0.5f);
for (size_t i = 0; i < n; ++i) {
float v = u(gen);
data[i] = -b * ((v > 0) - (v < 0)) * logf(1.0f - 2.0f * fabsf(v));
}
}
static void fill_uniform(float * data, size_t n, std::mt19937 & gen, float range = 1.0f) {
std::uniform_real_distribution<float> dist(-range, range);
for (size_t i = 0; i < n; ++i) {
data[i] = dist(gen);
}
}
static void fill_offset_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f, float offset = 2.0f) {
std::normal_distribution<float> dist(offset, sigma);
for (size_t i = 0; i < n; ++i) {
data[i] = dist(gen);
}
}
// ============================================================================
// Data loading
// ============================================================================
static bool load_f32_tensor(const char * path, std::vector<float> & data, int64_t & nrow, int64_t & n_per_row) {
FILE * f = fopen(path, "rb");
if (!f) {
return false;
}
int64_t header[2];
if (fread(header, sizeof(int64_t), 2, f) != 2) {
fclose(f);
return false;
}
nrow = header[0];
n_per_row = header[1];
int64_t total = nrow * n_per_row;
data.resize(total);
size_t nread = fread(data.data(), sizeof(float), total, f);
fclose(f);
if ((int64_t) nread != total) {
return false;
}
return true;
}
// Load imatrix file (flat f32 array, no header, one importance value per column dimension)
// The imatrix is the sum-of-squares of activations per dimension.
static bool load_imatrix(const char * path, std::vector<float> & data, int64_t expected_dims) {
FILE * f = fopen(path, "rb");
if (!f) {
return false;
}
// Get file size to determine dimensions
fseek(f, 0, SEEK_END);
long file_size = ftell(f);
fseek(f, 0, SEEK_SET);
int64_t dims = file_size / sizeof(float);
if (expected_dims > 0 && dims != expected_dims) {
printf(" WARN: imatrix dims %lld != expected %lld\n", (long long) dims, (long long) expected_dims);
fclose(f);
return false;
}
data.resize(dims);
size_t nread = fread(data.data(), sizeof(float), dims, f);
fclose(f);
if ((int64_t) nread != dims) {
return false;
}
// Compute stats
float imin = data[0], imax = data[0], isum = 0;
for (int64_t i = 0; i < dims; i++) {
if (data[i] < imin) imin = data[i];
if (data[i] > imax) imax = data[i];
isum += data[i];
}
printf(" Loaded imatrix: %lld dims, min=%.6f, max=%.6f, mean=%.6f\n",
(long long) dims, imin, imax, isum / dims);
return true;
}
// ============================================================================
// Test class
// ============================================================================
class QuantLaboratory {
public:
QuantLaboratory() : gen(42) {}
// ========================================================================
// MULTI-APPROACH COMPARISON FRAMEWORK
//
// Each "approach" is a function that takes float weights and produces
// dequantized float output. The framework computes:
// - Weight RMSE (dequant vs original)
// - Matmul error (dequant weights x real activations vs f64 reference)
// - Ratio vs first approach (typically Q2_K baseline)
//
// To add a new approach:
// 1. Write: void approach_xxx(const float *W, float *out,
// int64_t nrow, int64_t ncol,
// const float *imatrix) { ... }
// 2. Add it to the `approaches` array in compare_approaches()
// ========================================================================
// -- Example approach: Q2_K baseline (via ggml library) --
// Uncomment and adapt for your experiment:
//
// void approach_q2k(const float * W, float * out, int64_t nrow, int64_t ncol, const float * imatrix) {
// size_t rs = ggml_row_size(GGML_TYPE_Q2_K, ncol);
// std::vector<uint8_t> buf(nrow * rs);
// quantize_q2_K(W, buf.data(), nrow, ncol, imatrix);
// auto * tr = ggml_get_type_traits(GGML_TYPE_Q2_K);
// for (int64_t r = 0; r < nrow; r++) {
// tr->to_float(buf.data() + r * rs, out + r * ncol, ncol, NULL);
// }
// }
void compare_approaches(const float * W,
int64_t w_nrow,
int64_t w_ncol,
const float * A,
int64_t a_nrow,
int64_t a_ncol,
const char * name,
const float * imatrix) {
if (w_ncol != a_ncol) {
return;
}
int64_t nr = std::min(w_nrow, (int64_t) 256);
int64_t nc = w_ncol;
// Reference matmul (double precision)
std::vector<double> ref(a_nrow * nr);
for (int64_t t = 0; t < a_nrow; t++) {
for (int64_t r = 0; r < nr; r++) {
double s = 0;
for (int64_t c = 0; c < nc; c++) {
s += (double) A[t * a_ncol + c] * (double) W[r * nc + c];
}
ref[t * nr + r] = s;
}
}
double ref_mag2 = 0;
for (auto v : ref) {
ref_mag2 += v * v;
}
float ref_rms = (float) sqrt(ref_mag2 / (a_nrow * nr));
(void) ref_rms;
struct Approach {
const char * name;
float bpw;
std::function<void(const float *, float *, int64_t, int64_t, const float *)> fn;
};
// ── Register approaches here ──
Approach approaches[] = {
// { "Q2_K (baseline)", 2.625f,
// [&](auto * W, auto * o, auto nr, auto nc, auto * im) {
// approach_q2k(W, o, nr, nc, im);
// } },
// Add more approaches...
{ "placeholder", 0.0f, nullptr }, // remove once real approaches added
};
printf("\n %-28s %5s %10s %10s %7s\n", name, "BPW", "RMSE", "MatmulErr", "vs Q2K");
printf(" %-28s %5s %10s %10s %7s\n", "---", "---", "---", "---", "---");
float baseline_matmul_err = 0;
for (auto & ap : approaches) {
if (!ap.fn) {
continue;
}
std::vector<float> dec(nr * nc);
ap.fn(W, dec.data(), nr, nc, imatrix);
// Weight RMSE
double werr2 = 0;
for (int64_t i = 0; i < nr * nc; i++) {
double d = W[i] - dec[i];
werr2 += d * d;
}
float wrmse = (float) sqrt(werr2 / (nr * nc));
// Matmul error
double merr2 = 0;
for (int64_t t = 0; t < a_nrow; t++) {
for (int64_t r = 0; r < nr; r++) {
double s = 0;
for (int64_t c = 0; c < nc; c++) {
s += (double) A[t * a_ncol + c] * (double) dec[r * nc + c];
}
double d = s - ref[t * nr + r];
merr2 += d * d;
}
}
float matmul_rmse = (float) sqrt(merr2 / (a_nrow * nr));
if (baseline_matmul_err == 0) {
baseline_matmul_err = matmul_rmse;
}
float ratio = (baseline_matmul_err > 1e-10f) ? matmul_rmse / baseline_matmul_err : 0;
printf(" %-28s %5.3f %10.6f %10.6f %6.3fx\n", ap.name, ap.bpw, wrmse, matmul_rmse, ratio);
}
}
// Run comparison on all tensor pairs from data directory
int test_approach_comparison(const char * data_dir) {
printf("\n");
printf("=======================================================================\n");
printf(" MULTI-APPROACH COMPARISON (real weights x real activations)\n");
printf("=======================================================================\n");
struct TestPair {
const char * wf;
const char * af;
const char * imf;
const char * name;
} pairs[] = {
{ "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_gate" },
{ "blk_0_ffn_up_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_up" },
{ "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin", "imatrix_blk0_ffn_down.f32bin", "ffn_down" },
{ "blk_0_attn_q_weight.f32bin", "act_blk0_attn_input.f32bin", "imatrix_blk0_attn_qkv.f32bin", "attn_q" },
};
for (auto & p : pairs) {
char wp[512], ap[512], imp[512];
snprintf(wp, sizeof(wp), "%s/%s", data_dir, p.wf);
snprintf(ap, sizeof(ap), "%s/%s", data_dir, p.af);
snprintf(imp, sizeof(imp), "%s/%s", data_dir, p.imf);
std::vector<float> wd, ad, im;
int64_t wnr, wnc, anr, anc;
if (!load_f32_tensor(wp, wd, wnr, wnc) || !load_f32_tensor(ap, ad, anr, anc)) {
continue;
}
const float * im_ptr = nullptr;
if (load_imatrix(imp, im, wnc)) {
im_ptr = im.data();
} else {
printf(" [%s] No imatrix found, using uniform weights\n", p.name);
}
compare_approaches(wd.data(), wnr, wnc, ad.data(), anr, anc, p.name, im_ptr);
}
printf("\n");
return 0;
}
private:
std::mt19937 gen;
};
// ============================================================================
// Main
// ============================================================================
int main(int argc, char ** argv) {
ggml_backend_load_all();
QuantLaboratory lab;
int total_fail = 0;
printf("Quantization Laboratory\n");
printf("=======================\n");
// Real data tests (from data/ directory)
{
const char * data_dir = "data";
if (argc > 1) {
data_dir = argv[1];
}
char probe[512];
snprintf(probe, sizeof(probe), "%s/blk_0_ffn_gate_weight.f32bin", data_dir);
FILE * fp = fopen(probe, "rb");
if (fp) {
fclose(fp);
total_fail += lab.test_approach_comparison(data_dir);
} else {
printf("\n=== Real Data Tests SKIPPED ===\n");
printf(" No data found at %s\n", data_dir);
printf(
" Run: cd data && PYTHONPATH=../gguf-py python3 ../scripts/extract-tensor-data.py MODEL.gguf "
"blk.0.ffn_gate blk.0.ffn_up blk.0.ffn_down blk.0.attn_q\n");
printf(" And: llama-capture-layer-data -m MODEL.gguf -l 0 -o data\n");
}
}
printf("\n\n=== Testing Complete: %d failures ===\n", total_fail);
return total_fail > 0 ? 1 : 0;
}

View File

@ -54,7 +54,7 @@ static float total_quantization_error(const ggml_type_traits * qfns, const ggml_
std::vector<float> tmp_out(test_size);
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr);
return array_rmse(test_data, tmp_out.data(), test_size);
}
@ -66,10 +66,10 @@ static float reference_quantization_error(const ggml_type_traits * qfns, const g
// FIXME: why is done twice?
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr);
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size, nullptr);
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
}
@ -95,7 +95,7 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr
vdot->from_float(test_data2, tmp_q2.data(), test_size);
float result = INFINITY;
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1, nullptr);
const float dot_ref = dot_product(test_data1, test_data2, test_size);

View File

@ -309,7 +309,7 @@ int main(int argc, char * argv[]) {
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
qfns->to_float(test_q1, test_out, size);
qfns->to_float(test_q1, test_out, size, nullptr);
return test_out[0];
};
size_t quantized_size = ggml_row_size(type, size);
@ -341,7 +341,7 @@ int main(int argc, char * argv[]) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
float result;
qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1, nullptr);
return result;
};
size_t quantized_size = ggml_row_size(type, size);

View File

@ -158,7 +158,7 @@ static void test_roundtrip_on_chunk(
} else {
qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
}
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
qfns.to_float(quantized_scratch, output_scratch, chunk_size, nullptr);
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
}

View File

@ -38,5 +38,6 @@ else()
add_subdirectory(export-lora)
endif()
add_subdirectory(fit-params)
add_subdirectory(capture-layer-data)
add_subdirectory(results)
endif()

View File

@ -0,0 +1,9 @@
set(TARGET llama-capture-layer-data)
add_executable(${TARGET} capture-layer-data.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()

View File

@ -0,0 +1,251 @@
// capture-layer-data.cpp
// Captures intermediate activation tensors during model inference
// and saves them as .f32bin files for the quantization laboratory.
//
// Usage:
// llama-capture-layer-data -m MODEL_PATH -l LAYER [-p PROMPT] [-o OUTPUT_DIR]
//
// Example:
// llama-capture-layer-data -m /devel/models/Qwen_Qwen3-4B-Instruct-2507-bf16.gguf -l 0 -o data
#include "arg.h"
#include "common.h"
#include "ggml-backend.h"
#include "ggml.h"
#include "llama.h"
#include "log.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <filesystem>
#include <fstream>
#include <string>
#include <vector>
struct TensorMapping {
const char * graph_name_prefix;
const char * output_suffix;
};
static const TensorMapping mappings[] = {
{ "attn_norm", "attn_input" },
{ "kqv_out", "attn_output_input" },
{ "ffn_norm", "ffn_input" },
{ "ffn_swiglu", "ffn_down_input" },
};
static constexpr int N_MAPPINGS = sizeof(mappings) / sizeof(mappings[0]);
struct CaptureState {
int target_layer;
std::string output_dir;
int captured_count = 0;
std::string pending_name;
std::string graph_to_filename(const char * graph_name) const {
for (int i = 0; i < N_MAPPINGS; i++) {
std::string prefix = mappings[i].graph_name_prefix;
if (strncmp(graph_name, prefix.c_str(), prefix.size()) == 0) {
char buf[256];
snprintf(buf, sizeof(buf), "act_blk%d_%s.f32bin", target_layer, mappings[i].output_suffix);
return std::string(buf);
}
}
return "";
}
};
static CaptureState * g_capture_state = nullptr;
static void save_tensor_as_f32bin(const ggml_tensor * t, const std::string & filepath) {
int64_t n_rows = t->ne[1];
int64_t row_len = t->ne[0];
int64_t total = 1;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
total *= t->ne[i];
}
std::vector<float> f32_data(total);
if (t->type == GGML_TYPE_F32) {
const float * src = (const float *) t->data;
if (!src) {
LOG_ERR("Tensor %s has null data pointer\n", t->name);
return;
}
memcpy(f32_data.data(), src, total * sizeof(float));
} else if (t->type == GGML_TYPE_F16) {
const ggml_fp16_t * src = (const ggml_fp16_t *) t->data;
for (int64_t i = 0; i < total; i++) {
f32_data[i] = ggml_fp16_to_fp32(src[i]);
}
} else if (t->type == GGML_TYPE_BF16) {
const ggml_bf16_t * src = (const ggml_bf16_t *) t->data;
for (int64_t i = 0; i < total; i++) {
f32_data[i] = ggml_bf16_to_fp32(src[i]);
}
} else {
LOG_ERR("Unsupported tensor type %s for %s\n", ggml_type_name(t->type), t->name);
return;
}
std::ofstream file(filepath, std::ios::binary);
if (!file) {
LOG_ERR("Failed to open %s for writing\n", filepath.c_str());
return;
}
file.write(reinterpret_cast<const char *>(&n_rows), sizeof(int64_t));
file.write(reinterpret_cast<const char *>(&row_len), sizeof(int64_t));
file.write(reinterpret_cast<const char *>(f32_data.data()), total * sizeof(float));
file.close();
LOG(" Captured: %s -> %s (%lld x %lld, %s)\n", t->name, filepath.c_str(), (long long) n_rows, (long long) row_len,
ggml_type_name(t->type));
}
static bool capture_callback(ggml_tensor * t, bool ask, void * user_data) {
auto * state = (CaptureState *) user_data;
if (ask) {
char target[128];
for (int i = 0; i < N_MAPPINGS; i++) {
snprintf(target, sizeof(target), "%s-%d", mappings[i].graph_name_prefix, state->target_layer);
if (strcmp(t->name, target) == 0) {
state->pending_name = t->name;
return true;
}
}
return false;
}
if (state->pending_name.empty()) {
return true;
}
if (strcmp(t->name, state->pending_name.c_str()) != 0) {
return true;
}
if (!ggml_backend_buffer_is_host(t->buffer)) {
size_t nbytes = ggml_nbytes(t);
std::vector<uint8_t> tmp(nbytes);
ggml_backend_tensor_get(t, tmp.data(), 0, nbytes);
LOG_WRN("Tensor %s is not host-accessible, data copied via backend\n", t->name);
}
std::string filename = state->graph_to_filename(t->name);
if (!filename.empty()) {
std::filesystem::create_directories(state->output_dir);
std::string filepath = (std::filesystem::path(state->output_dir) / filename).string();
save_tensor_as_f32bin(t, filepath);
state->captured_count++;
}
state->pending_name.clear();
return true;
}
static void print_usage(void) {
LOG("Usage: llama-capture-layer-data -m MODEL_PATH [-l LAYER] [-p PROMPT] [-o OUTPUT_DIR]\n");
LOG("\n");
LOG(" -m MODEL Path to GGUF model (BF16/F16 recommended)\n");
LOG(" -l LAYER Target layer index (default: 0)\n");
LOG(" -p PROMPT Inference prompt (default: \"The quick brown fox...\")\n");
LOG(" -o DIR Output directory for .f32bin files (default: data)\n");
}
int main(int argc, char ** argv) {
if (argc < 3 || (std::string(argv[1]) == "-h" || std::string(argv[1]) == "--help")) {
print_usage();
return 1;
}
common_params params;
int layer = 0;
std::string output_dir = "data";
std::string prompt = "The quick brown fox jumps over the lazy dog.";
std::string model_path;
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-m" && i + 1 < argc) {
model_path = argv[++i];
} else if (arg == "-l" && i + 1 < argc) {
layer = atoi(argv[++i]);
} else if (arg == "-p" && i + 1 < argc) {
prompt = argv[++i];
} else if (arg == "-o" && i + 1 < argc) {
output_dir = argv[++i];
}
}
if (model_path.empty()) {
LOG_ERR("Error: -m MODEL_PATH is required\n\n");
print_usage();
return 1;
}
params.model.path = model_path;
params.prompt = prompt;
params.n_batch = 512;
params.n_ubatch = 512;
params.n_gpu_layers = 0;
params.fit_params = false;
CaptureState state;
state.target_layer = layer;
state.output_dir = output_dir;
g_capture_state = &state;
params.cb_eval = capture_callback;
params.cb_eval_user_data = &state;
LOG("Loading model: %s\n", model_path.c_str());
LOG("Target layer: %d\n", layer);
LOG("Output directory: %s\n", output_dir.c_str());
common_init();
ggml_backend_load_all();
llama_backend_init();
llama_numa_init(params.numa);
auto llama_init = common_init_from_params(params);
if (!llama_init) {
LOG_ERR("Failed to load model\n");
return 1;
}
auto * model = llama_init->model();
auto * ctx = llama_init->context();
if (model == nullptr || ctx == nullptr) {
LOG_ERR("Failed to initialize context\n");
return 1;
}
LOG("Model loaded successfully\n");
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
if (tokens.empty()) {
LOG_ERR("No tokens generated from prompt\n");
return 1;
}
LOG("Tokenizing prompt: %zu tokens\n", tokens.size());
LOG("Running inference...\n");
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
LOG_ERR("llama_decode failed\n");
return 1;
}
LOG("\nDone. Captured %d tensors to %s/\n", state.captured_count, output_dir.c_str());
llama_backend_free();
return state.captured_count == 0 ? 1 : 0;
}

View File

@ -318,7 +318,7 @@ struct lora_merge_ctx {
auto nels = ggml_nelements(inp_base);
const auto * qtype = ggml_get_type_traits(base->type);
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels, nullptr);
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
} else {
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));

View File

@ -46,6 +46,13 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
{ "Q3_PT", LLAMA_FTYPE_MOSTLY_Q3_PT, " 3.25 bpw quantization", },
{ "Q3_KPT", LLAMA_FTYPE_MOSTLY_Q3_KPT, " Q3_K with learned per-tensor levels" },
{ "Q4_DPT", LLAMA_FTYPE_MOSTLY_Q4_DPT, " IQ4_NL with learned per-tensor int8 levels" },
{ "Q2_KPT", LLAMA_FTYPE_MOSTLY_Q2_KPT, " Q2_K with learned per-tensor float levels" },
{ "IQ2_TQ", LLAMA_FTYPE_MOSTLY_IQ2_TQ, " 2.0625 bpw, trellis quantized" },
{ "IQ3_TQ", LLAMA_FTYPE_MOSTLY_IQ3_TQ, " 3.5625 bpw, per-tensor trained grid" },
{ "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.5625 bpw, 8D vector quantized" },
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
@ -162,6 +169,9 @@ static void usage(const char * executable) {
printf(" WARNING: this is an advanced option, use with care.\n");
printf(" --dry-run\n");
printf(" calculate and show the final quantization size without performing quantization\n");
printf(" --threads n\n");
printf(" number of threads to use for cross-tensor parallelization (default: 0, use same as within-tensor)\n");
printf(" when n > 0, enables parallel quantization of multiple tensors simultaneously\n");
printf(" example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n");
printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
printf("-----------------------------------------------------------------------------\n");
@ -565,6 +575,8 @@ int main(int argc, char ** argv) {
}
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
params.keep_split = true;
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
params.keep_split = true;
} else {
usage(argv[0]);
}