Merge 6c0bec52f4 into 75f3bc94e6
This commit is contained in:
commit
e840352f97
|
|
@ -111,13 +111,14 @@ extern "C" {
|
|||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
|
||||
|
||||
struct ggml_type_traits_cpu {
|
||||
ggml_from_float_t from_float;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
size_t levels_row_stride; // bytes to add per row to get next row's quant_levels (0 = per-tensor)
|
||||
};
|
||||
|
||||
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||
|
|
|
|||
|
|
@ -429,7 +429,15 @@ extern "C" {
|
|||
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
||||
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
|
||||
GGML_TYPE_Q1_0 = 41,
|
||||
GGML_TYPE_COUNT = 42,
|
||||
GGML_TYPE_Q3_PT = 42, // 3.875 bpw per-tensor Lloyd-Max, 16-elem affine sub-blocks
|
||||
GGML_TYPE_Q3_KPT = 43, // Q3_K with learned per-tensor levels (3.4375 bpw)
|
||||
GGML_TYPE_Q4_DPT = 44, // IQ4_NL with learned per-tensor int8 levels (4.125 bpw)
|
||||
GGML_TYPE_Q2_DPT = 45, // 2-bit with learned per-tensor int8 levels (2.5 bpw)
|
||||
GGML_TYPE_Q2_KPT = 46, // Q2_K with learned per-tensor float levels (2.625 bpw)
|
||||
GGML_TYPE_IQ2_TQ = 47, // Trellis quantized with RNG codebook (2.0625 bpw)
|
||||
GGML_TYPE_IQ3_TQ = 48, // 3-bit with per-tensor trained grid table (3.5625 bpw)
|
||||
GGML_TYPE_IQ1_BN = 49, // 8D vector quantized with per-tensor trained codebook (1.5625 bpw)
|
||||
GGML_TYPE_COUNT = 50,
|
||||
};
|
||||
|
||||
// precision
|
||||
|
|
@ -457,6 +465,7 @@ extern "C" {
|
|||
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q3_PT = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
|
||||
|
|
@ -465,8 +474,11 @@ extern "C" {
|
|||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q3_KPT = 27, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_DPT = 28, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q2_KPT = 29, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_NVFP4 = 30, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q1_0 = 31, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
|
@ -686,9 +698,8 @@ extern "C" {
|
|||
|
||||
char name[GGML_MAX_NAME];
|
||||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
char padding[8];
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
void * quant_levels; // per-tensor quantization levels (replaces char padding[8]; same size on 64-bit)
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
|
@ -2723,7 +2734,7 @@ extern "C" {
|
|||
# define GGML_RESTRICT restrict
|
||||
# endif
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
struct ggml_type_traits {
|
||||
|
|
@ -2734,6 +2745,7 @@ extern "C" {
|
|||
bool is_quantized;
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float_ref;
|
||||
size_t levels_row_stride; // bytes to advance quant_levels per row (0 = per-tensor)
|
||||
};
|
||||
|
||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||
|
|
|
|||
|
|
@ -208,6 +208,13 @@ add_library(ggml-base
|
|||
ggml-quants.h
|
||||
gguf.cpp)
|
||||
|
||||
# Enable native SIMD for ggml-quants.c (needed for K-means training in quantization)
|
||||
include(CheckCCompilerFlag)
|
||||
check_c_compiler_flag("-march=native" GGML_COMPILER_SUPPORTS_MARCH_NATIVE)
|
||||
if (GGML_COMPILER_SUPPORTS_MARCH_NATIVE)
|
||||
set_source_files_properties(ggml-quants.c PROPERTIES COMPILE_FLAGS "-march=native")
|
||||
endif()
|
||||
|
||||
set_target_properties(ggml-base PROPERTIES
|
||||
VERSION ${GGML_VERSION}
|
||||
SOVERSION ${GGML_VERSION_MAJOR}
|
||||
|
|
|
|||
|
|
@ -396,7 +396,7 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
|
|||
//
|
||||
|
||||
struct ggml_backend_meta_buffer_context {
|
||||
static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
|
||||
static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::quant_levels);
|
||||
|
||||
std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
|
||||
std::map< const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,15 @@
|
|||
#include "ggml-impl.h"
|
||||
#include "ggml-blas.h"
|
||||
|
||||
// Helper: compute quant_levels stride for a given row.
|
||||
// For Q2_KPT (per-block levels), stride depends on tensor width.
|
||||
static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) {
|
||||
if (type == GGML_TYPE_Q2_KPT) {
|
||||
return (size_t)(ne0 / 256) * 4 * sizeof(float);
|
||||
}
|
||||
return constant_stride;
|
||||
}
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <future>
|
||||
|
|
@ -77,10 +87,11 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
|
||||
const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);
|
||||
|
||||
const size_t lrs = ggml_quant_levels_stride(src0->type, ggml_get_type_traits(src0->type)->levels_row_stride, src0->ne[0]);
|
||||
#ifdef GGML_USE_OPENMP
|
||||
#pragma omp parallel for num_threads(n_threads)
|
||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
|
||||
}
|
||||
#else
|
||||
for (int i = 1; i < n_threads; i++) {
|
||||
|
|
@ -89,7 +100,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
if (start < end) {
|
||||
ctx->tasks.push_back(std::async(std::launch::async, [=]() {
|
||||
for (int64_t i01 = start; i01 < end; i01++) {
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
|
@ -99,7 +110,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
const int64_t start = 0;
|
||||
const int64_t end = ne01/n_threads;
|
||||
for (int64_t i01 = start; i01 < end; i01++) {
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
||||
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00, (const char*)src0->quant_levels + i01*lrs);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -298,6 +298,7 @@ typedef struct {
|
|||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
|
||||
// 3-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
// 16 blocks of 16 elements each
|
||||
|
|
@ -327,6 +328,12 @@ typedef struct {
|
|||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
||||
|
||||
// Q3_KPT: Q3_K with learned per-tensor levels
|
||||
// Reuses block_q3_K structure but maps 3-bit indices through learned level table
|
||||
typedef block_q3_K block_q3_kpt;
|
||||
#define Q3KPT_N_LEVELS 8
|
||||
|
||||
|
||||
// 5-bit quantization
|
||||
// 8 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
|
|
@ -449,6 +456,115 @@ typedef struct {
|
|||
} block_iq4_xs;
|
||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||
|
||||
// 3.875 bpw - per-tensor Lloyd-Max scalar quantization
|
||||
// 256 elements = 16 sub-blocks of 16, 8-entry level table trained per tensor
|
||||
// Layout: 2 (d) + 2 (dmin) + 24 (scales: 32x6-bit) + 96 (qs: 256x3-bit) = 124 bytes
|
||||
typedef struct {
|
||||
ggml_half d; // 2 bytes: global scale for 16-elem sub-block ranges
|
||||
ggml_half dmin; // 2 bytes: global scale for sub-block neg_mins
|
||||
uint8_t scales[3*QK_K/32]; // 24 bytes: 32 x 6-bit (indices 0..15 = ranges, 16..31 = neg_mins)
|
||||
uint8_t qs[3*QK_K/8]; // 96 bytes: 256 x 3-bit Lloyd-Max level index, sequential
|
||||
} block_q3_pt;
|
||||
static_assert(sizeof(block_q3_pt) == 124, "wrong q3_pt block size");
|
||||
|
||||
#define Q3PT_N_LEVELS 8
|
||||
|
||||
// Q4_DPT: IQ4_NL with learned per-tensor int8 levels (4.125 bpw)
|
||||
// Block format: identical to block_iq4_nl (2 + 16 = 18 bytes per 32 elements)
|
||||
typedef block_iq4_nl block_q4_dpt;
|
||||
#define Q4DPT_N_LEVELS 16
|
||||
|
||||
// Q2_DPT: 2-bit per-tensor Lloyd-Max scalar quantization (2.5 bpw)
|
||||
// Block format: 2 bytes (FP16 scale) + 8 bytes (2-bit indices for 32 elements) = 10 bytes per block
|
||||
// 4 learned int8 levels per tensor, optimized via Lloyd-Max k-means
|
||||
typedef struct {
|
||||
ggml_half d; // 2 bytes: FP16 scale (delta)
|
||||
uint8_t qs[8]; // 8 bytes: 2-bit indices (4 values per byte, 32 elements total)
|
||||
} block_q2_dpt;
|
||||
static_assert(sizeof(block_q2_dpt) == sizeof(ggml_half) + 8, "wrong q2_dpt block size/padding");
|
||||
|
||||
#define QK2_DPT 32
|
||||
#define Q2DPT_N_LEVELS 4
|
||||
|
||||
// Q2_KPT: Q2_K with learned per-tensor float levels (2.625 bpw)
|
||||
// Reuses block_q2_K structure but maps 2-bit indices through learned level table
|
||||
typedef block_q2_K block_q2_kpt;
|
||||
#define Q2KPT_N_LEVELS 4
|
||||
|
||||
// IQ2_TQ: Trellis Quantized with RNG codebook (2.0625 bpw)
|
||||
//
|
||||
// Reconstruction: y[i] = d * hash(seed, block_idx, position, trellis_state, qs_idx)
|
||||
// where hash is a deterministic function mapping to [-1, 1]
|
||||
// and trellis_state evolves as: next = (state + idx + 1) & 7
|
||||
//
|
||||
// Block layout (66 bytes per 256 elements):
|
||||
// IQ2_TQ: 2-bit scalar quantization with per-tensor trained asymmetric grid table
|
||||
// 32 groups of 8 elements per 256-element super-block
|
||||
// - ggml_half d (2 bytes): super-block scale
|
||||
// - uint8_t scales[16] (16 bytes): 32 × 4-bit grid entry index per group
|
||||
// - uint8_t qs[64] (64 bytes): 256 × 2-bit element index within grid entry
|
||||
// recon[j] = d * IQ2TQ_GRID_SCALE * grid[group_idx][elem_idx]
|
||||
typedef struct {
|
||||
ggml_half d; // Super-block scale (2 bytes)
|
||||
uint8_t scales[QK_K/16]; // 32 × 4-bit grid entry index per group (16 bytes)
|
||||
uint8_t qs[QK_K/4]; // 256 × 2-bit element index (64 bytes)
|
||||
} block_iq2_tq;
|
||||
static_assert(sizeof(block_iq2_tq) == 82, "wrong iq2_tq block size");
|
||||
// 2 + 16 + 64 = 82 bytes per 256 weights = 2.5625 bpw
|
||||
|
||||
#define IQ2TQ_GROUP_SIZE 8 // Elements per group
|
||||
#define IQ2TQ_N_GROUPS (QK_K / IQ2TQ_GROUP_SIZE) // 32 groups per super-block
|
||||
#define IQ2TQ_GRID_SCALE 0.125f // Grid value multiplier: recon = d * GRID_SCALE * grid_int8
|
||||
|
||||
// IQ3_TQ: 3-bit scalar quantization with per-tensor trained asymmetric grid table (3.5625 bpw)
|
||||
// 32 groups of 8 elements per 256-element super-block
|
||||
// Each grid entry has 8 int8 levels (3 bits → 8 values per element)
|
||||
// Grid table: 16 entries × 8 int8 = 128 bytes per tensor
|
||||
// Block layout:
|
||||
// - ggml_half d (2 bytes): super-block scale
|
||||
// - uint8_t scales[16] (16 bytes): 32 × 4-bit grid entry index per group
|
||||
// - uint8_t qs[96] (96 bytes): 256 × 3-bit element index within grid entry
|
||||
// recon[j] = d * IQ3TQ_GRID_SCALE * grid[group_idx][elem_idx]
|
||||
typedef struct {
|
||||
ggml_half d; // Super-block scale (2 bytes)
|
||||
uint8_t scales[QK_K/16]; // 32 × 4-bit grid entry index per group (16 bytes)
|
||||
uint8_t qs[3*QK_K/8]; // 256 × 3-bit element index (96 bytes)
|
||||
} block_iq3_tq;
|
||||
static_assert(sizeof(block_iq3_tq) == 114, "wrong iq3_tq block size");
|
||||
// 2 + 16 + 96 = 114 bytes per 256 weights = 3.5625 bpw
|
||||
|
||||
#define IQ3TQ_GROUP_SIZE 8 // Elements per group
|
||||
#define IQ3TQ_N_GROUPS (QK_K / IQ3TQ_GROUP_SIZE) // 32 groups per super-block
|
||||
#define IQ3TQ_N_LEVELS 8 // 3-bit → 8 levels per grid entry
|
||||
#define IQ3TQ_GRID_SCALE 0.125f // Grid value multiplier
|
||||
#define IQ3TQ_GRID_SIZE 128 // 16 entries × 8 int8 = 128 bytes per tensor
|
||||
|
||||
// IQ1_BN: 8D vector quantized with per-tensor trained 4096-entry codebook (1.5625 bpw)
|
||||
// 32 groups of 8 elements per 256-element super-block
|
||||
// Each group selects one of 4096 trained 8D vectors via 12-bit codebook index
|
||||
// Codebook: 4096 entries × 8 int8 = 32768 bytes per tensor
|
||||
// Block layout:
|
||||
// - ggml_half d (2 bytes): super-block scale
|
||||
// - uint8_t qs[48] (48 bytes): 32 × 12-bit codebook indices packed in pairs
|
||||
// 12-bit pair packing (groups 2k, 2k+1 → 3 bytes at qs[3k]):
|
||||
// idx_even = qs[3k] | ((qs[3k+1] & 0x0F) << 8)
|
||||
// idx_odd = (qs[3k+1] >> 4) | (qs[3k+2] << 4)
|
||||
// recon[g*8+k] = d * IQ1BN_GRID_SCALE * codebook[ci][k]
|
||||
typedef struct {
|
||||
ggml_half d; // Super-block scale (2 bytes)
|
||||
uint8_t qs[3*QK_K/16]; // 32 × 12-bit codebook indices packed in pairs (48 bytes)
|
||||
} block_iq1_bn;
|
||||
static_assert(sizeof(block_iq1_bn) == 50, "wrong iq1_bn block size");
|
||||
// 2 + 48 = 50 bytes per 256 weights = 1.5625 bpw
|
||||
|
||||
#define IQ1BN_GROUP_SIZE 8
|
||||
#define IQ1BN_N_GROUPS (QK_K / IQ1BN_GROUP_SIZE) // 32
|
||||
#define IQ1BN_CODEBOOK_K 4096 // number of codebook entries
|
||||
#define IQ1BN_CODEBOOK_DIM 8 // vector dimension (= group size)
|
||||
#define IQ1BN_GRID_SCALE 0.125f // Grid value multiplier
|
||||
#define IQ1BN_CODEBOOK_SIZE (IQ1BN_CODEBOOK_K * IQ1BN_CODEBOOK_DIM) // 32768 bytes
|
||||
#define IQ1BN_AUX_SIZE IQ1BN_CODEBOOK_SIZE // 32768 bytes
|
||||
|
||||
#endif // GGML_COMMON_DECL
|
||||
#endif // GGML_COMMON_DECL
|
||||
|
||||
|
|
|
|||
|
|
@ -33,6 +33,8 @@
|
|||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
|
||||
#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
|
|
@ -203,6 +205,15 @@
|
|||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
|
||||
#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
// repack.cpp
|
||||
|
|
@ -307,6 +318,8 @@
|
|||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_q3_pt_q8_K_generic ggml_vec_dot_q3_pt_q8_K
|
||||
#define ggml_vec_dot_q4_dpt_q8_0_generic ggml_vec_dot_q4_dpt_q8_0
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
|
||||
|
|
|
|||
|
|
@ -137,7 +137,111 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|||
|
||||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK1_0; // 128
|
||||
const int nb = n / qk;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
GGML_UNUSED(levels);
|
||||
|
||||
const block_q1_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t sumv = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
// Process 4 Q8_0 blocks (each has 32 elements)
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
|
||||
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
|
||||
|
||||
// Get the 4 bytes of bits for this Q8_0 block (32 bits = 4 bytes)
|
||||
// Bits are at offset k*4 bytes in x[i].qs
|
||||
const uint8_t * bits = &x[i].qs[k * 4];
|
||||
|
||||
// Load 32 int8 values from y
|
||||
const int8x16_t y0 = vld1q_s8(yb->qs);
|
||||
const int8x16_t y1 = vld1q_s8(yb->qs + 16);
|
||||
|
||||
// Byte 0-1: bits for y0[0..15]
|
||||
const uint64_t expand0 = table_b2b_0[bits[0]];
|
||||
const uint64_t expand1 = table_b2b_0[bits[1]];
|
||||
// Byte 2-3: bits for y1[0..15]
|
||||
const uint64_t expand2 = table_b2b_0[bits[2]];
|
||||
const uint64_t expand3 = table_b2b_0[bits[3]];
|
||||
|
||||
// Build the sign vectors by reinterpreting the table values
|
||||
uint8x8_t e0 = vcreate_u8(expand0);
|
||||
uint8x8_t e1 = vcreate_u8(expand1);
|
||||
uint8x8_t e2 = vcreate_u8(expand2);
|
||||
uint8x8_t e3 = vcreate_u8(expand3);
|
||||
|
||||
// Shift right by 4 to get 0 or 1
|
||||
int8x8_t s0 = vreinterpret_s8_u8(vshr_n_u8(e0, 4));
|
||||
int8x8_t s1 = vreinterpret_s8_u8(vshr_n_u8(e1, 4));
|
||||
int8x8_t s2 = vreinterpret_s8_u8(vshr_n_u8(e2, 4));
|
||||
int8x8_t s3 = vreinterpret_s8_u8(vshr_n_u8(e3, 4));
|
||||
|
||||
// Convert 0/1 to -1/+1: sign = 2*val - 1
|
||||
int8x8_t one = vdup_n_s8(1);
|
||||
s0 = vsub_s8(vadd_s8(s0, s0), one); // 2*s0 - 1
|
||||
s1 = vsub_s8(vadd_s8(s1, s1), one);
|
||||
s2 = vsub_s8(vadd_s8(s2, s2), one);
|
||||
s3 = vsub_s8(vadd_s8(s3, s3), one);
|
||||
|
||||
// Combine into 16-element vectors
|
||||
int8x16_t signs0 = vcombine_s8(s0, s1);
|
||||
int8x16_t signs1 = vcombine_s8(s2, s3);
|
||||
|
||||
// Multiply signs with y values and accumulate
|
||||
// dot(signs, y) where signs are +1/-1
|
||||
int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), signs0, y0);
|
||||
int32x4_t p1 = ggml_vdotq_s32(p0, signs1, y1);
|
||||
|
||||
// Scale by d1 and accumulate
|
||||
sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1);
|
||||
}
|
||||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv);
|
||||
#else
|
||||
// Scalar fallback
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
// Process 4 Q8_0 blocks
|
||||
for (int k = 0; k < 4; k++) {
|
||||
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const int bit_index = k * QK8_0 + j;
|
||||
const int byte_index = bit_index / 8;
|
||||
const int bit_offset = bit_index % 8;
|
||||
|
||||
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
|
||||
sumi += xi * y[i*4 + k].qs[j];
|
||||
}
|
||||
sumf += d0 * d1 * sumi;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK1_0; // 128
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -240,7 +344,7 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
}
|
||||
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -533,7 +637,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -753,12 +857,13 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_NVFP4 == 0);
|
||||
|
||||
const block_nvfp4 * GGML_RESTRICT x = vx;
|
||||
|
|
@ -837,7 +942,92 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_NVFP4 == 0);
|
||||
|
||||
const block_nvfp4 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
// Each NVFP4 super-block (64 elements) spans 2 q8_0 blocks
|
||||
const int nb = n / QK_NVFP4;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
||||
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
float32x4_t acc = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs);
|
||||
const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16);
|
||||
|
||||
const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_0, m4b));
|
||||
const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4));
|
||||
const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_1, m4b));
|
||||
const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
|
||||
|
||||
const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
|
||||
const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
|
||||
const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
|
||||
const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b));
|
||||
|
||||
const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs);
|
||||
const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16);
|
||||
const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b));
|
||||
const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
|
||||
|
||||
const int32x4_t p0 = vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
|
||||
const int32x4_t p1 = vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
|
||||
|
||||
const int32x4_t sums = vpaddq_s32(p0, p1);
|
||||
|
||||
// Decode 4 UE4M3 scales to f32 and multiply with q8 scales
|
||||
const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
|
||||
const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
|
||||
const float32x4_t nvsc = {
|
||||
ggml_ue4m3_to_fp32(x[ib].d[0]),
|
||||
ggml_ue4m3_to_fp32(x[ib].d[1]),
|
||||
ggml_ue4m3_to_fp32(x[ib].d[2]),
|
||||
ggml_ue4m3_to_fp32(x[ib].d[3])
|
||||
};
|
||||
const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
|
||||
|
||||
acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
|
||||
}
|
||||
sumf = vaddvq_f32(acc);
|
||||
#else
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
for (int si = 0; si < 4; ++si) {
|
||||
const float d = ggml_ue4m3_to_fp32(x[ib].d[si]);
|
||||
const int q8b = si / 2;
|
||||
const int q8o = (si % 2) * QK_NVFP4_SUB;
|
||||
const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8b].d);
|
||||
|
||||
int sumi_lo = 0, sumi_hi = 0;
|
||||
for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
|
||||
const uint8_t qv = x[ib].qs[si*(QK_NVFP4_SUB/2) + j];
|
||||
sumi_lo += y[2*ib + q8b].qs[q8o + j + 0] * kvalues_mxfp4[qv & 0xf];
|
||||
sumi_hi += y[2*ib + q8b].qs[q8o + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
|
||||
}
|
||||
sumf += dy * d * (sumi_lo + sumi_hi);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -949,7 +1139,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -1067,7 +1257,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -3953,6 +4143,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
|
|
|
|||
|
|
@ -644,7 +644,7 @@ static inline __m128i get_scale_shuffle(int i) {
|
|||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -772,7 +772,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -827,11 +827,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -880,11 +880,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -936,11 +936,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -983,7 +983,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1956,6 +1956,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
}
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
||||
const __m256i a = __lasx_xvmulwev_h_b(x, y);
|
||||
|
|
|
|||
|
|
@ -141,7 +141,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|||
|
||||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -207,11 +207,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -274,7 +274,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -340,11 +340,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -412,11 +412,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -488,11 +488,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -557,7 +557,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2000,6 +2000,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
|
|
@ -2190,7 +2194,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -213,7 +213,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|||
|
||||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -264,11 +264,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -315,11 +315,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -369,11 +369,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
|
@ -422,11 +422,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -470,7 +470,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2954,6 +2954,14 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
}
|
||||
|
||||
#if defined __riscv_v_intrinsic
|
||||
static void ggml_vec_dot_iq3_xxs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
|
|
|
|||
|
|
@ -146,7 +146,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|||
|
||||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -201,11 +201,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -258,7 +258,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -353,11 +353,11 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -495,11 +495,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -648,11 +648,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -698,7 +698,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1388,7 +1388,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1463,3 +1463,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -229,7 +229,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|||
|
||||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -355,7 +355,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -442,11 +442,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -537,11 +537,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -605,7 +605,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1218,3 +1218,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -540,7 +540,8 @@ static inline __m128i get_scale_shuffle(int i) {
|
|||
}
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -698,7 +699,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -753,11 +755,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -843,7 +846,8 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -919,11 +923,12 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(ib);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -1005,11 +1010,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(ib);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -1077,7 +1083,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -1205,11 +1212,12 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -1271,11 +1279,12 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -1463,11 +1472,12 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -1735,11 +1745,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -1913,11 +1924,12 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -2123,11 +2135,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -2328,7 +2341,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -2369,7 +2382,8 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|||
};
|
||||
#endif
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -2483,11 +2497,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -2780,11 +2795,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -2965,11 +2981,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -3089,11 +3106,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -3299,11 +3317,17 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q3_pt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
ggml_vec_dot_q3_pt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -3418,11 +3442,12 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -3625,11 +3650,12 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(scale);
|
||||
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -3713,7 +3739,185 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK4_NL == 0);
|
||||
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
|
||||
|
||||
const block_q4_dpt * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK4_NL;
|
||||
|
||||
const int8_t * values = (const int8_t *)levels;
|
||||
GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor");
|
||||
|
||||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
#if defined __AVX2__
|
||||
|
||||
const __m128i values128 = _mm_loadu_si128((const __m128i*)values);
|
||||
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||
const __m256i mone = _mm256_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
|
||||
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
|
||||
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
|
||||
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
||||
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
||||
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
||||
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
||||
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
||||
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
||||
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(p_1), accum1);
|
||||
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(p_2), accum2);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
|
||||
#elif defined __AVX__
|
||||
const __m128i values128 = _mm_loadu_si128((const __m128i*)values);
|
||||
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
|
||||
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
|
||||
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
||||
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
||||
|
||||
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
||||
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
||||
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
||||
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
||||
|
||||
const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(accum);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * values[x[ib].qs[j] & 0xf];
|
||||
sumi2 += y[ib].qs[j+QK4_NL/2] * values[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_dpt_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK2_DPT == 0);
|
||||
static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same");
|
||||
|
||||
const block_q2_dpt * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK2_DPT;
|
||||
|
||||
const int8_t * values = (const int8_t *)levels;
|
||||
GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor");
|
||||
|
||||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
#if defined __AVX2__
|
||||
|
||||
const __m128i values128 = _mm_loadu_si128((const __m128i*)values);
|
||||
const __m128i m3 = _mm_set1_epi8(0x03);
|
||||
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q2bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
|
||||
const __m128i q2bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
|
||||
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
|
||||
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
|
||||
|
||||
// Extract 2-bit indices and lookup values - process 8 elements at a time
|
||||
// For each byte of q2bits, we have 4 x 2-bit indices
|
||||
const __m128i q2_01_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_1, m3));
|
||||
const __m128i q2_01_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 2), m3));
|
||||
const __m128i q2_02_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 4), m3));
|
||||
const __m128i q2_02_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_1, 6), m3));
|
||||
const __m128i q2_11_l = _mm_shuffle_epi8(values128, _mm_and_si128(q2bits_2, m3));
|
||||
const __m128i q2_11_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 2), m3));
|
||||
const __m128i q2_12_l = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 4), m3));
|
||||
const __m128i q2_12_h = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q2bits_2, 6), m3));
|
||||
|
||||
// Combine pairs into __m256i
|
||||
const __m256i q4b_1a = MM256_SET_M128I(q2_01_h, q2_01_l);
|
||||
const __m256i q4b_1b = MM256_SET_M128I(q2_02_h, q2_02_l);
|
||||
const __m256i q4b_2a = MM256_SET_M128I(q2_11_h, q2_11_l);
|
||||
const __m256i q4b_2b = MM256_SET_M128I(q2_12_h, q2_12_l);
|
||||
|
||||
// Split q8 into pairs and compute dot products
|
||||
const __m256i q8b_1a = _mm256_and_si256(q8b_1, _mm256_set1_epi16(0x00ff));
|
||||
const __m256i q8b_1b = _mm256_srli_epi16(q8b_1, 8);
|
||||
const __m256i q8b_2a = _mm256_and_si256(q8b_2, _mm256_set1_epi16(0x00ff));
|
||||
const __m256i q8b_2b = _mm256_srli_epi16(q8b_2, 8);
|
||||
|
||||
const __m256i p16_1a = mul_add_epi8(q4b_1a, q8b_1a);
|
||||
const __m256i p16_1b = mul_add_epi8(q4b_1b, q8b_1b);
|
||||
const __m256i p16_2a = mul_add_epi8(q4b_2a, q8b_2a);
|
||||
const __m256i p16_2b = mul_add_epi8(q4b_2b, q8b_2b);
|
||||
|
||||
const __m256i mone = _mm256_set1_epi16(1);
|
||||
const __m256i p_1 = _mm256_add_epi32(_mm256_madd_epi16(p16_1a, mone), _mm256_madd_epi16(p16_1b, mone));
|
||||
const __m256i p_2 = _mm256_add_epi32(_mm256_madd_epi16(p16_2a, mone), _mm256_madd_epi16(p16_2b, mone));
|
||||
|
||||
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(p_1), accum);
|
||||
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(p_2), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(accum);
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK2_DPT/4; ++j) {
|
||||
uint8_t q = x[ib].qs[j];
|
||||
sumi += y[ib].qs[j*4 + 0] * values[(q >> 0) & 3];
|
||||
sumi += y[ib].qs[j*4 + 1] * values[(q >> 2) & 3];
|
||||
sumi += y[ib].qs[j*4 + 2] * values[(q >> 4) & 3];
|
||||
sumi += y[ib].qs[j*4 + 3] * values[(q >> 6) & 3];
|
||||
}
|
||||
sumf += d * sumi;
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -3815,6 +4019,6 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
#endif
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
|
@ -396,6 +397,52 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q3_PT] = {
|
||||
// from_float not set — requires codebook initialization via q3pt_set_codebook()
|
||||
.vec_dot = ggml_vec_dot_q3_pt_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q3_KPT] = {
|
||||
// from_float not set — requires level initialization via q3kpt_set_levels()
|
||||
.vec_dot = ggml_vec_dot_q3_kpt_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_DPT] = {
|
||||
// from_float not set — requires level initialization via q4dpt_set_levels()
|
||||
.vec_dot = ggml_vec_dot_q4_dpt_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q2_DPT] = {
|
||||
// from_float not set — requires level initialization via q2dpt_set_levels()
|
||||
.vec_dot = ggml_vec_dot_q2_dpt_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q2_KPT] = {
|
||||
// from_float not set — requires level initialization via q2kpt_set_levels()
|
||||
.vec_dot = ggml_vec_dot_q2_kpt_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
.levels_row_stride = 0, // computed dynamically: (ne0/QK_K)*Q2KPT_N_LEVELS*sizeof(float)
|
||||
},
|
||||
[GGML_TYPE_IQ2_TQ] = {
|
||||
.vec_dot = ggml_vec_dot_iq2_tq_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ3_TQ] = {
|
||||
.vec_dot = ggml_vec_dot_iq3_tq_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ1_BN] = {
|
||||
.vec_dot = ggml_vec_dot_iq1_bn_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_I32] = {
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
|
||||
},
|
||||
|
|
@ -1165,8 +1212,15 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
|||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
// For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
|
||||
// ne00 is the number of elements per row in src0 (input dimension), NOT ne0 (= ne01 = output rows).
|
||||
// For non-square matrices (e.g. ffn_up: [hidden, intermediate]) ne00 != ne01, so ne00 is correct.
|
||||
// For other types, use the static stride from type_traits_cpu
|
||||
const size_t levels_row_stride = (type == GGML_TYPE_Q2_KPT)
|
||||
? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
|
||||
: type_traits_cpu[type].levels_row_stride;
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
|
|
@ -1227,7 +1281,11 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
|||
//}
|
||||
|
||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
|
||||
// For Q2_KPT, levels are stored per-expert: [expert0_rows, expert1_rows, ...]
|
||||
// So for 3D tensors we need to index by (i03 * ne01 + ir0)
|
||||
const size_t levels_row_idx = (type == GGML_TYPE_Q2_KPT && ne03 > 1) ? (i03 * ne01 + ir0) : ir0;
|
||||
const void * row_levels = (const char*)src0->quant_levels + levels_row_idx * levels_row_stride;
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot, row_levels);
|
||||
}
|
||||
|
||||
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
|
||||
|
|
@ -1293,7 +1351,8 @@ void ggml_compute_forward_mul_mat(
|
|||
nb1/ggml_type_size(dst->type),
|
||||
src0->type,
|
||||
src1->type,
|
||||
dst->type))
|
||||
dst->type,
|
||||
src0->quant_levels))
|
||||
goto UseGgmlGemm1;
|
||||
return;
|
||||
}
|
||||
|
|
@ -1361,7 +1420,8 @@ UseGgmlGemm1:;
|
|||
nb1/ggml_type_size(dst->type),
|
||||
src0->type,
|
||||
vec_dot_type,
|
||||
dst->type))
|
||||
dst->type,
|
||||
src0->quant_levels))
|
||||
goto UseGgmlGemm2;
|
||||
return;
|
||||
}
|
||||
|
|
@ -1461,8 +1521,14 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(
|
|||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
// For Q2_KPT, levels are per-block: stride = (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
|
||||
// ne00 is the input dimension (elements per row in src0), NOT ne0 (= ne01 = output rows).
|
||||
// For other types, use the static stride from type_traits_cpu
|
||||
const size_t levels_row_stride = (type == GGML_TYPE_Q2_KPT)
|
||||
? (ne00 / QK_K) * Q2KPT_N_LEVELS * sizeof(float)
|
||||
: type_traits_cpu[type].levels_row_stride;
|
||||
|
||||
const int64_t blck_0 = 16;
|
||||
const int64_t blck_1 = 16;
|
||||
|
|
@ -1495,7 +1561,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(
|
|||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
|
||||
|
||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
|
||||
const void * row_levels = (const char*)src0->quant_levels + (cur_a * ne01 + ir0) * levels_row_stride;
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1, row_levels);
|
||||
}
|
||||
|
||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
|
||||
|
|
|
|||
|
|
@ -1356,16 +1356,20 @@ class tinyBLAS_Q0_AVX {
|
|||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
int ith, int nth,
|
||||
const int8_t * custom_table = nullptr)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
const int8_t kvalues_iq4nl[16] = {
|
||||
-127, -104, -83, -65,
|
||||
-49, -35, -22, -10,
|
||||
1, 13, 25, 38,
|
||||
53, 69, 89, 113
|
||||
};
|
||||
|
||||
iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
|
||||
if (custom_table) {
|
||||
iq4nlt = _mm_loadu_si128((const __m128i *)custom_table);
|
||||
} else {
|
||||
const int8_t kvalues_iq4nl[16] = {
|
||||
-127, -104, -83, -65,
|
||||
-49, -35, -22, -10,
|
||||
1, 13, 25, 38,
|
||||
53, 69, 89, 113
|
||||
};
|
||||
iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
|
||||
}
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
|
|
@ -3684,7 +3688,7 @@ class tinyBLAS_PPC {
|
|||
*/
|
||||
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
|
||||
const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||
int64_t ldc, int Atype, int Btype, int Ctype) {
|
||||
int64_t ldc, int Atype, int Btype, int Ctype, const void * quant_levels) {
|
||||
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
|
|
@ -4024,6 +4028,26 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
|||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_Q4_DPT: {
|
||||
if (Btype != GGML_TYPE_Q8_0)
|
||||
return false;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
||||
// Q4_DPT has identical block layout to IQ4_NL (block_q4_dpt = block_iq4_nl)
|
||||
// but uses a per-tensor lookup table instead of the fixed IQ4_NL values.
|
||||
const int8_t * levels = (const int8_t *)quant_levels;
|
||||
if (!levels) return false;
|
||||
tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
|
||||
k, (const block_iq4_nl *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
params->ith, params->nth, levels};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ extern "C" {
|
|||
|
||||
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
|
||||
const void *, int64_t, const void *, int64_t, void *, int64_t,
|
||||
int, int, int);
|
||||
int, int, int, const void * quant_levels);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,6 +8,19 @@
|
|||
#include "unary-ops.h"
|
||||
#include "vec.h"
|
||||
|
||||
// Helper: compute quant_levels stride for a given row.
|
||||
// For most types this is the constant levels_row_stride from type_traits.
|
||||
// For Q2_KPT (per-block levels), stride depends on tensor width (ne[0]).
|
||||
static inline size_t ggml_quant_levels_stride(ggml_type type, size_t constant_stride, int64_t ne0) {
|
||||
if (type == GGML_TYPE_Q2_KPT) {
|
||||
// Q2_KPT has Q2KPT_N_LEVELS floats per 256-element block
|
||||
// Stride = (ne0 / 256) * Q2KPT_N_LEVELS * sizeof(float)
|
||||
return (size_t)(ne0 / 256) * 4 * sizeof(float);
|
||||
}
|
||||
return constant_stride;
|
||||
}
|
||||
|
||||
|
||||
#include <algorithm>
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
|
|
@ -517,9 +530,11 @@ static void ggml_compute_forward_dup_from_q(
|
|||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
const size_t q_lrs0 = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
|
||||
dequantize_row_q(
|
||||
(const void *) ((char *) src0->data + x_offset),
|
||||
(float *) ((char *) dst->data + dst_offset), qk);
|
||||
(float *) ((char *) dst->data + dst_offset), qk,
|
||||
(const char*)src0->quant_levels + i01 * q_lrs0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -639,7 +654,8 @@ static void ggml_compute_forward_add_q_f32(
|
|||
assert(ne00 % 32 == 0);
|
||||
|
||||
// unquantize row from src0 to temp buffer
|
||||
dequantize_row_q(src0_row, wdata, ne00);
|
||||
const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
|
||||
dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add);
|
||||
// add src1
|
||||
ggml_vec_acc_f32(ne00, wdata, src1_row);
|
||||
// quantize row to dst
|
||||
|
|
@ -688,6 +704,9 @@ void ggml_compute_forward_add(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
{
|
||||
ggml_compute_forward_add_q_f32(params, dst);
|
||||
} break;
|
||||
|
|
@ -974,7 +993,8 @@ static void ggml_compute_forward_add1_q_f32(
|
|||
assert(ne0 % 32 == 0);
|
||||
|
||||
// unquantize row from src0 to temp buffer
|
||||
dequantize_row_q(src0_row, wdata, ne0);
|
||||
const size_t q_lrs_add = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
|
||||
dequantize_row_q(src0_row, wdata, ne00, (const char*)src0->quant_levels + i1 * q_lrs_add);
|
||||
// add src1
|
||||
ggml_vec_acc1_f32(ne0, wdata, v);
|
||||
// quantize row to dst
|
||||
|
|
@ -1139,6 +1159,9 @@ void ggml_compute_forward_add1(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
{
|
||||
ggml_compute_forward_add1_q_f32(params, dst);
|
||||
} break;
|
||||
|
|
@ -1269,6 +1292,9 @@ void ggml_compute_forward_acc(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
@ -4321,7 +4347,8 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|||
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
|
||||
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
||||
|
||||
dequantize_row_q(s0, wdata, ne0);
|
||||
const size_t q_lrs_op = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
|
||||
dequantize_row_q(s0, wdata, ne0, (const char*)src0->quant_levels + i01 * q_lrs_op);
|
||||
ggml_vec_mad_f32(ne0, d, wdata, *s1);
|
||||
}
|
||||
}
|
||||
|
|
@ -4358,6 +4385,9 @@ void ggml_compute_forward_out_prod(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
{
|
||||
ggml_compute_forward_out_prod_q_f32(params, dst);
|
||||
} break;
|
||||
|
|
@ -4635,6 +4665,9 @@ void ggml_compute_forward_set(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
@ -4698,9 +4731,21 @@ static void ggml_compute_forward_get_rows_q(
|
|||
|
||||
GGML_ASSERT(i01 >= 0 && i01 < ne01);
|
||||
|
||||
const size_t q_lrs_gr = ggml_quant_levels_stride(src0->type, ggml_get_type_traits_cpu(src0->type)->levels_row_stride, src0->ne[0]);
|
||||
// For Q2_KPT with 3D tensors, levels are indexed by [i12 * ne02 * ne01 + i11 * ne01 + i01]
|
||||
// For 2D tensors, levels are indexed by [i11 * ne01 + i01] (or just [i01] if ne02 == 1)
|
||||
size_t levels_row_idx;
|
||||
if (type == GGML_TYPE_Q2_KPT && ne03 > 1) {
|
||||
levels_row_idx = (i12 * ne02 + i11) * ne01 + i01;
|
||||
} else if (type == GGML_TYPE_Q2_KPT) {
|
||||
levels_row_idx = i11 * ne01 + i01;
|
||||
} else {
|
||||
levels_row_idx = i01;
|
||||
}
|
||||
dequantize_row_q(
|
||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc,
|
||||
(const char*)src0->quant_levels + levels_row_idx * q_lrs_gr);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -4859,6 +4904,9 @@ void ggml_compute_forward_get_rows(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
{
|
||||
ggml_compute_forward_get_rows_q(params, dst);
|
||||
} break;
|
||||
|
|
@ -5436,7 +5484,7 @@ static void ggml_compute_forward_soft_max_ext_back_f32(
|
|||
|
||||
// linear runtime, no additional memory
|
||||
float dot_y_dy = 0;
|
||||
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
|
||||
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1, nullptr);
|
||||
ggml_vec_cpy_f32 (nc, dx, dy);
|
||||
ggml_vec_acc1_f32 (nc, dx, -dot_y_dy);
|
||||
ggml_vec_mul_f32 (nc, dx, dx, y);
|
||||
|
|
@ -5571,6 +5619,8 @@ void ggml_compute_forward_clamp(
|
|||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q3_KPT:
|
||||
case GGML_TYPE_Q4_DPT:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
|
|
@ -5583,6 +5633,12 @@ void ggml_compute_forward_clamp(
|
|||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_Q3_PT:
|
||||
case GGML_TYPE_Q2_KPT:
|
||||
case GGML_TYPE_Q2_DPT:
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q8_K:
|
||||
|
|
@ -6007,7 +6063,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|||
float v = 0;
|
||||
ggml_vec_dot_f16(ne02, &v, 0,
|
||||
(ggml_fp16_t *) wdata_src + i1n, 0,
|
||||
(ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
|
||||
(ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1, nullptr);
|
||||
dst_data[i10*s0 + i00] += v;
|
||||
}
|
||||
}
|
||||
|
|
@ -6095,7 +6151,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|||
float v = 0;
|
||||
ggml_vec_dot_f32(ne02, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i00*ne02, 0, 1);
|
||||
wdata_kernel + i00*ne02, 0, 1, nullptr);
|
||||
dst_data[i10*s0 + i00] += v;
|
||||
}
|
||||
}
|
||||
|
|
@ -7021,11 +7077,11 @@ static void ggml_compute_forward_conv_transpose_2d_impl(
|
|||
if constexpr (std::is_same_v<kernel_t, ggml_fp16_t>) {
|
||||
ggml_vec_dot_f16(ne03, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1, nullptr);
|
||||
} else {
|
||||
ggml_vec_dot_f32(ne03, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1, nullptr);
|
||||
}
|
||||
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
||||
}
|
||||
|
|
@ -8298,7 +8354,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|||
float s; // KQ value
|
||||
|
||||
const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
|
||||
kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1);
|
||||
kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1, k->quant_levels);
|
||||
|
||||
s = s*scale; // scale KQ value
|
||||
|
||||
|
|
@ -8345,7 +8401,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|||
|
||||
// V += v*expf(s - M)
|
||||
if (v_to_float) {
|
||||
v_to_float(v_data, V32, DV);
|
||||
v_to_float(v_data, V32, DV, v->quant_levels);
|
||||
ggml_vec_mad_f32(DV, VKQ32, V32, vs);
|
||||
} else {
|
||||
// V is F32
|
||||
|
|
@ -9058,7 +9114,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|||
ggml_vec_dot_f32(neq0,
|
||||
S + i1, 0,
|
||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1, nullptr);
|
||||
}
|
||||
|
||||
// scale
|
||||
|
|
@ -9172,7 +9228,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|||
|
||||
// S = SM * (S - dot(SM, S))
|
||||
float dot_SM_gradSM = 0;
|
||||
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
|
||||
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1, nullptr);
|
||||
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
|
||||
ggml_vec_mul_f32 (masked_begin, S, S, SM);
|
||||
|
||||
|
|
@ -10535,7 +10591,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
|||
// delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k)
|
||||
for (int64_t j = 0; j < S_v; ++j) {
|
||||
float sum = 0.0f;
|
||||
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1);
|
||||
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1, nullptr);
|
||||
delta[j] = (v_d[j] - sum) * beta_val;
|
||||
}
|
||||
|
||||
|
|
@ -10547,7 +10603,7 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
|
|||
// attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q)
|
||||
for (int64_t j = 0; j < S_v; ++j) {
|
||||
float sum = 0.0f;
|
||||
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1);
|
||||
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1, nullptr);
|
||||
attn_data[j] = sum * scale;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -120,7 +120,8 @@ void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI
|
|||
|
||||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK1_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -165,7 +166,8 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
}
|
||||
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -202,7 +204,8 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
}
|
||||
|
||||
// TODO: add WASM SIMD
|
||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -238,7 +241,8 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -270,7 +274,8 @@ void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
}
|
||||
|
||||
// NVFP4: super-block of 64 elements = 4 sub-blocks of 16 = 2 q8_0 blocks
|
||||
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -305,7 +310,8 @@ void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -348,7 +354,8 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -391,7 +398,8 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
|
|
@ -421,7 +429,8 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -473,7 +482,8 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -505,7 +515,8 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -557,7 +568,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -636,7 +648,8 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -710,8 +723,7 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -741,6 +753,7 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
GGML_UNUSED(levels);
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
|
@ -791,7 +804,8 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -846,7 +860,8 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -888,7 +903,8 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|||
*s = 0.125f * sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -938,7 +954,8 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = 0.125f * sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -990,7 +1007,8 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = 0.125f * sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -1034,7 +1052,8 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|||
*s = 0.25f * sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -1090,7 +1109,65 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q3_pt * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
const float * lv = (const float *)levels;
|
||||
GGML_ASSERT(lv != NULL && "Q3_PT levels not set for tensor");
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float xd = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float xdmin = GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
const float yd = y[i].d;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
|
||||
float block_sum = 0.f;
|
||||
for (int ib = 0; ib < QK_K/16; ++ib) {
|
||||
// Inline 6-bit unpack for range scale (index ib) and neg_min scale (index ib + QK_K/16)
|
||||
const int sbit0 = ib * 6, sbyte0 = sbit0 / 8, soff0 = sbit0 % 8;
|
||||
const int sbit1 = (ib + QK_K/16) * 6, sbyte1 = sbit1 / 8, soff1 = sbit1 % 8;
|
||||
uint8_t qrange = (sc[sbyte0] >> soff0) & 0x3F;
|
||||
if (soff0 > 2) { qrange |= (uint8_t)((sc[sbyte0+1] << (8 - soff0)) & 0x3F); }
|
||||
uint8_t qnmin = (sc[sbyte1] >> soff1) & 0x3F;
|
||||
if (soff1 > 2) { qnmin |= (uint8_t)((sc[sbyte1+1] << (8 - soff1)) & 0x3F); }
|
||||
const float range = xd * (float)qrange;
|
||||
const float sub_min = -xdmin * (float)qnmin;
|
||||
|
||||
float sum_lq = 0.f;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
// Inline 3-bit unpack
|
||||
const int qk = ib * 16 + j;
|
||||
const int qbit = qk * 3;
|
||||
const int qbyte = qbit / 8;
|
||||
const int qoff = qbit % 8;
|
||||
int q = (qs[qbyte] >> qoff) & 0x7;
|
||||
if (qoff > 5) { q |= (int)((qs[qbyte+1] << (8 - qoff)) & 0x7); }
|
||||
sum_lq += lv[q] * (float)q8[qk];
|
||||
}
|
||||
// min contribution uses precomputed 16-element sum from block_q8_K.bsums
|
||||
block_sum += sum_lq * range + sub_min * (float)y[i].bsums[ib];
|
||||
}
|
||||
sumf += block_sum * yd;
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -1133,7 +1210,375 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
// Q3_KPT vec_dot - similar to Q3_K but with learned levels
|
||||
void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q3_kpt * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
const float * lv = (const float *)levels;
|
||||
GGML_ASSERT(lv != NULL && "Q3_KPT levels not set for tensor");
|
||||
|
||||
const uint32_t kmask1 = 0x03030303;
|
||||
const uint32_t kmask2 = 0x0f0f0f0f;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float yd = y[i].d;
|
||||
const uint8_t * q = x[i].qs;
|
||||
const uint8_t * hm = x[i].hmask;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
uint8_t m = 1;
|
||||
|
||||
uint32_t aux32[4];
|
||||
memcpy(aux32, x[i].scales, 12);
|
||||
uint32_t tmp = aux32[2];
|
||||
aux32[2] = ((aux32[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
aux32[3] = ((aux32[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
aux32[0] = (aux32[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
aux32[1] = (aux32[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
const uint8_t * aux = (const uint8_t *)aux32;
|
||||
|
||||
int is = 0;
|
||||
float block_sum = 0.f;
|
||||
for (int blk = 0; blk < QK_K; blk += 128) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
int sc1 = (int)aux[is] - 32;
|
||||
int sc2 = (int)aux[is+1] - 32;
|
||||
is += 2;
|
||||
float dl1 = d_all * sc1;
|
||||
float dl2 = d_all * sc2;
|
||||
|
||||
float sum1 = 0.f, sum2 = 0.f;
|
||||
for (int l = 0; l < 16; ++l) {
|
||||
int k_idx = ((q[l+0] >> shift) & 3) + ((hm[l+0] & m) ? 4 : 0);
|
||||
sum1 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+0];
|
||||
}
|
||||
for (int l = 0; l < 16; ++l) {
|
||||
int k_idx = ((q[l+16] >> shift) & 3) + ((hm[l+16] & m) ? 4 : 0);
|
||||
sum2 += (lv[k_idx] * 7.0f - 4.0f) * (float)q8[l+16];
|
||||
}
|
||||
block_sum += dl1 * sum1 + dl2 * sum2;
|
||||
|
||||
shift += 2;
|
||||
m <<= 1;
|
||||
q8 += 32;
|
||||
}
|
||||
q += 32;
|
||||
}
|
||||
sumf += block_sum * yd;
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
ggml_vec_dot_q3_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
}
|
||||
|
||||
// Q2_KPT vec_dot - similar to Q2_K but with learned levels
|
||||
void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
const block_q2_kpt * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
const float * lv = (const float *)levels;
|
||||
GGML_ASSERT(lv != NULL && "Q2_KPT levels not set for tensor");
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// Per-block levels: block i uses lv[i*4 + 0..3]
|
||||
const float * block_lv = lv + i * Q2KPT_N_LEVELS;
|
||||
|
||||
// Precompute mapped levels for this block: ml[k] = levels[k] * 3.0
|
||||
float ml[Q2KPT_N_LEVELS];
|
||||
for (int k = 0; k < Q2KPT_N_LEVELS; ++k) {
|
||||
ml[k] = block_lv[k] * 3.0f;
|
||||
}
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
// Min term: accumulate integer bsums * min_scale (same as Q2_K)
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
// Scale term: need floating-point because levels are non-uniform
|
||||
int is = 0;
|
||||
float fsum = 0;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
int d_sc = sc[is++] & 0xF;
|
||||
float suml = 0;
|
||||
for (int l = 0; l < 16; ++l) {
|
||||
int idx = (q2[l] >> shift) & 3;
|
||||
suml += ml[idx] * (float)q8[l];
|
||||
}
|
||||
fsum += d_sc * suml;
|
||||
|
||||
d_sc = sc[is++] & 0xF;
|
||||
suml = 0;
|
||||
for (int l = 16; l < 32; ++l) {
|
||||
int idx = (q2[l] >> shift) & 3;
|
||||
suml += ml[idx] * (float)q8[l];
|
||||
}
|
||||
fsum += d_sc * suml;
|
||||
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * fsum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_kpt_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
ggml_vec_dot_q2_kpt_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc, levels);
|
||||
}
|
||||
|
||||
// IQ2_TQ: 2-bit with asymmetric 4-tuple grid per group
|
||||
// Default grid table — only used when no per-tensor grid is available
|
||||
static const int8_t iq2tq_grid_cpu[16][4] = {
|
||||
{-20, -8, -2, 6}, {-14, -8, -2, 4}, {-16,-10, 0, 12}, {-14, -4, 2, 8},
|
||||
{-20, -4, 4, 12}, {-8, -4, 0, 4}, {-8, -4, 0, 8}, {-12, -6, 2, 12},
|
||||
{-4, -2, 2, 4}, {-10, -2, 4, 8}, {-16, -6, 4, 20}, {-12, -2, 6, 14},
|
||||
{-8, -2, 4, 14}, {-4, 0, 4, 8}, {-8, -2, 6, 22}, {-4, 2, 8, 14},
|
||||
};
|
||||
|
||||
void ggml_vec_dot_iq2_tq_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
|
||||
|
||||
const int8_t (*grid)[4] = levels ? (const int8_t (*)[4])levels : (const int8_t (*)[4])iq2tq_grid_cpu;
|
||||
const block_iq2_tq * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
const int nb = n / QK_K;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ2TQ_GRID_SCALE;
|
||||
const float yd = y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
|
||||
int32_t fsum = 0;
|
||||
|
||||
for (int g = 0; g < IQ2TQ_N_GROUPS; ++g) {
|
||||
int si = (x[i].scales[g / 2] >> (4 * (g % 2))) & 0xF;
|
||||
const int8_t * ge = grid[si];
|
||||
const int8_t * q8g = q8 + g * 8;
|
||||
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
int j = g * 8 + k;
|
||||
int qi = (x[i].qs[j / 4] >> ((j % 4) * 2)) & 3;
|
||||
fsum += (int32_t)ge[qi] * (int32_t)q8g[k];
|
||||
}
|
||||
}
|
||||
|
||||
sumf += d * yd * (float)fsum;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
// IQ3_TQ default grid (must match ggml-quants.c)
|
||||
static const int8_t iq3tq_grid_cpu[16][8] = {
|
||||
{-24,-18,-12, -6, 0, 6, 12, 18},
|
||||
{-20,-15,-10, -5, 0, 5, 10, 15},
|
||||
{-16,-12, -8, -4, 0, 4, 8, 12},
|
||||
{-12, -8, -4, -2, 0, 2, 4, 8},
|
||||
{-24,-16, -8, -2, 2, 6, 10, 14},
|
||||
{-14,-10, -6, -2, 2, 8, 16, 24},
|
||||
{-20,-14, -8, -4, 0, 4, 10, 18},
|
||||
{-18,-10, -4, 0, 4, 8, 14, 20},
|
||||
{ -8, -6, -4, -2, 0, 2, 4, 6},
|
||||
{-10, -6, -4, -2, 2, 4, 6, 10},
|
||||
{-22,-14, -6, -2, 2, 6, 14, 22},
|
||||
{-16, -8, -4, -2, 0, 4, 8, 16},
|
||||
{-24,-20,-16,-12, -8, -4, 0, 4},
|
||||
{ -4, 0, 4, 8, 12, 16, 20, 24},
|
||||
{-20,-16,-10, -4, 4, 10, 16, 20},
|
||||
{-12, -8, -6, -2, 2, 6, 8, 12},
|
||||
};
|
||||
|
||||
void ggml_vec_dot_iq3_tq_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
|
||||
|
||||
const int8_t (*grid)[8] = levels ? (const int8_t (*)[8])levels : (const int8_t (*)[8])iq3tq_grid_cpu;
|
||||
const block_iq3_tq * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
const int nb = n / QK_K;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ3TQ_GRID_SCALE;
|
||||
const float yd = y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
|
||||
int32_t fsum = 0;
|
||||
|
||||
for (int g = 0; g < IQ3TQ_N_GROUPS; ++g) {
|
||||
int si = (x[i].scales[g / 2] >> (4 * (g % 2))) & 0xF;
|
||||
const int8_t * ge = grid[si];
|
||||
const int8_t * q8g = q8 + g * 8;
|
||||
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
int j = g * 8 + k;
|
||||
// 3-bit unpack
|
||||
int bit_pos = j * 3;
|
||||
int byte_idx = bit_pos >> 3;
|
||||
int bit_off = bit_pos & 7;
|
||||
uint16_t val = x[i].qs[byte_idx];
|
||||
if (bit_off > 5) val |= ((uint16_t)x[i].qs[byte_idx + 1] << 8);
|
||||
int qi = (val >> bit_off) & 7;
|
||||
fsum += (int32_t)ge[qi] * (int32_t)q8g[k];
|
||||
}
|
||||
}
|
||||
|
||||
sumf += d * yd * (float)fsum;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
// IQ1_BN: 8D vector quantized — codebook[256][8] + scale_table[16]
|
||||
void ggml_vec_dot_iq1_bn_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
|
||||
|
||||
GGML_ASSERT(levels && "IQ1_BN requires per-tensor codebook in quant_levels");
|
||||
const int8_t * codebook = (const int8_t *)levels;
|
||||
const block_iq1_bn * GGML_RESTRICT x = vx;
|
||||
const block_q8_K * GGML_RESTRICT y = vy;
|
||||
const int nb = n / QK_K;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * IQ1BN_GRID_SCALE;
|
||||
const float yd = y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
|
||||
int32_t block_sum = 0;
|
||||
|
||||
for (int g = 0; g < IQ1BN_N_GROUPS; ++g) {
|
||||
int ci = (g & 1)
|
||||
? ((x[i].qs[3*(g/2)+1] >> 4) | ((int)x[i].qs[3*(g/2)+2] << 4))
|
||||
: (x[i].qs[3*(g/2)] | (((int)x[i].qs[3*(g/2)+1] & 0x0F) << 8));
|
||||
const int8_t * cb = codebook + ci * IQ1BN_CODEBOOK_DIM;
|
||||
const int8_t * q8g = q8 + g * IQ1BN_GROUP_SIZE;
|
||||
|
||||
for (int k = 0; k < IQ1BN_CODEBOOK_DIM; ++k) {
|
||||
block_sum += (int32_t)cb[k] * (int32_t)q8g[k];
|
||||
}
|
||||
}
|
||||
|
||||
sumf += d * yd * (float)block_sum;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK4_NL == 0);
|
||||
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
|
||||
|
||||
const block_q4_dpt * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK4_NL;
|
||||
|
||||
const int8_t * values = (const int8_t *)levels;
|
||||
GGML_ASSERT(values != NULL && "Q4_DPT levels not set for tensor");
|
||||
|
||||
float sumf = 0;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int32_t blk = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
blk += (int32_t)y[ib].qs[j+ 0] * (int32_t)values[x[ib].qs[j] & 0xf];
|
||||
blk += (int32_t)y[ib].qs[j+QK4_NL/2] * (int32_t)values[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (float)blk;
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK2_DPT == 0);
|
||||
static_assert(QK2_DPT == QK8_0, "QK2_DPT and QK8_0 must be the same");
|
||||
|
||||
const block_q2_dpt * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK2_DPT;
|
||||
|
||||
const int8_t * values = (const int8_t *)levels;
|
||||
GGML_ASSERT(values != NULL && "Q2_DPT levels not set for tensor");
|
||||
|
||||
float sumf = 0;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d) * GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int32_t blk = 0;
|
||||
for (int j = 0; j < QK2_DPT/4; ++j) {
|
||||
uint8_t q = x[ib].qs[j];
|
||||
blk += (int32_t)y[ib].qs[j*4 + 0] * (int32_t)values[(q >> 0) & 3];
|
||||
blk += (int32_t)y[ib].qs[j*4 + 1] * (int32_t)values[(q >> 2) & 3];
|
||||
blk += (int32_t)y[ib].qs[j*4 + 2] * (int32_t)values[(q >> 4) & 3];
|
||||
blk += (int32_t)y[ib].qs[j*4 + 3] * (int32_t)values[(q >> 6) & 3];
|
||||
}
|
||||
sumf += d * (float)blk;
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
|
|
@ -1194,7 +1639,8 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
@ -1223,7 +1669,8 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
|
|
|
|||
|
|
@ -37,66 +37,79 @@ void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
|
|||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q3_pt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q3_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q3_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q4_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q4_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q2_dpt_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q2_dpt_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q2_kpt_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q2_kpt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq2_tq_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq3_tq_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq1_bn_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
|
||||
// Generic implementation
|
||||
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_q3_pt_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc, const void * levels);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,7 +8,8 @@ ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
|||
// precomputed quick gelu table for f16 (128 KB)
|
||||
ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
||||
|
||||
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
|
||||
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
GGML_UNUSED(nrc);
|
||||
GGML_UNUSED(bx);
|
||||
|
|
@ -136,7 +137,8 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
||||
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
GGML_UNUSED(nrc);
|
||||
GGML_UNUSED(bx);
|
||||
|
|
@ -261,7 +263,8 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
||||
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
assert(nrc == 1);
|
||||
GGML_UNUSED(nrc);
|
||||
GGML_UNUSED(bx);
|
||||
|
|
|
|||
|
|
@ -39,9 +39,9 @@ extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
|||
// fundamental operations
|
||||
//
|
||||
|
||||
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
||||
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
||||
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
||||
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
|
||||
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc, const void * levels);
|
||||
|
||||
void ggml_vec_silu_f32(const int n, float * y, const float * x);
|
||||
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
|
||||
|
|
@ -873,7 +873,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
|||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
||||
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1, NULL); *s = sqrtf(*s); }
|
||||
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
||||
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
|
|
|
|||
|
|
@ -1057,6 +1057,27 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
|
|||
static constexpr int qi = QI4_NL;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_Q4_DPT> {
|
||||
static constexpr int qk = QK4_NL;
|
||||
static constexpr int qr = QR4_NL;
|
||||
static constexpr int qi = QI4_NL;
|
||||
};
|
||||
|
||||
// Per-tensor lookup table for Q4_DPT (device global memory).
|
||||
// Each TU gets its own copy; initialized via cudaGetSymbolAddress + cudaMemcpyAsync before use.
|
||||
__device__ int8_t q4dpt_levels_cuda[16];
|
||||
|
||||
// Per-tensor lookup table for Q2_DPT (4 int8 levels).
|
||||
__device__ int8_t q2dpt_levels_cuda[4];
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_Q2_DPT> {
|
||||
static constexpr int qk = QK2_DPT;
|
||||
static constexpr int qr = 4; // 4 elements per "quantum" (2-bit)
|
||||
static constexpr int qi = 1; // 1 uint32 per block
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
|
||||
static constexpr int qk = QK_K;
|
||||
|
|
@ -1064,6 +1085,38 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
|
|||
static constexpr int qi = QI4_XS;
|
||||
};
|
||||
|
||||
// Per-tensor grid for IQ2_TQ (16 × 4 int8 = 64 bytes).
|
||||
__device__ int8_t iq2tq_grid_cuda[64];
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_IQ2_TQ> {
|
||||
static constexpr int qk = QK_K;
|
||||
static constexpr int qr = 4;
|
||||
static constexpr int qi = QK_K / (4*4); // 16
|
||||
};
|
||||
|
||||
// Per-tensor grid for IQ3_TQ (16 × 8 int8 = 128 bytes).
|
||||
__device__ int8_t iq3tq_grid_cuda[128];
|
||||
|
||||
|
||||
// Per-tensor codebook for IQ1_BN (4096 × 8 int8 = 32768 bytes).
|
||||
__device__ int8_t iq1bn_codebook_cuda[IQ1BN_CODEBOOK_SIZE];
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_IQ3_TQ> {
|
||||
static constexpr int qk = QK_K;
|
||||
static constexpr int qr = 4;
|
||||
static constexpr int qi = QK_K / (4*4); // 16
|
||||
};
|
||||
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_IQ1_BN> {
|
||||
static constexpr int qk = QK_K;
|
||||
static constexpr int qr = 4;
|
||||
static constexpr int qi = QK_K / (4*4); // 16
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
|
||||
static constexpr int qk = QK_K;
|
||||
|
|
|
|||
|
|
@ -593,12 +593,187 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t
|
|||
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream) {
|
||||
int8_t * d_q4dpt_levels;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, levels, 16, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
|
||||
void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream) {
|
||||
int8_t * d_q2dpt_levels;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, levels, 4, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
|
||||
void ggml_cuda_set_iq2tq_grid(const void * grid, cudaStream_t stream) {
|
||||
int8_t * d_grid;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq2tq_grid_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_grid, grid, 64, cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
void ggml_cuda_set_iq3tq_grid(const void * grid, cudaStream_t stream) {
|
||||
int8_t * d_grid;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq3tq_grid_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_grid, grid, 128, cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
|
||||
void ggml_cuda_set_iq1bn_aux(const void * aux, cudaStream_t stream) {
|
||||
int8_t * d_cb;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_cb, iq1bn_codebook_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_cb, aux, IQ1BN_CODEBOOK_SIZE, cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_q4_dpt * x = (const block_q4_dpt *) vx + i*(QK_K/QK4_NL);
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||
const float d = (float)x[ib].d;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = d * q4dpt_levels_cuda[q4[j] & 0xf];
|
||||
y[j+16] = d * q4dpt_levels_cuda[q4[j] >> 4];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q4_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_q4_dpt<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q2_dpt(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_q2_dpt * x = (const block_q2_dpt *) vx + i*(QK_K/QK2_DPT);
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q2 = x[ib].qs + il;
|
||||
const float d = (float)x[ib].d;
|
||||
uint8_t q = q2[0];
|
||||
y[ 0] = d * q2dpt_levels_cuda[(q >> 0) & 3];
|
||||
y[ 1] = d * q2dpt_levels_cuda[(q >> 2) & 3];
|
||||
y[ 2] = d * q2dpt_levels_cuda[(q >> 4) & 3];
|
||||
y[ 3] = d * q2dpt_levels_cuda[(q >> 6) & 3];
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q2_dpt_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_q2_dpt<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_tq(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq2_tq * bq = (const block_iq2_tq *) vx + i;
|
||||
const int g = threadIdx.x; // group index 0..31
|
||||
|
||||
const float dq = __half2float(bq->d) * IQ2TQ_GRID_SCALE;
|
||||
|
||||
const int si = (bq->scales[g / 2] >> (4 * (g & 1))) & 0xF;
|
||||
const int8_t * ge = iq2tq_grid_cuda + si * 4;
|
||||
|
||||
dst_t * y = yy + i * QK_K + g * 8;
|
||||
const uint8_t * qs = bq->qs + g * 2;
|
||||
|
||||
y[0] = dq * ge[(qs[0] >> 0) & 3];
|
||||
y[1] = dq * ge[(qs[0] >> 2) & 3];
|
||||
y[2] = dq * ge[(qs[0] >> 4) & 3];
|
||||
y[3] = dq * ge[(qs[0] >> 6) & 3];
|
||||
y[4] = dq * ge[(qs[1] >> 0) & 3];
|
||||
y[5] = dq * ge[(qs[1] >> 2) & 3];
|
||||
y[6] = dq * ge[(qs[1] >> 4) & 3];
|
||||
y[7] = dq * ge[(qs[1] >> 6) & 3];
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq2_tq_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq2_tq<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_tq(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq3_tq * bq = (const block_iq3_tq *) vx + i;
|
||||
const int g = threadIdx.x; // group index 0..31
|
||||
|
||||
const float dq = __half2float(bq->d) * IQ3TQ_GRID_SCALE;
|
||||
|
||||
const int si = (bq->scales[g / 2] >> (4 * (g & 1))) & 0xF;
|
||||
const int8_t * ge = iq3tq_grid_cuda + si * 8;
|
||||
|
||||
dst_t * y = yy + i * QK_K + g * 8;
|
||||
const uint8_t * qs = bq->qs + g * 3;
|
||||
const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16);
|
||||
|
||||
y[0] = dq * ge[(bits >> 0) & 7];
|
||||
y[1] = dq * ge[(bits >> 3) & 7];
|
||||
y[2] = dq * ge[(bits >> 6) & 7];
|
||||
y[3] = dq * ge[(bits >> 9) & 7];
|
||||
y[4] = dq * ge[(bits >> 12) & 7];
|
||||
y[5] = dq * ge[(bits >> 15) & 7];
|
||||
y[6] = dq * ge[(bits >> 18) & 7];
|
||||
y[7] = dq * ge[(bits >> 21) & 7];
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_tq_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_tq<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq1_bn(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_iq1_bn * bq = (const block_iq1_bn *) vx + i;
|
||||
const int g = threadIdx.x; // group index 0..31
|
||||
|
||||
const float dq = __half2float(bq->d) * IQ1BN_GRID_SCALE;
|
||||
|
||||
// Extract 12-bit codebook index
|
||||
const int pair = g / 2;
|
||||
int ci;
|
||||
if (g & 1) {
|
||||
ci = (bq->qs[3*pair+1] >> 4) | ((int)bq->qs[3*pair+2] << 4);
|
||||
} else {
|
||||
ci = bq->qs[3*pair] | (((int)bq->qs[3*pair+1] & 0x0F) << 8);
|
||||
}
|
||||
const int8_t * cb = iq1bn_codebook_cuda + ci * IQ1BN_CODEBOOK_DIM;
|
||||
|
||||
dst_t * y = yy + i * QK_K + g * IQ1BN_GROUP_SIZE;
|
||||
y[0] = dq * cb[0];
|
||||
y[1] = dq * cb[1];
|
||||
y[2] = dq * cb[2];
|
||||
y[3] = dq * cb[3];
|
||||
y[4] = dq * cb[4];
|
||||
y[5] = dq * cb[5];
|
||||
y[6] = dq * cb[6];
|
||||
y[7] = dq * cb[7];
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq1_bn_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq1_bn<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
|
|
@ -748,6 +923,16 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|||
return dequantize_row_iq1_m_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_Q4_DPT:
|
||||
return dequantize_row_q4_dpt_cuda;
|
||||
case GGML_TYPE_Q2_DPT:
|
||||
return dequantize_row_q2_dpt_cuda;
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
return dequantize_row_iq2_tq_cuda;
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
return dequantize_row_iq3_tq_cuda;
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
return dequantize_row_iq1_bn_cuda;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return dequantize_row_iq4_xs_cuda;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
|
|
@ -801,6 +986,16 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|||
return dequantize_row_iq1_m_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_Q4_DPT:
|
||||
return dequantize_row_q4_dpt_cuda;
|
||||
case GGML_TYPE_Q2_DPT:
|
||||
return dequantize_row_q2_dpt_cuda;
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
return dequantize_row_iq2_tq_cuda;
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
return dequantize_row_iq3_tq_cuda;
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
return dequantize_row_iq1_bn_cuda;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return dequantize_row_iq4_xs_cuda;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
|
|
|
|||
|
|
@ -31,6 +31,22 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
|
|||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
|
||||
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
|
||||
|
||||
// Set the Q4_DPT lookup table in device constant memory.
|
||||
void ggml_cuda_set_q4dpt_levels(const int8_t * levels, cudaStream_t stream);
|
||||
|
||||
// Set the Q2_DPT lookup table in device constant memory.
|
||||
void ggml_cuda_set_q2dpt_levels(const int8_t * levels, cudaStream_t stream);
|
||||
|
||||
// Set the IQ2_TQ per-tensor grid (64 bytes: 16 entries × 4 int8 levels).
|
||||
void ggml_cuda_set_iq2tq_grid(const void * grid, cudaStream_t stream);
|
||||
|
||||
// Set the IQ3_TQ per-tensor grid (128 bytes: 16 entries × 8 int8 levels).
|
||||
void ggml_cuda_set_iq3tq_grid(const void * grid, cudaStream_t stream);
|
||||
|
||||
|
||||
// Set the IQ1_BN per-tensor codebook+scale (2064 bytes).
|
||||
void ggml_cuda_set_iq1bn_aux(const void * aux, cudaStream_t stream);
|
||||
|
||||
template<typename dst_t, typename src_t>
|
||||
__host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
|
||||
if constexpr (std::is_same_v<dst_t, src_t>) {
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-cuda/common.cuh"
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-cuda/acc.cuh"
|
||||
#include "ggml-cuda/add-id.cuh"
|
||||
#include "ggml-cuda/arange.cuh"
|
||||
|
|
@ -1426,6 +1427,24 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|||
row_diff == src0->ne[1] &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT;
|
||||
|
||||
// Upload per-tensor grids/levels before any dequantize path (fp16, fp32, or bf16)
|
||||
if (src0->type == GGML_TYPE_Q4_DPT) {
|
||||
GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)");
|
||||
ggml_cuda_set_q4dpt_levels((const int8_t *)src0->quant_levels, stream);
|
||||
}
|
||||
if (src0->type == GGML_TYPE_IQ2_TQ) {
|
||||
GGML_ASSERT(src0->quant_levels && "IQ2_TQ MUL_MAT requires grid (set tensor->quant_levels)");
|
||||
ggml_cuda_set_iq2tq_grid(src0->quant_levels, stream);
|
||||
}
|
||||
if (src0->type == GGML_TYPE_IQ3_TQ) {
|
||||
GGML_ASSERT(src0->quant_levels && "IQ3_TQ MUL_MAT requires grid (set tensor->quant_levels)");
|
||||
ggml_cuda_set_iq3tq_grid(src0->quant_levels, stream);
|
||||
}
|
||||
if (src0->type == GGML_TYPE_IQ1_BN) {
|
||||
GGML_ASSERT(src0->quant_levels && "IQ1_BN MUL_MAT requires codebook (set tensor->quant_levels)");
|
||||
ggml_cuda_set_iq1bn_aux(src0->quant_levels, stream);
|
||||
}
|
||||
|
||||
if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
||||
if (src1->type != GGML_TYPE_BF16) {
|
||||
|
|
@ -4804,6 +4823,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_Q4_DPT:
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_BF16:
|
||||
return true;
|
||||
|
|
@ -4838,7 +4861,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
{
|
||||
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
|
||||
op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
|
||||
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
|
||||
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL ||
|
||||
op->type == GGML_TYPE_Q4_DPT) &&
|
||||
op->src[0]->type == GGML_TYPE_F32 &&
|
||||
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
|
||||
} break;
|
||||
|
|
@ -4891,6 +4915,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_DPT) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) {
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
#include "mmq.cuh"
|
||||
#include "quantize.cuh"
|
||||
#include "mmid.cuh"
|
||||
#include "convert.cuh"
|
||||
#include "ggml-quants.h"
|
||||
|
||||
static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
switch (args.type_x) {
|
||||
|
|
@ -65,6 +67,12 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
|
|||
case GGML_TYPE_IQ4_NL:
|
||||
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_DPT:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_DPT>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_DPT:
|
||||
mul_mat_q_case<GGML_TYPE_Q2_DPT>(ctx, args, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
|
|
@ -82,6 +90,22 @@ void ggml_cuda_mul_mat_q(
|
|||
cudaStream_t stream = ctx.stream();
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
// Set Q4_DPT lookup table from tensor's quant_levels
|
||||
if (src0->type == GGML_TYPE_Q4_DPT) {
|
||||
GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)");
|
||||
int8_t * d_q4dpt_levels;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
// Set Q2_DPT lookup table from tensor's quant_levels
|
||||
if (src0->type == GGML_TYPE_Q2_DPT) {
|
||||
GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)");
|
||||
int8_t * d_q2dpt_levels;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
const size_t ts_src0 = ggml_type_size(src0->type);
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
const size_t ts_dst = ggml_type_size(dst->type);
|
||||
|
|
@ -290,6 +314,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
|||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_Q4_DPT:
|
||||
case GGML_TYPE_Q2_DPT:
|
||||
mmq_supported = true;
|
||||
break;
|
||||
default:
|
||||
|
|
@ -367,3 +393,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
|||
|
||||
return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
// Q4_DPT must be instantiated in this TU (not a separate template-instance file)
|
||||
// because it accesses the TU-local __device__ variable q4dpt_levels_cuda,
|
||||
// which is initialized by the code above.
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q4_DPT);
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q2_DPT);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
#include "ggml.h"
|
||||
#include "vecdotq.cuh"
|
||||
#include "mma.cuh"
|
||||
|
||||
|
|
@ -88,6 +89,8 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
|
|||
return MMQ_Q8_1_DS_LAYOUT_DS4;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_Q4_DPT:
|
||||
case GGML_TYPE_Q2_DPT:
|
||||
return MMQ_Q8_1_DS_LAYOUT_D4;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
@ -205,6 +208,8 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
|
|||
case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_Q4_DPT: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_Q2_DPT: return MMQ_DP4A_TXS_Q8_0_16;
|
||||
default: return tile_x_sizes{0, 0, 0};
|
||||
}
|
||||
}
|
||||
|
|
@ -250,6 +255,8 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
|||
case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q4_DPT: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q2_DPT: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -2763,6 +2770,71 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
|||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_dpt(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_DPT, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_NL);
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
|
||||
const int kbx = txi / QI4_NL;
|
||||
const int kqsx = txi % QI4_NL;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
|
||||
int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbx;
|
||||
|
||||
const int aux_q4 = get_int_b2(bxi->qs, kqsx);
|
||||
const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda);
|
||||
const int k0 = kbx * (2 * QI4_NL) + kqsx;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + 0] = v.x;
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + k0 + QI4_NL] = v.y;
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + 0] = v.x;
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + k0 + QI4_NL] = v.y;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
|
||||
constexpr int blocks_per_tile_x_row = MMQ_TILE_NE_K / QI4_NL;
|
||||
constexpr int rows_per_warp = warp_size / blocks_per_tile_x_row;
|
||||
const int kbxd = threadIdx.x % blocks_per_tile_x_row;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * rows_per_warp) {
|
||||
int i = i0 + threadIdx.y * rows_per_warp + threadIdx.x / blocks_per_tile_x_row;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_q4_dpt * bxi = (const block_q4_dpt *) x + kbx0 + i*stride + kbxd;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kbxd] = __half2float(bxi->d);
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/QI4_NL) + i/QI4_NL + kbxd] = __half2float(bxi->d);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xxs(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
|
|
@ -3447,6 +3519,22 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_NL> {
|
|||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_DPT> {
|
||||
static constexpr int vdr = VDR_Q4_DPT_Q8_1_MMQ;
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_dpt<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q2_DPT> {
|
||||
static constexpr int vdr = VDR_Q2_DPT_Q8_1_MMQ;
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_dpt<mmq_y, need_check>; // Reuse Q4_DPT loader (same layout)
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_XS> {
|
||||
static constexpr int vdr = VDR_IQ4_XS_Q8_1_MMQ;
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
#include "quantize.cuh"
|
||||
#include "unary.cuh"
|
||||
#include "vecdotq.cuh"
|
||||
#include "convert.cuh"
|
||||
#include "ggml-quants.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
|
|
@ -28,6 +30,11 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
|||
case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1;
|
||||
case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1;
|
||||
case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1;
|
||||
case GGML_TYPE_Q4_DPT: return vec_dot_q4_dpt_q8_1;
|
||||
case GGML_TYPE_Q2_DPT: return vec_dot_q2_dpt_q8_1;
|
||||
case GGML_TYPE_IQ2_TQ: return vec_dot_iq2_tq_q8_1;
|
||||
case GGML_TYPE_IQ3_TQ: return vec_dot_iq3_tq_q8_1;
|
||||
case GGML_TYPE_IQ1_BN: return vec_dot_iq1_bn_q8_1;
|
||||
case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1;
|
||||
case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1;
|
||||
default: return nullptr;
|
||||
|
|
@ -54,6 +61,11 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
|
|||
case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_DPT: return VDR_Q4_DPT_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q2_DPT: return VDR_Q2_DPT_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ2_TQ: return VDR_IQ2_TQ_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ3_TQ: return VDR_IQ3_TQ_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ1_BN: return VDR_IQ1_BN_Q8_1_MMVQ;
|
||||
case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ;
|
||||
default: return 1;
|
||||
}
|
||||
|
|
@ -1000,6 +1012,30 @@ static void mul_mat_vec_q_switch_type(
|
|||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_DPT:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_DPT>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ2_TQ:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_TQ>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_TQ:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_TQ>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ1_BN:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_BN>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
|
|
@ -1029,6 +1065,45 @@ void ggml_cuda_mul_mat_vec_q(
|
|||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
// Set Q4_DPT lookup table from tensor's quant_levels
|
||||
if (src0->type == GGML_TYPE_Q4_DPT) {
|
||||
GGML_ASSERT(src0->quant_levels && "Q4_DPT MUL_MAT requires levels (set tensor->quant_levels)");
|
||||
int8_t * d_q4dpt_levels;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q4dpt_levels, q4dpt_levels_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_q4dpt_levels, src0->quant_levels, Q4DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
// Set Q2_DPT lookup table from tensor's quant_levels
|
||||
if (src0->type == GGML_TYPE_Q2_DPT) {
|
||||
GGML_ASSERT(src0->quant_levels && "Q2_DPT MUL_MAT requires levels (set tensor->quant_levels)");
|
||||
int8_t * d_q2dpt_levels;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_q2dpt_levels, q2dpt_levels_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_q2dpt_levels, src0->quant_levels, Q2DPT_N_LEVELS * sizeof(int8_t), cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
// Set IQ2_TQ per-tensor grid
|
||||
if (src0->type == GGML_TYPE_IQ2_TQ) {
|
||||
GGML_ASSERT(src0->quant_levels && "IQ2_TQ MUL_MAT requires grid (set tensor->quant_levels)");
|
||||
int8_t * d_grid;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq2tq_grid_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_grid, src0->quant_levels, 64, cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
// Set IQ3_TQ per-tensor grid
|
||||
if (src0->type == GGML_TYPE_IQ3_TQ) {
|
||||
GGML_ASSERT(src0->quant_levels && "IQ3_TQ MUL_MAT requires grid (set tensor->quant_levels)");
|
||||
int8_t * d_grid;
|
||||
CUDA_CHECK(cudaGetSymbolAddress((void **)&d_grid, iq3tq_grid_cuda));
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_grid, src0->quant_levels, 128, cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
|
||||
// Set IQ1_BN per-tensor codebook+scale
|
||||
if (src0->type == GGML_TYPE_IQ1_BN) {
|
||||
GGML_ASSERT(src0->quant_levels && "IQ1_BN MUL_MAT requires codebook (set tensor->quant_levels)");
|
||||
ggml_cuda_set_iq1bn_aux(src0->quant_levels, stream);
|
||||
}
|
||||
|
||||
const size_t ts_src0 = ggml_type_size(src0->type);
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
const size_t ts_dst = ggml_type_size(dst->type);
|
||||
|
|
|
|||
|
|
@ -1240,6 +1240,194 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
|||
return d * sumi;
|
||||
}
|
||||
|
||||
#define VDR_Q4_DPT_Q8_1_MMVQ 2
|
||||
#define VDR_Q4_DPT_Q8_1_MMQ 4
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q4_dpt_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
const block_q4_dpt * bq4 = (const block_q4_dpt *) vbq + kbx;
|
||||
|
||||
const int * q8 = (const int *) bq8_1->qs + iqs;
|
||||
|
||||
int sumi = 0;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < VDR_Q4_DPT_Q8_1_MMVQ; ++l) {
|
||||
const int aux_q4 = get_int_b2(bq4->qs, iqs + l);
|
||||
const int2 v = get_int_from_table_16(aux_q4, q4dpt_levels_cuda);
|
||||
|
||||
sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
|
||||
sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
|
||||
}
|
||||
|
||||
const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
|
||||
return d * sumi;
|
||||
}
|
||||
|
||||
// Q2_DPT: 2-bit quantization with 4 learned levels
|
||||
// Helper: lookup 4 int8 levels using 2-bit indices packed in a 32-bit int
|
||||
static __device__ __forceinline__ int4 get_int_from_table_4(const int & q2, const int8_t * table) {
|
||||
int4 result;
|
||||
result.x = table[(q2 >> 0) & 3];
|
||||
result.y = table[(q2 >> 8) & 3];
|
||||
result.z = table[(q2 >> 16) & 3];
|
||||
result.w = table[(q2 >> 24) & 3];
|
||||
return result;
|
||||
}
|
||||
|
||||
#define VDR_Q2_DPT_Q8_1_MMVQ 4
|
||||
#define VDR_Q2_DPT_Q8_1_MMQ 8
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q2_dpt_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
const block_q2_dpt * bq2 = (const block_q2_dpt *) vbq + kbx;
|
||||
|
||||
const int * q8 = (const int *) bq8_1->qs + iqs;
|
||||
|
||||
int sumi = 0;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < VDR_Q2_DPT_Q8_1_MMVQ; ++l) {
|
||||
const int aux_q2 = get_int_b4(bq2->qs, l);
|
||||
const int4 v = get_int_from_table_4(aux_q2, q2dpt_levels_cuda);
|
||||
|
||||
sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
|
||||
sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
|
||||
sumi = ggml_cuda_dp4a(v.z, q8[l + 8], sumi);
|
||||
sumi = ggml_cuda_dp4a(v.w, q8[l + 12], sumi);
|
||||
}
|
||||
|
||||
const float d = __half2float(bq2->d) * __low2float(bq8_1->ds);
|
||||
return d * sumi;
|
||||
}
|
||||
|
||||
// IQ2_TQ: 2-bit with per-tensor trained 16×4 grid table
|
||||
// Grid lookup helper: 4 × 2-bit indices packed in a byte → 4 grid values packed as int32
|
||||
static __device__ __forceinline__ int iq2tq_grid_lookup4(uint8_t qbyte, const int8_t * grid_entry) {
|
||||
uint32_t r = (uint32_t)(uint8_t)grid_entry[(qbyte >> 0) & 3];
|
||||
r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 2) & 3] << 8;
|
||||
r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 4) & 3] << 16;
|
||||
r |= (uint32_t)(uint8_t)grid_entry[(qbyte >> 6) & 3] << 24;
|
||||
return (int)r;
|
||||
}
|
||||
|
||||
#define VDR_IQ2_TQ_Q8_1_MMVQ 1
|
||||
#define VDR_IQ2_TQ_Q8_1_MMQ 1
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_tq_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
const block_iq2_tq * bq = (const block_iq2_tq *) vbq + kbx;
|
||||
|
||||
// iqs selects which 16-element portion (0..15): 2 groups of 8 elements
|
||||
const int q8b = iqs / 2; // Q8_1 block index (0..7)
|
||||
const int q8off = (iqs & 1) * 4; // int32 offset within Q8_1 block (0 or 4)
|
||||
|
||||
// Grid indices for groups iqs*2 and iqs*2+1
|
||||
const uint8_t sc = bq->scales[iqs];
|
||||
const int8_t * ge0 = iq2tq_grid_cuda + (sc & 0xF) * 4;
|
||||
const int8_t * ge1 = iq2tq_grid_cuda + (sc >> 4) * 4;
|
||||
|
||||
const uint8_t * qs = bq->qs + iqs * 4;
|
||||
const int * q8 = (const int *)bq8_1[q8b].qs + q8off;
|
||||
|
||||
int sumi = 0;
|
||||
|
||||
// Group 0: 8 elements = 2 bytes qs, 2 int32 Q8_1
|
||||
sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[0], ge0), q8[0], sumi);
|
||||
sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[1], ge0), q8[1], sumi);
|
||||
|
||||
// Group 1: next 8 elements
|
||||
sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[2], ge1), q8[2], sumi);
|
||||
sumi = ggml_cuda_dp4a(iq2tq_grid_lookup4(qs[3], ge1), q8[3], sumi);
|
||||
|
||||
return __half2float(bq->d) * IQ2TQ_GRID_SCALE * __low2float(bq8_1[q8b].ds) * sumi;
|
||||
}
|
||||
|
||||
// IQ3_TQ: 3-bit with per-tensor trained 16×8 grid table
|
||||
#define VDR_IQ3_TQ_Q8_1_MMVQ 1
|
||||
#define VDR_IQ3_TQ_Q8_1_MMQ 1
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq3_tq_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
const block_iq3_tq * bq = (const block_iq3_tq *) vbq + kbx;
|
||||
|
||||
const int q8b = iqs / 2;
|
||||
const int q8off = (iqs & 1) * 4;
|
||||
|
||||
const uint8_t sc = bq->scales[iqs];
|
||||
const int8_t * ge0 = iq3tq_grid_cuda + (sc & 0xF) * 8;
|
||||
const int8_t * ge1 = iq3tq_grid_cuda + (sc >> 4) * 8;
|
||||
|
||||
const int * q8 = (const int *)bq8_1[q8b].qs + q8off;
|
||||
|
||||
int sumi = 0;
|
||||
|
||||
// Group 0: 8 elements, 3 bytes of qs
|
||||
{
|
||||
const uint8_t * qs = bq->qs + (iqs * 2) * 3;
|
||||
const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16);
|
||||
|
||||
int v0 = (uint8_t)ge0[(bits >> 0) & 7] | ((uint32_t)(uint8_t)ge0[(bits >> 3) & 7] << 8)
|
||||
| ((uint32_t)(uint8_t)ge0[(bits >> 6) & 7] << 16) | ((uint32_t)(uint8_t)ge0[(bits >> 9) & 7] << 24);
|
||||
sumi = ggml_cuda_dp4a(v0, q8[0], sumi);
|
||||
|
||||
int v1 = (uint8_t)ge0[(bits >> 12) & 7] | ((uint32_t)(uint8_t)ge0[(bits >> 15) & 7] << 8)
|
||||
| ((uint32_t)(uint8_t)ge0[(bits >> 18) & 7] << 16) | ((uint32_t)(uint8_t)ge0[(bits >> 21) & 7] << 24);
|
||||
sumi = ggml_cuda_dp4a(v1, q8[1], sumi);
|
||||
}
|
||||
|
||||
// Group 1: next 8 elements, next 3 bytes of qs
|
||||
{
|
||||
const uint8_t * qs = bq->qs + (iqs * 2 + 1) * 3;
|
||||
const uint32_t bits = qs[0] | ((uint32_t)qs[1] << 8) | ((uint32_t)qs[2] << 16);
|
||||
|
||||
int v0 = (uint8_t)ge1[(bits >> 0) & 7] | ((uint32_t)(uint8_t)ge1[(bits >> 3) & 7] << 8)
|
||||
| ((uint32_t)(uint8_t)ge1[(bits >> 6) & 7] << 16) | ((uint32_t)(uint8_t)ge1[(bits >> 9) & 7] << 24);
|
||||
sumi = ggml_cuda_dp4a(v0, q8[2], sumi);
|
||||
|
||||
int v1 = (uint8_t)ge1[(bits >> 12) & 7] | ((uint32_t)(uint8_t)ge1[(bits >> 15) & 7] << 8)
|
||||
| ((uint32_t)(uint8_t)ge1[(bits >> 18) & 7] << 16) | ((uint32_t)(uint8_t)ge1[(bits >> 21) & 7] << 24);
|
||||
sumi = ggml_cuda_dp4a(v1, q8[3], sumi);
|
||||
}
|
||||
|
||||
return __half2float(bq->d) * IQ3TQ_GRID_SCALE * __low2float(bq8_1[q8b].ds) * sumi;
|
||||
}
|
||||
|
||||
|
||||
// IQ1_BN: 8D vector quantized with per-tensor trained 4096-entry codebook
|
||||
#define VDR_IQ1_BN_Q8_1_MMVQ 1
|
||||
#define VDR_IQ1_BN_Q8_1_MMQ 1
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_bn_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
const block_iq1_bn * bq = (const block_iq1_bn *) vbq + kbx;
|
||||
|
||||
// iqs = 0..15, each thread handles 2 groups (16 elements)
|
||||
const int q8b = iqs / 2;
|
||||
const int q8off = (iqs & 1) * 4;
|
||||
|
||||
// Extract two 12-bit codebook indices from qs[3*iqs .. 3*iqs+2]
|
||||
const uint8_t * qs = bq->qs + 3 * iqs;
|
||||
const int ci0 = qs[0] | (((int)qs[1] & 0x0F) << 8);
|
||||
const int ci1 = (qs[1] >> 4) | ((int)qs[2] << 4);
|
||||
|
||||
const int * cb0 = (const int *)(iq1bn_codebook_cuda + ci0 * IQ1BN_CODEBOOK_DIM);
|
||||
const int * cb1 = (const int *)(iq1bn_codebook_cuda + ci1 * IQ1BN_CODEBOOK_DIM);
|
||||
|
||||
const int * q8 = (const int *)bq8_1[q8b].qs + q8off;
|
||||
|
||||
int sumi = 0;
|
||||
sumi = ggml_cuda_dp4a(cb0[0], q8[0], sumi);
|
||||
sumi = ggml_cuda_dp4a(cb0[1], q8[1], sumi);
|
||||
sumi = ggml_cuda_dp4a(cb1[0], q8[2], sumi);
|
||||
sumi = ggml_cuda_dp4a(cb1[1], q8[3], sumi);
|
||||
|
||||
return __half2float(bq->d) * IQ1BN_GRID_SCALE * __low2float(bq8_1[q8b].ds) * (float)sumi;
|
||||
}
|
||||
|
||||
#define VDR_IQ4_XS_Q8_1_MMVQ 4
|
||||
#define VDR_IQ4_XS_Q8_1_MMQ 4
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -27,6 +27,7 @@ GGML_API void quantize_row_nvfp4_ref(const float * GGML_RESTRICT x, block_nvfp4
|
|||
|
||||
GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_q3_kpt_ref(const float * GGML_RESTRICT x, block_q3_kpt * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||
|
|
@ -42,36 +43,37 @@ GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_
|
|||
GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dequantization
|
||||
GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
|
||||
GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_nvfp4(const block_nvfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
|
||||
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q3_kpt(const block_q3_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
|
||||
GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
|
||||
GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
|
@ -82,6 +84,14 @@ GGML_API size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RE
|
|||
GGML_API size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_q3_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// Q3_KPT level management
|
||||
GGML_API void q3kpt_set_levels(const float * levels);
|
||||
GGML_API const float * q3kpt_get_levels(void);
|
||||
GGML_API void q3kpt_free_levels(void);
|
||||
GGML_API void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float levels_out[Q3KPT_N_LEVELS]);
|
||||
GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
|
@ -102,6 +112,198 @@ GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTR
|
|||
GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API size_t quantize_nvfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1])
|
||||
GGML_API void q3pt_set_levels(const float * levels); // set global levels (quantization)
|
||||
GGML_API const float * q3pt_get_levels(void);
|
||||
GGML_API void q3pt_free_levels(void);
|
||||
|
||||
// Per-tensor levels registry (inference — range-based lookup by data address)
|
||||
|
||||
// Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized
|
||||
// 16-element sub-block values. Also sets the global levels via q3pt_set_levels().
|
||||
// data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL.
|
||||
GGML_API void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float levels_out[8]);
|
||||
|
||||
// Q4_DPT: IQ4_NL with learned per-tensor int8 levels
|
||||
GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// Q4_DPT levels management (per-tensor Lloyd-Max int8 levels)
|
||||
GGML_API void q4dpt_set_levels(const int8_t * levels);
|
||||
GGML_API const int8_t * q4dpt_get_levels(void);
|
||||
GGML_API void q4dpt_free_levels(void);
|
||||
|
||||
// Q2_DPT: 2-bit with learned per-tensor int8 levels
|
||||
GGML_API void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// Q2_DPT levels management (per-tensor Lloyd-Max int8 levels)
|
||||
GGML_API void q2dpt_set_levels(const int8_t * levels);
|
||||
GGML_API const int8_t * q2dpt_get_levels(void);
|
||||
GGML_API void q2dpt_free_levels(void);
|
||||
GGML_API void q2dpt_set_quant_strategy(int s);
|
||||
|
||||
// Train 4 Lloyd-Max int8 levels from tensor data for Q2_DPT.
|
||||
// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4].
|
||||
// Also sets the global levels via q2dpt_set_levels().
|
||||
GGML_API void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]);
|
||||
|
||||
// Q2_KPT: Q2_K with learned per-tensor float levels
|
||||
GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// Q2_KPT levels management (per-tensor float levels in [0,1])
|
||||
GGML_API void q2kpt_set_levels(const float * levels);
|
||||
GGML_API const float * q2kpt_get_levels(void);
|
||||
GGML_API void q2kpt_free_levels(void);
|
||||
// Prepare levels buffer for a tensor with given dimensions (call before parallel quantization)
|
||||
GGML_API void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
|
||||
|
||||
// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT.
|
||||
// Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids.
|
||||
// Also sets the global levels via q2kpt_set_levels().
|
||||
GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float levels_out[Q2KPT_N_LEVELS]);
|
||||
|
||||
// Train per-row levels for all rows: writes nrow * Q2KPT_N_LEVELS floats to out_levels.
|
||||
GGML_API void q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float * out_levels);
|
||||
|
||||
// IQ2_TQ: 2-bit scalar with per-group asymmetric grid (2.5625 bpw)
|
||||
GGML_API void dequantize_row_iq2_tq(const block_iq2_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_iq2_tq_ref(const float * GGML_RESTRICT x, block_iq2_tq * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_iq2_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
|
||||
GGML_API void iq2tq_set_grid(const int8_t grid[64]);
|
||||
GGML_API const int8_t * iq2tq_get_grid(void);
|
||||
|
||||
// IQ3_TQ: 3-bit scalar with per-group asymmetric grid (3.5625 bpw)
|
||||
GGML_API void dequantize_row_iq3_tq(const block_iq3_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_iq3_tq_ref(const float * GGML_RESTRICT x, block_iq3_tq * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_iq3_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[IQ3TQ_GRID_SIZE]);
|
||||
GGML_API void iq3tq_set_grid(const int8_t grid[IQ3TQ_GRID_SIZE]);
|
||||
GGML_API const int8_t * iq3tq_get_grid(void);
|
||||
|
||||
// IQ1_BN: 8D vector quantized with per-tensor trained codebook (1.5625 bpw)
|
||||
GGML_API void dequantize_row_iq1_bn(const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_iq1_bn_ref(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_iq1_bn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[IQ1BN_AUX_SIZE], int nthread);
|
||||
GGML_API void iq1bn_set_aux(const int8_t aux[IQ1BN_AUX_SIZE]);
|
||||
GGML_API const int8_t * iq1bn_get_aux(void);
|
||||
|
||||
// Train 16 Lloyd-Max int8 levels from tensor data.
|
||||
// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16].
|
||||
// Also sets the global levels via q4dpt_set_levels().
|
||||
GGML_API void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]);
|
||||
|
||||
GGML_API void quantize_row_q3_pt_ref(const float * GGML_RESTRICT x, block_q3_pt * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API void dequantize_row_q3_pt(const block_q3_pt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API size_t quantize_q3_pt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// Q3_PT levels management (per-tensor Lloyd-Max levels in [0,1])
|
||||
GGML_API void q3pt_set_levels(const float * levels); // set global levels (quantization)
|
||||
GGML_API const float * q3pt_get_levels(void);
|
||||
GGML_API void q3pt_free_levels(void);
|
||||
|
||||
// Per-tensor levels registry (inference — range-based lookup by data address)
|
||||
|
||||
// Train 8 Lloyd-Max levels from tensor data via weighted k-means on affine-normalized
|
||||
// 16-element sub-block values. Also sets the global levels via q3pt_set_levels().
|
||||
// data: float array [nrow * n_per_row], imatrix: importance weights [n_per_row] or NULL.
|
||||
GGML_API void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float levels_out[8]);
|
||||
|
||||
// Q4_DPT: IQ4_NL with learned per-tensor int8 levels
|
||||
GGML_API void dequantize_row_q4_dpt(const block_q4_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_q4_dpt_ref(const float * GGML_RESTRICT x, block_q4_dpt * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_q4_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// Q4_DPT levels management (per-tensor Lloyd-Max int8 levels)
|
||||
GGML_API void q4dpt_set_levels(const int8_t * levels);
|
||||
GGML_API const int8_t * q4dpt_get_levels(void);
|
||||
GGML_API void q4dpt_free_levels(void);
|
||||
|
||||
// Q2_DPT: 2-bit with learned per-tensor int8 levels
|
||||
GGML_API void dequantize_row_q2_dpt(const block_q2_dpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_q2_dpt_ref(const float * GGML_RESTRICT x, block_q2_dpt * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_q2_dpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// Q2_DPT levels management (per-tensor Lloyd-Max int8 levels)
|
||||
GGML_API void q2dpt_set_levels(const int8_t * levels);
|
||||
GGML_API const int8_t * q2dpt_get_levels(void);
|
||||
GGML_API void q2dpt_free_levels(void);
|
||||
GGML_API void q2dpt_set_quant_strategy(int s);
|
||||
|
||||
// Train 4 Lloyd-Max int8 levels from tensor data for Q2_DPT.
|
||||
// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[4].
|
||||
// Also sets the global levels via q2dpt_set_levels().
|
||||
GGML_API void q2dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, int8_t levels_out[Q2DPT_N_LEVELS]);
|
||||
|
||||
// Q2_KPT: Q2_K with learned per-tensor float levels
|
||||
GGML_API void dequantize_row_q2_kpt(const block_q2_kpt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_q2_kpt_ref(const float * GGML_RESTRICT x, block_q2_kpt * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_q2_kpt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t start_row, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// Q2_KPT levels management (per-tensor float levels in [0,1])
|
||||
GGML_API void q2kpt_set_levels(const float * levels);
|
||||
GGML_API const float * q2kpt_get_levels(void);
|
||||
GGML_API void q2kpt_free_levels(void);
|
||||
// Prepare levels buffer for a tensor with given dimensions (call before parallel quantization)
|
||||
GGML_API void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
|
||||
|
||||
// Train 4 Lloyd-Max float levels from tensor data for Q2_KPT.
|
||||
// Bins normalized sub-block values in [0,1], runs weighted k-means for 4 centroids.
|
||||
// Also sets the global levels via q2kpt_set_levels().
|
||||
GGML_API void q2kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float levels_out[Q2KPT_N_LEVELS]);
|
||||
|
||||
// Train per-row levels for all rows: writes nrow * Q2KPT_N_LEVELS floats to out_levels.
|
||||
GGML_API void q2kpt_train_all_row_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float * out_levels);
|
||||
|
||||
// IQ2_TQ: 2-bit scalar with per-group asymmetric grid (2.5625 bpw)
|
||||
GGML_API void dequantize_row_iq2_tq(const block_iq2_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_iq2_tq_ref(const float * GGML_RESTRICT x, block_iq2_tq * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_iq2_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
|
||||
GGML_API void iq2tq_set_grid(const int8_t grid[64]);
|
||||
GGML_API const int8_t * iq2tq_get_grid(void);
|
||||
|
||||
// IQ3_TQ: 3-bit scalar with per-group asymmetric grid (3.5625 bpw)
|
||||
GGML_API void dequantize_row_iq3_tq(const block_iq3_tq * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_iq3_tq_ref(const float * GGML_RESTRICT x, block_iq3_tq * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_iq3_tq(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[IQ3TQ_GRID_SIZE]);
|
||||
GGML_API void iq3tq_set_grid(const int8_t grid[IQ3TQ_GRID_SIZE]);
|
||||
GGML_API const int8_t * iq3tq_get_grid(void);
|
||||
|
||||
// IQ1_BN: 8D vector quantized with per-tensor trained codebook (1.5625 bpw)
|
||||
GGML_API void dequantize_row_iq1_bn(const block_iq1_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k, const void * levels);
|
||||
GGML_API void quantize_row_iq1_bn_ref(const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
|
||||
GGML_API size_t quantize_iq1_bn(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
GGML_API void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[IQ1BN_AUX_SIZE], int nthread);
|
||||
GGML_API void iq1bn_set_aux(const int8_t aux[IQ1BN_AUX_SIZE]);
|
||||
GGML_API const int8_t * iq1bn_get_aux(void);
|
||||
|
||||
// Train 16 Lloyd-Max int8 levels from tensor data.
|
||||
// Bins normalized values (x/amax) in [-1,1], runs weighted k-means, rounds to sorted int8[16].
|
||||
// Also sets the global levels via q4dpt_set_levels().
|
||||
GGML_API void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, int8_t levels_out[Q4DPT_N_LEVELS]);
|
||||
|
||||
GGML_API void iq2xs_init_impl(enum ggml_type type);
|
||||
GGML_API void iq2xs_free_impl(enum ggml_type type);
|
||||
GGML_API void iq3xs_init_impl(int grid_size);
|
||||
|
|
|
|||
|
|
@ -12273,7 +12273,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml
|
|||
ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
|
||||
}
|
||||
|
||||
static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
|
||||
static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant, const void * levels = nullptr) {
|
||||
if (quant == GGML_TYPE_F32) {
|
||||
memcpy(to, from, sizeof(float) * ne);
|
||||
return;
|
||||
|
|
@ -12283,7 +12283,7 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg
|
|||
|
||||
ggml_to_float_t dequant_fn = tt->to_float;
|
||||
|
||||
dequant_fn(from, to, ne);
|
||||
dequant_fn(from, to, ne, levels);
|
||||
}
|
||||
|
||||
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
||||
|
|
|
|||
|
|
@ -456,6 +456,11 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_fp16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
ggml_fp16_to_fp32_row((const ggml_fp16_t *)x, y, n);
|
||||
}
|
||||
|
||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
int i = 0;
|
||||
for (; i < n; ++i) {
|
||||
|
|
@ -470,6 +475,11 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_bf16_to_fp32_row_leveled(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t n, const void * levels) {
|
||||
GGML_UNUSED(levels);
|
||||
ggml_bf16_to_fp32_row((const ggml_bf16_t *)x, y, n);
|
||||
}
|
||||
|
||||
void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
y[i] = ggml_compute_fp32_to_bf16(x[i]);
|
||||
|
|
@ -648,7 +658,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.blck_size = 1,
|
||||
.type_size = sizeof(ggml_fp16_t),
|
||||
.is_quantized = false,
|
||||
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
||||
.to_float = ggml_fp16_to_fp32_row_leveled,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
},
|
||||
[GGML_TYPE_Q1_0] = {
|
||||
|
|
@ -857,7 +867,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.blck_size = 1,
|
||||
.type_size = sizeof(ggml_bf16_t),
|
||||
.is_quantized = false,
|
||||
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
||||
.to_float = ggml_bf16_to_fp32_row_leveled,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
|
||||
},
|
||||
[31] = { // GGML_TYPE_Q4_0_4_4
|
||||
|
|
@ -912,6 +922,71 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
|||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
},
|
||||
[GGML_TYPE_Q3_PT] = {
|
||||
.type_name = "q3_pt",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_q3_pt),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q3_pt,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q3_pt_ref,
|
||||
},
|
||||
[GGML_TYPE_Q3_KPT] = {
|
||||
.type_name = "q3_kpt",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_q3_kpt),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q3_kpt,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q3_kpt_ref,
|
||||
},
|
||||
[GGML_TYPE_Q4_DPT] = {
|
||||
.type_name = "q4_dpt",
|
||||
.blck_size = QK4_NL,
|
||||
.type_size = sizeof(block_q4_dpt),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_dpt,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_dpt_ref,
|
||||
},
|
||||
[GGML_TYPE_Q2_DPT] = {
|
||||
.type_name = "q2_dpt",
|
||||
.blck_size = QK2_DPT,
|
||||
.type_size = sizeof(block_q2_dpt),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q2_dpt,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q2_dpt_ref,
|
||||
},
|
||||
[GGML_TYPE_Q2_KPT] = {
|
||||
.type_name = "q2_kpt",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_q2_kpt),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q2_kpt,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q2_kpt_ref,
|
||||
.levels_row_stride = 0, // computed dynamically: (ne[0]/256)*4*sizeof(float)
|
||||
},
|
||||
[GGML_TYPE_IQ2_TQ] = {
|
||||
.type_name = "iq2_tq",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_iq2_tq),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq2_tq,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_iq2_tq_ref,
|
||||
},
|
||||
[GGML_TYPE_IQ3_TQ] = {
|
||||
.type_name = "iq3_tq",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_iq3_tq),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq3_tq,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_iq3_tq_ref,
|
||||
},
|
||||
[GGML_TYPE_IQ1_BN] = {
|
||||
.type_name = "iq1_bn",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_iq1_bn),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq1_bn,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_iq1_bn_ref,
|
||||
},
|
||||
};
|
||||
|
||||
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
|
||||
|
|
@ -1412,6 +1487,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|||
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||
case GGML_FTYPE_MOSTLY_Q3_PT: wtype = GGML_TYPE_Q3_PT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q3_KPT: wtype = GGML_TYPE_Q3_KPT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_DPT: wtype = GGML_TYPE_Q4_DPT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q2_KPT: wtype = GGML_TYPE_Q2_KPT; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
|
|
@ -7607,6 +7686,13 @@ void ggml_quantize_init(enum ggml_type type) {
|
|||
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
|
||||
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
||||
case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
|
||||
case GGML_TYPE_IQ2_TQ: break; // per-tensor grid stored in tensor->quant_levels
|
||||
case GGML_TYPE_IQ3_TQ: break; // per-tensor grid stored in tensor->quant_levels
|
||||
case GGML_TYPE_IQ1_BN: break; // per-tensor codebook stored in tensor->quant_levels
|
||||
case GGML_TYPE_Q3_PT: break; // levels stored in tensor->quant_levels
|
||||
case GGML_TYPE_Q3_KPT: break; // levels stored in tensor->quant_levels
|
||||
case GGML_TYPE_Q4_DPT: break; // levels stored in tensor->quant_levels
|
||||
case GGML_TYPE_Q2_KPT: break; // levels stored in tensor->quant_levels
|
||||
default: // nothing
|
||||
break;
|
||||
}
|
||||
|
|
@ -7685,6 +7771,13 @@ size_t ggml_quantize_chunk(
|
|||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q3_PT: result = quantize_q3_pt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q3_KPT: result = quantize_q3_kpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_DPT: result = quantize_q4_dpt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q2_KPT: result = quantize_q2_kpt (src + start, (char *) dst + start_row * row_size, start_row, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ2_TQ: result = quantize_iq2_tq (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ3_TQ: result = quantize_iq3_tq (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
|
|
|
|||
|
|
@ -1331,37 +1331,63 @@ struct gguf_writer_base {
|
|||
|
||||
if (kv.is_array) {
|
||||
write(GGUF_TYPE_ARRAY);
|
||||
write(kv.get_type());
|
||||
const enum gguf_type elem_type = kv.get_type();
|
||||
write(elem_type);
|
||||
write(ne);
|
||||
// Write array element data based on element type
|
||||
switch (elem_type) {
|
||||
case GGUF_TYPE_UINT8:
|
||||
case GGUF_TYPE_INT8:
|
||||
case GGUF_TYPE_UINT16:
|
||||
case GGUF_TYPE_INT16:
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32:
|
||||
case GGUF_TYPE_FLOAT32:
|
||||
case GGUF_TYPE_UINT64:
|
||||
case GGUF_TYPE_INT64:
|
||||
case GGUF_TYPE_FLOAT64: {
|
||||
// Write raw bytes inline for array data
|
||||
for (size_t i = 0; i < kv.data.size(); ++i) {
|
||||
write(kv.data[i]);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_BOOL: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<bool>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_STRING: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<std::string>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
default: GGML_ABORT("invalid array element type");
|
||||
}
|
||||
} else {
|
||||
write(kv.get_type());
|
||||
}
|
||||
|
||||
switch (kv.get_type()) {
|
||||
case GGUF_TYPE_UINT8:
|
||||
case GGUF_TYPE_INT8:
|
||||
case GGUF_TYPE_UINT16:
|
||||
case GGUF_TYPE_INT16:
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32:
|
||||
case GGUF_TYPE_FLOAT32:
|
||||
case GGUF_TYPE_UINT64:
|
||||
case GGUF_TYPE_INT64:
|
||||
case GGUF_TYPE_FLOAT64: {
|
||||
write(kv.data);
|
||||
} break;
|
||||
case GGUF_TYPE_BOOL: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<bool>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_STRING: {
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
write(kv.get_val<std::string>(i));
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
default: GGML_ABORT("invalid type");
|
||||
switch (kv.get_type()) {
|
||||
case GGUF_TYPE_UINT8:
|
||||
case GGUF_TYPE_INT8:
|
||||
case GGUF_TYPE_UINT16:
|
||||
case GGUF_TYPE_INT16:
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32:
|
||||
case GGUF_TYPE_FLOAT32:
|
||||
case GGUF_TYPE_UINT64:
|
||||
case GGUF_TYPE_INT64:
|
||||
case GGUF_TYPE_FLOAT64: {
|
||||
write(kv.data);
|
||||
} break;
|
||||
case GGUF_TYPE_BOOL: {
|
||||
write(kv.get_val<bool>(0));
|
||||
} break;
|
||||
case GGUF_TYPE_STRING: {
|
||||
write(kv.get_val<std::string>(0));
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
default: GGML_ABORT("invalid type");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -155,6 +155,14 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_PT = 41, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_KPT = 42, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_DPT = 43, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_KPT = 44, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_TQ = 45, // except 1d tensors, trellis quantized with RNG codebook
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_TQ = 46, // except 1d tensors, 3-bit with per-tensor trained grid
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_BN = 47, // except 1d tensors, 8D vector quantized with trained codebook
|
||||
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
|
|
|||
|
|
@ -157,8 +157,8 @@ int main(int argc, char** argv) {
|
|||
|
||||
t1 = std::chrono::high_resolution_clock::now();
|
||||
float fs;
|
||||
if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
|
||||
else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
|
||||
if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1, nullptr);
|
||||
else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1, nullptr);
|
||||
t2 = std::chrono::high_resolution_clock::now();
|
||||
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||
if (iloop > 3) ggml.addResult(fs, t);
|
||||
|
|
|
|||
|
|
@ -285,8 +285,8 @@ int main(int argc, char** argv) {
|
|||
else {
|
||||
const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
|
||||
vdot->from_float(y1.data(), q8.data(), kVecSize);
|
||||
if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
||||
else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
||||
if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1, nullptr);
|
||||
else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1, nullptr);
|
||||
}
|
||||
sumq += result;
|
||||
t2 = std::chrono::high_resolution_clock::now();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,604 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Deep analysis of WHY ffn_down is hard to quantize.
|
||||
Compares structural properties of all weight and activation tensors.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import struct
|
||||
import sys
|
||||
import os
|
||||
|
||||
DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data")
|
||||
|
||||
|
||||
def load_f32_tensor(name):
|
||||
path = os.path.join(DATA_DIR, name)
|
||||
with open(path, "rb") as f:
|
||||
nrow, ncol = struct.unpack("qq", f.read(16))
|
||||
data = np.frombuffer(f.read(), dtype=np.float32)
|
||||
assert len(data) == nrow * ncol, f"Expected {nrow * ncol}, got {len(data)}"
|
||||
return data.reshape(nrow, ncol)
|
||||
|
||||
|
||||
def stats(label, arr):
|
||||
"""Print comprehensive statistics for a flat array."""
|
||||
a = arr.ravel()
|
||||
print(f" {label}:")
|
||||
print(f" shape={arr.shape}, n={len(a)}")
|
||||
print(f" mean={a.mean():.6f}, std={a.std():.6f}")
|
||||
print(f" min={a.min():.6f}, max={a.max():.6f}")
|
||||
print(f" median={np.median(a):.6f}")
|
||||
print(
|
||||
f" |mean|/std = {abs(a.mean()) / (a.std() + 1e-10):.4f} (offset-to-spread ratio)"
|
||||
)
|
||||
# Kurtosis (excess) - how heavy-tailed vs Gaussian
|
||||
kurt = np.mean(((a - a.mean()) / (a.std() + 1e-10)) ** 4) - 3.0
|
||||
# Skewness
|
||||
skew = np.mean(((a - a.mean()) / (a.std() + 1e-10)) ** 3)
|
||||
print(f" skewness={skew:.4f}, excess_kurtosis={kurt:.4f}")
|
||||
# Percentile ranges
|
||||
pcts = np.percentile(a, [0.1, 1, 5, 25, 50, 75, 95, 99, 99.9])
|
||||
print(
|
||||
f" percentiles: 0.1%={pcts[0]:.4f}, 1%={pcts[1]:.4f}, 5%={pcts[2]:.4f}, "
|
||||
f"25%={pcts[3]:.4f}, 50%={pcts[4]:.4f}, 75%={pcts[5]:.4f}, "
|
||||
f"95%={pcts[6]:.4f}, 99%={pcts[7]:.4f}, 99.9%={pcts[8]:.4f}"
|
||||
)
|
||||
# Sparsity
|
||||
near_zero = np.sum(np.abs(a) < 0.001 * a.std()) / len(a)
|
||||
print(f" fraction |x| < 0.001*std: {near_zero:.4f}")
|
||||
return {
|
||||
"mean": a.mean(),
|
||||
"std": a.std(),
|
||||
"skew": skew,
|
||||
"kurt": kurt,
|
||||
"min": a.min(),
|
||||
"max": a.max(),
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# 1. BASIC WEIGHT TENSOR COMPARISON
|
||||
# ============================================================================
|
||||
print("=" * 80)
|
||||
print("SECTION 1: WEIGHT TENSOR GLOBAL STATISTICS")
|
||||
print("=" * 80)
|
||||
|
||||
tensors = {
|
||||
"ffn_gate": ("blk_0_ffn_gate_weight.f32bin", "9728x2560 (wide→narrow proj)"),
|
||||
"ffn_up": ("blk_0_ffn_up_weight.f32bin", "9728x2560 (wide→narrow proj)"),
|
||||
"ffn_down": ("blk_0_ffn_down_weight.f32bin", "2560x9728 (narrow→wide proj)"),
|
||||
"attn_q": ("blk_0_attn_q_weight.f32bin", "4096x2560"),
|
||||
"attn_k": ("blk_0_attn_k_weight.f32bin", "1024x2560"),
|
||||
"attn_v": ("blk_0_attn_v_weight.f32bin", "1024x2560"),
|
||||
"attn_out": ("blk_0_attn_output_weight.f32bin", "2560x4096"),
|
||||
}
|
||||
|
||||
weight_data = {}
|
||||
for name, (fname, desc) in tensors.items():
|
||||
try:
|
||||
W = load_f32_tensor(fname)
|
||||
print(f"\n{'─' * 70}")
|
||||
print(f" {name} [{desc}] — file: {fname}")
|
||||
weight_data[name] = W
|
||||
stats(name, W)
|
||||
except Exception as e:
|
||||
print(f" {name}: SKIP ({e})")
|
||||
|
||||
# ============================================================================
|
||||
# 2. ROW-LEVEL STATISTICS (each row is a neuron output)
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("SECTION 2: ROW-LEVEL VARIABILITY (per-neuron weight statistics)")
|
||||
print("=" * 80)
|
||||
print(" Each row of the weight matrix produces one output dimension.")
|
||||
print(" High row-to-row variability in mean/std means the quantizer")
|
||||
print(" must handle very different distributions across rows.\n")
|
||||
|
||||
for name, W in weight_data.items():
|
||||
row_means = W.mean(axis=1)
|
||||
row_stds = W.std(axis=1)
|
||||
row_ranges = W.max(axis=1) - W.min(axis=1)
|
||||
|
||||
print(f"\n {name} ({W.shape[0]} rows × {W.shape[1]} cols):")
|
||||
print(
|
||||
f" Row means: mean={row_means.mean():.6f}, std={row_means.std():.6f}, "
|
||||
f"range=[{row_means.min():.6f}, {row_means.max():.6f}]"
|
||||
)
|
||||
print(
|
||||
f" Row stds: mean={row_stds.mean():.6f}, std={row_stds.std():.6f}, "
|
||||
f"range=[{row_stds.min():.6f}, {row_stds.max():.6f}]"
|
||||
)
|
||||
print(f" Row ranges: mean={row_ranges.mean():.6f}, std={row_ranges.std():.6f}")
|
||||
print(
|
||||
f" RowMeans CV (std/mean): {row_means.std() / (abs(row_means.mean()) + 1e-10):.4f}"
|
||||
)
|
||||
print(f" RowStds CV: {row_stds.std() / (row_stds.mean() + 1e-10):.4f}")
|
||||
|
||||
# ============================================================================
|
||||
# 3. GROUP-LEVEL ANALYSIS (16-element groups, like Q2_K)
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("SECTION 3: GROUP-LEVEL ANALYSIS (16-element groups)")
|
||||
print("=" * 80)
|
||||
print(" Quantization works on 16-element groups. Key question:")
|
||||
print(" How much does each group need its own OFFSET (dmin)?\n")
|
||||
|
||||
GS = 16
|
||||
|
||||
for name, W in weight_data.items():
|
||||
# Look at first 256 rows for speed
|
||||
nr = min(W.shape[0], 256)
|
||||
nc = W.shape[1]
|
||||
|
||||
group_means = []
|
||||
group_stds = []
|
||||
group_ranges = []
|
||||
group_offsets = [] # |mean| / range — how important is the offset
|
||||
|
||||
for r in range(nr):
|
||||
for g_start in range(0, nc, GS):
|
||||
g = W[r, g_start : g_start + GS]
|
||||
gm = g.mean()
|
||||
gs = g.std()
|
||||
gr = g.max() - g.min()
|
||||
gmin = g.min()
|
||||
|
||||
group_means.append(gm)
|
||||
group_stds.append(gs)
|
||||
group_ranges.append(gr)
|
||||
# Offset importance: how large is the group mean relative to its range?
|
||||
# If this is high, offset (dmin) matters a lot
|
||||
if gr > 1e-10:
|
||||
group_offsets.append(abs(gm) / gr)
|
||||
else:
|
||||
group_offsets.append(0)
|
||||
|
||||
gm = np.array(group_means)
|
||||
gs = np.array(group_stds)
|
||||
gr = np.array(group_ranges)
|
||||
go = np.array(group_offsets)
|
||||
|
||||
print(f"\n {name} ({len(group_means)} groups):")
|
||||
print(
|
||||
f" Group mean: mean={gm.mean():.6f}, std={gm.std():.6f}, "
|
||||
f"range=[{gm.min():.6f}, {gm.max():.6f}]"
|
||||
)
|
||||
print(f" Group std: mean={gs.mean():.6f}, std={gs.std():.6f}")
|
||||
print(f" Group range: mean={gr.mean():.6f}, std={gr.std():.6f}")
|
||||
print(f" *** OFFSET IMPORTANCE (|group_mean| / range) ***")
|
||||
print(
|
||||
f" mean={go.mean():.4f}, median={np.median(go):.4f}, "
|
||||
f"p90={np.percentile(go, 90):.4f}, max={go.max():.4f}"
|
||||
)
|
||||
print(f" fraction with offset > 0.1: {np.mean(go > 0.1):.3f}")
|
||||
print(f" fraction with offset > 0.2: {np.mean(go > 0.2):.3f}")
|
||||
print(f" fraction with offset > 0.3: {np.mean(go > 0.3):.3f}")
|
||||
|
||||
# How well does zeroing the min (Q2_K style, clamping min to 0) work?
|
||||
# vs keeping the actual min
|
||||
mse_no_offset = 0 # Assume uniform 4 levels [0,1,2,3] * scale
|
||||
mse_with_offset = 0 # Assume uniform 4 levels [0,1,2,3] * scale + offset
|
||||
|
||||
for r in range(nr):
|
||||
for g_start in range(0, nc, GS):
|
||||
g = W[r, g_start : g_start + GS]
|
||||
gmin = g.min()
|
||||
gmax = g.max()
|
||||
gr = gmax - gmin
|
||||
if gr < 1e-10:
|
||||
continue
|
||||
|
||||
# No offset: clamp min to 0, scale = max/3
|
||||
if gmin > 0:
|
||||
scale_no = gmax / 3.0
|
||||
min_no = 0
|
||||
else:
|
||||
scale_no = gmax / 3.0
|
||||
min_no = 0 # lose the negative offset
|
||||
# Actually use (gmax - 0)/3 but we're clamping gmin to 0
|
||||
|
||||
# Better: use actual min/max
|
||||
scale_w = gr / 3.0
|
||||
min_w = gmin
|
||||
|
||||
for val in g:
|
||||
# No offset quantization
|
||||
norm_no = val / (scale_no + 1e-10)
|
||||
idx_no = max(0, min(3, int(round(norm_no))))
|
||||
recon_no = scale_no * idx_no
|
||||
mse_no_offset += (val - recon_no) ** 2
|
||||
|
||||
# With offset quantization
|
||||
norm_w = (val - min_w) / (scale_w + 1e-10)
|
||||
idx_w = max(0, min(3, int(round(norm_w))))
|
||||
recon_w = min_w + scale_w * idx_w
|
||||
mse_with_offset += (val - recon_w) ** 2
|
||||
|
||||
total_elements = nr * nc
|
||||
rmse_no = np.sqrt(mse_no_offset / total_elements)
|
||||
rmse_w = np.sqrt(mse_with_offset / total_elements)
|
||||
improvement = (rmse_no - rmse_w) / rmse_no * 100
|
||||
print(f" Quant RMSE (no offset): {rmse_no:.6f}")
|
||||
print(f" Quant RMSE (with offset): {rmse_w:.6f}")
|
||||
print(f" Offset benefit: {improvement:.1f}% RMSE reduction")
|
||||
|
||||
# ============================================================================
|
||||
# 4. ACTIVATION ANALYSIS
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("SECTION 4: ACTIVATION DISTRIBUTION COMPARISON")
|
||||
print("=" * 80)
|
||||
|
||||
activations = {
|
||||
"ffn_input (gate/up)": "act_blk0_ffn_input.f32bin",
|
||||
"ffn_down_input (swiglu)": "act_blk0_ffn_down_input.f32bin",
|
||||
"attn_input (q/k/v)": "act_blk0_attn_input.f32bin",
|
||||
"attn_output_input": "act_blk0_attn_output_input.f32bin",
|
||||
}
|
||||
|
||||
act_data = {}
|
||||
for name, fname in activations.items():
|
||||
try:
|
||||
A = load_f32_tensor(fname)
|
||||
act_data[name] = A
|
||||
print(f"\n{'─' * 70}")
|
||||
print(f" {name} — {fname}")
|
||||
stats(name, A)
|
||||
except Exception as e:
|
||||
print(f" {name}: SKIP ({e})")
|
||||
|
||||
# ============================================================================
|
||||
# 5. THE CRITICAL QUESTION: PER-DIMENSION ACTIVATION MAGNITUDE
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("SECTION 5: PER-DIMENSION ACTIVATION POWER (per-column RMS)")
|
||||
print("=" * 80)
|
||||
print(" If activation dimensions have very different magnitudes,")
|
||||
print(" the quantization error in each weight dimension is weighted differently.")
|
||||
print(" Dimensions with high activation power amplify weight errors.\n")
|
||||
|
||||
for name, A in act_data.items():
|
||||
col_rms = np.sqrt(np.mean(A**2, axis=0)) # RMS per column (dimension)
|
||||
print(f"\n {name} ({A.shape[1]} dimensions):")
|
||||
print(f" Col RMS: mean={col_rms.mean():.6f}, std={col_rms.std():.6f}")
|
||||
print(f" Col RMS range: [{col_rms.min():.6f}, {col_rms.max():.6f}]")
|
||||
print(f" Col RMS CV (std/mean): {col_rms.std() / (col_rms.mean() + 1e-10):.4f}")
|
||||
print(f" Max/Min ratio: {col_rms.max() / (col_rms.min() + 1e-10):.1f}x")
|
||||
|
||||
# Top 10 and bottom 10 dimensions by power
|
||||
top10 = np.argsort(col_rms)[-10:][::-1]
|
||||
bot10 = np.argsort(col_rms)[:10]
|
||||
print(
|
||||
f" Top-10 dims by RMS: {[(int(d), f'{col_rms[d]:.4f}') for d in top10[:5]]}..."
|
||||
)
|
||||
print(
|
||||
f" Bot-10 dims by RMS: {[(int(d), f'{col_rms[d]:.4f}') for d in bot10[:5]]}..."
|
||||
)
|
||||
|
||||
# How much do the top 10% of dimensions contribute to total power?
|
||||
total_power = np.sum(col_rms**2)
|
||||
sorted_power = np.sort(col_rms**2)[::-1]
|
||||
top10pct = int(len(col_rms) * 0.1)
|
||||
top10pct_power = np.sum(sorted_power[:top10pct])
|
||||
top1pct = max(1, int(len(col_rms) * 0.01))
|
||||
top1pct_power = np.sum(sorted_power[:top1pct])
|
||||
print(
|
||||
f" Top 10% of dims contribute {top10pct_power / total_power * 100:.1f}% of total power"
|
||||
)
|
||||
print(
|
||||
f" Top 1% of dims contribute {top1pct_power / total_power * 100:.1f}% of total power"
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# 6. CROSS-CORRELATION: WEIGHT ERROR × ACTIVATION POWER
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("SECTION 6: WHERE DO WEIGHT ERRORS MEET HIGH ACTIVATION POWER?")
|
||||
print("=" * 80)
|
||||
print(" For each weight dimension, compute: activation_rms[dim] × weight_error[dim]")
|
||||
print(" This tells us which dimensions contribute most to matmul error.\n")
|
||||
|
||||
# Focus on ffn_down vs ffn_gate for comparison
|
||||
focus = [
|
||||
("ffn_down", "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin"),
|
||||
("ffn_gate", "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin"),
|
||||
("ffn_up", "blk_0_ffn_up_weight.f32bin", "act_blk0_ffn_input.f32bin"),
|
||||
("attn_q", "blk_0_attn_q_weight.f32bin", "act_blk0_attn_input.f32bin"),
|
||||
]
|
||||
|
||||
for name, wfile, afile in focus:
|
||||
W = load_f32_tensor(wfile)
|
||||
A = load_f32_tensor(afile)
|
||||
|
||||
if W.shape[1] != A.shape[1]:
|
||||
print(f" {name}: dim mismatch W={W.shape[1]} vs A={A.shape[1]}, SKIP")
|
||||
continue
|
||||
|
||||
nc = W.shape[1]
|
||||
|
||||
# Per-column activation RMS
|
||||
act_rms = np.sqrt(np.mean(A**2, axis=0))
|
||||
|
||||
# Per-column weight std and range (how "hard" to quantize)
|
||||
w_std = W.std(axis=0)
|
||||
w_range = W.max(axis=0) - W.min(axis=0)
|
||||
|
||||
# Per-column weight kurtosis (heavy tails = harder to quantize)
|
||||
w_kurt = (
|
||||
np.mean(((W - W.mean(axis=0)) / (W.std(axis=0) + 1e-10)) ** 4, axis=0) - 3.0
|
||||
)
|
||||
|
||||
# Weight error proxy: with 2-bit uniform quant on 16-element groups
|
||||
# Higher variance columns → more error
|
||||
nr = min(W.shape[0], 256)
|
||||
|
||||
# Simple Q2_K-style error estimate per dimension:
|
||||
# For each group of 16 in the column direction, quantize and measure error
|
||||
dim_mse = np.zeros(nc)
|
||||
for g_start in range(0, nc, GS):
|
||||
g_end = min(g_start + GS, nc)
|
||||
for r in range(nr):
|
||||
g = W[r, g_start:g_end]
|
||||
gmin = min(g.min(), 0) # Q2_K clamps min to ≤0
|
||||
gmax = g.max()
|
||||
gr = gmax - gmin
|
||||
if gr < 1e-10:
|
||||
continue
|
||||
scale = gr / 3.0
|
||||
for i, val in enumerate(g):
|
||||
norm = (val - gmin) / scale
|
||||
idx = max(0, min(3, int(round(norm))))
|
||||
recon = gmin + scale * idx
|
||||
dim_mse[g_start + i] += (val - recon) ** 2
|
||||
|
||||
dim_rmse = np.sqrt(dim_mse / nr)
|
||||
|
||||
# The key metric: dimension-level contribution to matmul error
|
||||
# matmul_error_contribution[d] ≈ act_rms[d] * weight_rmse[d]
|
||||
matmul_contrib = act_rms * dim_rmse
|
||||
|
||||
print(f"\n {name} ({nc} dimensions):")
|
||||
print(
|
||||
f" act_rms: mean={act_rms.mean():.4f}, CV={act_rms.std() / act_rms.mean():.4f}"
|
||||
)
|
||||
print(
|
||||
f" w_rmse: mean={dim_rmse.mean():.6f}, CV={dim_rmse.std() / (dim_rmse.mean() + 1e-10):.4f}"
|
||||
)
|
||||
print(
|
||||
f" matmul_contrib: mean={matmul_contrib.mean():.6f}, "
|
||||
f"std={matmul_contrib.std():.6f}"
|
||||
)
|
||||
|
||||
# Correlation between activation power and weight error
|
||||
corr = np.corrcoef(act_rms, dim_rmse)[0, 1]
|
||||
print(f" CORRELATION act_rms ↔ weight_rmse: {corr:.4f}")
|
||||
print(f" (>0 means high-power dims are also hard to quantize — BAD)")
|
||||
|
||||
# Top contributors to matmul error
|
||||
top_dims = np.argsort(matmul_contrib)[-20:][::-1]
|
||||
print(f" Top-5 error-contributing dimensions:")
|
||||
for d in top_dims[:5]:
|
||||
print(
|
||||
f" dim {d}: act_rms={act_rms[d]:.4f}, w_rmse={dim_rmse[d]:.6f}, "
|
||||
f"contrib={matmul_contrib[d]:.6f}, w_std={w_std[d]:.6f}, w_kurt={w_kurt[d]:.2f}"
|
||||
)
|
||||
|
||||
# Distribution of matmul contributions
|
||||
total_contrib = matmul_contrib.sum()
|
||||
sorted_contrib = np.sort(matmul_contrib)[::-1]
|
||||
for pct in [0.01, 0.05, 0.10, 0.25]:
|
||||
n = max(1, int(nc * pct))
|
||||
print(
|
||||
f" Top {pct * 100:.0f}% dims: {sorted_contrib[:n].sum() / total_contrib * 100:.1f}% "
|
||||
f"of total matmul error"
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# 7. THE STRUCTURAL ASYMMETRY: COLUMN DIRECTION GROUP ANALYSIS
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("SECTION 7: STRUCTURAL ASYMMETRY — COLUMN vs ROW GROUPING")
|
||||
print("=" * 80)
|
||||
print(" Quantization groups along the ROW (inner dim). For ffn_down,")
|
||||
print(" each row has 9728 elements (38 groups of 256).")
|
||||
print(" For ffn_gate, each row has 2560 elements (10 groups of 256).")
|
||||
print(" More groups = more metadata (scales/offsets) relative to data bits.\n")
|
||||
|
||||
for name, wfile, afile in focus:
|
||||
W = load_f32_tensor(wfile)
|
||||
nc = W.shape[1]
|
||||
n_groups_per_row = nc // 256 # super-blocks per row
|
||||
|
||||
print(f"\n {name}: {nc} cols → {n_groups_per_row} super-blocks per row")
|
||||
print(f" Groups per row: {nc // GS} (16-element groups)")
|
||||
print(
|
||||
f" With Q2_K (2.625 bpw): {n_groups_per_row * 2} scale+offset bytes per row"
|
||||
)
|
||||
|
||||
# How much do group means vary WITHIN a row?
|
||||
nr = min(W.shape[0], 64)
|
||||
intra_row_mean_var = []
|
||||
for r in range(nr):
|
||||
group_means = []
|
||||
for g_start in range(0, nc, GS):
|
||||
group_means.append(W[r, g_start : g_start + GS].mean())
|
||||
group_means = np.array(group_means)
|
||||
intra_row_mean_var.append(group_means.std())
|
||||
|
||||
print(
|
||||
f" Intra-row group mean variability (avg across rows): "
|
||||
f"mean={np.mean(intra_row_mean_var):.6f}"
|
||||
)
|
||||
|
||||
# How much does the sign of group means vary?
|
||||
pos_frac = 0
|
||||
neg_frac = 0
|
||||
total_groups = 0
|
||||
for r in range(nr):
|
||||
for g_start in range(0, nc, GS):
|
||||
gm = W[r, g_start : g_start + GS].mean()
|
||||
if gm > 0.001:
|
||||
pos_frac += 1
|
||||
elif gm < -0.001:
|
||||
neg_frac += 1
|
||||
total_groups += 1
|
||||
print(
|
||||
f" Group mean sign: {pos_frac / total_groups * 100:.1f}% positive, "
|
||||
f"{neg_frac / total_groups * 100:.1f}% negative, "
|
||||
f"{(1 - pos_frac / total_groups - neg_frac / total_groups) * 100:.1f}% near-zero"
|
||||
)
|
||||
|
||||
# ============================================================================
|
||||
# 8. THE SWIGLU EFFECT: WHY ffn_down INPUT IS SPECIAL
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("SECTION 8: THE SWIGLU EFFECT — ffn_down ACTIVATION STRUCTURE")
|
||||
print("=" * 80)
|
||||
print(" ffn_down's activation is the SwiGLU output: silu(gate) * up")
|
||||
print(" This creates a specific activation pattern that differs from")
|
||||
print(" raw FFN input (RMSNorm output).\n")
|
||||
|
||||
if "ffn_input (gate/up)" in act_data and "ffn_down_input (swiglu)" in act_data:
|
||||
A_in = act_data["ffn_input (gate/up)"]
|
||||
A_swiglu = act_data["ffn_down_input (swiglu)"]
|
||||
|
||||
print(f" FFN input (RMSNorm output): {A_in.shape}")
|
||||
print(f" SwiGLU output: {A_swiglu.shape}")
|
||||
|
||||
# Per-token analysis
|
||||
for t in range(min(A_swiglu.shape[0], 3)):
|
||||
tok_in = A_in[t]
|
||||
tok_sw = A_swiglu[t]
|
||||
print(f"\n Token {t}:")
|
||||
print(
|
||||
f" FFN input: mean={tok_in.mean():.6f}, std={tok_in.std():.6f}, "
|
||||
f"|max|={np.abs(tok_in).max():.6f}"
|
||||
)
|
||||
print(
|
||||
f" SwiGLU out: mean={tok_sw.mean():.6f}, std={tok_sw.std():.6f}, "
|
||||
f"|max|={np.abs(tok_sw).max():.6f}"
|
||||
)
|
||||
|
||||
# SwiGLU creates lots of near-zero values (silu suppresses negatives)
|
||||
frac_nearzero_sw = np.mean(np.abs(tok_sw) < 0.01 * tok_sw.std())
|
||||
frac_nearzero_in = np.mean(np.abs(tok_in) < 0.01 * tok_in.std())
|
||||
print(
|
||||
f" Near-zero fraction: FFN input={frac_nearzero_in:.3f}, "
|
||||
f"SwiGLU={frac_nearzero_sw:.3f}"
|
||||
)
|
||||
|
||||
# Sparsity pattern
|
||||
frac_neg = np.mean(tok_sw < 0)
|
||||
print(f" SwiGLU negative fraction: {frac_neg:.3f}")
|
||||
|
||||
# Dimension-level analysis of SwiGLU
|
||||
print(f"\n Dimension-level SwiGLU properties:")
|
||||
dim_mean_sw = A_swiglu.mean(axis=0)
|
||||
dim_std_sw = A_swiglu.std(axis=0)
|
||||
dim_sparsity = np.mean(A_swiglu < 0, axis=0) # fraction of tokens negative per dim
|
||||
|
||||
print(f" Dim mean range: [{dim_mean_sw.min():.6f}, {dim_mean_sw.max():.6f}]")
|
||||
print(f" Dim std range: [{dim_std_sw.min():.6f}, {dim_std_sw.max():.6f}]")
|
||||
print(
|
||||
f" Dim negative fraction: mean={dim_sparsity.mean():.3f}, "
|
||||
f"range=[{dim_sparsity.min():.3f}, {dim_sparsity.max():.3f}]"
|
||||
)
|
||||
|
||||
# Highly sparse dimensions (mostly near-zero after SwiGLU)
|
||||
high_sparsity = np.sum(dim_sparsity > 0.7)
|
||||
low_sparsity = np.sum(dim_sparsity < 0.3)
|
||||
print(f" Dims with >70% negative tokens: {high_sparsity}/{len(dim_sparsity)}")
|
||||
print(f" Dims with <30% negative tokens: {low_sparsity}/{len(dim_sparsity)}")
|
||||
|
||||
# ============================================================================
|
||||
# 9. QUANTIZATION NOISE × ACTIVATION POWER: THE MATMUL ERROR DECOMPOSITION
|
||||
# ============================================================================
|
||||
print("\n" + "=" * 80)
|
||||
print("SECTION 9: MATMUL ERROR DECOMPOSITION")
|
||||
print("=" * 80)
|
||||
print(
|
||||
" matmul_error ≈ sum over groups of (activation_power_in_group × "
|
||||
"weight_mse_in_group)"
|
||||
)
|
||||
print(
|
||||
" If activation power is concentrated in groups with high weight error, "
|
||||
"matmul error explodes.\n"
|
||||
)
|
||||
|
||||
# For ffn_down specifically, compare where activation power sits vs weight error
|
||||
W_down = load_f32_tensor("blk_0_ffn_down_weight.f32bin")
|
||||
A_swiglu = load_f32_tensor("act_blk0_ffn_down_input.f32bin")
|
||||
|
||||
W_gate = load_f32_tensor("blk_0_ffn_gate_weight.f32bin")
|
||||
A_ffn_in = load_f32_tensor("act_blk0_ffn_input.f32bin")
|
||||
|
||||
for label, W, A in [("ffn_down", W_down, A_swiglu), ("ffn_gate", W_gate, A_ffn_in)]:
|
||||
nc = W.shape[1]
|
||||
nr = min(W.shape[0], 128)
|
||||
|
||||
# Compute per-superblock (256) activation power and weight error
|
||||
n_sb = nc // 256
|
||||
sb_act_power = np.zeros(n_sb)
|
||||
sb_weight_mse = np.zeros(n_sb)
|
||||
|
||||
for sb in range(n_sb):
|
||||
s = sb * 256
|
||||
e = s + 256
|
||||
# Activation power: mean squared activation in this region
|
||||
sb_act_power[sb] = np.mean(A[:, s:e] ** 2)
|
||||
|
||||
# Weight MSE: Q2_K-style uniform quant error
|
||||
mse = 0
|
||||
count = 0
|
||||
for r in range(nr):
|
||||
for g in range(0, 256, GS):
|
||||
gvals = W[r, s + g : s + g + GS]
|
||||
gmin = min(gvals.min(), 0)
|
||||
gmax = gvals.max()
|
||||
gr = gmax - gmin
|
||||
if gr < 1e-10:
|
||||
continue
|
||||
scale = gr / 3.0
|
||||
for v in gvals:
|
||||
norm = (v - gmin) / scale
|
||||
idx = max(0, min(3, int(round(norm))))
|
||||
recon = gmin + scale * idx
|
||||
mse += (v - recon) ** 2
|
||||
count += 1
|
||||
sb_weight_mse[sb] = mse / max(count, 1)
|
||||
|
||||
# Correlation between activation power and weight error across super-blocks
|
||||
valid = sb_act_power > 1e-10
|
||||
if valid.sum() > 10:
|
||||
corr = np.corrcoef(np.sqrt(sb_act_power[valid]), np.sqrt(sb_weight_mse[valid]))[
|
||||
0, 1
|
||||
]
|
||||
else:
|
||||
corr = 0
|
||||
|
||||
print(f"\n {label}:")
|
||||
print(f" Super-blocks: {n_sb}")
|
||||
print(
|
||||
f" act_power: mean={sb_act_power.mean():.6f}, "
|
||||
f"std={np.sqrt(sb_act_power.var()):.6f}, "
|
||||
f"range=[{sb_act_power.min():.6f}, {sb_act_power.max():.6f}]"
|
||||
)
|
||||
print(
|
||||
f" weight_mse: mean={sb_weight_mse.mean():.6f}, "
|
||||
f"range=[{sb_weight_mse.min():.6f}, {sb_weight_mse.max():.6f}]"
|
||||
)
|
||||
print(f" CORRELATION (act_power ↔ weight_mse): {corr:.4f}")
|
||||
|
||||
# Show top-5 super-blocks by contribution to matmul error
|
||||
contrib = sb_act_power * sb_weight_mse
|
||||
top5 = np.argsort(contrib)[-5:][::-1]
|
||||
print(f" Top-5 error-contributing super-blocks (of {n_sb}):")
|
||||
for idx in top5:
|
||||
print(
|
||||
f" SB {idx * 256}-{(idx + 1) * 256 - 1}: act_power={sb_act_power[idx]:.6f}, "
|
||||
f"weight_mse={sb_weight_mse[idx]:.6f}, contrib={contrib[idx]:.6f}"
|
||||
)
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("ANALYSIS COMPLETE")
|
||||
print("=" * 80)
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Compute imatrix (importance matrix) from captured activation tensors.
|
||||
|
||||
The imatrix is the per-dimension sum-of-squares of the activations.
|
||||
It's what upstream llama.cpp uses to weight quantization optimization.
|
||||
|
||||
For each activation file act_blkL_*.f32bin, produces imatrix_blkL_<role>.f32bin
|
||||
where <role> matches the weight tensor it multiplies with.
|
||||
|
||||
Format: flat float32 array of length n_per_row, one importance value per dimension.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import struct
|
||||
import os
|
||||
|
||||
DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data")
|
||||
|
||||
|
||||
def load_f32_tensor(name):
|
||||
path = os.path.join(DATA_DIR, name)
|
||||
with open(path, "rb") as f:
|
||||
nrow, ncol = struct.unpack("qq", f.read(16))
|
||||
data = np.frombuffer(f.read(), dtype=np.float32)
|
||||
assert len(data) == nrow * ncol
|
||||
return data.reshape(nrow, ncol)
|
||||
|
||||
|
||||
def save_imatrix(name, data):
|
||||
path = os.path.join(DATA_DIR, name)
|
||||
data.astype(np.float32).tofile(path)
|
||||
print(
|
||||
f" Wrote {path}: {len(data)} dims, "
|
||||
f"min={data.min():.6f}, max={data.max():.6f}, mean={data.mean():.6f}"
|
||||
)
|
||||
|
||||
|
||||
# Mapping: activation file → imatrix files for each weight it multiplies with
|
||||
# Each weight tensor's column dimension matches the activation's column dimension
|
||||
mappings = [
|
||||
{
|
||||
"act_file": "act_blk0_ffn_input.f32bin",
|
||||
"imatrix_name": "imatrix_blk0_ffn_gate_up.f32bin",
|
||||
"description": "ffn_gate and ffn_up (both use ffn_input activation)",
|
||||
},
|
||||
{
|
||||
"act_file": "act_blk0_ffn_down_input.f32bin",
|
||||
"imatrix_name": "imatrix_blk0_ffn_down.f32bin",
|
||||
"description": "ffn_down (uses SwiGLU activation)",
|
||||
},
|
||||
{
|
||||
"act_file": "act_blk0_attn_input.f32bin",
|
||||
"imatrix_name": "imatrix_blk0_attn_qkv.f32bin",
|
||||
"description": "attn_q, attn_k, attn_v (all use attn_input activation)",
|
||||
},
|
||||
{
|
||||
"act_file": "act_blk0_attn_output_input.f32bin",
|
||||
"imatrix_name": "imatrix_blk0_attn_output.f32bin",
|
||||
"description": "attn_output (uses kqv_out activation)",
|
||||
},
|
||||
]
|
||||
|
||||
print("Computing imatrix from captured activations")
|
||||
print("=" * 60)
|
||||
|
||||
for m in mappings:
|
||||
try:
|
||||
A = load_f32_tensor(m["act_file"])
|
||||
print(f"\n{m['description']}:")
|
||||
print(f" Activation: {A.shape[0]} tokens × {A.shape[1]} dims")
|
||||
|
||||
# imatrix = sum over tokens of activation^2
|
||||
# This is the standard definition used by llama.cpp
|
||||
imatrix = np.sum(A**2, axis=0)
|
||||
|
||||
# Also compute per-dim RMS for reference
|
||||
rms = np.sqrt(np.mean(A**2, axis=0))
|
||||
|
||||
print(
|
||||
f" Imatrix stats: min={imatrix.min():.6f}, max={imatrix.max():.6f}, "
|
||||
f"mean={imatrix.mean():.6f}, std={imatrix.std():.6f}"
|
||||
)
|
||||
print(
|
||||
f" RMS stats: min={rms.min():.6f}, max={rms.max():.6f}, "
|
||||
f"mean={rms.mean():.6f}"
|
||||
)
|
||||
|
||||
# Concentration metrics
|
||||
total = imatrix.sum()
|
||||
sorted_im = np.sort(imatrix)[::-1]
|
||||
top1pct = max(1, int(len(imatrix) * 0.01))
|
||||
top10pct = max(1, int(len(imatrix) * 0.10))
|
||||
print(f" Power concentration:")
|
||||
print(
|
||||
f" Top 1% dims ({top1pct}): {sorted_im[:top1pct].sum() / total * 100:.1f}% of total"
|
||||
)
|
||||
print(
|
||||
f" Top 10% dims ({top10pct}): {sorted_im[:top10pct].sum() / total * 100:.1f}% of total"
|
||||
)
|
||||
|
||||
save_imatrix(m["imatrix_name"], imatrix)
|
||||
except Exception as e:
|
||||
print(f" SKIP: {e}")
|
||||
|
||||
print("\nDone.")
|
||||
|
|
@ -0,0 +1,210 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Extract real activation tensors by running a forward pass through the model.
|
||||
|
||||
Captures the INPUT activations to specific weight tensors (the vectors that get
|
||||
multiplied by the weight matrix). These are what matter for quantization quality:
|
||||
quantization error * activation magnitude = output error.
|
||||
|
||||
Usage:
|
||||
python3 scripts/extract-activations.py MODEL.gguf OUTPUT_DIR [--prompt TEXT] [--layer N]
|
||||
|
||||
Output:
|
||||
For each target tensor, writes a .f32bin file with header:
|
||||
int64_t n_rows, int64_t row_len
|
||||
followed by n_rows * row_len float32 values.
|
||||
n_rows = number of tokens, row_len = hidden dimension.
|
||||
|
||||
NOTE: This uses a simplified forward pass (no KV cache, single prompt).
|
||||
Activations are extracted from after the norm layers (the actual matmul inputs).
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import struct
|
||||
import numpy as np
|
||||
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
repo_root = os.path.dirname(script_dir)
|
||||
sys.path.insert(0, os.path.join(repo_root, 'gguf-py'))
|
||||
|
||||
from gguf import GGUFReader
|
||||
|
||||
|
||||
def bf16_to_f32(raw_bytes):
|
||||
"""Convert raw BF16 bytes to float32 numpy array."""
|
||||
bf16 = np.frombuffer(raw_bytes, dtype=np.uint16)
|
||||
f32_bits = bf16.astype(np.uint32) << 16
|
||||
return f32_bits.view(np.float32)
|
||||
|
||||
|
||||
def rms_norm(x, weight, eps=1e-6):
|
||||
"""RMS normalization (Qwen3/Llama style)."""
|
||||
rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps)
|
||||
return (x / rms) * weight
|
||||
|
||||
|
||||
def silu(x):
|
||||
"""SiLU activation."""
|
||||
return x / (1.0 + np.exp(-np.clip(x, -88, 88)))
|
||||
|
||||
|
||||
def softmax(x, axis=-1):
|
||||
"""Numerically stable softmax."""
|
||||
x_max = np.max(x, axis=axis, keepdims=True)
|
||||
e = np.exp(x - x_max)
|
||||
return e / np.sum(e, axis=axis, keepdims=True)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3:
|
||||
print(f"Usage: {sys.argv[0]} MODEL.gguf OUTPUT_DIR [--prompt TEXT] [--layer N]")
|
||||
sys.exit(1)
|
||||
|
||||
model_path = sys.argv[1]
|
||||
output_dir = sys.argv[2]
|
||||
prompt_text = "The quick brown fox jumps over the lazy dog. In a distant galaxy, scientists discovered"
|
||||
target_layer = 16
|
||||
|
||||
for i in range(3, len(sys.argv)):
|
||||
if sys.argv[i] == "--prompt" and i + 1 < len(sys.argv):
|
||||
prompt_text = sys.argv[i + 1]
|
||||
elif sys.argv[i] == "--layer" and i + 1 < len(sys.argv):
|
||||
target_layer = int(sys.argv[i + 1])
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
print(f"Loading {model_path}...")
|
||||
reader = GGUFReader(model_path)
|
||||
|
||||
# Read model config from metadata
|
||||
config = {}
|
||||
for kv in reader.fields.values():
|
||||
if hasattr(kv, 'parts') and len(kv.parts) > 0:
|
||||
name = kv.name
|
||||
if 'block_count' in name:
|
||||
config['n_layer'] = int(kv.parts[-1][0])
|
||||
elif 'embedding_length' in name:
|
||||
config['hidden'] = int(kv.parts[-1][0])
|
||||
elif 'feed_forward_length' in name:
|
||||
config['ffn'] = int(kv.parts[-1][0])
|
||||
elif 'head_count_kv' in name:
|
||||
config['n_kv_heads'] = int(kv.parts[-1][0])
|
||||
elif 'head_count' in name and 'kv' not in name:
|
||||
config['n_heads'] = int(kv.parts[-1][0])
|
||||
elif 'key_length' in name:
|
||||
config['head_dim'] = int(kv.parts[-1][0])
|
||||
elif 'layer_norm_rms_epsilon' in name:
|
||||
config['eps'] = float(kv.parts[-1][0])
|
||||
|
||||
print(f"Config: {config}")
|
||||
hidden = config['hidden']
|
||||
|
||||
# Load tensors into a dict
|
||||
def load_tensor(name):
|
||||
for t in reader.tensors:
|
||||
if t.name == name:
|
||||
raw = bytes(t.data)
|
||||
shape = [int(s) for s in t.shape]
|
||||
n_el = int(t.n_elements)
|
||||
if t.tensor_type.name == 'BF16':
|
||||
flat = bf16_to_f32(raw)
|
||||
elif t.tensor_type.name == 'F16':
|
||||
flat = np.frombuffer(raw, dtype=np.float16).astype(np.float32)
|
||||
elif t.tensor_type.name == 'F32':
|
||||
flat = np.frombuffer(raw, dtype=np.float32)
|
||||
else:
|
||||
raise ValueError(f"Unsupported type: {t.tensor_type.name}")
|
||||
assert flat.shape[0] == n_el, f"Expected {n_el} elements, got {flat.shape[0]}"
|
||||
if len(shape) == 1:
|
||||
return flat.copy()
|
||||
return flat.reshape(list(reversed(shape))).copy()
|
||||
raise KeyError(f"Tensor {name} not found")
|
||||
|
||||
# Create simple token IDs from the prompt (use first few tokens from vocab)
|
||||
# We just need realistic activations, not perfect tokenization
|
||||
n_tokens = min(32, len(prompt_text.split()))
|
||||
print(f"Using {n_tokens} pseudo-tokens for activation extraction")
|
||||
|
||||
# Load token embedding and create input
|
||||
print("Loading token_embd...")
|
||||
token_embd = load_tensor("token_embd.weight") # [vocab, hidden]
|
||||
# Use token IDs 100-131 (arbitrary but avoids special tokens)
|
||||
token_ids = list(range(100, 100 + n_tokens))
|
||||
x = token_embd[token_ids] # [n_tokens, hidden]
|
||||
print(f"Input shape: {x.shape}")
|
||||
|
||||
# Run forward pass through target layer only (we just need the activations)
|
||||
layer = target_layer
|
||||
print(f"\nProcessing layer {layer}...")
|
||||
|
||||
def save_activation(name, data):
|
||||
"""Save activation tensor as f32bin."""
|
||||
if data.ndim == 1:
|
||||
data = data.reshape(1, -1)
|
||||
n_rows, row_len = data.shape
|
||||
fname = os.path.join(output_dir, name + ".f32bin")
|
||||
with open(fname, 'wb') as fp:
|
||||
fp.write(struct.pack('<qq', n_rows, row_len))
|
||||
data.astype(np.float32).tofile(fp)
|
||||
print(f" Saved {fname}: {n_rows} x {row_len} ({os.path.getsize(fname) / 1024:.1f} KB)")
|
||||
|
||||
# Attention norm → input to attn_q/k/v
|
||||
attn_norm_w = load_tensor(f"blk.{layer}.attn_norm.weight")
|
||||
x_normed = rms_norm(x, attn_norm_w, config.get('eps', 1e-6))
|
||||
save_activation(f"act_blk{layer}_attn_input", x_normed)
|
||||
|
||||
# Compute Q, K, V to get post-attention residual
|
||||
W_q = load_tensor(f"blk.{layer}.attn_q.weight") # [n_heads*head_dim, hidden]
|
||||
W_k = load_tensor(f"blk.{layer}.attn_k.weight") # [n_kv_heads*head_dim, hidden]
|
||||
W_v = load_tensor(f"blk.{layer}.attn_v.weight") # [n_kv_heads*head_dim, hidden]
|
||||
W_o = load_tensor(f"blk.{layer}.attn_output.weight") # [hidden, n_heads*head_dim]
|
||||
|
||||
Q = x_normed @ W_q.T # [n_tokens, n_heads*head_dim]
|
||||
K = x_normed @ W_k.T
|
||||
V = x_normed @ W_v.T
|
||||
|
||||
# Simplified attention (no RoPE, no mask, no GQA — just need rough activations)
|
||||
n_heads = config['n_heads']
|
||||
head_dim = config['head_dim']
|
||||
Q_h = Q.reshape(n_tokens, n_heads, head_dim)
|
||||
K_h = K.reshape(n_tokens, config['n_kv_heads'], head_dim)
|
||||
V_h = V.reshape(n_tokens, config['n_kv_heads'], head_dim)
|
||||
|
||||
# Repeat KV heads for GQA
|
||||
rep = n_heads // config['n_kv_heads']
|
||||
K_h = np.repeat(K_h, rep, axis=1)
|
||||
V_h = np.repeat(V_h, rep, axis=1)
|
||||
|
||||
# Attention scores and output
|
||||
scores = np.einsum('thd,shd->ths', Q_h, K_h) / np.sqrt(head_dim)
|
||||
attn_w = softmax(scores, axis=-1)
|
||||
attn_out = np.einsum('ths,shd->thd', attn_w, V_h).reshape(n_tokens, -1)
|
||||
|
||||
# attn_output weight input
|
||||
save_activation(f"act_blk{layer}_attn_output_input", attn_out)
|
||||
|
||||
# Project and add residual
|
||||
attn_proj = attn_out @ W_o.T
|
||||
x = x + attn_proj
|
||||
|
||||
# FFN norm → input to ffn_gate/ffn_up
|
||||
ffn_norm_w = load_tensor(f"blk.{layer}.ffn_norm.weight")
|
||||
x_ffn = rms_norm(x, ffn_norm_w, config.get('eps', 1e-6))
|
||||
save_activation(f"act_blk{layer}_ffn_input", x_ffn)
|
||||
|
||||
# FFN: gate and up projections
|
||||
W_gate = load_tensor(f"blk.{layer}.ffn_gate.weight") # [ffn, hidden]
|
||||
W_up = load_tensor(f"blk.{layer}.ffn_up.weight") # [ffn, hidden]
|
||||
W_down = load_tensor(f"blk.{layer}.ffn_down.weight") # [hidden, ffn]
|
||||
|
||||
gate = x_ffn @ W_gate.T
|
||||
up = x_ffn @ W_up.T
|
||||
ffn_act = silu(gate) * up # SwiGLU activation
|
||||
|
||||
# ffn_down weight input (the SwiGLU output)
|
||||
save_activation(f"act_blk{layer}_ffn_down_input", ffn_act)
|
||||
|
||||
print(f"\nDone! Extracted 4 activation tensors to {output_dir}/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Extract tensor data from GGUF as raw f32 binary files for C++ testing.
|
||||
|
||||
Usage:
|
||||
python3 scripts/extract-tensor-data.py MODEL.gguf pattern1 [pattern2 ...]
|
||||
|
||||
Output:
|
||||
For each matching tensor, writes a .f32bin file with header:
|
||||
int64_t n_rows, int64_t row_len
|
||||
followed by n_rows * row_len float32 values.
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
# Support running from build/ or repo root
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
repo_root = os.path.dirname(script_dir)
|
||||
sys.path.insert(0, os.path.join(repo_root, 'gguf-py'))
|
||||
|
||||
from gguf import GGUFReader
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 3:
|
||||
print(f"Usage: {sys.argv[0]} MODEL.gguf pattern1 [pattern2 ...]")
|
||||
print(f" Extracts tensors whose names contain any of the given patterns.")
|
||||
sys.exit(1)
|
||||
|
||||
model_path = sys.argv[1]
|
||||
patterns = sys.argv[2:]
|
||||
|
||||
print(f"Reading {model_path}...")
|
||||
reader = GGUFReader(model_path)
|
||||
|
||||
for tensor in reader.tensors:
|
||||
if not any(p in tensor.name for p in patterns):
|
||||
continue
|
||||
|
||||
print(f"\nExtracting: {tensor.name}")
|
||||
print(f" Shape: {list(tensor.shape)}, type: {tensor.tensor_type.name}")
|
||||
|
||||
# Convert to f32
|
||||
raw = np.array(tensor.data, dtype=np.uint8)
|
||||
|
||||
if tensor.tensor_type.name == 'BF16':
|
||||
bf16_vals = raw.view(np.uint16)
|
||||
f32_bits = bf16_vals.astype(np.uint32) << 16
|
||||
f32_vals = f32_bits.view(np.float32)
|
||||
elif tensor.tensor_type.name == 'F16':
|
||||
f16_vals = raw.view(np.float16)
|
||||
f32_vals = f16_vals.astype(np.float32)
|
||||
elif tensor.tensor_type.name == 'F32':
|
||||
f32_vals = raw.view(np.float32)
|
||||
else:
|
||||
print(f" SKIP: unsupported type {tensor.tensor_type.name}")
|
||||
continue
|
||||
|
||||
# Determine layout: GGUF stores shape as [col, row] for 2D
|
||||
row_len = int(tensor.shape[0])
|
||||
n_rows = tensor.n_elements // row_len
|
||||
|
||||
fname = tensor.name.replace(".", "_") + ".f32bin"
|
||||
with open(fname, 'wb') as fp:
|
||||
fp.write(np.array([n_rows, row_len], dtype=np.int64).tobytes())
|
||||
f32_vals.tofile(fp)
|
||||
|
||||
file_size = os.path.getsize(fname)
|
||||
print(f" Wrote {fname}: {n_rows} rows x {row_len} cols = {tensor.n_elements} elements")
|
||||
print(f" File size: {file_size / (1024*1024):.1f} MB")
|
||||
print(f" Stats: mean={f32_vals.mean():.6f}, std={f32_vals.std():.6f}, "
|
||||
f"min={f32_vals.min():.6f}, max={f32_vals.max():.6f}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -511,6 +511,7 @@ public:
|
|||
std::map<llama_seq_id, llama_sampler *> samplers;
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "gguf.h"
|
||||
#include "llama-hparams.h"
|
||||
|
||||
|
|
@ -61,6 +62,13 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_PT: return "Q3_PT - 3.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_KPT: return "Q3_KPT - Q3_K with learned levels";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_DPT: return "Q4_DPT - IQ4_NL with learned levels";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_KPT: return "Q2_KPT - Q2_K with learned levels";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_TQ: return "IQ2_TQ - 2.0625 bpw trellis quantized";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_TQ: return "IQ3_TQ - 3.5625 bpw per-tensor trained grid";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.5625 bpw 8D vector quantized";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
|
||||
|
|
@ -758,6 +766,13 @@ llama_model_loader::llama_model_loader(
|
|||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_Q3_PT: ftype = LLAMA_FTYPE_MOSTLY_Q3_PT; break;
|
||||
case GGML_TYPE_Q3_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q3_KPT; break;
|
||||
case GGML_TYPE_Q4_DPT: ftype = LLAMA_FTYPE_MOSTLY_Q4_DPT; break;
|
||||
case GGML_TYPE_Q2_KPT: ftype = LLAMA_FTYPE_MOSTLY_Q2_KPT; break;
|
||||
case GGML_TYPE_IQ2_TQ: ftype = LLAMA_FTYPE_MOSTLY_IQ2_TQ; break;
|
||||
case GGML_TYPE_IQ3_TQ: ftype = LLAMA_FTYPE_MOSTLY_IQ3_TQ; break;
|
||||
case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break;
|
||||
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
|
||||
case GGML_TYPE_Q1_0: ftype = LLAMA_FTYPE_MOSTLY_Q1_0; break;
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@
|
|||
// TODO: tmp until the ggml meta backend matures and becomes public
|
||||
#include "../src/ggml-ext.h"
|
||||
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cfloat>
|
||||
|
|
@ -8247,6 +8248,175 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
}
|
||||
}
|
||||
|
||||
// Load per-tensor quantization auxiliary data (levels/kvalues) from GGUF metadata.
|
||||
// Indexed by weight tensor pointer for direct lookup during inference.
|
||||
{
|
||||
// Build tensor name to tensor pointer map
|
||||
std::unordered_map<std::string, ggml_tensor*> name_to_tensor;
|
||||
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
name_to_tensor[ggml_get_name(t)] = t;
|
||||
}
|
||||
}
|
||||
|
||||
struct level_type_info {
|
||||
ggml_type type;
|
||||
const char * gguf_key;
|
||||
size_t n_levels; // number of level values per tensor
|
||||
size_t elem_bytes; // size of each level value
|
||||
};
|
||||
|
||||
const level_type_info level_types[] = {
|
||||
{ GGML_TYPE_Q3_PT, "q3_pt.levels", 8, sizeof(float) },
|
||||
{ GGML_TYPE_Q3_KPT, "q3_kpt.levels", 8, sizeof(float) },
|
||||
{ GGML_TYPE_Q4_DPT, "q4_dpt.levels", 16, sizeof(int8_t) },
|
||||
};
|
||||
|
||||
for (const auto & lt : level_types) {
|
||||
int64_t lv_idx = gguf_find_key(ml.metadata, lt.gguf_key);
|
||||
if (lv_idx < 0) { continue; }
|
||||
|
||||
const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.metadata, lv_idx);
|
||||
const size_t lv_arr_n = gguf_get_arr_n(ml.metadata, lv_idx);
|
||||
|
||||
size_t tensor_count = 0;
|
||||
|
||||
// Iterate over GGUF slots to find matching tensors
|
||||
for (size_t gguf_slot = 0; gguf_slot < lv_arr_n / lt.n_levels; ++gguf_slot) {
|
||||
std::string tensor_name = gguf_get_tensor_name(ml.metadata, gguf_slot);
|
||||
auto it = name_to_tensor.find(tensor_name);
|
||||
if (it == name_to_tensor.end()) { continue; }
|
||||
|
||||
ggml_tensor* t = it->second;
|
||||
if (t->type != lt.type) { continue; }
|
||||
|
||||
const size_t gguf_offset = gguf_slot * lt.n_levels;
|
||||
|
||||
// Store directly indexed by tensor pointer
|
||||
auto & aux = tensor_aux_data[t];
|
||||
aux.type = lt.type;
|
||||
aux.host_data.assign(
|
||||
lv_raw + gguf_offset * lt.elem_bytes,
|
||||
lv_raw + (gguf_offset + lt.n_levels) * lt.elem_bytes
|
||||
);
|
||||
aux.aux_tensor = nullptr;
|
||||
|
||||
// Set quant_levels directly on the tensor
|
||||
t->quant_levels = aux.host_data.data();
|
||||
|
||||
tensor_count++;
|
||||
}
|
||||
|
||||
if (tensor_count > 0) {
|
||||
LLAMA_LOG_INFO("%s: loaded %zu %s per-tensor level tables\n",
|
||||
__func__, tensor_count, lt.gguf_key);
|
||||
}
|
||||
}
|
||||
|
||||
// Q2_KPT: per-block levels stored as per-tensor GGUF keys "{tensor_name}.q2kpt_levels"
|
||||
// Each key holds n_blocks * Q2KPT_N_LEVELS floats for that tensor (4 floats per 256-element block).
|
||||
{
|
||||
size_t q2kpt_loaded = 0;
|
||||
for (auto & [tname, t] : name_to_tensor) {
|
||||
if (t->type != GGML_TYPE_Q2_KPT) { continue; }
|
||||
const std::string key = tname + ".q2kpt_levels";
|
||||
int64_t lv_idx = gguf_find_key(ml.metadata, key.c_str());
|
||||
if (lv_idx < 0) { continue; }
|
||||
|
||||
const uint8_t * lv_raw = (const uint8_t *)gguf_get_arr_data(ml.metadata, lv_idx);
|
||||
const size_t lv_n = gguf_get_arr_n(ml.metadata, lv_idx);
|
||||
|
||||
auto & aux = tensor_aux_data[t];
|
||||
aux.type = GGML_TYPE_Q2_KPT;
|
||||
aux.host_data.assign(lv_raw, lv_raw + lv_n * sizeof(float));
|
||||
aux.aux_tensor = nullptr;
|
||||
t->quant_levels = aux.host_data.data();
|
||||
q2kpt_loaded++;
|
||||
}
|
||||
if (q2kpt_loaded > 0) {
|
||||
LLAMA_LOG_INFO("%s: loaded %zu Q2_KPT per-block level tables\n", __func__, q2kpt_loaded);
|
||||
}
|
||||
}
|
||||
|
||||
// IQ2_TQ: per-tensor trained grid (16 × 4 int8 = 64 bytes)
|
||||
{
|
||||
size_t iq2tq_loaded = 0;
|
||||
for (auto & [tname, t] : name_to_tensor) {
|
||||
if (t->type != GGML_TYPE_IQ2_TQ) { continue; }
|
||||
|
||||
const std::string grid_key = "iq2tq.grid." + tname;
|
||||
int64_t grid_idx = gguf_find_key(ml.metadata, grid_key.c_str());
|
||||
if (grid_idx < 0) { continue; }
|
||||
|
||||
auto & taux = tensor_aux_data[t];
|
||||
taux.type = GGML_TYPE_IQ2_TQ;
|
||||
taux.host_data.resize(64);
|
||||
const int8_t * grid_data = (const int8_t *)gguf_get_arr_data(ml.metadata, grid_idx);
|
||||
memcpy(taux.host_data.data(), grid_data, 64);
|
||||
|
||||
t->quant_levels = taux.host_data.data();
|
||||
iq2tq_loaded++;
|
||||
}
|
||||
if (iq2tq_loaded > 0) {
|
||||
LLAMA_LOG_INFO("%s: loaded IQ2_TQ grid for %zu tensors\n", __func__, iq2tq_loaded);
|
||||
}
|
||||
}
|
||||
|
||||
// IQ3_TQ: per-tensor trained grid (16 × 8 int8 = 128 bytes)
|
||||
{
|
||||
size_t iq3tq_loaded = 0;
|
||||
for (auto & [tname, t] : name_to_tensor) {
|
||||
if (t->type != GGML_TYPE_IQ3_TQ) { continue; }
|
||||
|
||||
const std::string grid_key = "iq3tq.grid." + tname;
|
||||
int64_t grid_idx = gguf_find_key(ml.metadata, grid_key.c_str());
|
||||
if (grid_idx < 0) {
|
||||
// backward compat: try old key name
|
||||
const std::string old_key = "iq3qt.grid." + tname;
|
||||
grid_idx = gguf_find_key(ml.metadata, old_key.c_str());
|
||||
if (grid_idx < 0) { continue; }
|
||||
}
|
||||
|
||||
auto & taux = tensor_aux_data[t];
|
||||
taux.type = GGML_TYPE_IQ3_TQ;
|
||||
taux.host_data.resize(128);
|
||||
const int8_t * grid_data = (const int8_t *)gguf_get_arr_data(ml.metadata, grid_idx);
|
||||
memcpy(taux.host_data.data(), grid_data, 128);
|
||||
|
||||
t->quant_levels = taux.host_data.data();
|
||||
iq3tq_loaded++;
|
||||
}
|
||||
if (iq3tq_loaded > 0) {
|
||||
LLAMA_LOG_INFO("%s: loaded IQ3_TQ grid for %zu tensors\n", __func__, iq3tq_loaded);
|
||||
}
|
||||
}
|
||||
|
||||
// IQ1_BN: per-tensor trained codebook (32768 bytes)
|
||||
{
|
||||
size_t iq1bn_loaded = 0;
|
||||
for (auto & [tname, t] : name_to_tensor) {
|
||||
if (t->type != GGML_TYPE_IQ1_BN) { continue; }
|
||||
|
||||
const std::string aux_key = "iq1bn.aux." + tname;
|
||||
int64_t aux_idx = gguf_find_key(ml.metadata, aux_key.c_str());
|
||||
if (aux_idx < 0) { continue; }
|
||||
|
||||
auto & taux = tensor_aux_data[t];
|
||||
taux.type = GGML_TYPE_IQ1_BN;
|
||||
taux.host_data.resize(32768);
|
||||
const int8_t * aux_data = (const int8_t *)gguf_get_arr_data(ml.metadata, aux_idx);
|
||||
memcpy(taux.host_data.data(), aux_data, 32768);
|
||||
|
||||
t->quant_levels = taux.host_data.data();
|
||||
iq1bn_loaded++;
|
||||
}
|
||||
if (iq1bn_loaded > 0) {
|
||||
LLAMA_LOG_INFO("%s: loaded IQ1_BN codebook for %zu tensors\n", __func__, iq1bn_loaded);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (use_mmap_buffer) {
|
||||
for (auto & mapping : ml.mappings) {
|
||||
pimpl->mappings.emplace_back(std::move(mapping));
|
||||
|
|
|
|||
|
|
@ -574,6 +574,24 @@ struct llama_model {
|
|||
// for keeping track of associated LoRA adapters
|
||||
std::unordered_set<llama_adapter_lora *> loras;
|
||||
|
||||
// host-side auxiliary data for dynamic quantization types (Q4_DPT, Q3_PT, Q3_KPT)
|
||||
// indexed by weight tensor pointer, allows separate GPU placement of aux data
|
||||
struct tensor_auxiliary {
|
||||
ggml_type type; // Quantization type this aux data is for
|
||||
std::vector<uint8_t> host_data; // Host copy of aux data (levels or kvalues)
|
||||
struct ggml_tensor * aux_tensor; // Separate ggml tensor for backend placement
|
||||
};
|
||||
|
||||
// Hash function for ggml_tensor pointers (reuse existing ggml_hash pattern)
|
||||
struct ggml_tensor_ptr_hash {
|
||||
size_t operator()(const ggml_tensor* t) const noexcept {
|
||||
return (size_t)(uintptr_t)t >> 4; // Same as ggml_hash()
|
||||
}
|
||||
};
|
||||
|
||||
// Per-tensor auxiliary data lookup - indexed by WEIGHT tensor pointer
|
||||
std::unordered_map<const ggml_tensor*, tensor_auxiliary, ggml_tensor_ptr_hash> tensor_aux_data;
|
||||
|
||||
// statically allocated context for assigning
|
||||
struct llama_meta_device_get_split_state_userdata get_split_state_ud;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
#include "ggml.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-model-loader.h"
|
||||
#include "llama.h"
|
||||
#include "llama-ext.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
|
@ -13,6 +15,98 @@
|
|||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
// Q3_PT levels functions (defined in ggml-quants.c)
|
||||
extern "C" {
|
||||
void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float levels_out[8]);
|
||||
void q3pt_set_levels(const float * levels);
|
||||
}
|
||||
|
||||
// Q3_KPT levels functions (defined in ggml-quants.c)
|
||||
extern "C" {
|
||||
void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float levels_out[8]);
|
||||
void q3kpt_set_levels(const float * levels);
|
||||
}
|
||||
|
||||
// Q4_DPT levels functions (defined in ggml-quants.c)
|
||||
extern "C" {
|
||||
void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, int8_t levels_out[16]);
|
||||
void q4dpt_set_levels(const int8_t * levels);
|
||||
}
|
||||
|
||||
// Q2_KPT levels are handled internally by quantize_q2_kpt
|
||||
#define Q2KPT_N_LEVELS 4
|
||||
#define QK_K 256
|
||||
extern "C" const float * q2kpt_get_levels(void);
|
||||
extern "C" void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
|
||||
extern "C" void q2kpt_free_levels(void);
|
||||
|
||||
// IQ2_TQ functions — per-tensor trained grid
|
||||
extern "C" size_t quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
extern "C" void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
|
||||
extern "C" void iq2tq_set_grid(const int8_t grid[64]);
|
||||
extern "C" const int8_t * iq2tq_get_grid(void);
|
||||
|
||||
// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor)
|
||||
extern "C" size_t quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
extern "C" void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]);
|
||||
extern "C" void iq3tq_set_grid(const int8_t grid[128]);
|
||||
extern "C" const int8_t * iq3tq_get_grid(void);
|
||||
|
||||
// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor)
|
||||
extern "C" size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
extern "C" void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread);
|
||||
extern "C" void iq1bn_set_aux(const int8_t aux[32768]);
|
||||
extern "C" const int8_t * iq1bn_get_aux(void);
|
||||
|
||||
// Q3_PT levels functions (defined in ggml-quants.c)
|
||||
extern "C" {
|
||||
void q3pt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float levels_out[8]);
|
||||
void q3pt_set_levels(const float * levels);
|
||||
}
|
||||
|
||||
// Q3_KPT levels functions (defined in ggml-quants.c)
|
||||
extern "C" {
|
||||
void q3kpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, float levels_out[8]);
|
||||
void q3kpt_set_levels(const float * levels);
|
||||
}
|
||||
|
||||
// Q4_DPT levels functions (defined in ggml-quants.c)
|
||||
extern "C" {
|
||||
void q4dpt_train_levels(const float * data, int64_t nrow, int64_t n_per_row,
|
||||
const float * imatrix, int8_t levels_out[16]);
|
||||
void q4dpt_set_levels(const int8_t * levels);
|
||||
}
|
||||
|
||||
// Q2_KPT levels are handled internally by quantize_q2_kpt
|
||||
#define Q2KPT_N_LEVELS 4
|
||||
#define QK_K 256
|
||||
extern "C" const float * q2kpt_get_levels(void);
|
||||
extern "C" void q2kpt_prepare_levels(int64_t nrows, int64_t n_per_row);
|
||||
extern "C" void q2kpt_free_levels(void);
|
||||
|
||||
// IQ2_TQ functions — per-tensor trained grid
|
||||
extern "C" size_t quantize_iq2_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
extern "C" void iq2tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[64]);
|
||||
extern "C" void iq2tq_set_grid(const int8_t grid[64]);
|
||||
extern "C" const int8_t * iq2tq_get_grid(void);
|
||||
|
||||
// IQ3_TQ functions — per-tensor trained grid (3-bit, 128 bytes per tensor)
|
||||
extern "C" size_t quantize_iq3_tq(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
extern "C" void iq3tq_train_grid(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t grid_out[128]);
|
||||
extern "C" void iq3tq_set_grid(const int8_t grid[128]);
|
||||
extern "C" const int8_t * iq3tq_get_grid(void);
|
||||
|
||||
// IQ1_BN functions — 8D vector quantized with per-tensor trained 4096-entry codebook (32768 bytes per tensor)
|
||||
extern "C" size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
extern "C" void iq1bn_train_codebook(const float * data, int64_t nrow, int64_t n_per_row, const float * imatrix, int8_t aux_out[32768], int nthread);
|
||||
extern "C" void iq1bn_set_aux(const int8_t aux[32768]);
|
||||
extern "C" const int8_t * iq1bn_get_aux(void);
|
||||
|
||||
// result of parsing --tensor-type option
|
||||
// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
|
||||
struct tensor_type_option {
|
||||
|
|
@ -234,7 +328,7 @@ static void llama_tensor_dequantize_impl(
|
|||
} else if (tensor->type == GGML_TYPE_BF16) {
|
||||
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
||||
} else if (ggml_is_quantized(tensor->type)) {
|
||||
qtype->to_float(tensor->data, f32_output, nelements);
|
||||
qtype->to_float(tensor->data, f32_output, nelements, tensor->quant_levels);
|
||||
} else {
|
||||
GGML_ABORT("fatal error"); // unreachable
|
||||
}
|
||||
|
|
@ -264,13 +358,14 @@ static void llama_tensor_dequantize_impl(
|
|||
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
|
||||
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
||||
|
||||
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
||||
const void * quant_levels = tensor->quant_levels;
|
||||
auto compute = [qtype, quant_levels] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
||||
if (typ == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
||||
} else if (typ == GGML_TYPE_BF16) {
|
||||
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
||||
} else {
|
||||
qtype->to_float(inbuf, outbuf, nels);
|
||||
qtype->to_float(inbuf, outbuf, nels, quant_levels);
|
||||
}
|
||||
};
|
||||
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
||||
|
|
@ -480,6 +575,18 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
|
||||
new_type = GGML_TYPE_IQ4_XS;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT) {
|
||||
new_type = GGML_TYPE_IQ4_XS;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
|
|
@ -518,13 +625,16 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
|
||||
new_type = GGML_TYPE_Q3_PT;
|
||||
}
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
|
||||
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
|
|
@ -569,16 +679,17 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
|
|||
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
||||
int i_layer = info.first, n_layer = info.second;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
||||
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) {
|
||||
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
||||
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
||||
: GGML_TYPE_Q3_K;
|
||||
: (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ? GGML_TYPE_Q3_KPT : GGML_TYPE_Q3_K);
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
|
||||
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
||||
|
|
@ -587,6 +698,9 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
||||
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT) {
|
||||
new_type = GGML_TYPE_IQ4_XS;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
if (arch == LLM_ARCH_FALCON) {
|
||||
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
|
||||
|
|
@ -616,13 +730,14 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
|
|||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
|
|
@ -828,6 +943,14 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return GGML_TYPE_IQ4_XS;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S:
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_PT: return GGML_TYPE_Q3_PT;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_KPT: return GGML_TYPE_Q3_KPT;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_DPT: return GGML_TYPE_Q4_DPT;
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_KPT: return GGML_TYPE_Q2_KPT;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_TQ: return GGML_TYPE_IQ2_TQ;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_TQ: return GGML_TYPE_IQ3_TQ;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_BN: return GGML_TYPE_IQ1_BN;
|
||||
|
||||
|
||||
default: return GGML_TYPE_COUNT;
|
||||
}
|
||||
|
|
@ -1098,6 +1221,615 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
::zeros(fout, meta_size);
|
||||
};
|
||||
|
||||
// Q3_PT two-pass approach: train all per-tensor levels BEFORE opening the output
|
||||
// file, so the levels KV entry is already populated at the time of the metadata placeholder.
|
||||
static const size_t Q3PT_N_LEVELS = 8;
|
||||
std::vector<float> q3pt_all_levels; // indexed by position in tensors[]
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_PT && !params->dry_run) {
|
||||
LLAMA_LOG_INFO("%s: Q3_PT pass 1: training per-tensor levels...\n", __func__);
|
||||
q3pt_all_levels.assign(tensors.size() * Q3PT_N_LEVELS, 0.0f);
|
||||
|
||||
// Temporary dequant buffer for pass 1 (reuse f32_conv_buf / read_data declared below)
|
||||
std::vector<no_init<uint8_t>> p1_read_data;
|
||||
std::vector<no_init<float>> p1_f32_buf;
|
||||
std::vector<std::thread> p1_workers;
|
||||
p1_workers.reserve(nthread);
|
||||
|
||||
for (size_t ti = 0; ti < tensors.size(); ++ti) {
|
||||
ggml_tensor * tensor = tensors[ti]->tensor;
|
||||
const std::string tname = ggml_get_name(tensor);
|
||||
|
||||
// Determine whether this tensor will be Q3_PT (mirror the pass-2 logic)
|
||||
bool quantize = tname.rfind("weight") == tname.size() - 6;
|
||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||
quantize &= tname.find("_norm.weight") == std::string::npos;
|
||||
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
|
||||
if (!quantize) { continue; }
|
||||
|
||||
ggml_type new_type = default_type;
|
||||
if (!params->pure) {
|
||||
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
|
||||
}
|
||||
if (new_type != GGML_TYPE_Q3_PT) { continue; }
|
||||
|
||||
// Load tensor data
|
||||
const size_t tsz = ggml_nbytes(tensor);
|
||||
if (!ml.use_mmap) {
|
||||
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
|
||||
tensor->data = p1_read_data.data();
|
||||
}
|
||||
ml.load_data_for(tensor);
|
||||
|
||||
// Dequantize to f32 if needed
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
float * f32_data;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
|
||||
f32_data = (float *) p1_f32_buf.data();
|
||||
}
|
||||
|
||||
// Resolve imatrix
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it2 != imatrix_data->end() &&
|
||||
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
|
||||
imatrix = it2->second.data();
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
LLAMA_LOG_INFO("%s: Q3_PT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
|
||||
q3pt_train_levels(f32_data, nrows, n_per_row, imatrix,
|
||||
q3pt_all_levels.data() + ti * Q3PT_N_LEVELS);
|
||||
}
|
||||
|
||||
// All levels ready — store in GGUF metadata before the file is opened
|
||||
for (auto & ctx : ctx_outs) {
|
||||
if (ctx) {
|
||||
gguf_set_arr_data(ctx.get(), "q3_pt.levels", GGUF_TYPE_FLOAT32,
|
||||
q3pt_all_levels.data(), q3pt_all_levels.size());
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: Q3_PT pass 1 complete.\n", __func__);
|
||||
}
|
||||
|
||||
// Q3_KPT two-pass approach: train all per-tensor levels BEFORE opening the output
|
||||
static const size_t Q3KPT_N_LEVELS = 8;
|
||||
std::vector<float> q3kpt_all_levels; // indexed by position in tensors[]
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_KPT && !params->dry_run) {
|
||||
LLAMA_LOG_INFO("%s: Q3_KPT pass 1: training per-tensor levels...\n", __func__);
|
||||
q3kpt_all_levels.assign(tensors.size() * Q3KPT_N_LEVELS, 0.0f);
|
||||
|
||||
// Temporary dequant buffer for pass 1
|
||||
std::vector<no_init<uint8_t>> p1_read_data;
|
||||
std::vector<no_init<float>> p1_f32_buf;
|
||||
std::vector<std::thread> p1_workers;
|
||||
p1_workers.reserve(nthread);
|
||||
|
||||
for (size_t ti = 0; ti < tensors.size(); ++ti) {
|
||||
ggml_tensor * tensor = tensors[ti]->tensor;
|
||||
const std::string tname = ggml_get_name(tensor);
|
||||
|
||||
// Determine whether this tensor will be Q3_KPT (mirror the pass-2 logic)
|
||||
bool quantize = tname.rfind("weight") == tname.size() - 6;
|
||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||
quantize &= tname.find("_norm.weight") == std::string::npos;
|
||||
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
|
||||
if (!quantize) { continue; }
|
||||
|
||||
ggml_type new_type = default_type;
|
||||
if (!params->pure) {
|
||||
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
|
||||
}
|
||||
if (params->token_embedding_type < GGML_TYPE_COUNT &&
|
||||
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
|
||||
new_type = params->token_embedding_type;
|
||||
}
|
||||
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
|
||||
new_type = params->output_tensor_type;
|
||||
}
|
||||
if (new_type != GGML_TYPE_Q3_KPT) { continue; }
|
||||
|
||||
// Load tensor data
|
||||
const size_t tsz = ggml_nbytes(tensor);
|
||||
if (!ml.use_mmap) {
|
||||
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
|
||||
tensor->data = p1_read_data.data();
|
||||
}
|
||||
ml.load_data_for(tensor);
|
||||
|
||||
// Dequantize to f32 if needed
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
float * f32_data;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
|
||||
f32_data = (float *) p1_f32_buf.data();
|
||||
}
|
||||
|
||||
// Resolve imatrix
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it2 != imatrix_data->end() &&
|
||||
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
|
||||
imatrix = it2->second.data();
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
LLAMA_LOG_INFO("%s: Q3_KPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
|
||||
q3kpt_train_levels(f32_data, nrows, n_per_row, imatrix,
|
||||
q3kpt_all_levels.data() + ti * Q3KPT_N_LEVELS);
|
||||
}
|
||||
|
||||
// All levels ready — store in GGUF metadata before the file is opened
|
||||
for (auto & ctx : ctx_outs) {
|
||||
if (ctx) {
|
||||
gguf_set_arr_data(ctx.get(), "q3_kpt.levels", GGUF_TYPE_FLOAT32,
|
||||
q3kpt_all_levels.data(), q3kpt_all_levels.size());
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: Q3_KPT pass 1 complete.\n", __func__);
|
||||
}
|
||||
|
||||
// Q4_DPT two-pass approach: train all per-tensor int8 levels BEFORE opening the output
|
||||
// file, so the levels KV entry is already populated at the time of the metadata placeholder.
|
||||
static const size_t Q4DPT_N_LEVELS = 16;
|
||||
std::vector<int8_t> q4dpt_all_levels; // indexed by position in tensors[]
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q4_DPT && !params->dry_run) {
|
||||
LLAMA_LOG_INFO("%s: Q4_DPT pass 1: training per-tensor int8 levels...\n", __func__);
|
||||
q4dpt_all_levels.assign(tensors.size() * Q4DPT_N_LEVELS, (int8_t)0);
|
||||
|
||||
std::vector<no_init<uint8_t>> p1_read_data;
|
||||
std::vector<no_init<float>> p1_f32_buf;
|
||||
std::vector<std::thread> p1_workers;
|
||||
p1_workers.reserve(nthread);
|
||||
|
||||
for (size_t ti = 0; ti < tensors.size(); ++ti) {
|
||||
ggml_tensor * tensor = tensors[ti]->tensor;
|
||||
const std::string tname = ggml_get_name(tensor);
|
||||
|
||||
bool quantize = tname.rfind("weight") == tname.size() - 6;
|
||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||
quantize &= tname.find("_norm.weight") == std::string::npos;
|
||||
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
|
||||
if (!quantize) { continue; }
|
||||
|
||||
ggml_type new_type = default_type;
|
||||
if (!params->pure) {
|
||||
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
|
||||
}
|
||||
if (params->token_embedding_type < GGML_TYPE_COUNT &&
|
||||
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
|
||||
new_type = params->token_embedding_type;
|
||||
}
|
||||
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
|
||||
new_type = params->output_tensor_type;
|
||||
}
|
||||
if (new_type != GGML_TYPE_Q4_DPT) { continue; }
|
||||
|
||||
// Load tensor data
|
||||
const size_t tsz = ggml_nbytes(tensor);
|
||||
if (!ml.use_mmap) {
|
||||
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
|
||||
tensor->data = p1_read_data.data();
|
||||
}
|
||||
ml.load_data_for(tensor);
|
||||
|
||||
// Dequantize to f32 if needed
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
float * f32_data;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
|
||||
f32_data = (float *) p1_f32_buf.data();
|
||||
}
|
||||
|
||||
// Resolve imatrix
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it2 != imatrix_data->end() &&
|
||||
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
|
||||
imatrix = it2->second.data();
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
LLAMA_LOG_INFO("%s: Q4_DPT levels for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
|
||||
q4dpt_train_levels(f32_data, nrows, n_per_row, imatrix,
|
||||
q4dpt_all_levels.data() + ti * Q4DPT_N_LEVELS);
|
||||
}
|
||||
|
||||
// Store in GGUF metadata before the file is opened
|
||||
for (auto & ctx : ctx_outs) {
|
||||
if (ctx) {
|
||||
gguf_set_arr_data(ctx.get(), "q4_dpt.levels", GGUF_TYPE_INT8,
|
||||
q4dpt_all_levels.data(), q4dpt_all_levels.size());
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: Q4_DPT pass 1 complete.\n", __func__);
|
||||
}
|
||||
|
||||
// Q2_KPT two-pass approach: train all per-block levels BEFORE opening the output
|
||||
// file, so the levels KV entry is already populated at the time of the metadata placeholder.
|
||||
// Per-block levels: 4 floats per 256-element block.
|
||||
struct q2kpt_tensor_levels {
|
||||
std::string name;
|
||||
std::vector<float> levels; // nrows * (n_per_row / QK_K) * Q2KPT_N_LEVELS floats
|
||||
};
|
||||
std::vector<q2kpt_tensor_levels> q2kpt_all_levels;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_KPT && !params->dry_run) {
|
||||
LLAMA_LOG_INFO("%s: Q2_KPT pass 1: training per-block levels...\n", __func__);
|
||||
|
||||
std::vector<no_init<uint8_t>> p1_read_data;
|
||||
std::vector<no_init<float>> p1_f32_buf;
|
||||
std::vector<std::thread> p1_workers;
|
||||
p1_workers.reserve(nthread);
|
||||
|
||||
for (size_t ti = 0; ti < tensors.size(); ++ti) {
|
||||
ggml_tensor * tensor = tensors[ti]->tensor;
|
||||
const std::string tname = ggml_get_name(tensor);
|
||||
|
||||
// Determine whether this tensor will be Q2_KPT (mirror the pass-2 logic)
|
||||
bool quantize = tname.rfind("weight") == tname.size() - 6;
|
||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||
quantize &= tname.find("_norm.weight") == std::string::npos;
|
||||
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
|
||||
if (!quantize) { continue; }
|
||||
|
||||
ggml_type new_type = default_type;
|
||||
if (!params->pure) {
|
||||
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
|
||||
}
|
||||
if (params->token_embedding_type < GGML_TYPE_COUNT &&
|
||||
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
|
||||
new_type = params->token_embedding_type;
|
||||
}
|
||||
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
|
||||
new_type = params->output_tensor_type;
|
||||
}
|
||||
if (new_type != GGML_TYPE_Q2_KPT) { continue; }
|
||||
|
||||
// Load tensor data
|
||||
const size_t tsz = ggml_nbytes(tensor);
|
||||
if (!ml.use_mmap) {
|
||||
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
|
||||
tensor->data = p1_read_data.data();
|
||||
}
|
||||
ml.load_data_for(tensor);
|
||||
|
||||
// Dequantize to f32 if needed
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
float * f32_data;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
|
||||
f32_data = (float *) p1_f32_buf.data();
|
||||
}
|
||||
|
||||
// Resolve imatrix
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it2 != imatrix_data->end() &&
|
||||
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
|
||||
imatrix = it2->second.data();
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
// Allocate levels buffer for this tensor
|
||||
const int nb = n_per_row / QK_K;
|
||||
const size_t n_levels = (size_t)nrows * tensor->ne[2] * nb * Q2KPT_N_LEVELS;
|
||||
q2kpt_all_levels.push_back({tname, std::vector<float>(n_levels)});
|
||||
|
||||
LLAMA_LOG_INFO("%s: Q2_KPT levels for [%zu/%zu] %s (%zu floats)\n",
|
||||
__func__, ti+1, tensors.size(), tensor->name, n_levels);
|
||||
|
||||
// Train levels by running quantization internally
|
||||
// We need to quantize to f32 -> Q2_KPT -> f32 to get the trained levels
|
||||
std::vector<no_init<uint8_t>> p1_qbuf(ggml_nbytes(tensor));
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_Q2_KPT, n_per_row);
|
||||
|
||||
// Prepare levels buffer for this tensor
|
||||
q2kpt_free_levels();
|
||||
q2kpt_prepare_levels(nrows * tensor->ne[2], n_per_row);
|
||||
|
||||
// Quantize each expert slice
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
||||
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
||||
void * q_data_03 = (char *)p1_qbuf.data() + row_size * i03 * nrows;
|
||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||
|
||||
// start_row must be the absolute row index for correct levels indexing
|
||||
ggml_quantize_chunk(GGML_TYPE_Q2_KPT, f32_data_03, q_data_03, i03 * nrows, nrows, n_per_row, imatrix_03);
|
||||
}
|
||||
|
||||
// Copy trained levels to our storage
|
||||
const float * trained_levels = q2kpt_get_levels();
|
||||
if (trained_levels) {
|
||||
memcpy(q2kpt_all_levels.back().levels.data(), trained_levels, n_levels * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
// Store all levels in GGUF metadata before the file is opened
|
||||
for (const auto & tl : q2kpt_all_levels) {
|
||||
for (auto & ctx : ctx_outs) {
|
||||
if (ctx) {
|
||||
const std::string key = tl.name + ".q2kpt_levels";
|
||||
gguf_set_arr_data(ctx.get(), key.c_str(), GGUF_TYPE_FLOAT32,
|
||||
tl.levels.data(), tl.levels.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: Q2_KPT pass 1 complete.\n", __func__);
|
||||
}
|
||||
|
||||
// IQ2_TQ: train per-tensor grid in pass 1
|
||||
struct iq2tq_meta {
|
||||
std::string tensor_name;
|
||||
int8_t grid[64];
|
||||
};
|
||||
std::vector<iq2tq_meta> iq2tq_all_meta;
|
||||
if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ2_TQ) {
|
||||
const int64_t t_start_p1 = ggml_time_us();
|
||||
LLAMA_LOG_INFO("%s: IQ2_TQ pass 1: training per-tensor grids...\n", __func__);
|
||||
|
||||
std::vector<no_init<uint8_t>> p1_read_data;
|
||||
std::vector<no_init<float>> p1_f32_buf;
|
||||
std::vector<std::thread> p1_workers;
|
||||
p1_workers.reserve(nthread);
|
||||
|
||||
for (size_t ti = 0; ti < tensors.size(); ++ti) {
|
||||
ggml_tensor * tensor = tensors[ti]->tensor;
|
||||
const std::string tname = ggml_get_name(tensor);
|
||||
|
||||
// Mirror pass-2 logic: only quantize 2D+ weight tensors
|
||||
bool quantize = tname.rfind("weight") == tname.size() - 6;
|
||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||
quantize &= tname.find("_norm.weight") == std::string::npos;
|
||||
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
|
||||
if (!quantize) { continue; }
|
||||
|
||||
ggml_type new_type = default_type;
|
||||
if (!params->pure) {
|
||||
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
|
||||
}
|
||||
if (params->token_embedding_type < GGML_TYPE_COUNT &&
|
||||
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
|
||||
new_type = params->token_embedding_type;
|
||||
}
|
||||
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
|
||||
new_type = params->output_tensor_type;
|
||||
}
|
||||
if (new_type != GGML_TYPE_IQ2_TQ) { continue; }
|
||||
|
||||
// Load tensor data
|
||||
const size_t tsz = ggml_nbytes(tensor);
|
||||
if (!ml.use_mmap) {
|
||||
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
|
||||
tensor->data = p1_read_data.data();
|
||||
}
|
||||
ml.load_data_for(tensor);
|
||||
|
||||
// Dequantize to f32 if needed
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
float * f32_data;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
|
||||
f32_data = (float *) p1_f32_buf.data();
|
||||
}
|
||||
|
||||
// Resolve imatrix
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it2 != imatrix_data->end() &&
|
||||
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
|
||||
imatrix = it2->second.data();
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
LLAMA_LOG_INFO("%s: IQ2_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
|
||||
|
||||
iq2tq_meta meta;
|
||||
meta.tensor_name = tname;
|
||||
iq2tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid);
|
||||
iq2tq_all_meta.push_back(meta);
|
||||
|
||||
// Save to GGUF
|
||||
std::string grid_key = "iq2tq.grid." + tname;
|
||||
gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 64);
|
||||
}
|
||||
const int64_t t_end_p1 = ggml_time_us();
|
||||
LLAMA_LOG_INFO("%s: IQ2_TQ pass 1 complete (%zu tensors trained, %.1f s).\n",
|
||||
__func__, iq2tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
|
||||
}
|
||||
|
||||
// IQ3_TQ: train per-tensor grid in pass 1 (16 entries × 8 levels = 128 bytes)
|
||||
struct iq3tq_meta {
|
||||
std::string tensor_name;
|
||||
int8_t grid[128];
|
||||
};
|
||||
std::vector<iq3tq_meta> iq3tq_all_meta;
|
||||
if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ3_TQ) {
|
||||
const int64_t t_start_p1 = ggml_time_us();
|
||||
LLAMA_LOG_INFO("%s: IQ3_TQ pass 1: training per-tensor grids...\n", __func__);
|
||||
|
||||
std::vector<no_init<uint8_t>> p1_read_data;
|
||||
std::vector<no_init<float>> p1_f32_buf;
|
||||
std::vector<std::thread> p1_workers;
|
||||
p1_workers.reserve(nthread);
|
||||
|
||||
for (size_t ti = 0; ti < tensors.size(); ++ti) {
|
||||
ggml_tensor * tensor = tensors[ti]->tensor;
|
||||
const std::string tname = ggml_get_name(tensor);
|
||||
|
||||
bool quantize = tname.rfind("weight") == tname.size() - 6;
|
||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||
quantize &= tname.find("_norm.weight") == std::string::npos;
|
||||
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
|
||||
if (!quantize) { continue; }
|
||||
|
||||
ggml_type new_type = default_type;
|
||||
if (!params->pure) {
|
||||
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
|
||||
}
|
||||
if (params->token_embedding_type < GGML_TYPE_COUNT &&
|
||||
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
|
||||
new_type = params->token_embedding_type;
|
||||
}
|
||||
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
|
||||
new_type = params->output_tensor_type;
|
||||
}
|
||||
if (new_type != GGML_TYPE_IQ3_TQ) { continue; }
|
||||
|
||||
const size_t tsz = ggml_nbytes(tensor);
|
||||
if (!ml.use_mmap) {
|
||||
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
|
||||
tensor->data = p1_read_data.data();
|
||||
}
|
||||
ml.load_data_for(tensor);
|
||||
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
float * f32_data;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
|
||||
f32_data = (float *) p1_f32_buf.data();
|
||||
}
|
||||
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it2 != imatrix_data->end() &&
|
||||
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
|
||||
imatrix = it2->second.data();
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
LLAMA_LOG_INFO("%s: IQ3_TQ grid for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
|
||||
|
||||
iq3tq_meta meta;
|
||||
meta.tensor_name = tname;
|
||||
iq3tq_train_grid(f32_data, nrows, n_per_row, imatrix, meta.grid);
|
||||
iq3tq_all_meta.push_back(meta);
|
||||
|
||||
std::string grid_key = "iq3tq.grid." + tname;
|
||||
gguf_set_arr_data(ctx_outs[0].get(), grid_key.c_str(), GGUF_TYPE_INT8, meta.grid, 128);
|
||||
}
|
||||
const int64_t t_end_p1 = ggml_time_us();
|
||||
LLAMA_LOG_INFO("%s: IQ3_TQ pass 1 complete (%zu tensors trained, %.1f s).\n",
|
||||
__func__, iq3tq_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
|
||||
}
|
||||
|
||||
// IQ1_BN: train per-tensor codebook in pass 1 (4096 × 8D centroids = 32768 bytes)
|
||||
struct iq1bn_meta {
|
||||
std::string tensor_name;
|
||||
int8_t aux[32768];
|
||||
};
|
||||
std::vector<iq1bn_meta> iq1bn_all_meta;
|
||||
if (params->ftype == LLAMA_FTYPE_MOSTLY_IQ1_BN) {
|
||||
const int64_t t_start_p1 = ggml_time_us();
|
||||
LLAMA_LOG_INFO("%s: IQ1_BN pass 1: training per-tensor codebooks...\n", __func__);
|
||||
|
||||
std::vector<no_init<uint8_t>> p1_read_data;
|
||||
std::vector<no_init<float>> p1_f32_buf;
|
||||
std::vector<std::thread> p1_workers;
|
||||
p1_workers.reserve(nthread);
|
||||
|
||||
for (size_t ti = 0; ti < tensors.size(); ++ti) {
|
||||
ggml_tensor * tensor = tensors[ti]->tensor;
|
||||
const std::string tname = ggml_get_name(tensor);
|
||||
|
||||
bool quantize = tname.rfind("weight") == tname.size() - 6;
|
||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||
quantize &= tname.find("_norm.weight") == std::string::npos;
|
||||
quantize &= tname.find("ffn_gate_inp.weight") == std::string::npos;
|
||||
if (!quantize) { continue; }
|
||||
|
||||
ggml_type new_type = default_type;
|
||||
if (!params->pure) {
|
||||
new_type = llama_tensor_get_type_impl(qs, new_type, tensor, ftype, tensor_get_category(tname));
|
||||
}
|
||||
if (params->token_embedding_type < GGML_TYPE_COUNT &&
|
||||
(tname == "token_embd.weight" || tname == "per_layer_token_embd.weight")) {
|
||||
new_type = params->token_embedding_type;
|
||||
}
|
||||
if (params->output_tensor_type < GGML_TYPE_COUNT && tname == "output.weight") {
|
||||
new_type = params->output_tensor_type;
|
||||
}
|
||||
if (new_type != GGML_TYPE_IQ1_BN) { continue; }
|
||||
|
||||
const size_t tsz = ggml_nbytes(tensor);
|
||||
if (!ml.use_mmap) {
|
||||
if (p1_read_data.size() < tsz) { p1_read_data.resize(tsz); }
|
||||
tensor->data = p1_read_data.data();
|
||||
}
|
||||
ml.load_data_for(tensor);
|
||||
|
||||
const int64_t nelements = ggml_nelements(tensor);
|
||||
float * f32_data;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
f32_data = (float *) tensor->data;
|
||||
} else {
|
||||
llama_tensor_dequantize_impl(tensor, p1_f32_buf, p1_workers, nelements, nthread);
|
||||
f32_data = (float *) p1_f32_buf.data();
|
||||
}
|
||||
|
||||
const float * imatrix = nullptr;
|
||||
if (imatrix_data) {
|
||||
auto it2 = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
||||
if (it2 != imatrix_data->end() &&
|
||||
it2->second.size() == (size_t)tensor->ne[0] * tensor->ne[2]) {
|
||||
imatrix = it2->second.data();
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
LLAMA_LOG_INFO("%s: IQ1_BN codebook for [%zu/%zu] %s\n", __func__, ti+1, tensors.size(), tensor->name);
|
||||
|
||||
iq1bn_meta meta;
|
||||
meta.tensor_name = tname;
|
||||
iq1bn_train_codebook(f32_data, nrows, n_per_row, imatrix, meta.aux, nthread);
|
||||
iq1bn_all_meta.push_back(meta);
|
||||
|
||||
std::string aux_key = "iq1bn.aux." + tname;
|
||||
gguf_set_arr_data(ctx_outs[0].get(), aux_key.c_str(), GGUF_TYPE_INT8, meta.aux, 32768);
|
||||
}
|
||||
const int64_t t_end_p1 = ggml_time_us();
|
||||
LLAMA_LOG_INFO("%s: IQ1_BN pass 1 complete (%zu tensors trained, %.1f s).\n",
|
||||
__func__, iq1bn_all_meta.size(), (t_end_p1 - t_start_p1) / 1e6);
|
||||
}
|
||||
|
||||
// no output file for --dry-run
|
||||
if (!params->dry_run) {
|
||||
new_ofstream(0);
|
||||
|
|
@ -1106,6 +1838,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
//
|
||||
// main loop: iterate over all weights
|
||||
//
|
||||
size_t tensor_pass2_idx = 0; // index into tensors[], used for Q3_PT levels lookup
|
||||
|
||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
const auto & weight = *tensors[i];
|
||||
|
|
@ -1232,6 +1965,75 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||
|
||||
// Q3_PT: set the per-tensor levels (trained in pass 1) as global for quantization
|
||||
if (new_type == GGML_TYPE_Q3_PT) {
|
||||
q3pt_set_levels(q3pt_all_levels.data() + tensor_pass2_idx * Q3PT_N_LEVELS);
|
||||
}
|
||||
|
||||
// Q3_KPT: set the per-tensor levels (trained in pass 1) as global for quantization
|
||||
if (new_type == GGML_TYPE_Q3_KPT) {
|
||||
q3kpt_set_levels(q3kpt_all_levels.data() + tensor_pass2_idx * Q3KPT_N_LEVELS);
|
||||
}
|
||||
|
||||
// Q4_DPT: set the per-tensor levels (trained in pass 1) as global for quantization
|
||||
if (new_type == GGML_TYPE_Q4_DPT) {
|
||||
q4dpt_set_levels(q4dpt_all_levels.data() + tensor_pass2_idx * Q4DPT_N_LEVELS);
|
||||
}
|
||||
|
||||
// IQ2_TQ: set per-tensor trained grid
|
||||
if (new_type == GGML_TYPE_IQ2_TQ) {
|
||||
bool found = false;
|
||||
for (const auto & meta : iq2tq_all_meta) {
|
||||
if (meta.tensor_name == tm.name) {
|
||||
iq2tq_set_grid(meta.grid);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ2_TQ tensor %s\n", __func__, tm.name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// IQ3_TQ: set per-tensor trained grid
|
||||
if (new_type == GGML_TYPE_IQ3_TQ) {
|
||||
bool found = false;
|
||||
for (const auto & meta : iq3tq_all_meta) {
|
||||
if (meta.tensor_name == tm.name) {
|
||||
iq3tq_set_grid(meta.grid);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: no trained grid for IQ3_TQ tensor %s\n", __func__, tm.name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// IQ1_BN: set per-tensor trained codebook
|
||||
if (new_type == GGML_TYPE_IQ1_BN) {
|
||||
bool found = false;
|
||||
for (const auto & meta : iq1bn_all_meta) {
|
||||
if (meta.tensor_name == tm.name) {
|
||||
iq1bn_set_aux(meta.aux);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: no trained codebook for IQ1_BN tensor %s\n", __func__, tm.name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// Q2_KPT: quantize_q2_kpt trains per-block levels internally.
|
||||
// Levels were already trained and saved to GGUF in pass 1.
|
||||
// We still need to allocate the levels buffer for quantization to work correctly.
|
||||
if (new_type == GGML_TYPE_Q2_KPT) {
|
||||
const int64_t total_rows = nrows * tensor->ne[2];
|
||||
q2kpt_free_levels(); // Clear any stale levels from previous tensor
|
||||
q2kpt_prepare_levels(total_rows, n_per_row); // Allocate for this tensor
|
||||
}
|
||||
|
||||
// quantize each expert separately since they have different importance matrices
|
||||
new_size = 0;
|
||||
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
||||
|
|
@ -1255,7 +2057,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
fout.write((const char *) new_data, new_size);
|
||||
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
||||
} // no --dry-run
|
||||
} // main loop
|
||||
|
||||
tensor_pass2_idx++;
|
||||
} // iterate over tensors
|
||||
|
||||
if (!params->dry_run) {
|
||||
close_ofstream();
|
||||
|
|
|
|||
|
|
@ -257,6 +257,9 @@ if (NOT GGML_BACKEND_DL)
|
|||
llama_build_and_test(test-rope.cpp)
|
||||
endif()
|
||||
|
||||
# Quantization laboratory - tests for 2.5 BPW proposals
|
||||
llama_build_and_test(test-quant-laboratory.cpp)
|
||||
|
||||
# libmtmd
|
||||
set(LLAMA_TEST_NAME test-mtmd-c-api)
|
||||
llama_build_and_test(test-mtmd-c-api.c)
|
||||
|
|
|
|||
|
|
@ -261,7 +261,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|||
} else if (t->type == GGML_TYPE_I8) {
|
||||
tv.push_back((float)*(int8_t *) &buf[i]);
|
||||
} else if (quantized) {
|
||||
tt->to_float(&buf[i], vq.data(), bs);
|
||||
tt->to_float(&buf[i], vq.data(), bs, nullptr);
|
||||
tv.insert(tv.end(), vq.begin(), vq.end());
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
|
|||
|
|
@ -0,0 +1,355 @@
|
|||
// test-quant-laboratory.cpp
|
||||
// Reusable testing harness for quantization experiments.
|
||||
//
|
||||
// Provides:
|
||||
// - Synthetic data generators (Gaussian, Laplace, uniform)
|
||||
// - Real tensor data loading (f32bin format with [nrow, ncol] header)
|
||||
// - Importance matrix loading (flat f32 array)
|
||||
// - RMSE computation
|
||||
// - Multi-approach comparison framework (quantize → dequantize → matmul error)
|
||||
// - ggml graph-level verification skeleton
|
||||
//
|
||||
// To add a new experiment:
|
||||
// 1. Add an approach function: void approach_xxx(const float *W, float *out,
|
||||
// int64_t nrow, int64_t ncol,
|
||||
// const float *imatrix)
|
||||
// 2. Register it in compare_approaches()
|
||||
// 3. Call test_approach_comparison() from main()
|
||||
|
||||
#include "../ggml/src/ggml-quants.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <functional>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// ============================================================================
|
||||
// Helper functions
|
||||
// ============================================================================
|
||||
|
||||
static float rmse(const float * a, const float * b, size_t n) {
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
double d = (double) a[i] - (double) b[i];
|
||||
sum += d * d;
|
||||
}
|
||||
return (float) sqrt(sum / n);
|
||||
}
|
||||
|
||||
static void fill_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f) {
|
||||
std::normal_distribution<float> dist(0.0f, sigma);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
data[i] = dist(gen);
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_laplace(float * data, size_t n, std::mt19937 & gen, float b = 1.0f) {
|
||||
std::uniform_real_distribution<float> u(-0.5f, 0.5f);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
float v = u(gen);
|
||||
data[i] = -b * ((v > 0) - (v < 0)) * logf(1.0f - 2.0f * fabsf(v));
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_uniform(float * data, size_t n, std::mt19937 & gen, float range = 1.0f) {
|
||||
std::uniform_real_distribution<float> dist(-range, range);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
data[i] = dist(gen);
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_offset_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f, float offset = 2.0f) {
|
||||
std::normal_distribution<float> dist(offset, sigma);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
data[i] = dist(gen);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Data loading
|
||||
// ============================================================================
|
||||
static bool load_f32_tensor(const char * path, std::vector<float> & data, int64_t & nrow, int64_t & n_per_row) {
|
||||
FILE * f = fopen(path, "rb");
|
||||
if (!f) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int64_t header[2];
|
||||
if (fread(header, sizeof(int64_t), 2, f) != 2) {
|
||||
fclose(f);
|
||||
return false;
|
||||
}
|
||||
nrow = header[0];
|
||||
n_per_row = header[1];
|
||||
|
||||
int64_t total = nrow * n_per_row;
|
||||
data.resize(total);
|
||||
size_t nread = fread(data.data(), sizeof(float), total, f);
|
||||
fclose(f);
|
||||
if ((int64_t) nread != total) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Load imatrix file (flat f32 array, no header, one importance value per column dimension)
|
||||
// The imatrix is the sum-of-squares of activations per dimension.
|
||||
static bool load_imatrix(const char * path, std::vector<float> & data, int64_t expected_dims) {
|
||||
FILE * f = fopen(path, "rb");
|
||||
if (!f) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get file size to determine dimensions
|
||||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
|
||||
int64_t dims = file_size / sizeof(float);
|
||||
if (expected_dims > 0 && dims != expected_dims) {
|
||||
printf(" WARN: imatrix dims %lld != expected %lld\n", (long long) dims, (long long) expected_dims);
|
||||
fclose(f);
|
||||
return false;
|
||||
}
|
||||
|
||||
data.resize(dims);
|
||||
size_t nread = fread(data.data(), sizeof(float), dims, f);
|
||||
fclose(f);
|
||||
if ((int64_t) nread != dims) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Compute stats
|
||||
float imin = data[0], imax = data[0], isum = 0;
|
||||
for (int64_t i = 0; i < dims; i++) {
|
||||
if (data[i] < imin) imin = data[i];
|
||||
if (data[i] > imax) imax = data[i];
|
||||
isum += data[i];
|
||||
}
|
||||
printf(" Loaded imatrix: %lld dims, min=%.6f, max=%.6f, mean=%.6f\n",
|
||||
(long long) dims, imin, imax, isum / dims);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Test class
|
||||
// ============================================================================
|
||||
|
||||
class QuantLaboratory {
|
||||
public:
|
||||
QuantLaboratory() : gen(42) {}
|
||||
|
||||
// ========================================================================
|
||||
// MULTI-APPROACH COMPARISON FRAMEWORK
|
||||
//
|
||||
// Each "approach" is a function that takes float weights and produces
|
||||
// dequantized float output. The framework computes:
|
||||
// - Weight RMSE (dequant vs original)
|
||||
// - Matmul error (dequant weights x real activations vs f64 reference)
|
||||
// - Ratio vs first approach (typically Q2_K baseline)
|
||||
//
|
||||
// To add a new approach:
|
||||
// 1. Write: void approach_xxx(const float *W, float *out,
|
||||
// int64_t nrow, int64_t ncol,
|
||||
// const float *imatrix) { ... }
|
||||
// 2. Add it to the `approaches` array in compare_approaches()
|
||||
// ========================================================================
|
||||
|
||||
// -- Example approach: Q2_K baseline (via ggml library) --
|
||||
// Uncomment and adapt for your experiment:
|
||||
//
|
||||
// void approach_q2k(const float * W, float * out, int64_t nrow, int64_t ncol, const float * imatrix) {
|
||||
// size_t rs = ggml_row_size(GGML_TYPE_Q2_K, ncol);
|
||||
// std::vector<uint8_t> buf(nrow * rs);
|
||||
// quantize_q2_K(W, buf.data(), nrow, ncol, imatrix);
|
||||
// auto * tr = ggml_get_type_traits(GGML_TYPE_Q2_K);
|
||||
// for (int64_t r = 0; r < nrow; r++) {
|
||||
// tr->to_float(buf.data() + r * rs, out + r * ncol, ncol, NULL);
|
||||
// }
|
||||
// }
|
||||
|
||||
void compare_approaches(const float * W,
|
||||
int64_t w_nrow,
|
||||
int64_t w_ncol,
|
||||
const float * A,
|
||||
int64_t a_nrow,
|
||||
int64_t a_ncol,
|
||||
const char * name,
|
||||
const float * imatrix) {
|
||||
if (w_ncol != a_ncol) {
|
||||
return;
|
||||
}
|
||||
int64_t nr = std::min(w_nrow, (int64_t) 256);
|
||||
int64_t nc = w_ncol;
|
||||
|
||||
// Reference matmul (double precision)
|
||||
std::vector<double> ref(a_nrow * nr);
|
||||
for (int64_t t = 0; t < a_nrow; t++) {
|
||||
for (int64_t r = 0; r < nr; r++) {
|
||||
double s = 0;
|
||||
for (int64_t c = 0; c < nc; c++) {
|
||||
s += (double) A[t * a_ncol + c] * (double) W[r * nc + c];
|
||||
}
|
||||
ref[t * nr + r] = s;
|
||||
}
|
||||
}
|
||||
double ref_mag2 = 0;
|
||||
for (auto v : ref) {
|
||||
ref_mag2 += v * v;
|
||||
}
|
||||
float ref_rms = (float) sqrt(ref_mag2 / (a_nrow * nr));
|
||||
(void) ref_rms;
|
||||
|
||||
struct Approach {
|
||||
const char * name;
|
||||
float bpw;
|
||||
std::function<void(const float *, float *, int64_t, int64_t, const float *)> fn;
|
||||
};
|
||||
|
||||
// ── Register approaches here ──
|
||||
Approach approaches[] = {
|
||||
// { "Q2_K (baseline)", 2.625f,
|
||||
// [&](auto * W, auto * o, auto nr, auto nc, auto * im) {
|
||||
// approach_q2k(W, o, nr, nc, im);
|
||||
// } },
|
||||
// Add more approaches...
|
||||
{ "placeholder", 0.0f, nullptr }, // remove once real approaches added
|
||||
};
|
||||
|
||||
printf("\n %-28s %5s %10s %10s %7s\n", name, "BPW", "RMSE", "MatmulErr", "vs Q2K");
|
||||
printf(" %-28s %5s %10s %10s %7s\n", "---", "---", "---", "---", "---");
|
||||
|
||||
float baseline_matmul_err = 0;
|
||||
for (auto & ap : approaches) {
|
||||
if (!ap.fn) {
|
||||
continue;
|
||||
}
|
||||
std::vector<float> dec(nr * nc);
|
||||
ap.fn(W, dec.data(), nr, nc, imatrix);
|
||||
|
||||
// Weight RMSE
|
||||
double werr2 = 0;
|
||||
for (int64_t i = 0; i < nr * nc; i++) {
|
||||
double d = W[i] - dec[i];
|
||||
werr2 += d * d;
|
||||
}
|
||||
float wrmse = (float) sqrt(werr2 / (nr * nc));
|
||||
|
||||
// Matmul error
|
||||
double merr2 = 0;
|
||||
for (int64_t t = 0; t < a_nrow; t++) {
|
||||
for (int64_t r = 0; r < nr; r++) {
|
||||
double s = 0;
|
||||
for (int64_t c = 0; c < nc; c++) {
|
||||
s += (double) A[t * a_ncol + c] * (double) dec[r * nc + c];
|
||||
}
|
||||
double d = s - ref[t * nr + r];
|
||||
merr2 += d * d;
|
||||
}
|
||||
}
|
||||
float matmul_rmse = (float) sqrt(merr2 / (a_nrow * nr));
|
||||
|
||||
if (baseline_matmul_err == 0) {
|
||||
baseline_matmul_err = matmul_rmse;
|
||||
}
|
||||
float ratio = (baseline_matmul_err > 1e-10f) ? matmul_rmse / baseline_matmul_err : 0;
|
||||
|
||||
printf(" %-28s %5.3f %10.6f %10.6f %6.3fx\n", ap.name, ap.bpw, wrmse, matmul_rmse, ratio);
|
||||
}
|
||||
}
|
||||
|
||||
// Run comparison on all tensor pairs from data directory
|
||||
int test_approach_comparison(const char * data_dir) {
|
||||
printf("\n");
|
||||
printf("=======================================================================\n");
|
||||
printf(" MULTI-APPROACH COMPARISON (real weights x real activations)\n");
|
||||
printf("=======================================================================\n");
|
||||
|
||||
struct TestPair {
|
||||
const char * wf;
|
||||
const char * af;
|
||||
const char * imf;
|
||||
const char * name;
|
||||
} pairs[] = {
|
||||
{ "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_gate" },
|
||||
{ "blk_0_ffn_up_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_up" },
|
||||
{ "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin", "imatrix_blk0_ffn_down.f32bin", "ffn_down" },
|
||||
{ "blk_0_attn_q_weight.f32bin", "act_blk0_attn_input.f32bin", "imatrix_blk0_attn_qkv.f32bin", "attn_q" },
|
||||
};
|
||||
|
||||
for (auto & p : pairs) {
|
||||
char wp[512], ap[512], imp[512];
|
||||
snprintf(wp, sizeof(wp), "%s/%s", data_dir, p.wf);
|
||||
snprintf(ap, sizeof(ap), "%s/%s", data_dir, p.af);
|
||||
snprintf(imp, sizeof(imp), "%s/%s", data_dir, p.imf);
|
||||
std::vector<float> wd, ad, im;
|
||||
int64_t wnr, wnc, anr, anc;
|
||||
if (!load_f32_tensor(wp, wd, wnr, wnc) || !load_f32_tensor(ap, ad, anr, anc)) {
|
||||
continue;
|
||||
}
|
||||
const float * im_ptr = nullptr;
|
||||
if (load_imatrix(imp, im, wnc)) {
|
||||
im_ptr = im.data();
|
||||
} else {
|
||||
printf(" [%s] No imatrix found, using uniform weights\n", p.name);
|
||||
}
|
||||
compare_approaches(wd.data(), wnr, wnc, ad.data(), anr, anc, p.name, im_ptr);
|
||||
}
|
||||
printf("\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
std::mt19937 gen;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Main
|
||||
// ============================================================================
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_backend_load_all();
|
||||
|
||||
QuantLaboratory lab;
|
||||
int total_fail = 0;
|
||||
|
||||
printf("Quantization Laboratory\n");
|
||||
printf("=======================\n");
|
||||
|
||||
// Real data tests (from data/ directory)
|
||||
{
|
||||
const char * data_dir = "data";
|
||||
if (argc > 1) {
|
||||
data_dir = argv[1];
|
||||
}
|
||||
|
||||
char probe[512];
|
||||
snprintf(probe, sizeof(probe), "%s/blk_0_ffn_gate_weight.f32bin", data_dir);
|
||||
FILE * fp = fopen(probe, "rb");
|
||||
if (fp) {
|
||||
fclose(fp);
|
||||
total_fail += lab.test_approach_comparison(data_dir);
|
||||
} else {
|
||||
printf("\n=== Real Data Tests SKIPPED ===\n");
|
||||
printf(" No data found at %s\n", data_dir);
|
||||
printf(
|
||||
" Run: cd data && PYTHONPATH=../gguf-py python3 ../scripts/extract-tensor-data.py MODEL.gguf "
|
||||
"blk.0.ffn_gate blk.0.ffn_up blk.0.ffn_down blk.0.attn_q\n");
|
||||
printf(" And: llama-capture-layer-data -m MODEL.gguf -l 0 -o data\n");
|
||||
}
|
||||
}
|
||||
|
||||
printf("\n\n=== Testing Complete: %d failures ===\n", total_fail);
|
||||
|
||||
return total_fail > 0 ? 1 : 0;
|
||||
}
|
||||
|
|
@ -54,7 +54,7 @@ static float total_quantization_error(const ggml_type_traits * qfns, const ggml_
|
|||
std::vector<float> tmp_out(test_size);
|
||||
|
||||
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr);
|
||||
return array_rmse(test_data, tmp_out.data(), test_size);
|
||||
}
|
||||
|
||||
|
|
@ -66,10 +66,10 @@ static float reference_quantization_error(const ggml_type_traits * qfns, const g
|
|||
|
||||
// FIXME: why is done twice?
|
||||
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size, nullptr);
|
||||
|
||||
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
||||
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size, nullptr);
|
||||
|
||||
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
||||
}
|
||||
|
|
@ -95,7 +95,7 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr
|
|||
vdot->from_float(test_data2, tmp_q2.data(), test_size);
|
||||
|
||||
float result = INFINITY;
|
||||
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
||||
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1, nullptr);
|
||||
|
||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||
|
||||
|
|
|
|||
|
|
@ -309,7 +309,7 @@ int main(int argc, char * argv[]) {
|
|||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
qfns->to_float(test_q1, test_out, size);
|
||||
qfns->to_float(test_q1, test_out, size, nullptr);
|
||||
return test_out[0];
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
|
@ -341,7 +341,7 @@ int main(int argc, char * argv[]) {
|
|||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void) -> float {
|
||||
float result;
|
||||
qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
||||
qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1, nullptr);
|
||||
return result;
|
||||
};
|
||||
size_t quantized_size = ggml_row_size(type, size);
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ static void test_roundtrip_on_chunk(
|
|||
} else {
|
||||
qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
|
||||
}
|
||||
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
|
||||
qfns.to_float(quantized_scratch, output_scratch, chunk_size, nullptr);
|
||||
|
||||
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,5 +38,6 @@ else()
|
|||
add_subdirectory(export-lora)
|
||||
endif()
|
||||
add_subdirectory(fit-params)
|
||||
add_subdirectory(capture-layer-data)
|
||||
add_subdirectory(results)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,9 @@
|
|||
set(TARGET llama-capture-layer-data)
|
||||
add_executable(${TARGET} capture-layer-data.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
|
|
@ -0,0 +1,251 @@
|
|||
// capture-layer-data.cpp
|
||||
// Captures intermediate activation tensors during model inference
|
||||
// and saves them as .f32bin files for the quantization laboratory.
|
||||
//
|
||||
// Usage:
|
||||
// llama-capture-layer-data -m MODEL_PATH -l LAYER [-p PROMPT] [-o OUTPUT_DIR]
|
||||
//
|
||||
// Example:
|
||||
// llama-capture-layer-data -m /devel/models/Qwen_Qwen3-4B-Instruct-2507-bf16.gguf -l 0 -o data
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct TensorMapping {
|
||||
const char * graph_name_prefix;
|
||||
const char * output_suffix;
|
||||
};
|
||||
|
||||
static const TensorMapping mappings[] = {
|
||||
{ "attn_norm", "attn_input" },
|
||||
{ "kqv_out", "attn_output_input" },
|
||||
{ "ffn_norm", "ffn_input" },
|
||||
{ "ffn_swiglu", "ffn_down_input" },
|
||||
};
|
||||
static constexpr int N_MAPPINGS = sizeof(mappings) / sizeof(mappings[0]);
|
||||
|
||||
struct CaptureState {
|
||||
int target_layer;
|
||||
std::string output_dir;
|
||||
int captured_count = 0;
|
||||
std::string pending_name;
|
||||
|
||||
std::string graph_to_filename(const char * graph_name) const {
|
||||
for (int i = 0; i < N_MAPPINGS; i++) {
|
||||
std::string prefix = mappings[i].graph_name_prefix;
|
||||
if (strncmp(graph_name, prefix.c_str(), prefix.size()) == 0) {
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "act_blk%d_%s.f32bin", target_layer, mappings[i].output_suffix);
|
||||
return std::string(buf);
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
};
|
||||
|
||||
static CaptureState * g_capture_state = nullptr;
|
||||
|
||||
static void save_tensor_as_f32bin(const ggml_tensor * t, const std::string & filepath) {
|
||||
int64_t n_rows = t->ne[1];
|
||||
int64_t row_len = t->ne[0];
|
||||
|
||||
int64_t total = 1;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
total *= t->ne[i];
|
||||
}
|
||||
|
||||
std::vector<float> f32_data(total);
|
||||
|
||||
if (t->type == GGML_TYPE_F32) {
|
||||
const float * src = (const float *) t->data;
|
||||
if (!src) {
|
||||
LOG_ERR("Tensor %s has null data pointer\n", t->name);
|
||||
return;
|
||||
}
|
||||
memcpy(f32_data.data(), src, total * sizeof(float));
|
||||
} else if (t->type == GGML_TYPE_F16) {
|
||||
const ggml_fp16_t * src = (const ggml_fp16_t *) t->data;
|
||||
for (int64_t i = 0; i < total; i++) {
|
||||
f32_data[i] = ggml_fp16_to_fp32(src[i]);
|
||||
}
|
||||
} else if (t->type == GGML_TYPE_BF16) {
|
||||
const ggml_bf16_t * src = (const ggml_bf16_t *) t->data;
|
||||
for (int64_t i = 0; i < total; i++) {
|
||||
f32_data[i] = ggml_bf16_to_fp32(src[i]);
|
||||
}
|
||||
} else {
|
||||
LOG_ERR("Unsupported tensor type %s for %s\n", ggml_type_name(t->type), t->name);
|
||||
return;
|
||||
}
|
||||
|
||||
std::ofstream file(filepath, std::ios::binary);
|
||||
if (!file) {
|
||||
LOG_ERR("Failed to open %s for writing\n", filepath.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
file.write(reinterpret_cast<const char *>(&n_rows), sizeof(int64_t));
|
||||
file.write(reinterpret_cast<const char *>(&row_len), sizeof(int64_t));
|
||||
file.write(reinterpret_cast<const char *>(f32_data.data()), total * sizeof(float));
|
||||
|
||||
file.close();
|
||||
LOG(" Captured: %s -> %s (%lld x %lld, %s)\n", t->name, filepath.c_str(), (long long) n_rows, (long long) row_len,
|
||||
ggml_type_name(t->type));
|
||||
}
|
||||
|
||||
static bool capture_callback(ggml_tensor * t, bool ask, void * user_data) {
|
||||
auto * state = (CaptureState *) user_data;
|
||||
|
||||
if (ask) {
|
||||
char target[128];
|
||||
for (int i = 0; i < N_MAPPINGS; i++) {
|
||||
snprintf(target, sizeof(target), "%s-%d", mappings[i].graph_name_prefix, state->target_layer);
|
||||
if (strcmp(t->name, target) == 0) {
|
||||
state->pending_name = t->name;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state->pending_name.empty()) {
|
||||
return true;
|
||||
}
|
||||
if (strcmp(t->name, state->pending_name.c_str()) != 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!ggml_backend_buffer_is_host(t->buffer)) {
|
||||
size_t nbytes = ggml_nbytes(t);
|
||||
std::vector<uint8_t> tmp(nbytes);
|
||||
ggml_backend_tensor_get(t, tmp.data(), 0, nbytes);
|
||||
LOG_WRN("Tensor %s is not host-accessible, data copied via backend\n", t->name);
|
||||
}
|
||||
|
||||
std::string filename = state->graph_to_filename(t->name);
|
||||
if (!filename.empty()) {
|
||||
std::filesystem::create_directories(state->output_dir);
|
||||
std::string filepath = (std::filesystem::path(state->output_dir) / filename).string();
|
||||
save_tensor_as_f32bin(t, filepath);
|
||||
state->captured_count++;
|
||||
}
|
||||
|
||||
state->pending_name.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
static void print_usage(void) {
|
||||
LOG("Usage: llama-capture-layer-data -m MODEL_PATH [-l LAYER] [-p PROMPT] [-o OUTPUT_DIR]\n");
|
||||
LOG("\n");
|
||||
LOG(" -m MODEL Path to GGUF model (BF16/F16 recommended)\n");
|
||||
LOG(" -l LAYER Target layer index (default: 0)\n");
|
||||
LOG(" -p PROMPT Inference prompt (default: \"The quick brown fox...\")\n");
|
||||
LOG(" -o DIR Output directory for .f32bin files (default: data)\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3 || (std::string(argv[1]) == "-h" || std::string(argv[1]) == "--help")) {
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_params params;
|
||||
int layer = 0;
|
||||
std::string output_dir = "data";
|
||||
std::string prompt = "The quick brown fox jumps over the lazy dog.";
|
||||
std::string model_path;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
if (arg == "-m" && i + 1 < argc) {
|
||||
model_path = argv[++i];
|
||||
} else if (arg == "-l" && i + 1 < argc) {
|
||||
layer = atoi(argv[++i]);
|
||||
} else if (arg == "-p" && i + 1 < argc) {
|
||||
prompt = argv[++i];
|
||||
} else if (arg == "-o" && i + 1 < argc) {
|
||||
output_dir = argv[++i];
|
||||
}
|
||||
}
|
||||
|
||||
if (model_path.empty()) {
|
||||
LOG_ERR("Error: -m MODEL_PATH is required\n\n");
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.model.path = model_path;
|
||||
params.prompt = prompt;
|
||||
params.n_batch = 512;
|
||||
params.n_ubatch = 512;
|
||||
params.n_gpu_layers = 0;
|
||||
params.fit_params = false;
|
||||
|
||||
CaptureState state;
|
||||
state.target_layer = layer;
|
||||
state.output_dir = output_dir;
|
||||
g_capture_state = &state;
|
||||
|
||||
params.cb_eval = capture_callback;
|
||||
params.cb_eval_user_data = &state;
|
||||
|
||||
LOG("Loading model: %s\n", model_path.c_str());
|
||||
LOG("Target layer: %d\n", layer);
|
||||
LOG("Output directory: %s\n", output_dir.c_str());
|
||||
|
||||
common_init();
|
||||
ggml_backend_load_all();
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
auto llama_init = common_init_from_params(params);
|
||||
if (!llama_init) {
|
||||
LOG_ERR("Failed to load model\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto * model = llama_init->model();
|
||||
auto * ctx = llama_init->context();
|
||||
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
LOG_ERR("Failed to initialize context\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("Model loaded successfully\n");
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
if (tokens.empty()) {
|
||||
LOG_ERR("No tokens generated from prompt\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("Tokenizing prompt: %zu tokens\n", tokens.size());
|
||||
LOG("Running inference...\n");
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||
LOG_ERR("llama_decode failed\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("\nDone. Captured %d tensors to %s/\n", state.captured_count, output_dir.c_str());
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return state.captured_count == 0 ? 1 : 0;
|
||||
}
|
||||
|
|
@ -318,7 +318,7 @@ struct lora_merge_ctx {
|
|||
auto nels = ggml_nelements(inp_base);
|
||||
const auto * qtype = ggml_get_type_traits(base->type);
|
||||
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
|
||||
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
|
||||
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels, nullptr);
|
||||
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
|
||||
} else {
|
||||
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
|
||||
|
|
|
|||
|
|
@ -46,6 +46,13 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
|
|||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
||||
{ "Q3_PT", LLAMA_FTYPE_MOSTLY_Q3_PT, " 3.25 bpw quantization", },
|
||||
{ "Q3_KPT", LLAMA_FTYPE_MOSTLY_Q3_KPT, " Q3_K with learned per-tensor levels" },
|
||||
{ "Q4_DPT", LLAMA_FTYPE_MOSTLY_Q4_DPT, " IQ4_NL with learned per-tensor int8 levels" },
|
||||
{ "Q2_KPT", LLAMA_FTYPE_MOSTLY_Q2_KPT, " Q2_K with learned per-tensor float levels" },
|
||||
{ "IQ2_TQ", LLAMA_FTYPE_MOSTLY_IQ2_TQ, " 2.0625 bpw, trellis quantized" },
|
||||
{ "IQ3_TQ", LLAMA_FTYPE_MOSTLY_IQ3_TQ, " 3.5625 bpw, per-tensor trained grid" },
|
||||
{ "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.5625 bpw, 8D vector quantized" },
|
||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
|
|
@ -162,6 +169,9 @@ static void usage(const char * executable) {
|
|||
printf(" WARNING: this is an advanced option, use with care.\n");
|
||||
printf(" --dry-run\n");
|
||||
printf(" calculate and show the final quantization size without performing quantization\n");
|
||||
printf(" --threads n\n");
|
||||
printf(" number of threads to use for cross-tensor parallelization (default: 0, use same as within-tensor)\n");
|
||||
printf(" when n > 0, enables parallel quantization of multiple tensors simultaneously\n");
|
||||
printf(" example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n");
|
||||
printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
|
||||
printf("-----------------------------------------------------------------------------\n");
|
||||
|
|
@ -565,6 +575,8 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
|
||||
params.keep_split = true;
|
||||
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
|
||||
params.keep_split = true;
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue