356 lines
13 KiB
C++
356 lines
13 KiB
C++
// test-quant-laboratory.cpp
|
|
// Reusable testing harness for quantization experiments.
|
|
//
|
|
// Provides:
|
|
// - Synthetic data generators (Gaussian, Laplace, uniform)
|
|
// - Real tensor data loading (f32bin format with [nrow, ncol] header)
|
|
// - Importance matrix loading (flat f32 array)
|
|
// - RMSE computation
|
|
// - Multi-approach comparison framework (quantize → dequantize → matmul error)
|
|
// - ggml graph-level verification skeleton
|
|
//
|
|
// To add a new experiment:
|
|
// 1. Add an approach function: void approach_xxx(const float *W, float *out,
|
|
// int64_t nrow, int64_t ncol,
|
|
// const float *imatrix)
|
|
// 2. Register it in compare_approaches()
|
|
// 3. Call test_approach_comparison() from main()
|
|
|
|
#include "../ggml/src/ggml-quants.h"
|
|
#include "ggml-backend.h"
|
|
#include "ggml-alloc.h"
|
|
#include "ggml.h"
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cmath>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <functional>
|
|
#include <random>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
// ============================================================================
|
|
// Helper functions
|
|
// ============================================================================
|
|
|
|
static float rmse(const float * a, const float * b, size_t n) {
|
|
double sum = 0.0;
|
|
for (size_t i = 0; i < n; ++i) {
|
|
double d = (double) a[i] - (double) b[i];
|
|
sum += d * d;
|
|
}
|
|
return (float) sqrt(sum / n);
|
|
}
|
|
|
|
static void fill_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f) {
|
|
std::normal_distribution<float> dist(0.0f, sigma);
|
|
for (size_t i = 0; i < n; ++i) {
|
|
data[i] = dist(gen);
|
|
}
|
|
}
|
|
|
|
static void fill_laplace(float * data, size_t n, std::mt19937 & gen, float b = 1.0f) {
|
|
std::uniform_real_distribution<float> u(-0.5f, 0.5f);
|
|
for (size_t i = 0; i < n; ++i) {
|
|
float v = u(gen);
|
|
data[i] = -b * ((v > 0) - (v < 0)) * logf(1.0f - 2.0f * fabsf(v));
|
|
}
|
|
}
|
|
|
|
static void fill_uniform(float * data, size_t n, std::mt19937 & gen, float range = 1.0f) {
|
|
std::uniform_real_distribution<float> dist(-range, range);
|
|
for (size_t i = 0; i < n; ++i) {
|
|
data[i] = dist(gen);
|
|
}
|
|
}
|
|
|
|
static void fill_offset_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f, float offset = 2.0f) {
|
|
std::normal_distribution<float> dist(offset, sigma);
|
|
for (size_t i = 0; i < n; ++i) {
|
|
data[i] = dist(gen);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Data loading
|
|
// ============================================================================
|
|
static bool load_f32_tensor(const char * path, std::vector<float> & data, int64_t & nrow, int64_t & n_per_row) {
|
|
FILE * f = fopen(path, "rb");
|
|
if (!f) {
|
|
return false;
|
|
}
|
|
|
|
int64_t header[2];
|
|
if (fread(header, sizeof(int64_t), 2, f) != 2) {
|
|
fclose(f);
|
|
return false;
|
|
}
|
|
nrow = header[0];
|
|
n_per_row = header[1];
|
|
|
|
int64_t total = nrow * n_per_row;
|
|
data.resize(total);
|
|
size_t nread = fread(data.data(), sizeof(float), total, f);
|
|
fclose(f);
|
|
if ((int64_t) nread != total) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Load imatrix file (flat f32 array, no header, one importance value per column dimension)
|
|
// The imatrix is the sum-of-squares of activations per dimension.
|
|
static bool load_imatrix(const char * path, std::vector<float> & data, int64_t expected_dims) {
|
|
FILE * f = fopen(path, "rb");
|
|
if (!f) {
|
|
return false;
|
|
}
|
|
|
|
// Get file size to determine dimensions
|
|
fseek(f, 0, SEEK_END);
|
|
long file_size = ftell(f);
|
|
fseek(f, 0, SEEK_SET);
|
|
|
|
int64_t dims = file_size / sizeof(float);
|
|
if (expected_dims > 0 && dims != expected_dims) {
|
|
printf(" WARN: imatrix dims %lld != expected %lld\n", (long long) dims, (long long) expected_dims);
|
|
fclose(f);
|
|
return false;
|
|
}
|
|
|
|
data.resize(dims);
|
|
size_t nread = fread(data.data(), sizeof(float), dims, f);
|
|
fclose(f);
|
|
if ((int64_t) nread != dims) {
|
|
return false;
|
|
}
|
|
|
|
// Compute stats
|
|
float imin = data[0], imax = data[0], isum = 0;
|
|
for (int64_t i = 0; i < dims; i++) {
|
|
if (data[i] < imin) imin = data[i];
|
|
if (data[i] > imax) imax = data[i];
|
|
isum += data[i];
|
|
}
|
|
printf(" Loaded imatrix: %lld dims, min=%.6f, max=%.6f, mean=%.6f\n",
|
|
(long long) dims, imin, imax, isum / dims);
|
|
|
|
return true;
|
|
}
|
|
|
|
// ============================================================================
|
|
// Test class
|
|
// ============================================================================
|
|
|
|
class QuantLaboratory {
|
|
public:
|
|
QuantLaboratory() : gen(42) {}
|
|
|
|
// ========================================================================
|
|
// MULTI-APPROACH COMPARISON FRAMEWORK
|
|
//
|
|
// Each "approach" is a function that takes float weights and produces
|
|
// dequantized float output. The framework computes:
|
|
// - Weight RMSE (dequant vs original)
|
|
// - Matmul error (dequant weights x real activations vs f64 reference)
|
|
// - Ratio vs first approach (typically Q2_K baseline)
|
|
//
|
|
// To add a new approach:
|
|
// 1. Write: void approach_xxx(const float *W, float *out,
|
|
// int64_t nrow, int64_t ncol,
|
|
// const float *imatrix) { ... }
|
|
// 2. Add it to the `approaches` array in compare_approaches()
|
|
// ========================================================================
|
|
|
|
// -- Example approach: Q2_K baseline (via ggml library) --
|
|
// Uncomment and adapt for your experiment:
|
|
//
|
|
// void approach_q2k(const float * W, float * out, int64_t nrow, int64_t ncol, const float * imatrix) {
|
|
// size_t rs = ggml_row_size(GGML_TYPE_Q2_K, ncol);
|
|
// std::vector<uint8_t> buf(nrow * rs);
|
|
// quantize_q2_K(W, buf.data(), nrow, ncol, imatrix);
|
|
// auto * tr = ggml_get_type_traits(GGML_TYPE_Q2_K);
|
|
// for (int64_t r = 0; r < nrow; r++) {
|
|
// tr->to_float(buf.data() + r * rs, out + r * ncol, ncol, NULL);
|
|
// }
|
|
// }
|
|
|
|
void compare_approaches(const float * W,
|
|
int64_t w_nrow,
|
|
int64_t w_ncol,
|
|
const float * A,
|
|
int64_t a_nrow,
|
|
int64_t a_ncol,
|
|
const char * name,
|
|
const float * imatrix) {
|
|
if (w_ncol != a_ncol) {
|
|
return;
|
|
}
|
|
int64_t nr = std::min(w_nrow, (int64_t) 256);
|
|
int64_t nc = w_ncol;
|
|
|
|
// Reference matmul (double precision)
|
|
std::vector<double> ref(a_nrow * nr);
|
|
for (int64_t t = 0; t < a_nrow; t++) {
|
|
for (int64_t r = 0; r < nr; r++) {
|
|
double s = 0;
|
|
for (int64_t c = 0; c < nc; c++) {
|
|
s += (double) A[t * a_ncol + c] * (double) W[r * nc + c];
|
|
}
|
|
ref[t * nr + r] = s;
|
|
}
|
|
}
|
|
double ref_mag2 = 0;
|
|
for (auto v : ref) {
|
|
ref_mag2 += v * v;
|
|
}
|
|
float ref_rms = (float) sqrt(ref_mag2 / (a_nrow * nr));
|
|
(void) ref_rms;
|
|
|
|
struct Approach {
|
|
const char * name;
|
|
float bpw;
|
|
std::function<void(const float *, float *, int64_t, int64_t, const float *)> fn;
|
|
};
|
|
|
|
// ── Register approaches here ──
|
|
Approach approaches[] = {
|
|
// { "Q2_K (baseline)", 2.625f,
|
|
// [&](auto * W, auto * o, auto nr, auto nc, auto * im) {
|
|
// approach_q2k(W, o, nr, nc, im);
|
|
// } },
|
|
// Add more approaches...
|
|
{ "placeholder", 0.0f, nullptr }, // remove once real approaches added
|
|
};
|
|
|
|
printf("\n %-28s %5s %10s %10s %7s\n", name, "BPW", "RMSE", "MatmulErr", "vs Q2K");
|
|
printf(" %-28s %5s %10s %10s %7s\n", "---", "---", "---", "---", "---");
|
|
|
|
float baseline_matmul_err = 0;
|
|
for (auto & ap : approaches) {
|
|
if (!ap.fn) {
|
|
continue;
|
|
}
|
|
std::vector<float> dec(nr * nc);
|
|
ap.fn(W, dec.data(), nr, nc, imatrix);
|
|
|
|
// Weight RMSE
|
|
double werr2 = 0;
|
|
for (int64_t i = 0; i < nr * nc; i++) {
|
|
double d = W[i] - dec[i];
|
|
werr2 += d * d;
|
|
}
|
|
float wrmse = (float) sqrt(werr2 / (nr * nc));
|
|
|
|
// Matmul error
|
|
double merr2 = 0;
|
|
for (int64_t t = 0; t < a_nrow; t++) {
|
|
for (int64_t r = 0; r < nr; r++) {
|
|
double s = 0;
|
|
for (int64_t c = 0; c < nc; c++) {
|
|
s += (double) A[t * a_ncol + c] * (double) dec[r * nc + c];
|
|
}
|
|
double d = s - ref[t * nr + r];
|
|
merr2 += d * d;
|
|
}
|
|
}
|
|
float matmul_rmse = (float) sqrt(merr2 / (a_nrow * nr));
|
|
|
|
if (baseline_matmul_err == 0) {
|
|
baseline_matmul_err = matmul_rmse;
|
|
}
|
|
float ratio = (baseline_matmul_err > 1e-10f) ? matmul_rmse / baseline_matmul_err : 0;
|
|
|
|
printf(" %-28s %5.3f %10.6f %10.6f %6.3fx\n", ap.name, ap.bpw, wrmse, matmul_rmse, ratio);
|
|
}
|
|
}
|
|
|
|
// Run comparison on all tensor pairs from data directory
|
|
int test_approach_comparison(const char * data_dir) {
|
|
printf("\n");
|
|
printf("=======================================================================\n");
|
|
printf(" MULTI-APPROACH COMPARISON (real weights x real activations)\n");
|
|
printf("=======================================================================\n");
|
|
|
|
struct TestPair {
|
|
const char * wf;
|
|
const char * af;
|
|
const char * imf;
|
|
const char * name;
|
|
} pairs[] = {
|
|
{ "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_gate" },
|
|
{ "blk_0_ffn_up_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_up" },
|
|
{ "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin", "imatrix_blk0_ffn_down.f32bin", "ffn_down" },
|
|
{ "blk_0_attn_q_weight.f32bin", "act_blk0_attn_input.f32bin", "imatrix_blk0_attn_qkv.f32bin", "attn_q" },
|
|
};
|
|
|
|
for (auto & p : pairs) {
|
|
char wp[512], ap[512], imp[512];
|
|
snprintf(wp, sizeof(wp), "%s/%s", data_dir, p.wf);
|
|
snprintf(ap, sizeof(ap), "%s/%s", data_dir, p.af);
|
|
snprintf(imp, sizeof(imp), "%s/%s", data_dir, p.imf);
|
|
std::vector<float> wd, ad, im;
|
|
int64_t wnr, wnc, anr, anc;
|
|
if (!load_f32_tensor(wp, wd, wnr, wnc) || !load_f32_tensor(ap, ad, anr, anc)) {
|
|
continue;
|
|
}
|
|
const float * im_ptr = nullptr;
|
|
if (load_imatrix(imp, im, wnc)) {
|
|
im_ptr = im.data();
|
|
} else {
|
|
printf(" [%s] No imatrix found, using uniform weights\n", p.name);
|
|
}
|
|
compare_approaches(wd.data(), wnr, wnc, ad.data(), anr, anc, p.name, im_ptr);
|
|
}
|
|
printf("\n");
|
|
return 0;
|
|
}
|
|
|
|
private:
|
|
std::mt19937 gen;
|
|
};
|
|
|
|
// ============================================================================
|
|
// Main
|
|
// ============================================================================
|
|
|
|
int main(int argc, char ** argv) {
|
|
ggml_backend_load_all();
|
|
|
|
QuantLaboratory lab;
|
|
int total_fail = 0;
|
|
|
|
printf("Quantization Laboratory\n");
|
|
printf("=======================\n");
|
|
|
|
// Real data tests (from data/ directory)
|
|
{
|
|
const char * data_dir = "data";
|
|
if (argc > 1) {
|
|
data_dir = argv[1];
|
|
}
|
|
|
|
char probe[512];
|
|
snprintf(probe, sizeof(probe), "%s/blk_0_ffn_gate_weight.f32bin", data_dir);
|
|
FILE * fp = fopen(probe, "rb");
|
|
if (fp) {
|
|
fclose(fp);
|
|
total_fail += lab.test_approach_comparison(data_dir);
|
|
} else {
|
|
printf("\n=== Real Data Tests SKIPPED ===\n");
|
|
printf(" No data found at %s\n", data_dir);
|
|
printf(
|
|
" Run: cd data && PYTHONPATH=../gguf-py python3 ../scripts/extract-tensor-data.py MODEL.gguf "
|
|
"blk.0.ffn_gate blk.0.ffn_up blk.0.ffn_down blk.0.attn_q\n");
|
|
printf(" And: llama-capture-layer-data -m MODEL.gguf -l 0 -o data\n");
|
|
}
|
|
}
|
|
|
|
printf("\n\n=== Testing Complete: %d failures ===\n", total_fail);
|
|
|
|
return total_fail > 0 ? 1 : 0;
|
|
}
|