// test-quant-laboratory.cpp // Reusable testing harness for quantization experiments. // // Provides: // - Synthetic data generators (Gaussian, Laplace, uniform) // - Real tensor data loading (f32bin format with [nrow, ncol] header) // - Importance matrix loading (flat f32 array) // - RMSE computation // - Multi-approach comparison framework (quantize → dequantize → matmul error) // - ggml graph-level verification skeleton // // To add a new experiment: // 1. Add an approach function: void approach_xxx(const float *W, float *out, // int64_t nrow, int64_t ncol, // const float *imatrix) // 2. Register it in compare_approaches() // 3. Call test_approach_comparison() from main() #include "../ggml/src/ggml-quants.h" #include "ggml-backend.h" #include "ggml-alloc.h" #include "ggml.h" #include #include #include #include #include #include #include #include #include #include // ============================================================================ // Helper functions // ============================================================================ static float rmse(const float * a, const float * b, size_t n) { double sum = 0.0; for (size_t i = 0; i < n; ++i) { double d = (double) a[i] - (double) b[i]; sum += d * d; } return (float) sqrt(sum / n); } static void fill_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f) { std::normal_distribution dist(0.0f, sigma); for (size_t i = 0; i < n; ++i) { data[i] = dist(gen); } } static void fill_laplace(float * data, size_t n, std::mt19937 & gen, float b = 1.0f) { std::uniform_real_distribution u(-0.5f, 0.5f); for (size_t i = 0; i < n; ++i) { float v = u(gen); data[i] = -b * ((v > 0) - (v < 0)) * logf(1.0f - 2.0f * fabsf(v)); } } static void fill_uniform(float * data, size_t n, std::mt19937 & gen, float range = 1.0f) { std::uniform_real_distribution dist(-range, range); for (size_t i = 0; i < n; ++i) { data[i] = dist(gen); } } static void fill_offset_gaussian(float * data, size_t n, std::mt19937 & gen, float sigma = 1.0f, float offset = 2.0f) { std::normal_distribution dist(offset, sigma); for (size_t i = 0; i < n; ++i) { data[i] = dist(gen); } } // ============================================================================ // Data loading // ============================================================================ static bool load_f32_tensor(const char * path, std::vector & data, int64_t & nrow, int64_t & n_per_row) { FILE * f = fopen(path, "rb"); if (!f) { return false; } int64_t header[2]; if (fread(header, sizeof(int64_t), 2, f) != 2) { fclose(f); return false; } nrow = header[0]; n_per_row = header[1]; int64_t total = nrow * n_per_row; data.resize(total); size_t nread = fread(data.data(), sizeof(float), total, f); fclose(f); if ((int64_t) nread != total) { return false; } return true; } // Load imatrix file (flat f32 array, no header, one importance value per column dimension) // The imatrix is the sum-of-squares of activations per dimension. static bool load_imatrix(const char * path, std::vector & data, int64_t expected_dims) { FILE * f = fopen(path, "rb"); if (!f) { return false; } // Get file size to determine dimensions fseek(f, 0, SEEK_END); long file_size = ftell(f); fseek(f, 0, SEEK_SET); int64_t dims = file_size / sizeof(float); if (expected_dims > 0 && dims != expected_dims) { printf(" WARN: imatrix dims %lld != expected %lld\n", (long long) dims, (long long) expected_dims); fclose(f); return false; } data.resize(dims); size_t nread = fread(data.data(), sizeof(float), dims, f); fclose(f); if ((int64_t) nread != dims) { return false; } // Compute stats float imin = data[0], imax = data[0], isum = 0; for (int64_t i = 0; i < dims; i++) { if (data[i] < imin) imin = data[i]; if (data[i] > imax) imax = data[i]; isum += data[i]; } printf(" Loaded imatrix: %lld dims, min=%.6f, max=%.6f, mean=%.6f\n", (long long) dims, imin, imax, isum / dims); return true; } // ============================================================================ // Test class // ============================================================================ class QuantLaboratory { public: QuantLaboratory() : gen(42) {} // ======================================================================== // MULTI-APPROACH COMPARISON FRAMEWORK // // Each "approach" is a function that takes float weights and produces // dequantized float output. The framework computes: // - Weight RMSE (dequant vs original) // - Matmul error (dequant weights x real activations vs f64 reference) // - Ratio vs first approach (typically Q2_K baseline) // // To add a new approach: // 1. Write: void approach_xxx(const float *W, float *out, // int64_t nrow, int64_t ncol, // const float *imatrix) { ... } // 2. Add it to the `approaches` array in compare_approaches() // ======================================================================== // -- Example approach: Q2_K baseline (via ggml library) -- // Uncomment and adapt for your experiment: // // void approach_q2k(const float * W, float * out, int64_t nrow, int64_t ncol, const float * imatrix) { // size_t rs = ggml_row_size(GGML_TYPE_Q2_K, ncol); // std::vector buf(nrow * rs); // quantize_q2_K(W, buf.data(), nrow, ncol, imatrix); // auto * tr = ggml_get_type_traits(GGML_TYPE_Q2_K); // for (int64_t r = 0; r < nrow; r++) { // tr->to_float(buf.data() + r * rs, out + r * ncol, ncol, NULL); // } // } void compare_approaches(const float * W, int64_t w_nrow, int64_t w_ncol, const float * A, int64_t a_nrow, int64_t a_ncol, const char * name, const float * imatrix) { if (w_ncol != a_ncol) { return; } int64_t nr = std::min(w_nrow, (int64_t) 256); int64_t nc = w_ncol; // Reference matmul (double precision) std::vector ref(a_nrow * nr); for (int64_t t = 0; t < a_nrow; t++) { for (int64_t r = 0; r < nr; r++) { double s = 0; for (int64_t c = 0; c < nc; c++) { s += (double) A[t * a_ncol + c] * (double) W[r * nc + c]; } ref[t * nr + r] = s; } } double ref_mag2 = 0; for (auto v : ref) { ref_mag2 += v * v; } float ref_rms = (float) sqrt(ref_mag2 / (a_nrow * nr)); (void) ref_rms; struct Approach { const char * name; float bpw; std::function fn; }; // ── Register approaches here ── Approach approaches[] = { // { "Q2_K (baseline)", 2.625f, // [&](auto * W, auto * o, auto nr, auto nc, auto * im) { // approach_q2k(W, o, nr, nc, im); // } }, // Add more approaches... { "placeholder", 0.0f, nullptr }, // remove once real approaches added }; printf("\n %-28s %5s %10s %10s %7s\n", name, "BPW", "RMSE", "MatmulErr", "vs Q2K"); printf(" %-28s %5s %10s %10s %7s\n", "---", "---", "---", "---", "---"); float baseline_matmul_err = 0; for (auto & ap : approaches) { if (!ap.fn) { continue; } std::vector dec(nr * nc); ap.fn(W, dec.data(), nr, nc, imatrix); // Weight RMSE double werr2 = 0; for (int64_t i = 0; i < nr * nc; i++) { double d = W[i] - dec[i]; werr2 += d * d; } float wrmse = (float) sqrt(werr2 / (nr * nc)); // Matmul error double merr2 = 0; for (int64_t t = 0; t < a_nrow; t++) { for (int64_t r = 0; r < nr; r++) { double s = 0; for (int64_t c = 0; c < nc; c++) { s += (double) A[t * a_ncol + c] * (double) dec[r * nc + c]; } double d = s - ref[t * nr + r]; merr2 += d * d; } } float matmul_rmse = (float) sqrt(merr2 / (a_nrow * nr)); if (baseline_matmul_err == 0) { baseline_matmul_err = matmul_rmse; } float ratio = (baseline_matmul_err > 1e-10f) ? matmul_rmse / baseline_matmul_err : 0; printf(" %-28s %5.3f %10.6f %10.6f %6.3fx\n", ap.name, ap.bpw, wrmse, matmul_rmse, ratio); } } // Run comparison on all tensor pairs from data directory int test_approach_comparison(const char * data_dir) { printf("\n"); printf("=======================================================================\n"); printf(" MULTI-APPROACH COMPARISON (real weights x real activations)\n"); printf("=======================================================================\n"); struct TestPair { const char * wf; const char * af; const char * imf; const char * name; } pairs[] = { { "blk_0_ffn_gate_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_gate" }, { "blk_0_ffn_up_weight.f32bin", "act_blk0_ffn_input.f32bin", "imatrix_blk0_ffn_gate_up.f32bin", "ffn_up" }, { "blk_0_ffn_down_weight.f32bin", "act_blk0_ffn_down_input.f32bin", "imatrix_blk0_ffn_down.f32bin", "ffn_down" }, { "blk_0_attn_q_weight.f32bin", "act_blk0_attn_input.f32bin", "imatrix_blk0_attn_qkv.f32bin", "attn_q" }, }; for (auto & p : pairs) { char wp[512], ap[512], imp[512]; snprintf(wp, sizeof(wp), "%s/%s", data_dir, p.wf); snprintf(ap, sizeof(ap), "%s/%s", data_dir, p.af); snprintf(imp, sizeof(imp), "%s/%s", data_dir, p.imf); std::vector wd, ad, im; int64_t wnr, wnc, anr, anc; if (!load_f32_tensor(wp, wd, wnr, wnc) || !load_f32_tensor(ap, ad, anr, anc)) { continue; } const float * im_ptr = nullptr; if (load_imatrix(imp, im, wnc)) { im_ptr = im.data(); } else { printf(" [%s] No imatrix found, using uniform weights\n", p.name); } compare_approaches(wd.data(), wnr, wnc, ad.data(), anr, anc, p.name, im_ptr); } printf("\n"); return 0; } private: std::mt19937 gen; }; // ============================================================================ // Main // ============================================================================ int main(int argc, char ** argv) { ggml_backend_load_all(); QuantLaboratory lab; int total_fail = 0; printf("Quantization Laboratory\n"); printf("=======================\n"); // Real data tests (from data/ directory) { const char * data_dir = "data"; if (argc > 1) { data_dir = argv[1]; } char probe[512]; snprintf(probe, sizeof(probe), "%s/blk_0_ffn_gate_weight.f32bin", data_dir); FILE * fp = fopen(probe, "rb"); if (fp) { fclose(fp); total_fail += lab.test_approach_comparison(data_dir); } else { printf("\n=== Real Data Tests SKIPPED ===\n"); printf(" No data found at %s\n", data_dir); printf( " Run: cd data && PYTHONPATH=../gguf-py python3 ../scripts/extract-tensor-data.py MODEL.gguf " "blk.0.ffn_gate blk.0.ffn_up blk.0.ffn_down blk.0.attn_q\n"); printf(" And: llama-capture-layer-data -m MODEL.gguf -l 0 -o data\n"); } } printf("\n\n=== Testing Complete: %d failures ===\n", total_fail); return total_fail > 0 ? 1 : 0; }