llama.cpp/tests/test-quantize-fns.cpp

1118 lines
45 KiB
C++

// Unit tests for quantization specific functions - quantize, dequantize and dot product
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-quants.h"
#define GGML_COMMON_DECL_CPP
#define GGML_COMMON_IMPL_CPP
#include "ggml-common.h"
#undef NDEBUG
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <string>
#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_FP4 = 0.0030f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_MXFP4 = 0.0070f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_MXFP6 = 0.0040f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_MXFP8 = 0.0020f;
// MXFP Hadamard pipeline thresholds (mxfp_rmse, which computes sqrt(sum/n)).
// These represent actual RMSE through the full KV cache write/read path.
constexpr float MAX_MXFP_PIPELINE_ERROR_MXFP4 = 0.40f;
constexpr float MAX_MXFP_PIPELINE_ERROR_MXFP8 = 0.08f;
constexpr float MAX_MXFP_PIPELINE_ERROR_MXFP6 = 0.30f;
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
constexpr float MAX_DOT_PRODUCT_ERROR_FP4 = 0.03f;
constexpr float MAX_DOT_PRODUCT_ERROR_MXFP = 0.04f;
constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f;
static const char* RESULT_STR[] = {"ok", "FAILED"};
// Generate synthetic data
static void generate_data(float offset, size_t n, float * dst) {
for (size_t i = 0; i < n; i++) {
dst[i] = 0.1 + 2*cosf(i + offset);
}
}
// Calculate RMSE between two float arrays
static float array_rmse(const float * a1, const float * a2, size_t n) {
double sum = 0;
for (size_t i = 0; i < n; i++) {
double diff = a1[i] - a2[i];
sum += diff * diff;
}
return sqrtf(sum) / n;
}
// MXFP RMSE: sqrt(sum/n), used with MAX_MXFP_PIPELINE_ERROR_* thresholds
static float mxfp_rmse(const float * a1, const float * a2, size_t n) {
double sum = 0;
for (size_t i = 0; i < n; i++) {
double diff = a1[i] - a2[i];
sum += diff * diff;
}
return sqrtf((float)(sum / n));
}
// Total quantization error on test data
static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
return array_rmse(test_data, tmp_out.data(), test_size);
}
// Total quantization error on test data
static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size);
// FIXME: why is done twice?
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
}
static float dot_product(const float * a1, const float * a2, size_t test_size) {
double sum = 0;
for (size_t i = 0; i < test_size; i++) {
sum += a1[i] * a2[i];
}
return sum;
}
// Total dot product error
static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
GGML_UNUSED(qfns);
std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size);
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
vdot->from_float(test_data2, tmp_q2.data(), test_size);
float result = INFINITY;
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
const float dot_ref = dot_product(test_data1, test_data2, test_size);
return fabsf(result - dot_ref) / test_size;
}
int main(int argc, char * argv[]) {
bool verbose = false;
const size_t test_size = 32 * 128;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-v") {
verbose = true;
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
return 1;
}
}
std::vector<float> test_data(test_size);
std::vector<float> test_data2(test_size);
generate_data(0.0, test_data.size(), test_data.data());
generate_data(1.0, test_data2.size(), test_data2.data());
ggml_cpu_init();
int num_failed = 0;
bool failed = false;
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
const auto * qfns = ggml_get_type_traits(type);
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
// deprecated - skip
if (qfns->blck_size == 0) {
continue;
}
const ggml_type ei = (ggml_type)i;
printf("Testing %s\n", ggml_type_name((ggml_type) i));
ggml_quantize_init(ei);
if (qfns_cpu->from_float && qfns->to_float) {
const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
const float max_quantization_error =
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS :
type == GGML_TYPE_NVFP4 ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 :
type == GGML_TYPE_MXFP4 ? MAX_QUANTIZATION_TOTAL_ERROR_MXFP4 :
type == GGML_TYPE_MXFP6 ? MAX_QUANTIZATION_TOTAL_ERROR_MXFP6 :
type == GGML_TYPE_MXFP8 ? MAX_QUANTIZATION_TOTAL_ERROR_MXFP8 : MAX_QUANTIZATION_TOTAL_ERROR;
failed = !(total_error < max_quantization_error);
num_failed += failed;
if (failed || verbose) {
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
}
const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
num_failed += failed;
if (failed || verbose) {
printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
}
const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
? MAX_DOT_PRODUCT_ERROR_LOWBIT
: type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
? MAX_DOT_PRODUCT_ERROR_TERNARY
: type == GGML_TYPE_NVFP4
? MAX_DOT_PRODUCT_ERROR_FP4
: type == GGML_TYPE_MXFP4 || type == GGML_TYPE_MXFP6 || type == GGML_TYPE_MXFP8
? MAX_DOT_PRODUCT_ERROR_MXFP
: MAX_DOT_PRODUCT_ERROR;
failed = !(vec_dot_error < max_allowed_error);
num_failed += failed;
if (failed || verbose) {
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
}
}
}
// MXFP SoA roundtrip via traits
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
if (!qfns_cpu->from_float_soa || !qfns_cpu->to_float_soa) {
continue;
}
const size_t buf_size = ggml_row_size(type, test_size);
std::vector<uint8_t> tmp_q(buf_size);
std::vector<float> tmp_out(test_size);
qfns_cpu->from_float_soa(test_data.data(), tmp_q.data(), test_size);
qfns_cpu->to_float_soa(tmp_q.data(), tmp_out.data(), test_size);
const float soa_error = array_rmse(test_data.data(), tmp_out.data(), test_size);
const float max_soa_error =
type == GGML_TYPE_MXFP4 ? MAX_QUANTIZATION_TOTAL_ERROR_MXFP4 :
type == GGML_TYPE_MXFP6 ? MAX_QUANTIZATION_TOTAL_ERROR_MXFP6 :
type == GGML_TYPE_MXFP8 ? MAX_QUANTIZATION_TOTAL_ERROR_MXFP8 : MAX_QUANTIZATION_TOTAL_ERROR;
failed = !(soa_error < max_soa_error);
num_failed += failed;
if (failed || verbose) {
printf("%5s SoA quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], soa_error);
}
}
// MXFP traits: SoA required, MXFP6/MXFP8 are KV-cache-only (no AoS dequant)
{
const ggml_type all_mxfp_types[] = { GGML_TYPE_MXFP4, GGML_TYPE_MXFP8, GGML_TYPE_MXFP6 };
for (ggml_type type : all_mxfp_types) {
const auto * cpu = ggml_get_type_traits_cpu(type);
failed = !(cpu->from_float_soa && cpu->to_float_soa);
num_failed += failed;
if (failed || verbose) {
printf("%5s SoA traits present: %s\n", ggml_type_name(type), RESULT_STR[failed]);
}
}
// KV-cache-only types: no AoS dequant
const ggml_type kv_only_types[] = { GGML_TYPE_MXFP8, GGML_TYPE_MXFP6 };
for (ggml_type type : kv_only_types) {
const auto * cpu = ggml_get_type_traits_cpu(type);
failed = (cpu->to_float != nullptr);
num_failed += failed;
if (failed || verbose) {
printf("%5s AoS CPU to_float absent: %s\n", ggml_type_name(type), RESULT_STR[failed]);
}
}
}
// Hadamard self-inverse: H(H(x)) == x
{
float original[32], transformed[32];
for (int i = 0; i < 32; i++) {
original[i] = 0.1f + 2.0f * cosf(i + 0.5f);
transformed[i] = original[i];
}
ggml_hadamard_32_inplace(transformed);
ggml_hadamard_32_inplace(transformed); // apply twice = identity
float max_err = 0.0f;
for (int i = 0; i < 32; i++) {
float err = fabsf(transformed[i] - original[i]);
if (err > max_err) max_err = err;
}
// floating-point rounding tolerance
failed = !(max_err < 1e-5f);
num_failed += failed;
if (failed || verbose) {
printf("hadamard H(H(x))==x roundtrip: %s (max_err=%.2e)\n", RESULT_STR[failed], max_err);
}
}
// SoA SIMD vs scalar dequant
{
struct soa_cross_check {
ggml_type type;
void (*ref_dequant)(const void *, float *, int64_t);
};
const soa_cross_check checks[] = {
{ GGML_TYPE_MXFP4, dequantize_row_mxfp4_soa },
{ GGML_TYPE_MXFP8, dequantize_row_mxfp8_soa },
{ GGML_TYPE_MXFP6, dequantize_row_mxfp6_soa },
};
for (const auto & c : checks) {
const auto * cpu = ggml_get_type_traits_cpu(c.type);
if (!cpu->from_float_soa || !cpu->to_float_soa) continue;
const size_t buf_size = ggml_row_size(c.type, test_size);
std::vector<uint8_t> tmp_q(buf_size);
std::vector<float> out_ref(test_size);
std::vector<float> out_simd(test_size);
// Quantize with SoA
cpu->from_float_soa(test_data.data(), tmp_q.data(), test_size);
// Dequant with scalar reference
c.ref_dequant(tmp_q.data(), out_ref.data(), test_size);
// Dequant with CPU/SIMD path
cpu->to_float_soa(tmp_q.data(), out_simd.data(), test_size);
// Compare bitwise
int mismatches = 0;
for (size_t j = 0; j < test_size; j++) {
uint32_t a, b;
memcpy(&a, &out_ref[j], 4);
memcpy(&b, &out_simd[j], 4);
if (a != b) mismatches++;
}
failed = (mismatches > 0);
num_failed += failed;
if (failed || verbose) {
printf("%5s SoA SIMD vs scalar ref: %s (%zu/%zu match)\n",
ggml_type_name(c.type), RESULT_STR[failed],
test_size - mismatches, test_size);
}
}
}
// element converters vs canonical LUT values
{
struct lut_test {
const char * name;
const float * lut;
int count;
float (*converter)(uint8_t);
};
const lut_test lut_tests[] = {
{ "fp8_e4m3", kvalues_mxfp8_e4m3, 256, fp8_e4m3_to_float },
{ "fp8_e5m2", kvalues_mxfp8_e5m2, 256, fp8_e5m2_to_float },
{ "fp6_e2m3", kvalues_mxfp6_e2m3, 64, fp6_e2m3_to_float },
{ "fp6_e3m2", kvalues_mxfp6_e3m2, 64, fp6_e3m2_to_float },
};
for (const auto & t : lut_tests) {
int mismatches = 0;
for (int i = 0; i < t.count; i++) {
const float converter_val = t.converter((uint8_t)i);
const float lut_val = t.lut[i];
// both NaN = match
if (isnan(converter_val) && isnan(lut_val)) continue;
if (converter_val != lut_val) {
if (mismatches == 0 || verbose) {
printf(" %s LUT mismatch at [%d]: converter=%.8g, lut=%.8g\n",
t.name, i, converter_val, lut_val);
}
mismatches++;
}
}
failed = (mismatches > 0);
num_failed += failed;
if (failed || verbose) {
printf("%5s converter vs LUT: %s (%d/%d values match)\n",
t.name, RESULT_STR[failed], t.count - mismatches, t.count);
}
}
// FP4 E2M1
{
int mismatches = 0;
for (int i = 0; i < 16; i++) {
const float converter_val = ggml_mxfp_fp4_e2m1_to_float((uint8_t)i);
const float lut_val = kvalues_mxfp4_float[i];
if (converter_val != lut_val) {
if (mismatches == 0 || verbose) {
printf(" fp4_e2m1 LUT mismatch at [%d]: converter=%.8g, lut=%.8g\n",
i, converter_val, lut_val);
}
mismatches++;
}
}
failed = (mismatches > 0);
num_failed += failed;
if (failed || verbose) {
printf("fp4_e2m1 converter vs LUT: %s (%d/16 values match)\n",
RESULT_STR[failed], 16 - mismatches);
}
}
}
// element converter edge cases (expected values validated against LUTs)
{
struct conv_check {
const char * name;
float input;
uint8_t expected_bits;
bool is_saturation; // true = input overflows, expected_bits is max finite
const float * lut; // canonical LUT to validate expected_bits against (NULL for FP4)
float (*to_float)(uint8_t);
uint8_t (*to_quant)(float);
};
const conv_check checks[] = {
// FP4 E2M1 -[S(1)|E(2)|M(1)], bias=0
{ "fp4 zero", 0.0f, 0x00, false, nullptr, nullptr, nullptr },
{ "fp4 sub 0.5", 0.5f, 0x01, false, nullptr, nullptr, nullptr },
{ "fp4 norm 1.0", 1.0f, 0x02, false, nullptr, nullptr, nullptr },
{ "fp4 max 6.0", 6.0f, 0x07, false, nullptr, nullptr, nullptr },
{ "fp4 neg -3.0", -3.0f, 0x0D, false, nullptr, nullptr, nullptr },
{ "fp4 sat 100", 100.0f, 0x07, true, nullptr, nullptr, nullptr },
// FP8 E4M3 -[S(1)|E(4)|M(3)], bias=7
{ "e4m3 zero", 0.0f, 0x00, false, kvalues_mxfp8_e4m3, fp8_e4m3_to_float, float_to_fp8_e4m3_rn },
{ "e4m3 sub", 1.f/512, 0x01, false, kvalues_mxfp8_e4m3, fp8_e4m3_to_float, float_to_fp8_e4m3_rn },
{ "e4m3 max 448", 448.0f, 0x7E, false, kvalues_mxfp8_e4m3, fp8_e4m3_to_float, float_to_fp8_e4m3_rn },
{ "e4m3 sat 500", 500.0f, 0x7E, true, kvalues_mxfp8_e4m3, fp8_e4m3_to_float, float_to_fp8_e4m3_rn },
{ "e4m3 neg -1", -1.0f, 0xB8, false, kvalues_mxfp8_e4m3, fp8_e4m3_to_float, float_to_fp8_e4m3_rn },
// FP6 E2M3 -[S(1)|E(2)|M(3)], no NaN/Inf
{ "e2m3 zero", 0.0f, 0x00, false, kvalues_mxfp6_e2m3, fp6_e2m3_to_float, float_to_fp6_e2m3_rn },
{ "e2m3 sub", 0.125f, 0x01, false, kvalues_mxfp6_e2m3, fp6_e2m3_to_float, float_to_fp6_e2m3_rn },
{ "e2m3 max 7.5", 7.5f, 0x1F, false, kvalues_mxfp6_e2m3, fp6_e2m3_to_float, float_to_fp6_e2m3_rn },
{ "e2m3 sat 100", 100.0f, 0x1F, true, kvalues_mxfp6_e2m3, fp6_e2m3_to_float, float_to_fp6_e2m3_rn },
// FP6 E3M2 -[S(1)|E(3)|M(2)], no NaN/Inf, exp=7 is NORMAL
{ "e3m2 zero", 0.0f, 0x00, false, kvalues_mxfp6_e3m2, fp6_e3m2_to_float, float_to_fp6_e3m2_rn },
{ "e3m2 sub", 0.0625f, 0x01, false, kvalues_mxfp6_e3m2, fp6_e3m2_to_float, float_to_fp6_e3m2_rn },
{ "e3m2 max 28.0", 28.0f, 0x1F, false, kvalues_mxfp6_e3m2, fp6_e3m2_to_float, float_to_fp6_e3m2_rn },
{ "e3m2 exp7 16", 16.0f, 0x1C, false, kvalues_mxfp6_e3m2, fp6_e3m2_to_float, float_to_fp6_e3m2_rn },
// FP8 E5M2 -[S(1)|E(5)|M(2)], bias=15
{ "e5m2 zero", 0.0f, 0x00, false, kvalues_mxfp8_e5m2, fp8_e5m2_to_float, float_to_fp8_e5m2_rn },
{ "e5m2 max", 57344.f, 0x7B, false, kvalues_mxfp8_e5m2, fp8_e5m2_to_float, float_to_fp8_e5m2_rn },
};
int conv_bad = 0;
// validate expected_bits against LUTs
for (const auto & c : checks) {
if (c.lut && !c.is_saturation) {
float lut_val = c.lut[c.expected_bits];
if (c.input != lut_val && !(c.input == 0.0f && lut_val == 0.0f)) {
printf(" TEST BUG %s: expected_bits=0x%02X → LUT=%.8g, but input=%.8g\n",
c.name, c.expected_bits, lut_val, c.input);
conv_bad++;
}
} else if (!c.lut && !c.is_saturation) {
float lut_val = kvalues_mxfp4_float[c.expected_bits];
if (c.input != lut_val && !(c.input == 0.0f && lut_val == 0.0f)) {
printf(" TEST BUG %s: expected_bits=0x%02X → LUT=%.8g, but input=%.8g\n",
c.name, c.expected_bits, lut_val, c.input);
conv_bad++;
}
}
}
// Now test the quantize direction
for (const auto & c : checks) {
uint8_t got;
if (c.to_quant) {
got = c.to_quant(c.input);
} else {
got = ggml_mxfp_float_to_fp4_e2m1(c.input);
}
if (got != c.expected_bits) {
if (conv_bad == 0 || verbose) {
printf(" %s: quantize(%.6g) = 0x%02X, expected 0x%02X\n",
c.name, c.input, got, c.expected_bits);
}
conv_bad++;
}
}
// FP8 E4M3: 0x7F must dequantize to NaN
{
float nan_val = fp8_e4m3_to_float(0x7F);
if (!isnan(nan_val)) {
if (conv_bad == 0 || verbose) {
printf(" e4m3 0x7F dequant: expected NaN, got %.6g\n", nan_val);
}
conv_bad++;
}
}
// FP6 E3M2: exp=7 must dequant to valid float (NOT Inf/NaN)
{
float exp7_val = fp6_e3m2_to_float(0x1F); // max: exp=7, mant=3 → 28.0
if (isnan(exp7_val) || exp7_val != 28.0f) {
if (conv_bad == 0 || verbose) {
printf(" e3m2 0x1F dequant: expected 28.0, got %.6g\n", exp7_val);
}
conv_bad++;
}
}
failed = (conv_bad > 0);
num_failed += failed;
if (failed || verbose) {
printf(" element converter edge cases: %s (%d/%d passed)\n",
RESULT_STR[failed],
(int)(sizeof(checks)/sizeof(checks[0])) + 2 - conv_bad,
(int)(sizeof(checks)/sizeof(checks[0])) + 2);
}
}
// FP6 pack/unpack round-trip
{
int pack_bad = 0;
// Test all 64 possible 6-bit values in each of the 4 positions
for (int pos = 0; pos < 4; pos++) {
for (int val = 0; val < 64; val++) {
uint8_t in[4] = {0, 0, 0, 0};
in[pos] = (uint8_t)val;
uint8_t packed[3], out[4];
pack_fp6x4(in, packed);
unpack_fp6x4(packed, out);
if (out[pos] != (uint8_t)val) {
if (pack_bad == 0 || verbose) {
printf(" fp6 pack roundtrip: pos=%d val=0x%02X → got 0x%02X\n",
pos, val, out[pos]);
}
pack_bad++;
}
// no crosstalk
for (int k = 0; k < 4; k++) {
if (k != pos && out[k] != 0) {
if (pack_bad == 0 || verbose) {
printf(" fp6 pack crosstalk: pos=%d val=0x%02X leaked to pos=%d (0x%02X)\n",
pos, val, k, out[k]);
}
pack_bad++;
}
}
}
}
// known-answer: [0x3F, 0x00, 0x3F, 0x00] -> {0x3F, 0xF0, 0x03}
{
uint8_t in[4] = {0x3F, 0x00, 0x3F, 0x00};
uint8_t packed[3];
pack_fp6x4(in, packed);
uint8_t expected[3] = {0x3F, 0xF0, 0x03};
if (packed[0] != expected[0] || packed[1] != expected[1] || packed[2] != expected[2]) {
if (pack_bad == 0 || verbose) {
printf(" fp6 known-answer: packed [%02X,%02X,%02X] expected [%02X,%02X,%02X]\n",
packed[0], packed[1], packed[2], expected[0], expected[1], expected[2]);
}
pack_bad++;
}
}
failed = (pack_bad > 0);
num_failed += failed;
if (failed || verbose) {
printf(" fp6 pack/unpack round-trip: %s\n", RESULT_STR[failed]);
}
}
// E8M0 known-answer decode + HALF vs FULL (MXFP4 uses HALF, MXFP6/8 use FULL)
{
int e8m0_bad = 0;
// Known-answer E8M0 decodes
struct { uint8_t e; float expected; } e8m0_known[] = {
{ 127, 1.0f }, // 2^(127-127) = 2^0 = 1.0
{ 128, 2.0f }, // 2^(128-127) = 2^1 = 2.0
{ 126, 0.5f }, // 2^(126-127) = 2^(-1) = 0.5
{ 254, 1.70141183e+38f }, // 2^127 (max representable)
{ 1, 1.17549435e-38f }, // 2^(-126) (min normal)
};
for (const auto & t : e8m0_known) {
float got = ggml_mxfp_e8m0_to_fp32(t.e);
if (got != t.expected) {
if (e8m0_bad == 0 || verbose) {
printf(" E8M0 decode e=%d: got %.8g, expected %.8g\n", t.e, got, t.expected);
}
e8m0_bad++;
}
}
// HALF must be exactly half of FULL for all valid exponents
for (int e = 2; e < 255; e++) {
float full = ggml_mxfp_e8m0_to_fp32((uint8_t)e);
float half = ggml_mxfp_e8m0_to_fp32_half((uint8_t)e);
if (half != full * 0.5f) {
if (e8m0_bad == 0 || verbose) {
printf(" E8M0 HALF!=FULL/2 at e=%d: half=%.8g, full/2=%.8g\n", e, half, full * 0.5f);
}
e8m0_bad++;
break; // one failure is enough to flag the pattern
}
}
failed = (e8m0_bad > 0);
num_failed += failed;
if (failed || verbose) {
printf(" E8M0 known-answer + HALF/FULL: %s\n", RESULT_STR[failed]);
}
}
// E8M0 rounding at sqrt(2) threshold
{
int round_bad = 0;
// amax=1.0: floor_log2=0, mantissa=0 → no round → e_base = 0 - 0 + 127 = 127
{
int e = ggml_mxfp_e8m0_base_estimate(1.0f, 0);
if (e != 127) {
printf(" E8M0 round: amax=1.0 → e=%d, expected 127\n", e);
round_bad++;
}
}
// amax=2.0: floor_log2=1, mantissa=0 → no round → e_base = 1 + 127 = 128
{
int e = ggml_mxfp_e8m0_base_estimate(2.0f, 0);
if (e != 128) {
printf(" E8M0 round: amax=2.0 → e=%d, expected 128\n", e);
round_bad++;
}
}
// amax just below sqrt(2): mantissa < 0x3504F3 → floor only → e=127
{
// 1.41421 has IEEE mantissa just below 0x3504F3
float below = 1.4142f;
int e = ggml_mxfp_e8m0_base_estimate(below, 0);
if (e != 127) {
printf(" E8M0 round: amax=%.6f → e=%d, expected 127 (no round)\n", below, e);
round_bad++;
}
}
// amax at sqrt(2): mantissa >= 0x3504F3 → rounds up → e=128
{
float at_sqrt2 = 1.41422f;
int e = ggml_mxfp_e8m0_base_estimate(at_sqrt2, 0);
if (e != 128) {
printf(" E8M0 round: amax=%.6f → e=%d, expected 128 (rounds up)\n", at_sqrt2, e);
round_bad++;
}
}
// Verify emax_offset shifts the result
{
int e_no_off = ggml_mxfp_e8m0_base_estimate(448.0f, 0);
int e_e4m3 = ggml_mxfp_e8m0_base_estimate(448.0f, MXFP8_E4M3_EMAX_OFFSET);
if (e_no_off - e_e4m3 != MXFP8_E4M3_EMAX_OFFSET) {
printf(" E8M0 emax_offset: diff=%d, expected %d\n",
e_no_off - e_e4m3, MXFP8_E4M3_EMAX_OFFSET);
round_bad++;
}
}
failed = (round_bad > 0);
num_failed += failed;
if (failed || verbose) {
printf(" E8M0 rounding boundary: %s\n", RESULT_STR[failed]);
}
}
// Element converter exhaustive round-trip: quantize(dequantize(i)) == i for all valid bit patterns.
// Catches asymmetries between the to_float and to_quant paths.
{
struct rt_test {
const char * name;
int count;
float (*to_float)(uint8_t);
uint8_t (*to_quant)(float);
uint8_t nan_bits; // bit pattern for NaN (0 = no NaN in format)
};
const rt_test rt_tests[] = {
{ "fp8_e4m3", 256, fp8_e4m3_to_float, float_to_fp8_e4m3_rn, 0x7F },
{ "fp8_e5m2", 256, fp8_e5m2_to_float, float_to_fp8_e5m2_rn, 0 },
{ "fp6_e2m3", 64, fp6_e2m3_to_float, float_to_fp6_e2m3_rn, 0 },
{ "fp6_e3m2", 64, fp6_e3m2_to_float, float_to_fp6_e3m2_rn, 0 },
};
for (const auto & t : rt_tests) {
int rt_bad = 0;
for (int i = 0; i < t.count; i++) {
if ((uint8_t)i == t.nan_bits) continue; // skip NaN -quantize(NaN) is implementation-defined
float f = t.to_float((uint8_t)i);
if (isnan(f) || isinf(f)) continue; // E5M2 Inf/NaN
uint8_t back = t.to_quant(f);
// Negative zero may round-trip to positive zero -both are valid
if (back != (uint8_t)i && !(f == 0.0f && t.to_float(back) == 0.0f)) {
if (rt_bad == 0 || verbose) {
printf(" %s roundtrip: 0x%02X → %.6g → 0x%02X\n",
t.name, i, f, back);
}
rt_bad++;
}
}
failed = (rt_bad > 0);
num_failed += failed;
if (failed || verbose) {
printf("%5s converter round-trip: %s (%d/%d survived)\n",
t.name, RESULT_STR[failed], t.count - rt_bad, t.count);
}
}
// FP4 E2M1: uses static inline converters (not GGML_API wrappers), only 16 values
{
int rt_bad = 0;
for (int i = 0; i < 16; i++) {
float f = ggml_mxfp_fp4_e2m1_to_float((uint8_t)i);
uint8_t back = ggml_mxfp_float_to_fp4_e2m1(f);
if (back != (uint8_t)i && !(f == 0.0f && ggml_mxfp_fp4_e2m1_to_float(back) == 0.0f)) {
if (rt_bad == 0 || verbose) {
printf(" fp4_e2m1 roundtrip: 0x%02X → %.6g → 0x%02X\n", i, f, back);
}
rt_bad++;
}
}
failed = (rt_bad > 0);
num_failed += failed;
if (failed || verbose) {
printf("fp4_e2m1 converter round-trip: %s (%d/16 survived)\n",
RESULT_STR[failed], 16 - rt_bad);
}
}
}
// E8M0 scale computation: verify base exponent is reasonable for various amax values
{
const float test_amax[] = { 0.001f, 0.1f, 1.0f, 6.0f, 100.0f, 448.0f, 10000.0f };
int bad = 0;
for (float amax : test_amax) {
// ggml_mxfp_e8m0_base_estimate returns unclamped e_base
int e_base = ggml_mxfp_e8m0_base_estimate(amax, 0);
if (e_base < 1 || e_base > 254) {
if (bad == 0 || verbose) {
printf(" E8M0 bad e_base=%d for amax=%.4f\n", e_base, amax);
}
bad++;
continue;
}
float scale = ggml_mxfp_e8m0_to_fp32((uint8_t)e_base);
// Scale should be within 2x of amax (rough sanity check)
float ratio = amax / scale;
if (ratio < 0.25f || ratio > 4.0f) {
if (bad == 0 || verbose) {
printf(" E8M0 scale=%.6g for amax=%.4f, ratio=%.4f (expected ~1)\n",
scale, amax, ratio);
}
bad++;
}
}
failed = (bad > 0);
num_failed += failed;
if (failed || verbose) {
printf(" E8M0 scale sanity check: %s (%d/%d passed)\n",
RESULT_STR[failed], (int)(sizeof(test_amax)/sizeof(test_amax[0])) - bad,
(int)(sizeof(test_amax)/sizeof(test_amax[0])));
}
}
// SoA layout: verify offset macros produce correct byte positions
{
const struct { ggml_type type; int qs_per_block; } soa_types[] = {
{ GGML_TYPE_MXFP4, MXFP4_SOA_QS_PER_BLOCK },
{ GGML_TYPE_MXFP8, MXFP8_SOA_QS_PER_BLOCK },
{ GGML_TYPE_MXFP6, MXFP6_SOA_QS_PER_BLOCK },
};
for (const auto & st : soa_types) {
for (int nblocks : { 1, 4, 8, 32 }) {
size_t expected_e8m0_off = (size_t)nblocks * st.qs_per_block;
size_t actual_e8m0_off = MXFP_SOA_E8M0_OFFSET(nblocks, st.qs_per_block);
size_t total = actual_e8m0_off + nblocks; // e8m0 region = 1 byte per block
size_t row_size = ggml_row_size(st.type, nblocks * 32);
bool offset_ok = (actual_e8m0_off == expected_e8m0_off);
bool size_ok = (total == row_size);
if (!offset_ok || !size_ok) {
failed = true;
num_failed++;
if (verbose) {
printf(" %s SoA layout nblocks=%d: e8m0_off=%zu (expected %zu), total=%zu (row_size=%zu)\n",
ggml_type_name(st.type), nblocks, actual_e8m0_off, expected_e8m0_off, total, row_size);
}
}
}
}
if (verbose) {
printf(" SoA layout offset check: %s\n", RESULT_STR[0]); // only prints failures above
}
}
// block size consistency
{
failed = !(QK_MXFP4 == 32 && QK_MXFP8 == 32 && QK_MXFP6 == 32);
num_failed += failed;
if (failed || verbose) {
printf(" MXFP block size == 32: %s (QK4=%d, QK8=%d, QK6=%d)\n",
RESULT_STR[failed], QK_MXFP4, QK_MXFP8, QK_MXFP6);
}
}
// EMAX_OFFSET produces valid E8M0 for each format's max finite value
{
struct emax_check {
const char * name;
int emax_offset;
float max_finite; // from LUT / converter
};
const emax_check emax_checks[] = {
{ "fp4_e2m1", MXFP4_E2M1_EMAX_OFFSET, 6.0f },
{ "fp6_e2m3", MXFP6_E2M3_EMAX_OFFSET, 7.5f },
{ "fp6_e3m2", MXFP6_E3M2_EMAX_OFFSET, 28.0f },
{ "fp8_e4m3", MXFP8_E4M3_EMAX_OFFSET, 448.0f },
{ "fp8_e5m2", MXFP8_E5M2_EMAX_OFFSET, 57344.0f },
};
int emax_bad = 0;
for (const auto & e : emax_checks) {
// When amax == max_finite, the base estimate must produce a valid E8M0 (1..254)
int e_base = ggml_mxfp_e8m0_base_estimate(e.max_finite, e.emax_offset);
if (e_base < 1 || e_base > 254) {
if (emax_bad == 0 || verbose) {
printf(" %s emax_offset=%d: max_finite=%.1f gives e_base=%d (out of range)\n",
e.name, e.emax_offset, e.max_finite, e_base);
}
emax_bad++;
}
}
failed = (emax_bad > 0);
num_failed += failed;
if (failed || verbose) {
printf(" EMAX_OFFSET vs format max: %s\n", RESULT_STR[failed]);
}
}
// MXFP4 AoS vs SoA: two independent code paths, same result
{
const int nelems = 64; // 2 blocks
float input[64];
for (int i = 0; i < 64; i++) {
input[i] = 0.5f + 2.0f * sinf(i * 0.7f + 0.3f);
}
// Quantize and dequant via AoS (block_mxfp4 structs)
std::vector<block_mxfp4> aos_q(nelems / QK_MXFP4);
std::vector<float> aos_out(nelems);
quantize_row_mxfp4_ref(input, aos_q.data(), nelems);
dequantize_row_mxfp4(aos_q.data(), aos_out.data(), nelems);
// Quantize and dequant via SoA
const size_t soa_buf_size = ggml_row_size(GGML_TYPE_MXFP4, nelems);
std::vector<uint8_t> soa_q(soa_buf_size);
std::vector<float> soa_out(nelems);
quantize_row_mxfp4_soa(input, soa_q.data(), nelems);
dequantize_row_mxfp4_soa(soa_q.data(), soa_out.data(), nelems);
// Compare: both paths should produce identical results
int mismatches = 0;
for (int i = 0; i < nelems; i++) {
uint32_t a, b;
memcpy(&a, &aos_out[i], 4);
memcpy(&b, &soa_out[i], 4);
if (a != b) {
if (mismatches == 0 || verbose) {
printf(" mxfp4 AoS/SoA mismatch at [%d]: AoS=%.8g, SoA=%.8g\n",
i, aos_out[i], soa_out[i]);
}
mismatches++;
}
}
failed = (mismatches > 0);
num_failed += failed;
if (failed || verbose) {
printf("mxfp4 AoS vs SoA cross-check: %s (%d/%d match)\n",
RESULT_STR[failed], nelems - mismatches, nelems);
}
}
// Hadamard + quantize + dequant + Hadamard roundtrip (KV cache write/read path)
{
struct hadamard_pipeline_check {
const char * name;
ggml_type type;
float max_err;
};
const hadamard_pipeline_check pipeline_checks[] = {
{ "mxfp4", GGML_TYPE_MXFP4, MAX_MXFP_PIPELINE_ERROR_MXFP4 },
{ "mxfp8", GGML_TYPE_MXFP8, MAX_MXFP_PIPELINE_ERROR_MXFP8 },
{ "mxfp6", GGML_TYPE_MXFP6, MAX_MXFP_PIPELINE_ERROR_MXFP6 },
};
for (const auto & p : pipeline_checks) {
const auto * cpu = ggml_get_type_traits_cpu(p.type);
std::vector<float> original(test_size);
std::vector<float> rotated(test_size);
std::vector<float> recovered(test_size);
generate_data(2.0, test_size, original.data());
// Write path: Hadamard each block, then quantize
memcpy(rotated.data(), original.data(), test_size * sizeof(float));
for (size_t b = 0; b < test_size / 32; b++) {
ggml_hadamard_32_inplace(&rotated[b * 32]);
}
const size_t buf_size = ggml_row_size(p.type, test_size);
std::vector<uint8_t> qbuf(buf_size);
cpu->from_float_soa(rotated.data(), qbuf.data(), test_size);
// Read path: dequant, then Hadamard each block (self-inverse)
cpu->to_float_soa(qbuf.data(), recovered.data(), test_size);
for (size_t b = 0; b < test_size / 32; b++) {
ggml_hadamard_32_inplace(&recovered[b * 32]);
}
float err = mxfp_rmse(original.data(), recovered.data(), test_size);
failed = !(err < p.max_err);
num_failed += failed;
if (failed || verbose) {
printf("%5s Hadamard pipeline roundtrip: %s (err=%.6f, max=%.6f)\n",
p.name, RESULT_STR[failed], err, p.max_err);
}
}
}
// Hadamard known output: H([1,0,...,0]) = [1/sqrt(32), ...]
{
float unit[32] = {};
unit[0] = 1.0f;
ggml_hadamard_32_inplace(unit);
const float expected = MXFP_HADAMARD_32_NORM; // 1/sqrt(32)
float max_err = 0.0f;
for (int i = 0; i < 32; i++) {
float err = fabsf(unit[i] - expected);
if (err > max_err) max_err = err;
}
failed = !(max_err < 1e-7f);
num_failed += failed;
if (failed || verbose) {
printf("hadamard unit vector: %s (max_err=%.2e, expected %.8f)\n",
RESULT_STR[failed], max_err, expected);
}
}
// zero block produces E8M0=0
{
float zeros[32] = {};
const size_t buf_size = ggml_row_size(GGML_TYPE_MXFP8, 32);
std::vector<uint8_t> buf(buf_size, 0xFF); // fill with 0xFF to detect non-writes
quantize_row_mxfp8_soa(zeros, buf.data(), 32);
// E8M0 scale is at offset MXFP8_SOA_QS_PER_BLOCK (32) for 1 block
uint8_t e8m0 = buf[MXFP8_SOA_QS_PER_BLOCK];
failed = (e8m0 != 0);
num_failed += failed;
if (failed || verbose) {
printf(" zero block E8M0: %s (e8m0=%d, expected 0)\n",
RESULT_STR[failed], e8m0);
}
}
// SoA format spec: quantize, manually walk raw bytes, compare against reference dequant
{
// 2 blocks, asymmetric data
const int nblocks = 2;
const int nelems = nblocks * 32;
float input[64];
for (int i = 0; i < 64; i++) {
// Block 0: small values, Block 1: large values -different E8M0 scales
input[i] = (i < 32) ? 0.1f * sinf(i + 0.5f) : 3.0f * cosf(i + 0.5f);
}
// MXFP4
{
const size_t buf_size = ggml_row_size(GGML_TYPE_MXFP4, nelems);
std::vector<uint8_t> buf(buf_size);
std::vector<float> ref_out(nelems);
std::vector<float> manual_out(nelems);
quantize_row_mxfp4_soa(input, buf.data(), nelems);
dequantize_row_mxfp4_soa(buf.data(), ref_out.data(), nelems);
// manual dequant from raw bytes
const uint8_t * qs = buf.data();
const uint8_t * e8m0 = buf.data() + MXFP_SOA_E8M0_OFFSET(nblocks, MXFP4_SOA_QS_PER_BLOCK);
for (int b = 0; b < nblocks; b++) {
const float d = ggml_mxfp_e8m0_to_fp32_half(e8m0[b]);
const uint8_t * block_qs = qs + MXFP_SOA_QS_OFFSET(b, MXFP4_SOA_QS_PER_BLOCK);
for (int j = 0; j < 16; j++) {
// low nibble = first half, high nibble = second half
int8_t v_lo = kvalues_mxfp4[block_qs[j] & 0x0F];
int8_t v_hi = kvalues_mxfp4[block_qs[j] >> 4];
manual_out[b*32 + j] = v_lo * d;
manual_out[b*32 + j + 16] = v_hi * d;
}
}
int mismatches = 0;
for (int i = 0; i < nelems; i++) {
uint32_t a, b;
memcpy(&a, &ref_out[i], 4);
memcpy(&b, &manual_out[i], 4);
if (a != b) mismatches++;
}
failed = (mismatches > 0);
num_failed += failed;
if (failed || verbose) {
printf("mxfp4 SoA format spec: %s (%d/%d match)\n",
RESULT_STR[failed], nelems - mismatches, nelems);
}
}
// MXFP8
{
const size_t buf_size = ggml_row_size(GGML_TYPE_MXFP8, nelems);
std::vector<uint8_t> buf(buf_size);
std::vector<float> ref_out(nelems);
std::vector<float> manual_out(nelems);
quantize_row_mxfp8_soa(input, buf.data(), nelems);
dequantize_row_mxfp8_soa(buf.data(), ref_out.data(), nelems);
const uint8_t * qs = buf.data();
const uint8_t * e8m0 = buf.data() + MXFP_SOA_E8M0_OFFSET(nblocks, MXFP8_SOA_QS_PER_BLOCK);
for (int b = 0; b < nblocks; b++) {
const float d = ggml_mxfp_e8m0_to_fp32(e8m0[b]);
const uint8_t * block_qs = qs + MXFP_SOA_QS_OFFSET(b, MXFP8_SOA_QS_PER_BLOCK);
for (int j = 0; j < 32; j++) {
// one byte per element
manual_out[b*32 + j] = fp8_e4m3_to_float(block_qs[j]) * d;
}
}
int mismatches = 0;
for (int i = 0; i < nelems; i++) {
uint32_t a, b;
memcpy(&a, &ref_out[i], 4);
memcpy(&b, &manual_out[i], 4);
if (a != b) mismatches++;
}
failed = (mismatches > 0);
num_failed += failed;
if (failed || verbose) {
printf("mxfp8 SoA format spec: %s (%d/%d match)\n",
RESULT_STR[failed], nelems - mismatches, nelems);
}
}
// MXFP6
{
const size_t buf_size = ggml_row_size(GGML_TYPE_MXFP6, nelems);
std::vector<uint8_t> buf(buf_size);
std::vector<float> ref_out(nelems);
std::vector<float> manual_out(nelems);
quantize_row_mxfp6_soa(input, buf.data(), nelems);
dequantize_row_mxfp6_soa(buf.data(), ref_out.data(), nelems);
const uint8_t * qs = buf.data();
const uint8_t * e8m0 = buf.data() + MXFP_SOA_E8M0_OFFSET(nblocks, MXFP6_SOA_QS_PER_BLOCK);
for (int b = 0; b < nblocks; b++) {
const float d = ggml_mxfp_e8m0_to_fp32(e8m0[b]);
const uint8_t * block_qs = qs + MXFP_SOA_QS_OFFSET(b, MXFP6_SOA_QS_PER_BLOCK);
for (int j = 0; j < 32; j += 4) {
// 4 elements packed into 3 bytes
uint8_t vals[4];
unpack_fp6x4(&block_qs[j * 3 / 4], vals);
for (int k = 0; k < 4; k++) {
manual_out[b*32 + j + k] = fp6_e2m3_to_float(vals[k]) * d;
}
}
}
int mismatches = 0;
for (int i = 0; i < nelems; i++) {
uint32_t a, b;
memcpy(&a, &ref_out[i], 4);
memcpy(&b, &manual_out[i], 4);
if (a != b) mismatches++;
}
failed = (mismatches > 0);
num_failed += failed;
if (failed || verbose) {
printf("mxfp6 SoA format spec: %s (%d/%d match)\n",
RESULT_STR[failed], nelems - mismatches, nelems);
}
}
}
if (num_failed || verbose) {
printf("%d tests failed\n", num_failed);
}
return num_failed > 0;
}