llama.cpp/tests/test-nvfp4-repack.cpp

218 lines
7.8 KiB
C++

#include "ggml.h"
#include "ggml-cuda/repack_nvfp4.cuh"
#include <cstdio>
#include <cstring>
#include <vector>
static void set_q4(uint8_t * qs, int idx, uint8_t value) {
uint8_t & byte = qs[idx / 2];
if (idx & 1) {
byte = (uint8_t) ((byte & 0x0f) | ((value & 0x0f) << 4));
} else {
byte = (uint8_t) ((byte & 0xf0) | (value & 0x0f));
}
}
static void fill_block(block_nvfp4 & blk, uint32_t seed, int row, int block) {
memset(&blk, 0, sizeof(blk));
for (int i = 0; i < QK_NVFP4; ++i) {
const uint32_t value = seed + (uint32_t) row * 19U + (uint32_t) block * 11U + (uint32_t) i * 5U + (uint32_t) (i / 3);
set_q4(blk.qs, i, (uint8_t) (value & 0x0f));
}
for (int i = 0; i < QK_NVFP4 / QK_NVFP4_SUB; ++i) {
const uint32_t value = seed + 0x31U + (uint32_t) row * 7U + (uint32_t) block * 13U + (uint32_t) i * 9U;
blk.d[i] = (uint8_t) (0x30 + value % 0x30);
}
}
static void fill_rows(std::vector<uint8_t> & rows, int64_t ne0, int64_t nrows, uint32_t seed) {
const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
const int blocks_per_row = (int) (ne0 / QK_NVFP4);
rows.assign(row_size * nrows, 0);
for (int64_t row = 0; row < nrows; ++row) {
block_nvfp4 * dst = (block_nvfp4 *) (rows.data() + row * row_size);
for (int block = 0; block < blocks_per_row; ++block) {
fill_block(dst[block], seed, (int) row, block);
}
}
}
static void fill_layout_row(std::vector<uint8_t> & rows) {
rows.assign(ggml_row_size(GGML_TYPE_NVFP4, QK_K), 0);
block_nvfp4 * dst = (block_nvfp4 *) rows.data();
for (int lane = 0; lane < 4; ++lane) {
for (int i = 0; i < 32; ++i) {
dst[lane].qs[i] = (uint8_t) ((lane * 0x31 + i * 0x17 + 0x12) & 0xff);
}
for (int i = 0; i < 4; ++i) {
dst[lane].d[i] = (uint8_t) (0x31 + lane * 0x0d + i * 0x09);
}
}
}
static bool expect_equal(const char * name, const std::vector<uint8_t> & expected, const std::vector<uint8_t> & actual) {
if (expected.size() != actual.size()) {
std::printf("%s: size mismatch (%zu != %zu)\n", name, expected.size(), actual.size());
return false;
}
for (size_t i = 0; i < expected.size(); ++i) {
if (expected[i] != actual[i]) {
std::printf("%s: first mismatch at byte %zu (expected 0x%02x, got 0x%02x)\n",
name, i, expected[i], actual[i]);
return false;
}
}
return true;
}
static bool check_roundtrip(const char * name, int64_t ne0, int64_t nrows, uint32_t seed) {
const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
std::vector<uint8_t> input;
std::vector<uint8_t> packed(row_size * nrows);
std::vector<uint8_t> output(row_size * nrows);
fill_rows(input, ne0, nrows, seed);
ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data());
ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data());
return expect_equal(name, input, output);
}
static bool check_partial_patch(const char * name, int64_t ne0, int64_t nrows, size_t offset, size_t size, uint32_t seed) {
const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
const size_t total_size = row_size * nrows;
std::vector<uint8_t> input;
std::vector<uint8_t> expected;
std::vector<uint8_t> packed(total_size);
std::vector<uint8_t> output(total_size);
std::vector<uint8_t> patch(size);
fill_rows(input, ne0, nrows, seed);
expected = input;
for (size_t i = 0; i < patch.size(); ++i) {
patch[i] = (uint8_t) (seed + 0x5bU + (uint32_t) i * 23U + (uint32_t) (i / 5));
}
memcpy(expected.data() + offset, patch.data(), size);
ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data());
const size_t aligned_offset = offset / row_size * row_size;
const size_t aligned_end = (offset + size + row_size - 1) / row_size * row_size;
const size_t aligned_size = aligned_end - aligned_offset;
const int64_t aligned_rows = (int64_t) (aligned_size / row_size);
const size_t inner_offset = offset - aligned_offset;
std::vector<uint8_t> rows(aligned_size);
ggml_cuda_unpack_rows_nvfp4(ne0, aligned_rows, packed.data() + aligned_offset, rows.data());
memcpy(rows.data() + inner_offset, patch.data(), size);
ggml_cuda_repack_rows_nvfp4(ne0, aligned_rows, rows.data(), packed.data() + aligned_offset);
ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data());
return expect_equal(name, expected, output);
}
static bool check_4x64_to_256_layout(const char * name, uint32_t seed) {
GGML_UNUSED(seed);
std::vector<uint8_t> input;
std::vector<uint8_t> packed(ggml_row_size(GGML_TYPE_NVFP4, QK_K));
fill_layout_row(input);
ggml_cuda_repack_rows_nvfp4(QK_K, 1, input.data(), packed.data());
const block_nvfp4 * src = (const block_nvfp4 *) input.data();
for (int lane = 0; lane < 4; ++lane) {
for (int pack = 0; pack < 8; ++pack) {
uint32_t got = 0;
memcpy(&got, packed.data() + lane * 32 + pack * sizeof(got), sizeof(got));
const uint32_t expected = ggml_cuda_nvfp4_pack(src[lane].qs, pack);
if (got != expected) {
std::printf("%s: lane %d pack %d mismatch (expected 0x%08x, got 0x%08x)\n", name, lane, pack, expected, got);
return false;
}
}
if (memcmp(packed.data() + 128 + lane * 4, src[lane].d, 4) != 0) {
std::printf("%s: lane %d scales mismatch\n", name, lane);
return false;
}
}
std::printf("%s:\n", name);
std::printf(" 4 x block_nvfp4 (64 weights each) -> 1 x block_nvfp4_cuda (256 weights total)\n");
for (int lane = 0; lane < 4; ++lane) {
std::printf(" block_nvfp4[%d] qs:", lane);
for (int i = 0; i < 32; ++i) {
std::printf(" %02x", src[lane].qs[i]);
}
std::printf("\n");
}
for (int lane = 0; lane < 4; ++lane) {
std::printf(" block_nvfp4[%d] scales:", lane);
for (int i = 0; i < 4; ++i) {
std::printf(" %02x", src[lane].d[i]);
}
std::printf("\n");
}
std::printf(" block_nvfp4_cuda:\n");
for (int lane = 0; lane < 4; ++lane) {
std::printf(" lane %d qs @0x%02x:", lane, lane * 32);
for (int pack = 0; pack < 8; ++pack) {
uint32_t word = 0;
memcpy(&word, packed.data() + lane * 32 + pack * sizeof(word), sizeof(word));
std::printf(" %08x", word);
}
std::printf("\n");
}
for (int lane = 0; lane < 4; ++lane) {
std::printf(" lane %d scales @0x%02x:", lane, 128 + lane * 4);
for (int i = 0; i < 4; ++i) {
std::printf(" %02x", packed[128 + lane * 4 + i]);
}
std::printf("\n");
}
return true;
}
int main() {
int total = 0;
int passed = 0;
const struct { const char * name; int64_t ne0, nrows; uint32_t seed; } roundtrip_cases[] = {
{ "roundtrip-ne0-64", 64, 3, 0x1001U }, { "roundtrip-ne0-128", 128, 3, 0x1002U },
{ "roundtrip-ne0-192", 192, 3, 0x1003U }, { "roundtrip-ne0-256", 256, 2, 0x1004U },
{ "roundtrip-ne0-320", 320, 3, 0x1005U },
};
for (const auto & test : roundtrip_cases) {
total += 1;
passed += check_roundtrip(test.name, test.ne0, test.nrows, test.seed);
}
total += 1;
passed += check_partial_patch("partial-cross-row-128", 128, 3, ggml_row_size(GGML_TYPE_NVFP4, 128) - 11, 27, 0x2001U);
total += 1;
passed += check_partial_patch("partial-cross-row-320", 320, 2, ggml_row_size(GGML_TYPE_NVFP4, 320) - 19, 41, 0x2002U);
total += 1;
passed += check_4x64_to_256_layout("layout-4x64-to-256", 0x3001U);
std::printf("test-nvfp4-repack: %d/%d passed\n", passed, total);
return passed == total ? 0 : 1;
}