#include "ggml.h" #include "ggml-cuda/repack_nvfp4.cuh" #include #include #include static void set_q4(uint8_t * qs, int idx, uint8_t value) { uint8_t & byte = qs[idx / 2]; if (idx & 1) { byte = (uint8_t) ((byte & 0x0f) | ((value & 0x0f) << 4)); } else { byte = (uint8_t) ((byte & 0xf0) | (value & 0x0f)); } } static void fill_block(block_nvfp4 & blk, uint32_t seed, int row, int block) { memset(&blk, 0, sizeof(blk)); for (int i = 0; i < QK_NVFP4; ++i) { const uint32_t value = seed + (uint32_t) row * 19U + (uint32_t) block * 11U + (uint32_t) i * 5U + (uint32_t) (i / 3); set_q4(blk.qs, i, (uint8_t) (value & 0x0f)); } for (int i = 0; i < QK_NVFP4 / QK_NVFP4_SUB; ++i) { const uint32_t value = seed + 0x31U + (uint32_t) row * 7U + (uint32_t) block * 13U + (uint32_t) i * 9U; blk.d[i] = (uint8_t) (0x30 + value % 0x30); } } static void fill_rows(std::vector & rows, int64_t ne0, int64_t nrows, uint32_t seed) { const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0); const int blocks_per_row = (int) (ne0 / QK_NVFP4); rows.assign(row_size * nrows, 0); for (int64_t row = 0; row < nrows; ++row) { block_nvfp4 * dst = (block_nvfp4 *) (rows.data() + row * row_size); for (int block = 0; block < blocks_per_row; ++block) { fill_block(dst[block], seed, (int) row, block); } } } static void fill_layout_row(std::vector & rows) { rows.assign(ggml_row_size(GGML_TYPE_NVFP4, QK_K), 0); block_nvfp4 * dst = (block_nvfp4 *) rows.data(); for (int lane = 0; lane < 4; ++lane) { for (int i = 0; i < 32; ++i) { dst[lane].qs[i] = (uint8_t) ((lane * 0x31 + i * 0x17 + 0x12) & 0xff); } for (int i = 0; i < 4; ++i) { dst[lane].d[i] = (uint8_t) (0x31 + lane * 0x0d + i * 0x09); } } } static bool expect_equal(const char * name, const std::vector & expected, const std::vector & actual) { if (expected.size() != actual.size()) { std::printf("%s: size mismatch (%zu != %zu)\n", name, expected.size(), actual.size()); return false; } for (size_t i = 0; i < expected.size(); ++i) { if (expected[i] != actual[i]) { std::printf("%s: first mismatch at byte %zu (expected 0x%02x, got 0x%02x)\n", name, i, expected[i], actual[i]); return false; } } return true; } static bool check_roundtrip(const char * name, int64_t ne0, int64_t nrows, uint32_t seed) { const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0); std::vector input; std::vector packed(row_size * nrows); std::vector output(row_size * nrows); fill_rows(input, ne0, nrows, seed); ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data()); ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data()); return expect_equal(name, input, output); } static bool check_partial_patch(const char * name, int64_t ne0, int64_t nrows, size_t offset, size_t size, uint32_t seed) { const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0); const size_t total_size = row_size * nrows; std::vector input; std::vector expected; std::vector packed(total_size); std::vector output(total_size); std::vector patch(size); fill_rows(input, ne0, nrows, seed); expected = input; for (size_t i = 0; i < patch.size(); ++i) { patch[i] = (uint8_t) (seed + 0x5bU + (uint32_t) i * 23U + (uint32_t) (i / 5)); } memcpy(expected.data() + offset, patch.data(), size); ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data()); const size_t aligned_offset = offset / row_size * row_size; const size_t aligned_end = (offset + size + row_size - 1) / row_size * row_size; const size_t aligned_size = aligned_end - aligned_offset; const int64_t aligned_rows = (int64_t) (aligned_size / row_size); const size_t inner_offset = offset - aligned_offset; std::vector rows(aligned_size); ggml_cuda_unpack_rows_nvfp4(ne0, aligned_rows, packed.data() + aligned_offset, rows.data()); memcpy(rows.data() + inner_offset, patch.data(), size); ggml_cuda_repack_rows_nvfp4(ne0, aligned_rows, rows.data(), packed.data() + aligned_offset); ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data()); return expect_equal(name, expected, output); } static bool check_4x64_to_256_layout(const char * name, uint32_t seed) { GGML_UNUSED(seed); std::vector input; std::vector packed(ggml_row_size(GGML_TYPE_NVFP4, QK_K)); fill_layout_row(input); ggml_cuda_repack_rows_nvfp4(QK_K, 1, input.data(), packed.data()); const block_nvfp4 * src = (const block_nvfp4 *) input.data(); for (int lane = 0; lane < 4; ++lane) { for (int pack = 0; pack < 8; ++pack) { uint32_t got = 0; memcpy(&got, packed.data() + lane * 32 + pack * sizeof(got), sizeof(got)); const uint32_t expected = ggml_cuda_nvfp4_pack(src[lane].qs, pack); if (got != expected) { std::printf("%s: lane %d pack %d mismatch (expected 0x%08x, got 0x%08x)\n", name, lane, pack, expected, got); return false; } } if (memcmp(packed.data() + 128 + lane * 4, src[lane].d, 4) != 0) { std::printf("%s: lane %d scales mismatch\n", name, lane); return false; } } std::printf("%s:\n", name); std::printf(" 4 x block_nvfp4 (64 weights each) -> 1 x block_nvfp4_cuda (256 weights total)\n"); for (int lane = 0; lane < 4; ++lane) { std::printf(" block_nvfp4[%d] qs:", lane); for (int i = 0; i < 32; ++i) { std::printf(" %02x", src[lane].qs[i]); } std::printf("\n"); } for (int lane = 0; lane < 4; ++lane) { std::printf(" block_nvfp4[%d] scales:", lane); for (int i = 0; i < 4; ++i) { std::printf(" %02x", src[lane].d[i]); } std::printf("\n"); } std::printf(" block_nvfp4_cuda:\n"); for (int lane = 0; lane < 4; ++lane) { std::printf(" lane %d qs @0x%02x:", lane, lane * 32); for (int pack = 0; pack < 8; ++pack) { uint32_t word = 0; memcpy(&word, packed.data() + lane * 32 + pack * sizeof(word), sizeof(word)); std::printf(" %08x", word); } std::printf("\n"); } for (int lane = 0; lane < 4; ++lane) { std::printf(" lane %d scales @0x%02x:", lane, 128 + lane * 4); for (int i = 0; i < 4; ++i) { std::printf(" %02x", packed[128 + lane * 4 + i]); } std::printf("\n"); } return true; } int main() { int total = 0; int passed = 0; const struct { const char * name; int64_t ne0, nrows; uint32_t seed; } roundtrip_cases[] = { { "roundtrip-ne0-64", 64, 3, 0x1001U }, { "roundtrip-ne0-128", 128, 3, 0x1002U }, { "roundtrip-ne0-192", 192, 3, 0x1003U }, { "roundtrip-ne0-256", 256, 2, 0x1004U }, { "roundtrip-ne0-320", 320, 3, 0x1005U }, }; for (const auto & test : roundtrip_cases) { total += 1; passed += check_roundtrip(test.name, test.ne0, test.nrows, test.seed); } total += 1; passed += check_partial_patch("partial-cross-row-128", 128, 3, ggml_row_size(GGML_TYPE_NVFP4, 128) - 11, 27, 0x2001U); total += 1; passed += check_partial_patch("partial-cross-row-320", 320, 2, ggml_row_size(GGML_TYPE_NVFP4, 320) - 19, 41, 0x2002U); total += 1; passed += check_4x64_to_256_layout("layout-4x64-to-256", 0x3001U); std::printf("test-nvfp4-repack: %d/%d passed\n", passed, total); return passed == total ? 0 : 1; }