diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ce7a80acde..964cf70a29 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -36,6 +36,7 @@ #include "ggml-cuda/pad.cuh" #include "ggml-cuda/pool2d.cuh" #include "ggml-cuda/quantize.cuh" +#include "ggml-cuda/repack_nvfp4.cuh" #include "ggml-cuda/rope.cuh" #include "ggml-cuda/roll.cuh" #include "ggml-cuda/scale.cuh" @@ -641,10 +642,86 @@ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } +struct ggml_cuda_nvfp4_row_span { + size_t row_size; + size_t aligned_offset; + size_t aligned_size; + size_t inner_offset; + int64_t row_begin; + int64_t nrows; +}; + +static ggml_cuda_nvfp4_row_span ggml_cuda_get_nvfp4_row_span(const ggml_tensor * tensor, size_t offset, size_t size) { + GGML_ASSERT(tensor != nullptr); + GGML_ASSERT(ggml_is_contiguous(tensor) && "NVFP4 partial repack needs contiguous tensors"); + GGML_ASSERT(offset <= ggml_nbytes(tensor)); + GGML_ASSERT(size <= ggml_nbytes(tensor) - offset); + + const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, tensor->ne[0]); + if (size == 0) { + return { + row_size, + offset, + 0, + 0, + (int64_t) (offset / row_size), + 0, + }; + } + + const size_t aligned_offset = offset / row_size * row_size; + const size_t aligned_end = (offset + size + row_size - 1) / row_size * row_size; + + GGML_ASSERT(aligned_end <= ggml_nbytes(tensor)); + + return { + row_size, + aligned_offset, + aligned_end - aligned_offset, + offset - aligned_offset, + (int64_t) (aligned_offset / row_size), + (int64_t) ((aligned_end - aligned_offset) / row_size), + }; +} + static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + if (size == 0) { + return; + } + ggml_cuda_set_device(ctx->device); + if (tensor->type == GGML_TYPE_NVFP4) { + const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size); + + std::vector rows(span.aligned_size); // Pull full rows for partial updates + if (span.inner_offset != 0 || size != span.aligned_size) { + std::vector packed(span.aligned_size); + CUDA_CHECK(cudaMemcpyAsync( + packed.data(), + (const char *) tensor->data + span.aligned_offset, + span.aligned_size, + cudaMemcpyDeviceToHost, + cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + ggml_cuda_unpack_rows_nvfp4(tensor->ne[0], span.nrows, packed.data(), rows.data()); + } + + memcpy(rows.data() + span.inner_offset, data, size); + + std::vector packed(span.aligned_size); + ggml_cuda_repack_rows_nvfp4(tensor->ne[0], span.nrows, rows.data(), packed.data()); + CUDA_CHECK(cudaMemcpyAsync( + (char *) tensor->data + span.aligned_offset, + packed.data(), + span.aligned_size, + cudaMemcpyHostToDevice, + cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + return; + } + CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } @@ -652,7 +729,34 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + if (size == 0) { + return; + } + ggml_cuda_set_device(ctx->device); + if (tensor->type == GGML_TYPE_NVFP4) { + const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size); + + std::vector packed(span.aligned_size); + CUDA_CHECK(cudaMemcpyAsync( + packed.data(), + (const char *) tensor->data + span.aligned_offset, + span.aligned_size, + cudaMemcpyDeviceToHost, + cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + + if (span.inner_offset == 0 && size == span.aligned_size) { + ggml_cuda_unpack_rows_nvfp4(tensor->ne[0], span.nrows, packed.data(), data); + return; + } + + std::vector rows(span.aligned_size); + ggml_cuda_unpack_rows_nvfp4(tensor->ne[0], span.nrows, packed.data(), rows.data()); + memcpy(data, rows.data() + span.inner_offset, size); + return; + } + CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); } @@ -916,6 +1020,76 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_ } static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + if (size == 0) { + return; + } + + if (tensor->type == GGML_TYPE_NVFP4) { + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size); + + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + const int64_t copy_row_low = std::max(row_low, span.row_begin); + const int64_t copy_row_high = std::min(row_high, span.row_begin + span.nrows); + const int64_t nrows_copy = copy_row_high - copy_row_low; + if (nrows_copy == 0) { + continue; + } + + const size_t row_size = span.row_size; + const size_t offset_dst = (copy_row_low - row_low) * row_size; + const size_t size_copy = nrows_copy * row_size; + const size_t aligned_offset = (size_t) copy_row_low * row_size; + + std::vector rows(size_copy); + if (span.inner_offset != 0 || size != span.aligned_size) { + std::vector packed(size_copy); + ggml_cuda_set_device(id); + CUDA_CHECK(cudaMemcpyAsync( + packed.data(), + (const char *) extra->data_device[id] + offset_dst, + size_copy, + cudaMemcpyDeviceToHost, + cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + ggml_cuda_unpack_rows_nvfp4(ne0, nrows_copy, packed.data(), rows.data()); + } + + const size_t overlap_begin = std::max(offset, aligned_offset); + const size_t overlap_end = std::min(offset + size, aligned_offset + size_copy); + if (overlap_begin < overlap_end) { + memcpy( + rows.data() + (overlap_begin - aligned_offset), + (const char *) data + (overlap_begin - offset), + overlap_end - overlap_begin); + } + + std::vector packed(size_copy); + ggml_cuda_repack_rows_nvfp4(ne0, nrows_copy, rows.data(), packed.data()); + ggml_cuda_set_device(id); + CUDA_CHECK(cudaMemcpyAsync( + (char *) extra->data_device[id] + offset_dst, + packed.data(), + size_copy, + cudaMemcpyHostToDevice, + cudaStreamPerThread)); + } + + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + ggml_cuda_set_device(id); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + } + return; + } + // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -955,6 +1129,60 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff } static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + if (size == 0) { + return; + } + + if (tensor->type == GGML_TYPE_NVFP4) { + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size); + char * host_dst = (char *) data; + std::vector host_aligned; + if (span.inner_offset != 0 || size != span.aligned_size) { + host_aligned.resize(span.aligned_size); + host_dst = host_aligned.data(); + } + + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + const int64_t copy_row_low = std::max(row_low, span.row_begin); + const int64_t copy_row_high = std::min(row_high, span.row_begin + span.nrows); + const int64_t nrows_copy = copy_row_high - copy_row_low; + if (nrows_copy == 0) { + continue; + } + + const size_t row_size = span.row_size; + const size_t offset_dst = (copy_row_low - span.row_begin) * row_size; + const size_t offset_src = (copy_row_low - row_low) * row_size; + const size_t size_copy = nrows_copy * row_size; + + std::vector packed(size_copy); + ggml_cuda_set_device(id); + CUDA_CHECK(cudaMemcpyAsync( + packed.data(), + (const char *) extra->data_device[id] + offset_src, + size_copy, + cudaMemcpyDeviceToHost, + cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + + ggml_cuda_unpack_rows_nvfp4(ne0, nrows_copy, packed.data(), host_dst + offset_dst); + } + + if (host_dst != data) { + memcpy(data, host_dst + span.inner_offset, size); + } + return; + } + // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -2844,6 +3072,16 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + if (size == 0) { + return; + } + + if (tensor->type == GGML_TYPE_NVFP4) { + CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream())); + buf->iface.set_tensor(buf, tensor, data, offset, size); + return; + } + GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream())); @@ -2853,6 +3091,16 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + if (size == 0) { + return; + } + + if (tensor->type == GGML_TYPE_NVFP4) { + CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream())); + buf->iface.get_tensor(buf, tensor, data, offset, size); + return; + } + GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream())); diff --git a/ggml/src/ggml-cuda/repack_nvfp4.cu b/ggml/src/ggml-cuda/repack_nvfp4.cu new file mode 100644 index 0000000000..94c5ed8d68 --- /dev/null +++ b/ggml/src/ggml-cuda/repack_nvfp4.cu @@ -0,0 +1,111 @@ +#include + +#include "ggml.h" +#include "repack_nvfp4.cuh" + +static void ggml_cuda_repack_row_nvfp4(const block_nvfp4 * src, uint8_t * dst, int64_t ne0) { + GGML_ASSERT(ne0 % QK_NVFP4 == 0); + + const int lanes_per_cuda_block = QK_K / QK_NVFP4; + const int64_t n_upstream_blocks = ne0 / QK_NVFP4; + const int64_t n_cuda_blocks = n_upstream_blocks / lanes_per_cuda_block; + const int64_t tail_lanes = n_upstream_blocks % lanes_per_cuda_block; + + for (int64_t ib = 0; ib < n_cuda_blocks; ++ib) { + const block_nvfp4 * in = src + ib * lanes_per_cuda_block; + uint8_t * out = dst + ib * sizeof(block_nvfp4_cuda); + + for (int lane = 0; lane < lanes_per_cuda_block; ++lane) { + uint8_t * out_qs = out + lane * sizeof(in[lane].qs); + uint8_t * out_scales = out + lanes_per_cuda_block * sizeof(in[lane].qs) + lane * sizeof(in[lane].d); + + for (int pack = 0; pack < 8; ++pack) { + const uint32_t packed = ggml_cuda_nvfp4_pack(in[lane].qs, pack); + memcpy(out_qs + pack * sizeof(packed), &packed, sizeof(packed)); + } + memcpy(out_scales, in[lane].d, sizeof(in[lane].d)); + } + } + + if (tail_lanes > 0) { + const block_nvfp4 * in_tail = src + n_cuda_blocks * lanes_per_cuda_block; + uint8_t * tail = dst + n_cuda_blocks * sizeof(block_nvfp4_cuda); // Last short block stays compact too + + for (int64_t lane = 0; lane < tail_lanes; ++lane) { + uint8_t * tail_qs = tail + lane * sizeof(in_tail[lane].qs); + uint8_t * tail_scales = tail + tail_lanes * sizeof(in_tail[lane].qs) + lane * sizeof(in_tail[lane].d); + + for (int pack = 0; pack < 8; ++pack) { + const uint32_t packed = ggml_cuda_nvfp4_pack(in_tail[lane].qs, pack); + memcpy(tail_qs + pack * sizeof(packed), &packed, sizeof(packed)); + } + memcpy(tail_scales, in_tail[lane].d, sizeof(in_tail[lane].d)); + } + } +} + +static void ggml_cuda_unpack_weights_nvfp4(const uint8_t * src, uint8_t * dst) { + for (int scale = 0; scale < 4; ++scale) { + uint32_t packed_lo; + uint32_t packed_hi; + memcpy(&packed_lo, src + (scale * 2 + 0) * sizeof(packed_lo), sizeof(packed_lo)); + memcpy(&packed_hi, src + (scale * 2 + 1) * sizeof(packed_hi), sizeof(packed_hi)); + + for (int value = 0; value < 8; ++value) { + dst[scale * 8 + value] = + ggml_cuda_nvfp4_unpack(packed_lo, value) | + (ggml_cuda_nvfp4_unpack(packed_hi, value) << 4); + } + } +} + +static void ggml_cuda_unpack_row_nvfp4(const uint8_t * src, block_nvfp4 * dst, int64_t ne0) { + GGML_ASSERT(ne0 % QK_NVFP4 == 0); + + const int lanes_per_cuda_block = QK_K / QK_NVFP4; + const int64_t n_upstream_blocks = ne0 / QK_NVFP4; + const int64_t n_cuda_blocks = n_upstream_blocks / lanes_per_cuda_block; + const int64_t tail_lanes = n_upstream_blocks % lanes_per_cuda_block; + + for (int64_t ib = 0; ib < n_cuda_blocks; ++ib) { + const uint8_t * in = src + ib * sizeof(block_nvfp4_cuda); + block_nvfp4 * out = dst + ib * lanes_per_cuda_block; + + for (int lane = 0; lane < lanes_per_cuda_block; ++lane) { + const uint8_t * in_qs = in + lane * sizeof(out[lane].qs); + const uint8_t * in_scales = in + lanes_per_cuda_block * sizeof(out[lane].qs) + lane * sizeof(out[lane].d); + + ggml_cuda_unpack_weights_nvfp4(in_qs, out[lane].qs); + memcpy(out[lane].d, in_scales, sizeof(out[lane].d)); + } + } + + if (tail_lanes > 0) { + const uint8_t * tail = src + n_cuda_blocks * sizeof(block_nvfp4_cuda); + block_nvfp4 * out_tail = dst + n_cuda_blocks * lanes_per_cuda_block; // Same compact tail on unpack + + for (int64_t lane = 0; lane < tail_lanes; ++lane) { + const uint8_t * tail_qs = tail + lane * sizeof(out_tail[lane].qs); + const uint8_t * tail_scales = tail + tail_lanes * sizeof(out_tail[lane].qs) + lane * sizeof(out_tail[lane].d); + + ggml_cuda_unpack_weights_nvfp4(tail_qs, out_tail[lane].qs); + memcpy(out_tail[lane].d, tail_scales, sizeof(out_tail[lane].d)); + } + } +} + +void ggml_cuda_repack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst) { + const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0); + + for (int64_t row = 0; row < nrows; ++row) { + ggml_cuda_repack_row_nvfp4((const block_nvfp4 *) ((const uint8_t *) src + row * row_size), (uint8_t *) dst + row * row_size, ne0); + } +} + +void ggml_cuda_unpack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst) { + const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0); + + for (int64_t row = 0; row < nrows; ++row) { + ggml_cuda_unpack_row_nvfp4((const uint8_t *) src + row * row_size, (block_nvfp4 *) ((uint8_t *) dst + row * row_size), ne0); + } +} diff --git a/ggml/src/ggml-cuda/repack_nvfp4.cuh b/ggml/src/ggml-cuda/repack_nvfp4.cuh new file mode 100644 index 0000000000..54ea1a8435 --- /dev/null +++ b/ggml/src/ggml-cuda/repack_nvfp4.cuh @@ -0,0 +1,50 @@ +#pragma once + +#include +#include + +#if !defined(GGML_COMMON_DECL) +#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__HIP_DEVICE_COMPILE__) +#define GGML_COMMON_DECL_CUDA +#else +#define GGML_COMMON_DECL_CPP +#endif +#endif +#include "ggml-common.h" + +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +#define GGML_ALIGN_16 __align__(16) +#else +#define GGML_ALIGN_16 alignas(16) +#endif + +static constexpr int GGML_NVFP4_CUDA_LANES = QK_K / QK_NVFP4; // 4 packs in one 256 block +static_assert(GGML_NVFP4_CUDA_LANES == 4, "unexpected NVFP4 CUDA lane count"); + +// Full groups use 4 x 64 lane blocks +// Tails keep the compact row size +struct GGML_ALIGN_16 block_nvfp4_cuda { + uint8_t qs[GGML_NVFP4_CUDA_LANES][QK_NVFP4 / 2]; + uint8_t scales[GGML_NVFP4_CUDA_LANES][QK_NVFP4 / QK_NVFP4_SUB]; +}; + +static_assert(sizeof(block_nvfp4_cuda) == 144, "unexpected nvfp4 cuda block size"); +static_assert(alignof(block_nvfp4_cuda) == 16, "nvfp4 cuda block must be 16B aligned"); + +static inline uint32_t ggml_cuda_nvfp4_pack(const uint8_t src[32], int pack) { + const int scale = pack >> 1; + const int nibble_shift = (pack & 1) << 2; + uint32_t out = 0; + for (int value = 0; value < 8; ++value) { + const uint32_t nibble = (uint32_t) ((src[scale * 8 + value] >> nibble_shift) & 0x0F); // 8 fp4 codes into one u32 + out |= (nibble << (4 * value)); + } + return out; +} + +static inline uint8_t ggml_cuda_nvfp4_unpack(uint32_t packed, int value) { + return (packed >> (4 * value)) & 0x0F; +} + +void ggml_cuda_repack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst); +void ggml_cuda_unpack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9582164b58..e8d5b6d35d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -285,5 +285,10 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama) llama_build_and_test(test-alloc.cpp) target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) +if (GGML_CUDA AND NOT GGML_BACKEND_DL) + llama_build_and_test(test-nvfp4-repack.cpp) + target_include_directories(test-nvfp4-repack PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) +endif() + llama_build(export-graph-ops.cpp) target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) diff --git a/tests/test-nvfp4-repack.cpp b/tests/test-nvfp4-repack.cpp new file mode 100644 index 0000000000..96d04bb544 --- /dev/null +++ b/tests/test-nvfp4-repack.cpp @@ -0,0 +1,217 @@ +#include "ggml.h" +#include "ggml-cuda/repack_nvfp4.cuh" + +#include +#include +#include + +static void set_q4(uint8_t * qs, int idx, uint8_t value) { + uint8_t & byte = qs[idx / 2]; + + if (idx & 1) { + byte = (uint8_t) ((byte & 0x0f) | ((value & 0x0f) << 4)); + } else { + byte = (uint8_t) ((byte & 0xf0) | (value & 0x0f)); + } +} + +static void fill_block(block_nvfp4 & blk, uint32_t seed, int row, int block) { + memset(&blk, 0, sizeof(blk)); + + for (int i = 0; i < QK_NVFP4; ++i) { + const uint32_t value = seed + (uint32_t) row * 19U + (uint32_t) block * 11U + (uint32_t) i * 5U + (uint32_t) (i / 3); + set_q4(blk.qs, i, (uint8_t) (value & 0x0f)); + } + + for (int i = 0; i < QK_NVFP4 / QK_NVFP4_SUB; ++i) { + const uint32_t value = seed + 0x31U + (uint32_t) row * 7U + (uint32_t) block * 13U + (uint32_t) i * 9U; + blk.d[i] = (uint8_t) (0x30 + value % 0x30); + } +} + +static void fill_rows(std::vector & rows, int64_t ne0, int64_t nrows, uint32_t seed) { + const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0); + const int blocks_per_row = (int) (ne0 / QK_NVFP4); + + rows.assign(row_size * nrows, 0); + + for (int64_t row = 0; row < nrows; ++row) { + block_nvfp4 * dst = (block_nvfp4 *) (rows.data() + row * row_size); + for (int block = 0; block < blocks_per_row; ++block) { + fill_block(dst[block], seed, (int) row, block); + } + } +} + +static void fill_layout_row(std::vector & rows) { + rows.assign(ggml_row_size(GGML_TYPE_NVFP4, QK_K), 0); + + block_nvfp4 * dst = (block_nvfp4 *) rows.data(); + for (int lane = 0; lane < 4; ++lane) { + for (int i = 0; i < 32; ++i) { + dst[lane].qs[i] = (uint8_t) ((lane * 0x31 + i * 0x17 + 0x12) & 0xff); + } + for (int i = 0; i < 4; ++i) { + dst[lane].d[i] = (uint8_t) (0x31 + lane * 0x0d + i * 0x09); + } + } +} + +static bool expect_equal(const char * name, const std::vector & expected, const std::vector & actual) { + if (expected.size() != actual.size()) { + std::printf("%s: size mismatch (%zu != %zu)\n", name, expected.size(), actual.size()); + return false; + } + + for (size_t i = 0; i < expected.size(); ++i) { + if (expected[i] != actual[i]) { + std::printf("%s: first mismatch at byte %zu (expected 0x%02x, got 0x%02x)\n", + name, i, expected[i], actual[i]); + return false; + } + } + + return true; +} + +static bool check_roundtrip(const char * name, int64_t ne0, int64_t nrows, uint32_t seed) { + const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0); + + std::vector input; + std::vector packed(row_size * nrows); + std::vector output(row_size * nrows); + + fill_rows(input, ne0, nrows, seed); + ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data()); + ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data()); + + return expect_equal(name, input, output); +} + +static bool check_partial_patch(const char * name, int64_t ne0, int64_t nrows, size_t offset, size_t size, uint32_t seed) { + const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0); + const size_t total_size = row_size * nrows; + + std::vector input; + std::vector expected; + std::vector packed(total_size); + std::vector output(total_size); + std::vector patch(size); + + fill_rows(input, ne0, nrows, seed); + expected = input; + + for (size_t i = 0; i < patch.size(); ++i) { + patch[i] = (uint8_t) (seed + 0x5bU + (uint32_t) i * 23U + (uint32_t) (i / 5)); + } + + memcpy(expected.data() + offset, patch.data(), size); + + ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data()); + + const size_t aligned_offset = offset / row_size * row_size; + const size_t aligned_end = (offset + size + row_size - 1) / row_size * row_size; + const size_t aligned_size = aligned_end - aligned_offset; + const int64_t aligned_rows = (int64_t) (aligned_size / row_size); + const size_t inner_offset = offset - aligned_offset; + + std::vector rows(aligned_size); + ggml_cuda_unpack_rows_nvfp4(ne0, aligned_rows, packed.data() + aligned_offset, rows.data()); + memcpy(rows.data() + inner_offset, patch.data(), size); + ggml_cuda_repack_rows_nvfp4(ne0, aligned_rows, rows.data(), packed.data() + aligned_offset); + ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data()); + + return expect_equal(name, expected, output); +} + +static bool check_4x64_to_256_layout(const char * name, uint32_t seed) { + GGML_UNUSED(seed); + + std::vector input; + std::vector packed(ggml_row_size(GGML_TYPE_NVFP4, QK_K)); + + fill_layout_row(input); + ggml_cuda_repack_rows_nvfp4(QK_K, 1, input.data(), packed.data()); + + const block_nvfp4 * src = (const block_nvfp4 *) input.data(); + for (int lane = 0; lane < 4; ++lane) { + for (int pack = 0; pack < 8; ++pack) { + uint32_t got = 0; + memcpy(&got, packed.data() + lane * 32 + pack * sizeof(got), sizeof(got)); + const uint32_t expected = ggml_cuda_nvfp4_pack(src[lane].qs, pack); + if (got != expected) { + std::printf("%s: lane %d pack %d mismatch (expected 0x%08x, got 0x%08x)\n", name, lane, pack, expected, got); + return false; + } + } + + if (memcmp(packed.data() + 128 + lane * 4, src[lane].d, 4) != 0) { + std::printf("%s: lane %d scales mismatch\n", name, lane); + return false; + } + } + + std::printf("%s:\n", name); + std::printf(" 4 x block_nvfp4 (64 weights each) -> 1 x block_nvfp4_cuda (256 weights total)\n"); + for (int lane = 0; lane < 4; ++lane) { + std::printf(" block_nvfp4[%d] qs:", lane); + for (int i = 0; i < 32; ++i) { + std::printf(" %02x", src[lane].qs[i]); + } + std::printf("\n"); + } + for (int lane = 0; lane < 4; ++lane) { + std::printf(" block_nvfp4[%d] scales:", lane); + for (int i = 0; i < 4; ++i) { + std::printf(" %02x", src[lane].d[i]); + } + std::printf("\n"); + } + + std::printf(" block_nvfp4_cuda:\n"); + for (int lane = 0; lane < 4; ++lane) { + std::printf(" lane %d qs @0x%02x:", lane, lane * 32); + for (int pack = 0; pack < 8; ++pack) { + uint32_t word = 0; + memcpy(&word, packed.data() + lane * 32 + pack * sizeof(word), sizeof(word)); + std::printf(" %08x", word); + } + std::printf("\n"); + } + for (int lane = 0; lane < 4; ++lane) { + std::printf(" lane %d scales @0x%02x:", lane, 128 + lane * 4); + for (int i = 0; i < 4; ++i) { + std::printf(" %02x", packed[128 + lane * 4 + i]); + } + std::printf("\n"); + } + return true; +} + +int main() { + int total = 0; + int passed = 0; + + const struct { const char * name; int64_t ne0, nrows; uint32_t seed; } roundtrip_cases[] = { + { "roundtrip-ne0-64", 64, 3, 0x1001U }, { "roundtrip-ne0-128", 128, 3, 0x1002U }, + { "roundtrip-ne0-192", 192, 3, 0x1003U }, { "roundtrip-ne0-256", 256, 2, 0x1004U }, + { "roundtrip-ne0-320", 320, 3, 0x1005U }, + }; + + for (const auto & test : roundtrip_cases) { + total += 1; + passed += check_roundtrip(test.name, test.ne0, test.nrows, test.seed); + } + + total += 1; + passed += check_partial_patch("partial-cross-row-128", 128, 3, ggml_row_size(GGML_TYPE_NVFP4, 128) - 11, 27, 0x2001U); + + total += 1; + passed += check_partial_patch("partial-cross-row-320", 320, 2, ggml_row_size(GGML_TYPE_NVFP4, 320) - 19, 41, 0x2002U); + + total += 1; + passed += check_4x64_to_256_layout("layout-4x64-to-256", 0x3001U); + + std::printf("test-nvfp4-repack: %d/%d passed\n", passed, total); + return passed == total ? 0 : 1; +}