WIP Cuda NVFP4 Repacker Helpers

2026-03-14 13:59:45 -07:00 · 2026-03-14 13:59:45 -07:00 · 7cc56df86c
parent d23355afc3
commit 7cc56df86c
5 changed files with 631 additions and 0 deletions
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -36,6 +36,7 @@
 #include "ggml-cuda/pad.cuh"
 #include "ggml-cuda/pool2d.cuh"
 #include "ggml-cuda/quantize.cuh"
+#include "ggml-cuda/repack_nvfp4.cuh"
 #include "ggml-cuda/rope.cuh"
 #include "ggml-cuda/roll.cuh"
 #include "ggml-cuda/scale.cuh"
@ -641,10 +642,86 @@ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer,
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }

+struct ggml_cuda_nvfp4_row_span {
+    size_t row_size;
+    size_t aligned_offset;
+    size_t aligned_size;
+    size_t inner_offset;
+    int64_t row_begin;
+    int64_t nrows;
+};
+
+static ggml_cuda_nvfp4_row_span ggml_cuda_get_nvfp4_row_span(const ggml_tensor * tensor, size_t offset, size_t size) {
+    GGML_ASSERT(tensor != nullptr);
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "NVFP4 partial repack needs contiguous tensors");
+    GGML_ASSERT(offset <= ggml_nbytes(tensor));
+    GGML_ASSERT(size <= ggml_nbytes(tensor) - offset);
+
+    const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, tensor->ne[0]);
+    if (size == 0) {
+        return {
+            row_size,
+            offset,
+            0,
+            0,
+            (int64_t) (offset / row_size),
+            0,
+        };
+    }
+
+    const size_t aligned_offset = offset / row_size * row_size;
+    const size_t aligned_end = (offset + size + row_size - 1) / row_size * row_size;
+
+    GGML_ASSERT(aligned_end <= ggml_nbytes(tensor));
+
+    return {
+        row_size,
+        aligned_offset,
+        aligned_end - aligned_offset,
+        offset - aligned_offset,
+        (int64_t) (aligned_offset / row_size),
+        (int64_t) ((aligned_end - aligned_offset) / row_size),
+    };
+}
+
 static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

+    if (size == 0) {
+        return;
+    }
+
    ggml_cuda_set_device(ctx->device);
+    if (tensor->type == GGML_TYPE_NVFP4) {
+        const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size);
+
+        std::vector<char> rows(span.aligned_size); // Pull full rows for partial updates
+        if (span.inner_offset != 0 || size != span.aligned_size) {
+            std::vector<char> packed(span.aligned_size);
+            CUDA_CHECK(cudaMemcpyAsync(
+                    packed.data(),
+                    (const char *) tensor->data + span.aligned_offset,
+                    span.aligned_size,
+                    cudaMemcpyDeviceToHost,
+                    cudaStreamPerThread));
+            CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+            ggml_cuda_unpack_rows_nvfp4(tensor->ne[0], span.nrows, packed.data(), rows.data());
+        }
+
+        memcpy(rows.data() + span.inner_offset, data, size);
+
+        std::vector<char> packed(span.aligned_size);
+        ggml_cuda_repack_rows_nvfp4(tensor->ne[0], span.nrows, rows.data(), packed.data());
+        CUDA_CHECK(cudaMemcpyAsync(
+                (char *) tensor->data + span.aligned_offset,
+                packed.data(),
+                span.aligned_size,
+                cudaMemcpyHostToDevice,
+                cudaStreamPerThread));
+        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+        return;
+    }
+
    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }
@ -652,7 +729,34 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
 static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

+    if (size == 0) {
+        return;
+    }
+
    ggml_cuda_set_device(ctx->device);
+    if (tensor->type == GGML_TYPE_NVFP4) {
+        const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size);
+
+        std::vector<char> packed(span.aligned_size);
+        CUDA_CHECK(cudaMemcpyAsync(
+                packed.data(),
+                (const char *) tensor->data + span.aligned_offset,
+                span.aligned_size,
+                cudaMemcpyDeviceToHost,
+                cudaStreamPerThread));
+        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+
+        if (span.inner_offset == 0 && size == span.aligned_size) {
+            ggml_cuda_unpack_rows_nvfp4(tensor->ne[0], span.nrows, packed.data(), data);
+            return;
+        }
+
+        std::vector<char> rows(span.aligned_size);
+        ggml_cuda_unpack_rows_nvfp4(tensor->ne[0], span.nrows, packed.data(), rows.data());
+        memcpy(data, rows.data() + span.inner_offset, size);
+        return;
+    }
+
    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }
@ -916,6 +1020,76 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_
 }

 static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    if (size == 0) {
+        return;
+    }
+
+    if (tensor->type == GGML_TYPE_NVFP4) {
+        GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+
+        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+        const int64_t ne0 = tensor->ne[0];
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+        const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size);
+
+        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+            int64_t row_low, row_high;
+            get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+            const int64_t copy_row_low = std::max<int64_t>(row_low, span.row_begin);
+            const int64_t copy_row_high = std::min<int64_t>(row_high, span.row_begin + span.nrows);
+            const int64_t nrows_copy = copy_row_high - copy_row_low;
+            if (nrows_copy == 0) {
+                continue;
+            }
+
+            const size_t row_size = span.row_size;
+            const size_t offset_dst = (copy_row_low - row_low) * row_size;
+            const size_t size_copy = nrows_copy * row_size;
+            const size_t aligned_offset = (size_t) copy_row_low * row_size;
+
+            std::vector<char> rows(size_copy);
+            if (span.inner_offset != 0 || size != span.aligned_size) {
+                std::vector<char> packed(size_copy);
+                ggml_cuda_set_device(id);
+                CUDA_CHECK(cudaMemcpyAsync(
+                        packed.data(),
+                        (const char *) extra->data_device[id] + offset_dst,
+                        size_copy,
+                        cudaMemcpyDeviceToHost,
+                        cudaStreamPerThread));
+                CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+                ggml_cuda_unpack_rows_nvfp4(ne0, nrows_copy, packed.data(), rows.data());
+            }
+
+            const size_t overlap_begin = std::max(offset, aligned_offset);
+            const size_t overlap_end = std::min(offset + size, aligned_offset + size_copy);
+            if (overlap_begin < overlap_end) {
+                memcpy(
+                        rows.data() + (overlap_begin - aligned_offset),
+                        (const char *) data + (overlap_begin - offset),
+                        overlap_end - overlap_begin);
+            }
+
+            std::vector<char> packed(size_copy);
+            ggml_cuda_repack_rows_nvfp4(ne0, nrows_copy, rows.data(), packed.data());
+            ggml_cuda_set_device(id);
+            CUDA_CHECK(cudaMemcpyAsync(
+                    (char *) extra->data_device[id] + offset_dst,
+                    packed.data(),
+                    size_copy,
+                    cudaMemcpyHostToDevice,
+                    cudaStreamPerThread));
+        }
+
+        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+            ggml_cuda_set_device(id);
+            CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+        }
+        return;
+    }
+
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));
@ -955,6 +1129,60 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
 }

 static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    if (size == 0) {
+        return;
+    }
+
+    if (tensor->type == GGML_TYPE_NVFP4) {
+        GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+
+        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+        const int64_t ne0 = tensor->ne[0];
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+        const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size);
+        char * host_dst = (char *) data;
+        std::vector<char> host_aligned;
+        if (span.inner_offset != 0 || size != span.aligned_size) {
+            host_aligned.resize(span.aligned_size);
+            host_dst = host_aligned.data();
+        }
+
+        for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+            int64_t row_low, row_high;
+            get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+            const int64_t copy_row_low = std::max<int64_t>(row_low, span.row_begin);
+            const int64_t copy_row_high = std::min<int64_t>(row_high, span.row_begin + span.nrows);
+            const int64_t nrows_copy = copy_row_high - copy_row_low;
+            if (nrows_copy == 0) {
+                continue;
+            }
+
+            const size_t row_size = span.row_size;
+            const size_t offset_dst = (copy_row_low - span.row_begin) * row_size;
+            const size_t offset_src = (copy_row_low - row_low) * row_size;
+            const size_t size_copy = nrows_copy * row_size;
+
+            std::vector<char> packed(size_copy);
+            ggml_cuda_set_device(id);
+            CUDA_CHECK(cudaMemcpyAsync(
+                    packed.data(),
+                    (const char *) extra->data_device[id] + offset_src,
+                    size_copy,
+                    cudaMemcpyDeviceToHost,
+                    cudaStreamPerThread));
+            CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
+
+            ggml_cuda_unpack_rows_nvfp4(ne0, nrows_copy, packed.data(), host_dst + offset_dst);
+        }
+
+        if (host_dst != data) {
+            memcpy(data, host_dst + span.inner_offset, size);
+        }
+        return;
+    }
+
    // split tensors must always be set in their entirety at once
    GGML_ASSERT(offset == 0);
    GGML_ASSERT(size == ggml_nbytes(tensor));
@ -2844,6 +3072,16 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

+    if (size == 0) {
+        return;
+    }
+
+    if (tensor->type == GGML_TYPE_NVFP4) {
+        CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
+        buf->iface.set_tensor(buf, tensor, data, offset, size);
+        return;
+    }
+
    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");

    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
@ -2853,6 +3091,16 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

+    if (size == 0) {
+        return;
+    }
+
+    if (tensor->type == GGML_TYPE_NVFP4) {
+        CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
+        buf->iface.get_tensor(buf, tensor, data, offset, size);
+        return;
+    }
+
    GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");

    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
--- a/ggml/src/ggml-cuda/repack_nvfp4.cu
+++ b/ggml/src/ggml-cuda/repack_nvfp4.cu
@ -0,0 +1,111 @@
+#include <cstring>
+
+#include "ggml.h"
+#include "repack_nvfp4.cuh"
+
+static void ggml_cuda_repack_row_nvfp4(const block_nvfp4 * src, uint8_t * dst, int64_t ne0) {
+    GGML_ASSERT(ne0 % QK_NVFP4 == 0);
+
+    const int lanes_per_cuda_block = QK_K / QK_NVFP4;
+    const int64_t n_upstream_blocks = ne0 / QK_NVFP4;
+    const int64_t n_cuda_blocks = n_upstream_blocks / lanes_per_cuda_block;
+    const int64_t tail_lanes = n_upstream_blocks % lanes_per_cuda_block;
+
+    for (int64_t ib = 0; ib < n_cuda_blocks; ++ib) {
+        const block_nvfp4 * in = src + ib * lanes_per_cuda_block;
+        uint8_t * out = dst + ib * sizeof(block_nvfp4_cuda);
+
+        for (int lane = 0; lane < lanes_per_cuda_block; ++lane) {
+            uint8_t * out_qs = out + lane * sizeof(in[lane].qs);
+            uint8_t * out_scales = out + lanes_per_cuda_block * sizeof(in[lane].qs) + lane * sizeof(in[lane].d);
+
+            for (int pack = 0; pack < 8; ++pack) {
+                const uint32_t packed = ggml_cuda_nvfp4_pack(in[lane].qs, pack);
+                memcpy(out_qs + pack * sizeof(packed), &packed, sizeof(packed));
+            }
+            memcpy(out_scales, in[lane].d, sizeof(in[lane].d));
+        }
+    }
+
+    if (tail_lanes > 0) {
+        const block_nvfp4 * in_tail = src + n_cuda_blocks * lanes_per_cuda_block;
+        uint8_t * tail = dst + n_cuda_blocks * sizeof(block_nvfp4_cuda); // Last short block stays compact too
+
+        for (int64_t lane = 0; lane < tail_lanes; ++lane) {
+            uint8_t * tail_qs = tail + lane * sizeof(in_tail[lane].qs);
+            uint8_t * tail_scales = tail + tail_lanes * sizeof(in_tail[lane].qs) + lane * sizeof(in_tail[lane].d);
+
+            for (int pack = 0; pack < 8; ++pack) {
+                const uint32_t packed = ggml_cuda_nvfp4_pack(in_tail[lane].qs, pack);
+                memcpy(tail_qs + pack * sizeof(packed), &packed, sizeof(packed));
+            }
+            memcpy(tail_scales, in_tail[lane].d, sizeof(in_tail[lane].d));
+        }
+    }
+}
+
+static void ggml_cuda_unpack_weights_nvfp4(const uint8_t * src, uint8_t * dst) {
+    for (int scale = 0; scale < 4; ++scale) {
+        uint32_t packed_lo;
+        uint32_t packed_hi;
+        memcpy(&packed_lo, src + (scale * 2 + 0) * sizeof(packed_lo), sizeof(packed_lo));
+        memcpy(&packed_hi, src + (scale * 2 + 1) * sizeof(packed_hi), sizeof(packed_hi));
+
+        for (int value = 0; value < 8; ++value) {
+            dst[scale * 8 + value] =
+                    ggml_cuda_nvfp4_unpack(packed_lo, value) |
+                    (ggml_cuda_nvfp4_unpack(packed_hi, value) << 4);
+        }
+    }
+}
+
+static void ggml_cuda_unpack_row_nvfp4(const uint8_t * src, block_nvfp4 * dst, int64_t ne0) {
+    GGML_ASSERT(ne0 % QK_NVFP4 == 0);
+
+    const int lanes_per_cuda_block = QK_K / QK_NVFP4;
+    const int64_t n_upstream_blocks = ne0 / QK_NVFP4;
+    const int64_t n_cuda_blocks = n_upstream_blocks / lanes_per_cuda_block;
+    const int64_t tail_lanes = n_upstream_blocks % lanes_per_cuda_block;
+
+    for (int64_t ib = 0; ib < n_cuda_blocks; ++ib) {
+        const uint8_t * in = src + ib * sizeof(block_nvfp4_cuda);
+        block_nvfp4 * out = dst + ib * lanes_per_cuda_block;
+
+        for (int lane = 0; lane < lanes_per_cuda_block; ++lane) {
+            const uint8_t * in_qs = in + lane * sizeof(out[lane].qs);
+            const uint8_t * in_scales = in + lanes_per_cuda_block * sizeof(out[lane].qs) + lane * sizeof(out[lane].d);
+
+            ggml_cuda_unpack_weights_nvfp4(in_qs, out[lane].qs);
+            memcpy(out[lane].d, in_scales, sizeof(out[lane].d));
+        }
+    }
+
+    if (tail_lanes > 0) {
+        const uint8_t * tail = src + n_cuda_blocks * sizeof(block_nvfp4_cuda);
+        block_nvfp4 * out_tail = dst + n_cuda_blocks * lanes_per_cuda_block; // Same compact tail on unpack
+
+        for (int64_t lane = 0; lane < tail_lanes; ++lane) {
+            const uint8_t * tail_qs = tail + lane * sizeof(out_tail[lane].qs);
+            const uint8_t * tail_scales = tail + tail_lanes * sizeof(out_tail[lane].qs) + lane * sizeof(out_tail[lane].d);
+
+            ggml_cuda_unpack_weights_nvfp4(tail_qs, out_tail[lane].qs);
+            memcpy(out_tail[lane].d, tail_scales, sizeof(out_tail[lane].d));
+        }
+    }
+}
+
+void ggml_cuda_repack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst) {
+    const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
+
+    for (int64_t row = 0; row < nrows; ++row) {
+        ggml_cuda_repack_row_nvfp4((const block_nvfp4 *) ((const uint8_t *) src + row * row_size), (uint8_t *) dst + row * row_size, ne0);
+    }
+}
+
+void ggml_cuda_unpack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst) {
+    const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
+
+    for (int64_t row = 0; row < nrows; ++row) {
+        ggml_cuda_unpack_row_nvfp4((const uint8_t *) src + row * row_size, (block_nvfp4 *) ((uint8_t *) dst + row * row_size), ne0);
+    }
+}
--- a/ggml/src/ggml-cuda/repack_nvfp4.cuh
+++ b/ggml/src/ggml-cuda/repack_nvfp4.cuh
@ -0,0 +1,50 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#if !defined(GGML_COMMON_DECL)
+#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__HIP_DEVICE_COMPILE__)
+#define GGML_COMMON_DECL_CUDA
+#else
+#define GGML_COMMON_DECL_CPP
+#endif
+#endif
+#include "ggml-common.h"
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#define GGML_ALIGN_16 __align__(16)
+#else
+#define GGML_ALIGN_16 alignas(16)
+#endif
+
+static constexpr int GGML_NVFP4_CUDA_LANES = QK_K / QK_NVFP4; // 4 packs in one 256 block
+static_assert(GGML_NVFP4_CUDA_LANES == 4, "unexpected NVFP4 CUDA lane count");
+
+// Full groups use 4 x 64 lane blocks
+// Tails keep the compact row size
+struct GGML_ALIGN_16 block_nvfp4_cuda {
+    uint8_t qs[GGML_NVFP4_CUDA_LANES][QK_NVFP4 / 2];
+    uint8_t scales[GGML_NVFP4_CUDA_LANES][QK_NVFP4 / QK_NVFP4_SUB];
+};
+
+static_assert(sizeof(block_nvfp4_cuda) == 144, "unexpected nvfp4 cuda block size");
+static_assert(alignof(block_nvfp4_cuda) == 16, "nvfp4 cuda block must be 16B aligned");
+
+static inline uint32_t ggml_cuda_nvfp4_pack(const uint8_t src[32], int pack) {
+    const int scale = pack >> 1;
+    const int nibble_shift = (pack & 1) << 2;
+    uint32_t out = 0;
+    for (int value = 0; value < 8; ++value) {
+        const uint32_t nibble = (uint32_t) ((src[scale * 8 + value] >> nibble_shift) & 0x0F); // 8 fp4 codes into one u32
+        out |= (nibble << (4 * value));
+    }
+    return out;
+}
+
+static inline uint8_t ggml_cuda_nvfp4_unpack(uint32_t packed, int value) {
+    return (packed >> (4 * value)) & 0x0F;
+}
+
+void ggml_cuda_repack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst);
+void ggml_cuda_unpack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst);
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -285,5 +285,10 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)

+if (GGML_CUDA AND NOT GGML_BACKEND_DL)
+    llama_build_and_test(test-nvfp4-repack.cpp)
+    target_include_directories(test-nvfp4-repack PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
+endif()
+
 llama_build(export-graph-ops.cpp)
 target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
--- a/tests/test-nvfp4-repack.cpp
+++ b/tests/test-nvfp4-repack.cpp
@ -0,0 +1,217 @@
+#include "ggml.h"
+#include "ggml-cuda/repack_nvfp4.cuh"
+
+#include <cstdio>
+#include <cstring>
+#include <vector>
+
+static void set_q4(uint8_t * qs, int idx, uint8_t value) {
+    uint8_t & byte = qs[idx / 2];
+
+    if (idx & 1) {
+        byte = (uint8_t) ((byte & 0x0f) | ((value & 0x0f) << 4));
+    } else {
+        byte = (uint8_t) ((byte & 0xf0) | (value & 0x0f));
+    }
+}
+
+static void fill_block(block_nvfp4 & blk, uint32_t seed, int row, int block) {
+    memset(&blk, 0, sizeof(blk));
+
+    for (int i = 0; i < QK_NVFP4; ++i) {
+        const uint32_t value = seed + (uint32_t) row * 19U + (uint32_t) block * 11U + (uint32_t) i * 5U + (uint32_t) (i / 3);
+        set_q4(blk.qs, i, (uint8_t) (value & 0x0f));
+    }
+
+    for (int i = 0; i < QK_NVFP4 / QK_NVFP4_SUB; ++i) {
+        const uint32_t value = seed + 0x31U + (uint32_t) row * 7U + (uint32_t) block * 13U + (uint32_t) i * 9U;
+        blk.d[i] = (uint8_t) (0x30 + value % 0x30);
+    }
+}
+
+static void fill_rows(std::vector<uint8_t> & rows, int64_t ne0, int64_t nrows, uint32_t seed) {
+    const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
+    const int blocks_per_row = (int) (ne0 / QK_NVFP4);
+
+    rows.assign(row_size * nrows, 0);
+
+    for (int64_t row = 0; row < nrows; ++row) {
+        block_nvfp4 * dst = (block_nvfp4 *) (rows.data() + row * row_size);
+        for (int block = 0; block < blocks_per_row; ++block) {
+            fill_block(dst[block], seed, (int) row, block);
+        }
+    }
+}
+
+static void fill_layout_row(std::vector<uint8_t> & rows) {
+    rows.assign(ggml_row_size(GGML_TYPE_NVFP4, QK_K), 0);
+
+    block_nvfp4 * dst = (block_nvfp4 *) rows.data();
+    for (int lane = 0; lane < 4; ++lane) {
+        for (int i = 0; i < 32; ++i) {
+            dst[lane].qs[i] = (uint8_t) ((lane * 0x31 + i * 0x17 + 0x12) & 0xff);
+        }
+        for (int i = 0; i < 4; ++i) {
+            dst[lane].d[i] = (uint8_t) (0x31 + lane * 0x0d + i * 0x09);
+        }
+    }
+}
+
+static bool expect_equal(const char * name, const std::vector<uint8_t> & expected, const std::vector<uint8_t> & actual) {
+    if (expected.size() != actual.size()) {
+        std::printf("%s: size mismatch (%zu != %zu)\n", name, expected.size(), actual.size());
+        return false;
+    }
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        if (expected[i] != actual[i]) {
+            std::printf("%s: first mismatch at byte %zu (expected 0x%02x, got 0x%02x)\n",
+                    name, i, expected[i], actual[i]);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool check_roundtrip(const char * name, int64_t ne0, int64_t nrows, uint32_t seed) {
+    const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
+
+    std::vector<uint8_t> input;
+    std::vector<uint8_t> packed(row_size * nrows);
+    std::vector<uint8_t> output(row_size * nrows);
+
+    fill_rows(input, ne0, nrows, seed);
+    ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data());
+    ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data());
+
+    return expect_equal(name, input, output);
+}
+
+static bool check_partial_patch(const char * name, int64_t ne0, int64_t nrows, size_t offset, size_t size, uint32_t seed) {
+    const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
+    const size_t total_size = row_size * nrows;
+
+    std::vector<uint8_t> input;
+    std::vector<uint8_t> expected;
+    std::vector<uint8_t> packed(total_size);
+    std::vector<uint8_t> output(total_size);
+    std::vector<uint8_t> patch(size);
+
+    fill_rows(input, ne0, nrows, seed);
+    expected = input;
+
+    for (size_t i = 0; i < patch.size(); ++i) {
+        patch[i] = (uint8_t) (seed + 0x5bU + (uint32_t) i * 23U + (uint32_t) (i / 5));
+    }
+
+    memcpy(expected.data() + offset, patch.data(), size);
+
+    ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data());
+
+    const size_t aligned_offset = offset / row_size * row_size;
+    const size_t aligned_end = (offset + size + row_size - 1) / row_size * row_size;
+    const size_t aligned_size = aligned_end - aligned_offset;
+    const int64_t aligned_rows = (int64_t) (aligned_size / row_size);
+    const size_t inner_offset = offset - aligned_offset;
+
+    std::vector<uint8_t> rows(aligned_size);
+    ggml_cuda_unpack_rows_nvfp4(ne0, aligned_rows, packed.data() + aligned_offset, rows.data());
+    memcpy(rows.data() + inner_offset, patch.data(), size);
+    ggml_cuda_repack_rows_nvfp4(ne0, aligned_rows, rows.data(), packed.data() + aligned_offset);
+    ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data());
+
+    return expect_equal(name, expected, output);
+}
+
+static bool check_4x64_to_256_layout(const char * name, uint32_t seed) {
+    GGML_UNUSED(seed);
+
+    std::vector<uint8_t> input;
+    std::vector<uint8_t> packed(ggml_row_size(GGML_TYPE_NVFP4, QK_K));
+
+    fill_layout_row(input);
+    ggml_cuda_repack_rows_nvfp4(QK_K, 1, input.data(), packed.data());
+
+    const block_nvfp4 * src = (const block_nvfp4 *) input.data();
+    for (int lane = 0; lane < 4; ++lane) {
+        for (int pack = 0; pack < 8; ++pack) {
+            uint32_t got = 0;
+            memcpy(&got, packed.data() + lane * 32 + pack * sizeof(got), sizeof(got));
+            const uint32_t expected = ggml_cuda_nvfp4_pack(src[lane].qs, pack);
+            if (got != expected) {
+                std::printf("%s: lane %d pack %d mismatch (expected 0x%08x, got 0x%08x)\n", name, lane, pack, expected, got);
+                return false;
+            }
+        }
+
+        if (memcmp(packed.data() + 128 + lane * 4, src[lane].d, 4) != 0) {
+            std::printf("%s: lane %d scales mismatch\n", name, lane);
+            return false;
+        }
+    }
+
+    std::printf("%s:\n", name);
+    std::printf("  4 x block_nvfp4 (64 weights each) -> 1 x block_nvfp4_cuda (256 weights total)\n");
+    for (int lane = 0; lane < 4; ++lane) {
+        std::printf("  block_nvfp4[%d] qs:", lane);
+        for (int i = 0; i < 32; ++i) {
+            std::printf(" %02x", src[lane].qs[i]);
+        }
+        std::printf("\n");
+    }
+    for (int lane = 0; lane < 4; ++lane) {
+        std::printf("  block_nvfp4[%d] scales:", lane);
+        for (int i = 0; i < 4; ++i) {
+            std::printf(" %02x", src[lane].d[i]);
+        }
+        std::printf("\n");
+    }
+
+    std::printf("  block_nvfp4_cuda:\n");
+    for (int lane = 0; lane < 4; ++lane) {
+        std::printf("  lane %d qs @0x%02x:", lane, lane * 32);
+        for (int pack = 0; pack < 8; ++pack) {
+            uint32_t word = 0;
+            memcpy(&word, packed.data() + lane * 32 + pack * sizeof(word), sizeof(word));
+            std::printf(" %08x", word);
+        }
+        std::printf("\n");
+    }
+    for (int lane = 0; lane < 4; ++lane) {
+        std::printf("  lane %d scales @0x%02x:", lane, 128 + lane * 4);
+        for (int i = 0; i < 4; ++i) {
+            std::printf(" %02x", packed[128 + lane * 4 + i]);
+        }
+        std::printf("\n");
+    }
+    return true;
+}
+
+int main() {
+    int total = 0;
+    int passed = 0;
+
+    const struct { const char * name; int64_t ne0, nrows; uint32_t seed; } roundtrip_cases[] = {
+        { "roundtrip-ne0-64", 64, 3, 0x1001U }, { "roundtrip-ne0-128", 128, 3, 0x1002U },
+        { "roundtrip-ne0-192", 192, 3, 0x1003U }, { "roundtrip-ne0-256", 256, 2, 0x1004U },
+        { "roundtrip-ne0-320", 320, 3, 0x1005U },
+    };
+
+    for (const auto & test : roundtrip_cases) {
+        total += 1;
+        passed += check_roundtrip(test.name, test.ne0, test.nrows, test.seed);
+    }
+
+    total += 1;
+    passed += check_partial_patch("partial-cross-row-128", 128, 3, ggml_row_size(GGML_TYPE_NVFP4, 128) - 11, 27, 0x2001U);
+
+    total += 1;
+    passed += check_partial_patch("partial-cross-row-320", 320, 2, ggml_row_size(GGML_TYPE_NVFP4, 320) - 19, 41, 0x2002U);
+
+    total += 1;
+    passed += check_4x64_to_256_layout("layout-4x64-to-256", 0x3001U);
+
+    std::printf("test-nvfp4-repack: %d/%d passed\n", passed, total);
+    return passed == total ? 0 : 1;
+}