WIP Cuda NVFP4 Repacker Helpers
This commit is contained in:
parent
d23355afc3
commit
7cc56df86c
|
|
@ -36,6 +36,7 @@
|
|||
#include "ggml-cuda/pad.cuh"
|
||||
#include "ggml-cuda/pool2d.cuh"
|
||||
#include "ggml-cuda/quantize.cuh"
|
||||
#include "ggml-cuda/repack_nvfp4.cuh"
|
||||
#include "ggml-cuda/rope.cuh"
|
||||
#include "ggml-cuda/roll.cuh"
|
||||
#include "ggml-cuda/scale.cuh"
|
||||
|
|
@ -641,10 +642,86 @@ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer,
|
|||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
|
||||
struct ggml_cuda_nvfp4_row_span {
|
||||
size_t row_size;
|
||||
size_t aligned_offset;
|
||||
size_t aligned_size;
|
||||
size_t inner_offset;
|
||||
int64_t row_begin;
|
||||
int64_t nrows;
|
||||
};
|
||||
|
||||
static ggml_cuda_nvfp4_row_span ggml_cuda_get_nvfp4_row_span(const ggml_tensor * tensor, size_t offset, size_t size) {
|
||||
GGML_ASSERT(tensor != nullptr);
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "NVFP4 partial repack needs contiguous tensors");
|
||||
GGML_ASSERT(offset <= ggml_nbytes(tensor));
|
||||
GGML_ASSERT(size <= ggml_nbytes(tensor) - offset);
|
||||
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, tensor->ne[0]);
|
||||
if (size == 0) {
|
||||
return {
|
||||
row_size,
|
||||
offset,
|
||||
0,
|
||||
0,
|
||||
(int64_t) (offset / row_size),
|
||||
0,
|
||||
};
|
||||
}
|
||||
|
||||
const size_t aligned_offset = offset / row_size * row_size;
|
||||
const size_t aligned_end = (offset + size + row_size - 1) / row_size * row_size;
|
||||
|
||||
GGML_ASSERT(aligned_end <= ggml_nbytes(tensor));
|
||||
|
||||
return {
|
||||
row_size,
|
||||
aligned_offset,
|
||||
aligned_end - aligned_offset,
|
||||
offset - aligned_offset,
|
||||
(int64_t) (aligned_offset / row_size),
|
||||
(int64_t) ((aligned_end - aligned_offset) / row_size),
|
||||
};
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
if (tensor->type == GGML_TYPE_NVFP4) {
|
||||
const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size);
|
||||
|
||||
std::vector<char> rows(span.aligned_size); // Pull full rows for partial updates
|
||||
if (span.inner_offset != 0 || size != span.aligned_size) {
|
||||
std::vector<char> packed(span.aligned_size);
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
packed.data(),
|
||||
(const char *) tensor->data + span.aligned_offset,
|
||||
span.aligned_size,
|
||||
cudaMemcpyDeviceToHost,
|
||||
cudaStreamPerThread));
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
ggml_cuda_unpack_rows_nvfp4(tensor->ne[0], span.nrows, packed.data(), rows.data());
|
||||
}
|
||||
|
||||
memcpy(rows.data() + span.inner_offset, data, size);
|
||||
|
||||
std::vector<char> packed(span.aligned_size);
|
||||
ggml_cuda_repack_rows_nvfp4(tensor->ne[0], span.nrows, rows.data(), packed.data());
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
(char *) tensor->data + span.aligned_offset,
|
||||
packed.data(),
|
||||
span.aligned_size,
|
||||
cudaMemcpyHostToDevice,
|
||||
cudaStreamPerThread));
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
return;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
|
|
@ -652,7 +729,34 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|||
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
if (tensor->type == GGML_TYPE_NVFP4) {
|
||||
const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size);
|
||||
|
||||
std::vector<char> packed(span.aligned_size);
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
packed.data(),
|
||||
(const char *) tensor->data + span.aligned_offset,
|
||||
span.aligned_size,
|
||||
cudaMemcpyDeviceToHost,
|
||||
cudaStreamPerThread));
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
|
||||
if (span.inner_offset == 0 && size == span.aligned_size) {
|
||||
ggml_cuda_unpack_rows_nvfp4(tensor->ne[0], span.nrows, packed.data(), data);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<char> rows(span.aligned_size);
|
||||
ggml_cuda_unpack_rows_nvfp4(tensor->ne[0], span.nrows, packed.data(), rows.data());
|
||||
memcpy(data, rows.data() + span.inner_offset, size);
|
||||
return;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread));
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
|
|
@ -916,6 +1020,76 @@ static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_
|
|||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (tensor->type == GGML_TYPE_NVFP4) {
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
||||
const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size);
|
||||
|
||||
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
int64_t row_low, row_high;
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
const int64_t copy_row_low = std::max<int64_t>(row_low, span.row_begin);
|
||||
const int64_t copy_row_high = std::min<int64_t>(row_high, span.row_begin + span.nrows);
|
||||
const int64_t nrows_copy = copy_row_high - copy_row_low;
|
||||
if (nrows_copy == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t row_size = span.row_size;
|
||||
const size_t offset_dst = (copy_row_low - row_low) * row_size;
|
||||
const size_t size_copy = nrows_copy * row_size;
|
||||
const size_t aligned_offset = (size_t) copy_row_low * row_size;
|
||||
|
||||
std::vector<char> rows(size_copy);
|
||||
if (span.inner_offset != 0 || size != span.aligned_size) {
|
||||
std::vector<char> packed(size_copy);
|
||||
ggml_cuda_set_device(id);
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
packed.data(),
|
||||
(const char *) extra->data_device[id] + offset_dst,
|
||||
size_copy,
|
||||
cudaMemcpyDeviceToHost,
|
||||
cudaStreamPerThread));
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
ggml_cuda_unpack_rows_nvfp4(ne0, nrows_copy, packed.data(), rows.data());
|
||||
}
|
||||
|
||||
const size_t overlap_begin = std::max(offset, aligned_offset);
|
||||
const size_t overlap_end = std::min(offset + size, aligned_offset + size_copy);
|
||||
if (overlap_begin < overlap_end) {
|
||||
memcpy(
|
||||
rows.data() + (overlap_begin - aligned_offset),
|
||||
(const char *) data + (overlap_begin - offset),
|
||||
overlap_end - overlap_begin);
|
||||
}
|
||||
|
||||
std::vector<char> packed(size_copy);
|
||||
ggml_cuda_repack_rows_nvfp4(ne0, nrows_copy, rows.data(), packed.data());
|
||||
ggml_cuda_set_device(id);
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
(char *) extra->data_device[id] + offset_dst,
|
||||
packed.data(),
|
||||
size_copy,
|
||||
cudaMemcpyHostToDevice,
|
||||
cudaStreamPerThread));
|
||||
}
|
||||
|
||||
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
ggml_cuda_set_device(id);
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
|
@ -955,6 +1129,60 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
|||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (tensor->type == GGML_TYPE_NVFP4) {
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
||||
const ggml_cuda_nvfp4_row_span span = ggml_cuda_get_nvfp4_row_span(tensor, offset, size);
|
||||
char * host_dst = (char *) data;
|
||||
std::vector<char> host_aligned;
|
||||
if (span.inner_offset != 0 || size != span.aligned_size) {
|
||||
host_aligned.resize(span.aligned_size);
|
||||
host_dst = host_aligned.data();
|
||||
}
|
||||
|
||||
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
|
||||
int64_t row_low, row_high;
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
const int64_t copy_row_low = std::max<int64_t>(row_low, span.row_begin);
|
||||
const int64_t copy_row_high = std::min<int64_t>(row_high, span.row_begin + span.nrows);
|
||||
const int64_t nrows_copy = copy_row_high - copy_row_low;
|
||||
if (nrows_copy == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t row_size = span.row_size;
|
||||
const size_t offset_dst = (copy_row_low - span.row_begin) * row_size;
|
||||
const size_t offset_src = (copy_row_low - row_low) * row_size;
|
||||
const size_t size_copy = nrows_copy * row_size;
|
||||
|
||||
std::vector<char> packed(size_copy);
|
||||
ggml_cuda_set_device(id);
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
packed.data(),
|
||||
(const char *) extra->data_device[id] + offset_src,
|
||||
size_copy,
|
||||
cudaMemcpyDeviceToHost,
|
||||
cudaStreamPerThread));
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
|
||||
ggml_cuda_unpack_rows_nvfp4(ne0, nrows_copy, packed.data(), host_dst + offset_dst);
|
||||
}
|
||||
|
||||
if (host_dst != data) {
|
||||
memcpy(data, host_dst + span.inner_offset, size);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
|
@ -2844,6 +3072,16 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (tensor->type == GGML_TYPE_NVFP4) {
|
||||
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
|
||||
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
|
||||
|
|
@ -2853,6 +3091,16 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (tensor->type == GGML_TYPE_NVFP4) {
|
||||
CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream()));
|
||||
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream()));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,111 @@
|
|||
#include <cstring>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "repack_nvfp4.cuh"
|
||||
|
||||
static void ggml_cuda_repack_row_nvfp4(const block_nvfp4 * src, uint8_t * dst, int64_t ne0) {
|
||||
GGML_ASSERT(ne0 % QK_NVFP4 == 0);
|
||||
|
||||
const int lanes_per_cuda_block = QK_K / QK_NVFP4;
|
||||
const int64_t n_upstream_blocks = ne0 / QK_NVFP4;
|
||||
const int64_t n_cuda_blocks = n_upstream_blocks / lanes_per_cuda_block;
|
||||
const int64_t tail_lanes = n_upstream_blocks % lanes_per_cuda_block;
|
||||
|
||||
for (int64_t ib = 0; ib < n_cuda_blocks; ++ib) {
|
||||
const block_nvfp4 * in = src + ib * lanes_per_cuda_block;
|
||||
uint8_t * out = dst + ib * sizeof(block_nvfp4_cuda);
|
||||
|
||||
for (int lane = 0; lane < lanes_per_cuda_block; ++lane) {
|
||||
uint8_t * out_qs = out + lane * sizeof(in[lane].qs);
|
||||
uint8_t * out_scales = out + lanes_per_cuda_block * sizeof(in[lane].qs) + lane * sizeof(in[lane].d);
|
||||
|
||||
for (int pack = 0; pack < 8; ++pack) {
|
||||
const uint32_t packed = ggml_cuda_nvfp4_pack(in[lane].qs, pack);
|
||||
memcpy(out_qs + pack * sizeof(packed), &packed, sizeof(packed));
|
||||
}
|
||||
memcpy(out_scales, in[lane].d, sizeof(in[lane].d));
|
||||
}
|
||||
}
|
||||
|
||||
if (tail_lanes > 0) {
|
||||
const block_nvfp4 * in_tail = src + n_cuda_blocks * lanes_per_cuda_block;
|
||||
uint8_t * tail = dst + n_cuda_blocks * sizeof(block_nvfp4_cuda); // Last short block stays compact too
|
||||
|
||||
for (int64_t lane = 0; lane < tail_lanes; ++lane) {
|
||||
uint8_t * tail_qs = tail + lane * sizeof(in_tail[lane].qs);
|
||||
uint8_t * tail_scales = tail + tail_lanes * sizeof(in_tail[lane].qs) + lane * sizeof(in_tail[lane].d);
|
||||
|
||||
for (int pack = 0; pack < 8; ++pack) {
|
||||
const uint32_t packed = ggml_cuda_nvfp4_pack(in_tail[lane].qs, pack);
|
||||
memcpy(tail_qs + pack * sizeof(packed), &packed, sizeof(packed));
|
||||
}
|
||||
memcpy(tail_scales, in_tail[lane].d, sizeof(in_tail[lane].d));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_unpack_weights_nvfp4(const uint8_t * src, uint8_t * dst) {
|
||||
for (int scale = 0; scale < 4; ++scale) {
|
||||
uint32_t packed_lo;
|
||||
uint32_t packed_hi;
|
||||
memcpy(&packed_lo, src + (scale * 2 + 0) * sizeof(packed_lo), sizeof(packed_lo));
|
||||
memcpy(&packed_hi, src + (scale * 2 + 1) * sizeof(packed_hi), sizeof(packed_hi));
|
||||
|
||||
for (int value = 0; value < 8; ++value) {
|
||||
dst[scale * 8 + value] =
|
||||
ggml_cuda_nvfp4_unpack(packed_lo, value) |
|
||||
(ggml_cuda_nvfp4_unpack(packed_hi, value) << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_unpack_row_nvfp4(const uint8_t * src, block_nvfp4 * dst, int64_t ne0) {
|
||||
GGML_ASSERT(ne0 % QK_NVFP4 == 0);
|
||||
|
||||
const int lanes_per_cuda_block = QK_K / QK_NVFP4;
|
||||
const int64_t n_upstream_blocks = ne0 / QK_NVFP4;
|
||||
const int64_t n_cuda_blocks = n_upstream_blocks / lanes_per_cuda_block;
|
||||
const int64_t tail_lanes = n_upstream_blocks % lanes_per_cuda_block;
|
||||
|
||||
for (int64_t ib = 0; ib < n_cuda_blocks; ++ib) {
|
||||
const uint8_t * in = src + ib * sizeof(block_nvfp4_cuda);
|
||||
block_nvfp4 * out = dst + ib * lanes_per_cuda_block;
|
||||
|
||||
for (int lane = 0; lane < lanes_per_cuda_block; ++lane) {
|
||||
const uint8_t * in_qs = in + lane * sizeof(out[lane].qs);
|
||||
const uint8_t * in_scales = in + lanes_per_cuda_block * sizeof(out[lane].qs) + lane * sizeof(out[lane].d);
|
||||
|
||||
ggml_cuda_unpack_weights_nvfp4(in_qs, out[lane].qs);
|
||||
memcpy(out[lane].d, in_scales, sizeof(out[lane].d));
|
||||
}
|
||||
}
|
||||
|
||||
if (tail_lanes > 0) {
|
||||
const uint8_t * tail = src + n_cuda_blocks * sizeof(block_nvfp4_cuda);
|
||||
block_nvfp4 * out_tail = dst + n_cuda_blocks * lanes_per_cuda_block; // Same compact tail on unpack
|
||||
|
||||
for (int64_t lane = 0; lane < tail_lanes; ++lane) {
|
||||
const uint8_t * tail_qs = tail + lane * sizeof(out_tail[lane].qs);
|
||||
const uint8_t * tail_scales = tail + tail_lanes * sizeof(out_tail[lane].qs) + lane * sizeof(out_tail[lane].d);
|
||||
|
||||
ggml_cuda_unpack_weights_nvfp4(tail_qs, out_tail[lane].qs);
|
||||
memcpy(out_tail[lane].d, tail_scales, sizeof(out_tail[lane].d));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_repack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst) {
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
|
||||
|
||||
for (int64_t row = 0; row < nrows; ++row) {
|
||||
ggml_cuda_repack_row_nvfp4((const block_nvfp4 *) ((const uint8_t *) src + row * row_size), (uint8_t *) dst + row * row_size, ne0);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_unpack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst) {
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
|
||||
|
||||
for (int64_t row = 0; row < nrows; ++row) {
|
||||
ggml_cuda_unpack_row_nvfp4((const uint8_t *) src + row * row_size, (block_nvfp4 *) ((uint8_t *) dst + row * row_size), ne0);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#if !defined(GGML_COMMON_DECL)
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__) || defined(__HIP_DEVICE_COMPILE__)
|
||||
#define GGML_COMMON_DECL_CUDA
|
||||
#else
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#endif
|
||||
#endif
|
||||
#include "ggml-common.h"
|
||||
|
||||
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
|
||||
#define GGML_ALIGN_16 __align__(16)
|
||||
#else
|
||||
#define GGML_ALIGN_16 alignas(16)
|
||||
#endif
|
||||
|
||||
static constexpr int GGML_NVFP4_CUDA_LANES = QK_K / QK_NVFP4; // 4 packs in one 256 block
|
||||
static_assert(GGML_NVFP4_CUDA_LANES == 4, "unexpected NVFP4 CUDA lane count");
|
||||
|
||||
// Full groups use 4 x 64 lane blocks
|
||||
// Tails keep the compact row size
|
||||
struct GGML_ALIGN_16 block_nvfp4_cuda {
|
||||
uint8_t qs[GGML_NVFP4_CUDA_LANES][QK_NVFP4 / 2];
|
||||
uint8_t scales[GGML_NVFP4_CUDA_LANES][QK_NVFP4 / QK_NVFP4_SUB];
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_nvfp4_cuda) == 144, "unexpected nvfp4 cuda block size");
|
||||
static_assert(alignof(block_nvfp4_cuda) == 16, "nvfp4 cuda block must be 16B aligned");
|
||||
|
||||
static inline uint32_t ggml_cuda_nvfp4_pack(const uint8_t src[32], int pack) {
|
||||
const int scale = pack >> 1;
|
||||
const int nibble_shift = (pack & 1) << 2;
|
||||
uint32_t out = 0;
|
||||
for (int value = 0; value < 8; ++value) {
|
||||
const uint32_t nibble = (uint32_t) ((src[scale * 8 + value] >> nibble_shift) & 0x0F); // 8 fp4 codes into one u32
|
||||
out |= (nibble << (4 * value));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static inline uint8_t ggml_cuda_nvfp4_unpack(uint32_t packed, int value) {
|
||||
return (packed >> (4 * value)) & 0x0F;
|
||||
}
|
||||
|
||||
void ggml_cuda_repack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst);
|
||||
void ggml_cuda_unpack_rows_nvfp4(int64_t ne0, int64_t nrows, const void * src, void * dst);
|
||||
|
|
@ -285,5 +285,10 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
|||
llama_build_and_test(test-alloc.cpp)
|
||||
target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
|
||||
if (GGML_CUDA AND NOT GGML_BACKEND_DL)
|
||||
llama_build_and_test(test-nvfp4-repack.cpp)
|
||||
target_include_directories(test-nvfp4-repack PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
endif()
|
||||
|
||||
llama_build(export-graph-ops.cpp)
|
||||
target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,217 @@
|
|||
#include "ggml.h"
|
||||
#include "ggml-cuda/repack_nvfp4.cuh"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
static void set_q4(uint8_t * qs, int idx, uint8_t value) {
|
||||
uint8_t & byte = qs[idx / 2];
|
||||
|
||||
if (idx & 1) {
|
||||
byte = (uint8_t) ((byte & 0x0f) | ((value & 0x0f) << 4));
|
||||
} else {
|
||||
byte = (uint8_t) ((byte & 0xf0) | (value & 0x0f));
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_block(block_nvfp4 & blk, uint32_t seed, int row, int block) {
|
||||
memset(&blk, 0, sizeof(blk));
|
||||
|
||||
for (int i = 0; i < QK_NVFP4; ++i) {
|
||||
const uint32_t value = seed + (uint32_t) row * 19U + (uint32_t) block * 11U + (uint32_t) i * 5U + (uint32_t) (i / 3);
|
||||
set_q4(blk.qs, i, (uint8_t) (value & 0x0f));
|
||||
}
|
||||
|
||||
for (int i = 0; i < QK_NVFP4 / QK_NVFP4_SUB; ++i) {
|
||||
const uint32_t value = seed + 0x31U + (uint32_t) row * 7U + (uint32_t) block * 13U + (uint32_t) i * 9U;
|
||||
blk.d[i] = (uint8_t) (0x30 + value % 0x30);
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_rows(std::vector<uint8_t> & rows, int64_t ne0, int64_t nrows, uint32_t seed) {
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
|
||||
const int blocks_per_row = (int) (ne0 / QK_NVFP4);
|
||||
|
||||
rows.assign(row_size * nrows, 0);
|
||||
|
||||
for (int64_t row = 0; row < nrows; ++row) {
|
||||
block_nvfp4 * dst = (block_nvfp4 *) (rows.data() + row * row_size);
|
||||
for (int block = 0; block < blocks_per_row; ++block) {
|
||||
fill_block(dst[block], seed, (int) row, block);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void fill_layout_row(std::vector<uint8_t> & rows) {
|
||||
rows.assign(ggml_row_size(GGML_TYPE_NVFP4, QK_K), 0);
|
||||
|
||||
block_nvfp4 * dst = (block_nvfp4 *) rows.data();
|
||||
for (int lane = 0; lane < 4; ++lane) {
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
dst[lane].qs[i] = (uint8_t) ((lane * 0x31 + i * 0x17 + 0x12) & 0xff);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
dst[lane].d[i] = (uint8_t) (0x31 + lane * 0x0d + i * 0x09);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool expect_equal(const char * name, const std::vector<uint8_t> & expected, const std::vector<uint8_t> & actual) {
|
||||
if (expected.size() != actual.size()) {
|
||||
std::printf("%s: size mismatch (%zu != %zu)\n", name, expected.size(), actual.size());
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < expected.size(); ++i) {
|
||||
if (expected[i] != actual[i]) {
|
||||
std::printf("%s: first mismatch at byte %zu (expected 0x%02x, got 0x%02x)\n",
|
||||
name, i, expected[i], actual[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool check_roundtrip(const char * name, int64_t ne0, int64_t nrows, uint32_t seed) {
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
|
||||
|
||||
std::vector<uint8_t> input;
|
||||
std::vector<uint8_t> packed(row_size * nrows);
|
||||
std::vector<uint8_t> output(row_size * nrows);
|
||||
|
||||
fill_rows(input, ne0, nrows, seed);
|
||||
ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data());
|
||||
ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data());
|
||||
|
||||
return expect_equal(name, input, output);
|
||||
}
|
||||
|
||||
static bool check_partial_patch(const char * name, int64_t ne0, int64_t nrows, size_t offset, size_t size, uint32_t seed) {
|
||||
const size_t row_size = ggml_row_size(GGML_TYPE_NVFP4, ne0);
|
||||
const size_t total_size = row_size * nrows;
|
||||
|
||||
std::vector<uint8_t> input;
|
||||
std::vector<uint8_t> expected;
|
||||
std::vector<uint8_t> packed(total_size);
|
||||
std::vector<uint8_t> output(total_size);
|
||||
std::vector<uint8_t> patch(size);
|
||||
|
||||
fill_rows(input, ne0, nrows, seed);
|
||||
expected = input;
|
||||
|
||||
for (size_t i = 0; i < patch.size(); ++i) {
|
||||
patch[i] = (uint8_t) (seed + 0x5bU + (uint32_t) i * 23U + (uint32_t) (i / 5));
|
||||
}
|
||||
|
||||
memcpy(expected.data() + offset, patch.data(), size);
|
||||
|
||||
ggml_cuda_repack_rows_nvfp4(ne0, nrows, input.data(), packed.data());
|
||||
|
||||
const size_t aligned_offset = offset / row_size * row_size;
|
||||
const size_t aligned_end = (offset + size + row_size - 1) / row_size * row_size;
|
||||
const size_t aligned_size = aligned_end - aligned_offset;
|
||||
const int64_t aligned_rows = (int64_t) (aligned_size / row_size);
|
||||
const size_t inner_offset = offset - aligned_offset;
|
||||
|
||||
std::vector<uint8_t> rows(aligned_size);
|
||||
ggml_cuda_unpack_rows_nvfp4(ne0, aligned_rows, packed.data() + aligned_offset, rows.data());
|
||||
memcpy(rows.data() + inner_offset, patch.data(), size);
|
||||
ggml_cuda_repack_rows_nvfp4(ne0, aligned_rows, rows.data(), packed.data() + aligned_offset);
|
||||
ggml_cuda_unpack_rows_nvfp4(ne0, nrows, packed.data(), output.data());
|
||||
|
||||
return expect_equal(name, expected, output);
|
||||
}
|
||||
|
||||
static bool check_4x64_to_256_layout(const char * name, uint32_t seed) {
|
||||
GGML_UNUSED(seed);
|
||||
|
||||
std::vector<uint8_t> input;
|
||||
std::vector<uint8_t> packed(ggml_row_size(GGML_TYPE_NVFP4, QK_K));
|
||||
|
||||
fill_layout_row(input);
|
||||
ggml_cuda_repack_rows_nvfp4(QK_K, 1, input.data(), packed.data());
|
||||
|
||||
const block_nvfp4 * src = (const block_nvfp4 *) input.data();
|
||||
for (int lane = 0; lane < 4; ++lane) {
|
||||
for (int pack = 0; pack < 8; ++pack) {
|
||||
uint32_t got = 0;
|
||||
memcpy(&got, packed.data() + lane * 32 + pack * sizeof(got), sizeof(got));
|
||||
const uint32_t expected = ggml_cuda_nvfp4_pack(src[lane].qs, pack);
|
||||
if (got != expected) {
|
||||
std::printf("%s: lane %d pack %d mismatch (expected 0x%08x, got 0x%08x)\n", name, lane, pack, expected, got);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (memcmp(packed.data() + 128 + lane * 4, src[lane].d, 4) != 0) {
|
||||
std::printf("%s: lane %d scales mismatch\n", name, lane);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::printf("%s:\n", name);
|
||||
std::printf(" 4 x block_nvfp4 (64 weights each) -> 1 x block_nvfp4_cuda (256 weights total)\n");
|
||||
for (int lane = 0; lane < 4; ++lane) {
|
||||
std::printf(" block_nvfp4[%d] qs:", lane);
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
std::printf(" %02x", src[lane].qs[i]);
|
||||
}
|
||||
std::printf("\n");
|
||||
}
|
||||
for (int lane = 0; lane < 4; ++lane) {
|
||||
std::printf(" block_nvfp4[%d] scales:", lane);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
std::printf(" %02x", src[lane].d[i]);
|
||||
}
|
||||
std::printf("\n");
|
||||
}
|
||||
|
||||
std::printf(" block_nvfp4_cuda:\n");
|
||||
for (int lane = 0; lane < 4; ++lane) {
|
||||
std::printf(" lane %d qs @0x%02x:", lane, lane * 32);
|
||||
for (int pack = 0; pack < 8; ++pack) {
|
||||
uint32_t word = 0;
|
||||
memcpy(&word, packed.data() + lane * 32 + pack * sizeof(word), sizeof(word));
|
||||
std::printf(" %08x", word);
|
||||
}
|
||||
std::printf("\n");
|
||||
}
|
||||
for (int lane = 0; lane < 4; ++lane) {
|
||||
std::printf(" lane %d scales @0x%02x:", lane, 128 + lane * 4);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
std::printf(" %02x", packed[128 + lane * 4 + i]);
|
||||
}
|
||||
std::printf("\n");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int main() {
|
||||
int total = 0;
|
||||
int passed = 0;
|
||||
|
||||
const struct { const char * name; int64_t ne0, nrows; uint32_t seed; } roundtrip_cases[] = {
|
||||
{ "roundtrip-ne0-64", 64, 3, 0x1001U }, { "roundtrip-ne0-128", 128, 3, 0x1002U },
|
||||
{ "roundtrip-ne0-192", 192, 3, 0x1003U }, { "roundtrip-ne0-256", 256, 2, 0x1004U },
|
||||
{ "roundtrip-ne0-320", 320, 3, 0x1005U },
|
||||
};
|
||||
|
||||
for (const auto & test : roundtrip_cases) {
|
||||
total += 1;
|
||||
passed += check_roundtrip(test.name, test.ne0, test.nrows, test.seed);
|
||||
}
|
||||
|
||||
total += 1;
|
||||
passed += check_partial_patch("partial-cross-row-128", 128, 3, ggml_row_size(GGML_TYPE_NVFP4, 128) - 11, 27, 0x2001U);
|
||||
|
||||
total += 1;
|
||||
passed += check_partial_patch("partial-cross-row-320", 320, 2, ggml_row_size(GGML_TYPE_NVFP4, 320) - 19, 41, 0x2002U);
|
||||
|
||||
total += 1;
|
||||
passed += check_4x64_to_256_layout("layout-4x64-to-256", 0x3001U);
|
||||
|
||||
std::printf("test-nvfp4-repack: %d/%d passed\n", passed, total);
|
||||
return passed == total ? 0 : 1;
|
||||
}
|
||||
Loading…
Reference in New Issue