[SYCL] Add Q8_0 reorder optimization (~3x tg speedup on Intel Arc) (#21527)
Extend the existing reorder optimization to Q8_0. The reorder separates scale factors from weight data for coalesced memory access -- was implemented for Q4_0/Q4_K/Q6_K but Q8_0 was missing. On Arc Pro B70 (Xe2), Q8_0 tg goes from 4.88 to 15.24 t/s (3.1x) on Qwen3.5-27B. BW utilization: 21% -> 66%. The key fix beyond the kernels: Q8_0 was missing from the type check in ggml_backend_sycl_buffer_init_tensor() that allocates the extra struct carrying the reorder flag -- so the optimization was silently skipped. AI (Claude) was used to assist with root cause investigation and writing the kernel code. All code was human-reviewed and tested on real hardware. Fixes: #21517
This commit is contained in:
parent
0033f53a07
commit
0988accf82
|
|
@ -143,6 +143,22 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib,
|
|||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q8_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
|
||||
const int iqs, dfloat2 &v) {
|
||||
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr + ib);
|
||||
|
||||
v.x() = ((const int8_t *)qs)[iqs + 0];
|
||||
v.y() = ((const int8_t *)qs)[iqs + 1];
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
v.s0() *= d;
|
||||
v.s1() *= d;
|
||||
#else
|
||||
v.x() *= d;
|
||||
v.y() *= d;
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q8_0(const void *vx, const int64_t ib,
|
||||
const int iqs, dfloat2 &v) {
|
||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||
|
|
|
|||
|
|
@ -972,6 +972,103 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
|
|||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q8_0_sycl_reorder(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
// Q8_0 reorder layout: [all qs (ncols*nrows bytes)][all d values]
|
||||
// Cannot reuse dequantize_mul_mat_vec_reorder template because it has
|
||||
// Q4_0-specific constants hardcoded (d_ptr offset and qs stride).
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
if (row >= nrows) return;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int iter_stride = 8*2*GGML_SYCL_DMMV_X;
|
||||
const int vals_per_iter = iter_stride / WARP_SIZE;
|
||||
const int ncols_left = ncols % (QK8_0*WARP_SIZE);
|
||||
const int ncols_align = ncols - ncols_left;
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
sycl::half2 tmp = {0.0f, 0.0f};
|
||||
#else
|
||||
float tmp = 0.0f;
|
||||
#endif
|
||||
const char *d_ptr = (const char*)vx + ncols*nrows; // d after all qs
|
||||
|
||||
int i = 0;
|
||||
for (i = 0; i < ncols_align; i += iter_stride) {
|
||||
const int col = i + vals_per_iter*tid;
|
||||
const int ib = (row*ncols + col)/QK8_0;
|
||||
const int iqs = col % QK8_0;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < vals_per_iter; j += 2) {
|
||||
dfloat2 v;
|
||||
dequantize_q8_0_reorder((const void *)d_ptr, ib, (const void *)vx,
|
||||
ib * QK8_0 + iqs + j, v);
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
dfloat2 t1{y[col + j + 0], y[col + j + 1]};
|
||||
tmp += v * t1;
|
||||
#else
|
||||
tmp += v.x() * y[col + j + 0];
|
||||
tmp += v.y() * y[col + j + 1];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// handle remaining columns
|
||||
for (; i < ncols; i += iter_stride) {
|
||||
if (tid >= ncols_left/QK8_0) continue;
|
||||
const int col = i + vals_per_iter*tid;
|
||||
const int ib = (row*ncols + col)/QK8_0;
|
||||
const int iqs = col % QK8_0;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < vals_per_iter; j += 2) {
|
||||
dfloat2 v;
|
||||
dequantize_q8_0_reorder((const void *)d_ptr, ib, (const void *)vx,
|
||||
ib * QK8_0 + iqs + j, v);
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
dfloat2 t1{y[col + j + 0], y[col + j + 1]};
|
||||
tmp += v * t1;
|
||||
#else
|
||||
tmp += v.x() * y[col + j + 0];
|
||||
tmp += v.y() * y[col + j + 1];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// reduce
|
||||
const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
|
||||
for (int mask = mask_start; mask > 0; mask >>= 1) {
|
||||
tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
#ifdef GGML_SYCL_F16
|
||||
dst[row] = tmp.x() + tmp.y();
|
||||
#else
|
||||
dst[row] = tmp;
|
||||
#endif
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
|
|
@ -1122,7 +1219,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
|||
dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
dequantize_mul_mat_vec_q8_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
} else {
|
||||
dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
|
|
|
|||
|
|
@ -411,7 +411,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|||
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
|
||||
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q8_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
|
||||
!g_ggml_sycl_disable_optimize) {
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
tensor->extra = extra;
|
||||
|
|
@ -3254,6 +3254,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
|||
inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
|
|
@ -3266,6 +3267,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
|||
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
@ -3275,6 +3277,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
|||
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return true;
|
||||
|
|
@ -3364,6 +3367,40 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
|
|||
sycl_ext_free(stream, tmp_buf);
|
||||
}
|
||||
|
||||
static void reorder_qw_q8_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
||||
dpct::queue_ptr stream) {
|
||||
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
|
||||
|
||||
sycl::event copy_event;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
copy_event.wait();
|
||||
}
|
||||
|
||||
GGML_ASSERT((size % sizeof(block_q8_0) == 0));
|
||||
GGML_ASSERT((offset % sizeof(block_q8_0) == 0));
|
||||
int offset_blks = offset / sizeof(block_q8_0);
|
||||
auto qs_ptr = data_device + offset_blks * QK8_0;
|
||||
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows) + offset_blks;
|
||||
|
||||
auto reorder_event = stream->parallel_for(
|
||||
size / sizeof(block_q8_0),
|
||||
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
const block_q8_0* x = (const block_q8_0*)tmp_buf;
|
||||
const int ib = i;
|
||||
|
||||
for (int j = 0; j < QK8_0; j++)
|
||||
{
|
||||
*((int8_t*)qs_ptr + ib * QK8_0 + j) = x[ib].qs[j];
|
||||
}
|
||||
*(d_ptr + ib) = x[ib].d;
|
||||
});
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
reorder_event.wait_and_throw();
|
||||
}
|
||||
sycl_ext_free(stream, tmp_buf);
|
||||
}
|
||||
|
||||
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(size % sizeof(block_q4_K) == 0);
|
||||
GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
|
||||
|
|
@ -3460,6 +3497,9 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
|||
case GGML_TYPE_Q4_0:
|
||||
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
reorder_qw_q4_k(data_device, size, 0, stream);
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -679,6 +679,25 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
|||
}
|
||||
}
|
||||
|
||||
static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0>>(vx, vy, dst, ncols, nrows,
|
||||
nd_item);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
|
|
@ -1101,7 +1120,13 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
|||
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n");
|
||||
reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
} else {
|
||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
|
|
|
|||
|
|
@ -105,6 +105,27 @@ template <> struct block_q_t<GGML_TYPE_Q6_K> {
|
|||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
};
|
||||
|
||||
template <> struct block_q_t<GGML_TYPE_Q8_0> {
|
||||
struct traits {
|
||||
static constexpr uint32_t qk = QK8_0; // 32
|
||||
static constexpr uint32_t qi = QI8_0; // 8
|
||||
static constexpr uint32_t qr = QR8_0; // 1
|
||||
static constexpr uint32_t vdr_mmvq = 4;
|
||||
};
|
||||
|
||||
// Q8_0 reorder layout: [qs0|qs1|...|qsN][d0|d1|...|dN]
|
||||
// Each block has 32 int8 weights (32 bytes) followed by all scales
|
||||
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
|
||||
return { block_index * QK8_0, 0 };
|
||||
}
|
||||
|
||||
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
||||
return { (ncols * nrows) + block_index * sizeof(ggml_half), 0 };
|
||||
}
|
||||
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } // 1
|
||||
};
|
||||
|
||||
} // namespace ggml_sycl_reordered
|
||||
|
||||
#endif // GGML_SYCL_QUANTS_HPP
|
||||
|
|
|
|||
|
|
@ -351,6 +351,46 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
|
|||
};
|
||||
};
|
||||
|
||||
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0> {
|
||||
static constexpr ggml_type gtype = GGML_TYPE_Q8_0;
|
||||
|
||||
using q8_0_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q8_0>;
|
||||
using q8_0_traits = typename q8_0_block::traits;
|
||||
|
||||
__dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int * v, const int * u, const float & d8_0, const sycl::half2 & ds8) {
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
|
||||
// Q8_0 values are signed int8, no nibble extraction needed
|
||||
// Direct dp4a: each int packs 4 int8 values
|
||||
sumi = dpct::dp4a(v[i], u[i], sumi);
|
||||
}
|
||||
|
||||
const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
|
||||
|
||||
// Q8_0 has no bias term (values are signed), so just scale
|
||||
return d8_0 * sumi * ds8f.x();
|
||||
}
|
||||
|
||||
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
||||
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
|
||||
const sycl::half2 * q8_1_ds, const int & iqs) {
|
||||
const int8_t * bq8_0 = static_cast<const int8_t *>(vbq) + ibx_offset.first;
|
||||
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
|
||||
int v[q8_0_traits::vdr_mmvq];
|
||||
int u[q8_0_traits::vdr_mmvq];
|
||||
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
|
||||
v[i] = get_int_from_int8(bq8_0, iqs + i);
|
||||
u[i] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
|
||||
}
|
||||
|
||||
return vec_dot_q8_0_q8_1_impl(v, u, d, *q8_1_ds);
|
||||
};
|
||||
};
|
||||
|
||||
static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
|
||||
const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
|
||||
const int & iqs) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue