[SYCL] Add Q8_0 reorder optimization (~3x tg speedup on Intel Arc) (#21527)

Extend the existing reorder optimization to Q8_0. The reorder
separates scale factors from weight data for coalesced memory
access -- was implemented for Q4_0/Q4_K/Q6_K but Q8_0 was missing.

On Arc Pro B70 (Xe2), Q8_0 tg goes from 4.88 to 15.24 t/s (3.1x)
on Qwen3.5-27B. BW utilization: 21% -> 66%.

The key fix beyond the kernels: Q8_0 was missing from the type
check in ggml_backend_sycl_buffer_init_tensor() that allocates
the extra struct carrying the reorder flag -- so the optimization
was silently skipped.

AI (Claude) was used to assist with root cause investigation and
writing the kernel code. All code was human-reviewed and tested
on real hardware.

Fixes: #21517
This commit is contained in:
PMZFX 2026-04-07 04:12:49 -04:00 committed by GitHub
parent 0033f53a07
commit 0988accf82
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 247 additions and 3 deletions

View File

@ -143,6 +143,22 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib,
#endif // GGML_SYCL_F16
}
static __dpct_inline__ void dequantize_q8_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
const int iqs, dfloat2 &v) {
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr + ib);
v.x() = ((const int8_t *)qs)[iqs + 0];
v.y() = ((const int8_t *)qs)[iqs + 1];
#ifdef GGML_SYCL_F16
v.s0() *= d;
v.s1() *= d;
#else
v.x() *= d;
v.y() *= d;
#endif // GGML_SYCL_F16
}
static __dpct_inline__ void dequantize_q8_0(const void *vx, const int64_t ib,
const int iqs, dfloat2 &v) {
const block_q8_0 * x = (const block_q8_0 *) vx;

View File

@ -972,6 +972,103 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
}
}
static void dequantize_mul_mat_vec_q8_0_sycl_reorder(const void *vx, const dfloat *y,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
// Q8_0 reorder layout: [all qs (ncols*nrows bytes)][all d values]
// Cannot reuse dequantize_mul_mat_vec_reorder template because it has
// Q4_0-specific constants hardcoded (d_ptr offset and qs stride).
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1);
if (row >= nrows) return;
const int tid = item_ct1.get_local_id(2);
const int iter_stride = 8*2*GGML_SYCL_DMMV_X;
const int vals_per_iter = iter_stride / WARP_SIZE;
const int ncols_left = ncols % (QK8_0*WARP_SIZE);
const int ncols_align = ncols - ncols_left;
#ifdef GGML_SYCL_F16
sycl::half2 tmp = {0.0f, 0.0f};
#else
float tmp = 0.0f;
#endif
const char *d_ptr = (const char*)vx + ncols*nrows; // d after all qs
int i = 0;
for (i = 0; i < ncols_align; i += iter_stride) {
const int col = i + vals_per_iter*tid;
const int ib = (row*ncols + col)/QK8_0;
const int iqs = col % QK8_0;
#pragma unroll
for (int j = 0; j < vals_per_iter; j += 2) {
dfloat2 v;
dequantize_q8_0_reorder((const void *)d_ptr, ib, (const void *)vx,
ib * QK8_0 + iqs + j, v);
#ifdef GGML_SYCL_F16
dfloat2 t1{y[col + j + 0], y[col + j + 1]};
tmp += v * t1;
#else
tmp += v.x() * y[col + j + 0];
tmp += v.y() * y[col + j + 1];
#endif
}
}
// handle remaining columns
for (; i < ncols; i += iter_stride) {
if (tid >= ncols_left/QK8_0) continue;
const int col = i + vals_per_iter*tid;
const int ib = (row*ncols + col)/QK8_0;
const int iqs = col % QK8_0;
#pragma unroll
for (int j = 0; j < vals_per_iter; j += 2) {
dfloat2 v;
dequantize_q8_0_reorder((const void *)d_ptr, ib, (const void *)vx,
ib * QK8_0 + iqs + j, v);
#ifdef GGML_SYCL_F16
dfloat2 t1{y[col + j + 0], y[col + j + 1]};
tmp += v * t1;
#else
tmp += v.x() * y[col + j + 0];
tmp += v.y() * y[col + j + 1];
#endif
}
}
// reduce
const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
for (int mask = mask_start; mask > 0; mask >>= 1) {
tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
if (tid == 0) {
#ifdef GGML_SYCL_F16
dst[row] = tmp.x() + tmp.y();
#else
dst[row] = tmp;
#endif
}
});
}
}
static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
float *dst, const int ncols,
const int nrows,
@ -1122,7 +1219,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q8_0:
dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
dequantize_mul_mat_vec_q8_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
} else {
dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q2_K:
dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);

View File

@ -411,7 +411,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
assert(tensor->view_src->buffer->buft == buffer->buft);
return GGML_STATUS_SUCCESS;
}
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q8_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
!g_ggml_sycl_disable_optimize) {
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
tensor->extra = extra;
@ -3254,6 +3254,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
return true;
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q6_K:
@ -3266,6 +3267,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
return true;
default:
return false;
@ -3275,6 +3277,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q6_K:
return true;
@ -3364,6 +3367,40 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
sycl_ext_free(stream, tmp_buf);
}
static void reorder_qw_q8_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
dpct::queue_ptr stream) {
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}
GGML_ASSERT((size % sizeof(block_q8_0) == 0));
GGML_ASSERT((offset % sizeof(block_q8_0) == 0));
int offset_blks = offset / sizeof(block_q8_0);
auto qs_ptr = data_device + offset_blks * QK8_0;
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows) + offset_blks;
auto reorder_event = stream->parallel_for(
size / sizeof(block_q8_0),
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
const block_q8_0* x = (const block_q8_0*)tmp_buf;
const int ib = i;
for (int j = 0; j < QK8_0; j++)
{
*((int8_t*)qs_ptr + ib * QK8_0 + j) = x[ib].qs[j];
}
*(d_ptr + ib) = x[ib].d;
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
}
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
GGML_ASSERT(size % sizeof(block_q4_K) == 0);
GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
@ -3460,6 +3497,9 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
case GGML_TYPE_Q4_0:
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
break;
case GGML_TYPE_Q8_0:
reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
break;
case GGML_TYPE_Q4_K:
reorder_qw_q4_k(data_device, size, 0, stream);
break;

View File

@ -679,6 +679,25 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
}
}
static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK8_0 == 0);
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
constexpr size_t num_subgroups = 16;
GGML_ASSERT(block_num_y % num_subgroups == 0);
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0>>(vx, vy, dst, ncols, nrows,
nd_item);
});
});
}
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
@ -1101,7 +1120,13 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
case GGML_TYPE_Q8_0:
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n");
reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
} else {
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q2_K:
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);

View File

@ -105,6 +105,27 @@ template <> struct block_q_t<GGML_TYPE_Q6_K> {
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
};
template <> struct block_q_t<GGML_TYPE_Q8_0> {
struct traits {
static constexpr uint32_t qk = QK8_0; // 32
static constexpr uint32_t qi = QI8_0; // 8
static constexpr uint32_t qr = QR8_0; // 1
static constexpr uint32_t vdr_mmvq = 4;
};
// Q8_0 reorder layout: [qs0|qs1|...|qsN][d0|d1|...|dN]
// Each block has 32 int8 weights (32 bytes) followed by all scales
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
return { block_index * QK8_0, 0 };
}
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
return { (ncols * nrows) + block_index * sizeof(ggml_half), 0 };
}
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } // 1
};
} // namespace ggml_sycl_reordered
#endif // GGML_SYCL_QUANTS_HPP

View File

@ -351,6 +351,46 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
};
};
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0> {
static constexpr ggml_type gtype = GGML_TYPE_Q8_0;
using q8_0_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q8_0>;
using q8_0_traits = typename q8_0_block::traits;
__dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int * v, const int * u, const float & d8_0, const sycl::half2 & ds8) {
int sumi = 0;
#pragma unroll
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
// Q8_0 values are signed int8, no nibble extraction needed
// Direct dp4a: each int packs 4 int8 values
sumi = dpct::dp4a(v[i], u[i], sumi);
}
const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
// Q8_0 has no bias term (values are signed), so just scale
return d8_0 * sumi * ds8f.x();
}
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
const sycl::half2 * q8_1_ds, const int & iqs) {
const int8_t * bq8_0 = static_cast<const int8_t *>(vbq) + ibx_offset.first;
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
int v[q8_0_traits::vdr_mmvq];
int u[q8_0_traits::vdr_mmvq];
#pragma unroll
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
v[i] = get_int_from_int8(bq8_0, iqs + i);
u[i] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
}
return vec_dot_q8_0_q8_1_impl(v, u, d, *q8_1_ds);
};
};
static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
const int & iqs) {