ggml : fix TurboQuant CPU review issues

This commit is contained in:
Nikodem Eluszkiewicz 2026-03-29 20:24:13 +02:00
parent f96df927eb
commit 0aae7d78c7
4 changed files with 52 additions and 16 deletions

View File

@ -72,6 +72,9 @@
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
// quants.c
#define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K
#define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K
// repack.cpp
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
@ -207,6 +210,8 @@
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__riscv)
// quants.c
#define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K
#define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1

View File

@ -11,6 +11,7 @@
#include <algorithm>
#include <cfloat>
#include <cmath>
#include <type_traits>
#include <vector>
// ggml_compute_forward_dup
@ -529,8 +530,6 @@ static void ggml_compute_forward_dup_from_q(
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
std::vector<float> tmp(qk);
for (int64_t ir = ir0; ir < ir1; ++ir) {
uint32_t i = ir * qk;
@ -547,11 +546,19 @@ static void ggml_compute_forward_dup_from_q(
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
dequantize_row_q(
(const void *) ((char *) src0->data + x_offset),
tmp.data(), qk);
if constexpr (std::is_same_v<dst_t, float>) {
dequantize_row_q(
(const void *) ((char *) src0->data + x_offset),
(float *) ((char *) dst->data + dst_offset), qk);
} else {
std::vector<float> tmp(qk);
ggml_dup_from_float_row(tmp.data(), (dst_t *) ((char *) dst->data + dst_offset), qk);
dequantize_row_q(
(const void *) ((char *) src0->data + x_offset),
tmp.data(), qk);
ggml_dup_from_float_row(tmp.data(), (dst_t *) ((char *) dst->data + dst_offset), qk);
}
}
}

View File

@ -2944,10 +2944,15 @@ llama_context * llama_init_from_model(
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
const uint32_t blck_size = ggml_blck_size(params.type_k);
const bool is_tbq_k = params.type_k == GGML_TYPE_TBQ3_0 || params.type_k == GGML_TYPE_TBQ4_0;
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
const uint32_t n_embd_k = is_tbq_k ? model->hparams.n_embd_k_gqa(il) : model->hparams.n_embd_head_k(il);
if (n_embd_k % blck_size != 0) {
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide %s=%u\n",
__func__, ggml_type_name(params.type_k), blck_size,
is_tbq_k ? "n_embd_k_gqa" : "n_embd_head_k", n_embd_k);
return nullptr;
}
}
@ -2955,10 +2960,15 @@ llama_context * llama_init_from_model(
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
const uint32_t blck_size = ggml_blck_size(params.type_v);
const bool is_tbq_v = params.type_v == GGML_TYPE_TBQ3_0 || params.type_v == GGML_TYPE_TBQ4_0;
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
const uint32_t n_embd_v = is_tbq_v ? model->hparams.n_embd_v_gqa(il) : model->hparams.n_embd_head_v(il);
if (n_embd_v % blck_size != 0) {
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide %s=%u\n",
__func__, ggml_type_name(params.type_v), blck_size,
is_tbq_v ? "n_embd_v_gqa" : "n_embd_head_v", n_embd_v);
return nullptr;
}
}

View File

@ -7804,15 +7804,29 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}
}
for (ggml_type type_src : all_types) {
for (ggml_type type_dst : {GGML_TYPE_F32}) {
if (!ggml_is_quantized(type_src)) {
continue;
}
test_cases.emplace_back(new test_cpy(type_src, GGML_TYPE_F32, {256, 4, 4, 4}));
test_cases.emplace_back(new test_cpy(type_src, GGML_TYPE_F32, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
}
for (ggml_type type : turboq_types) {
test_cases.emplace_back(new test_cpy(type, GGML_TYPE_F32, {256, 4, 4, 4}));
test_cases.emplace_back(new test_cpy(type, GGML_TYPE_F32, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
}
for (ggml_type type_src : all_types) {
if (!ggml_is_quantized(type_src)) {
continue;
}
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_BF16}) {
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
}
}
for (ggml_type type_src : all_types) {
for (ggml_type type : turboq_types) {
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_BF16}) {
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
test_cases.emplace_back(new test_cpy(type, type_dst, {256, 4, 4, 4}));
test_cases.emplace_back(new test_cpy(type, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
}
}
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {