ggml : fix TurboQuant CPU review issues
This commit is contained in:
parent
f96df927eb
commit
0aae7d78c7
|
|
@ -72,6 +72,9 @@
|
||||||
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
||||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||||
|
// quants.c
|
||||||
|
#define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K
|
||||||
|
#define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
||||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
|
|
@ -207,6 +210,8 @@
|
||||||
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
||||||
#elif defined(__riscv)
|
#elif defined(__riscv)
|
||||||
// quants.c
|
// quants.c
|
||||||
|
#define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K
|
||||||
|
#define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K
|
||||||
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
|
||||||
// repack.cpp
|
// repack.cpp
|
||||||
#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
|
#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <type_traits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// ggml_compute_forward_dup
|
// ggml_compute_forward_dup
|
||||||
|
|
@ -529,8 +530,6 @@ static void ggml_compute_forward_dup_from_q(
|
||||||
const int ir0 = dr*ith;
|
const int ir0 = dr*ith;
|
||||||
const int ir1 = MIN(ir0 + dr, nr);
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
std::vector<float> tmp(qk);
|
|
||||||
|
|
||||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||||
|
|
||||||
uint32_t i = ir * qk;
|
uint32_t i = ir * qk;
|
||||||
|
|
@ -547,11 +546,19 @@ static void ggml_compute_forward_dup_from_q(
|
||||||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||||
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||||
|
|
||||||
dequantize_row_q(
|
if constexpr (std::is_same_v<dst_t, float>) {
|
||||||
(const void *) ((char *) src0->data + x_offset),
|
dequantize_row_q(
|
||||||
tmp.data(), qk);
|
(const void *) ((char *) src0->data + x_offset),
|
||||||
|
(float *) ((char *) dst->data + dst_offset), qk);
|
||||||
|
} else {
|
||||||
|
std::vector<float> tmp(qk);
|
||||||
|
|
||||||
ggml_dup_from_float_row(tmp.data(), (dst_t *) ((char *) dst->data + dst_offset), qk);
|
dequantize_row_q(
|
||||||
|
(const void *) ((char *) src0->data + x_offset),
|
||||||
|
tmp.data(), qk);
|
||||||
|
|
||||||
|
ggml_dup_from_float_row(tmp.data(), (dst_t *) ((char *) dst->data + dst_offset), qk);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2944,10 +2944,15 @@ llama_context * llama_init_from_model(
|
||||||
|
|
||||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
|
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
|
||||||
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
||||||
|
const bool is_tbq_k = params.type_k == GGML_TYPE_TBQ3_0 || params.type_k == GGML_TYPE_TBQ4_0;
|
||||||
|
|
||||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||||
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
|
const uint32_t n_embd_k = is_tbq_k ? model->hparams.n_embd_k_gqa(il) : model->hparams.n_embd_head_k(il);
|
||||||
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
|
||||||
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
|
if (n_embd_k % blck_size != 0) {
|
||||||
|
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide %s=%u\n",
|
||||||
|
__func__, ggml_type_name(params.type_k), blck_size,
|
||||||
|
is_tbq_k ? "n_embd_k_gqa" : "n_embd_head_k", n_embd_k);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2955,10 +2960,15 @@ llama_context * llama_init_from_model(
|
||||||
|
|
||||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
|
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
|
||||||
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
||||||
|
const bool is_tbq_v = params.type_v == GGML_TYPE_TBQ3_0 || params.type_v == GGML_TYPE_TBQ4_0;
|
||||||
|
|
||||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||||
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
|
const uint32_t n_embd_v = is_tbq_v ? model->hparams.n_embd_v_gqa(il) : model->hparams.n_embd_head_v(il);
|
||||||
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
|
|
||||||
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
|
if (n_embd_v % blck_size != 0) {
|
||||||
|
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide %s=%u\n",
|
||||||
|
__func__, ggml_type_name(params.type_v), blck_size,
|
||||||
|
is_tbq_v ? "n_embd_v_gqa" : "n_embd_head_v", n_embd_v);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7804,15 +7804,29 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (ggml_type type_src : all_types) {
|
for (ggml_type type_src : all_types) {
|
||||||
for (ggml_type type_dst : {GGML_TYPE_F32}) {
|
if (!ggml_is_quantized(type_src)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
test_cases.emplace_back(new test_cpy(type_src, GGML_TYPE_F32, {256, 4, 4, 4}));
|
||||||
|
test_cases.emplace_back(new test_cpy(type_src, GGML_TYPE_F32, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
||||||
|
}
|
||||||
|
for (ggml_type type : turboq_types) {
|
||||||
|
test_cases.emplace_back(new test_cpy(type, GGML_TYPE_F32, {256, 4, 4, 4}));
|
||||||
|
test_cases.emplace_back(new test_cpy(type, GGML_TYPE_F32, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
||||||
|
}
|
||||||
|
for (ggml_type type_src : all_types) {
|
||||||
|
if (!ggml_is_quantized(type_src)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_BF16}) {
|
||||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (ggml_type type_src : all_types) {
|
for (ggml_type type : turboq_types) {
|
||||||
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_BF16}) {
|
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_BF16}) {
|
||||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
test_cases.emplace_back(new test_cpy(type, type_dst, {256, 4, 4, 4}));
|
||||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
test_cases.emplace_back(new test_cpy(type, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue