diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 724ac84f1d..74b886b054 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -72,6 +72,9 @@ #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64) +// quants.c +#define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K +#define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K // repack.cpp #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 #define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8 @@ -207,6 +210,8 @@ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0 #elif defined(__riscv) // quants.c +#define ggml_vec_dot_tbq3_0_q8_K_generic ggml_vec_dot_tbq3_0_q8_K +#define ggml_vec_dot_tbq4_0_q8_K_generic ggml_vec_dot_tbq4_0_q8_K #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 6efdcb36b3..8933acf800 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include // ggml_compute_forward_dup @@ -529,8 +530,6 @@ static void ggml_compute_forward_dup_from_q( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - std::vector tmp(qk); - for (int64_t ir = ir0; ir < ir1; ++ir) { uint32_t i = ir * qk; @@ -547,11 +546,19 @@ static void ggml_compute_forward_dup_from_q( const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; - dequantize_row_q( - (const void *) ((char *) src0->data + x_offset), - tmp.data(), qk); + if constexpr (std::is_same_v) { + dequantize_row_q( + (const void *) ((char *) src0->data + x_offset), + (float *) ((char *) dst->data + dst_offset), qk); + } else { + std::vector tmp(qk); - ggml_dup_from_float_row(tmp.data(), (dst_t *) ((char *) dst->data + dst_offset), qk); + dequantize_row_q( + (const void *) ((char *) src0->data + x_offset), + tmp.data(), qk); + + ggml_dup_from_float_row(tmp.data(), (dst_t *) ((char *) dst->data + dst_offset), qk); + } } } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f6ce2817a8..c14fb98a5d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2944,10 +2944,15 @@ llama_context * llama_init_from_model( if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) { const uint32_t blck_size = ggml_blck_size(params.type_k); + const bool is_tbq_k = params.type_k == GGML_TYPE_TBQ3_0 || params.type_k == GGML_TYPE_TBQ4_0; + for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { - if (model->hparams.n_embd_head_k(il) % blck_size != 0) { - LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n", - __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il)); + const uint32_t n_embd_k = is_tbq_k ? model->hparams.n_embd_k_gqa(il) : model->hparams.n_embd_head_k(il); + + if (n_embd_k % blck_size != 0) { + LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide %s=%u\n", + __func__, ggml_type_name(params.type_k), blck_size, + is_tbq_k ? "n_embd_k_gqa" : "n_embd_head_k", n_embd_k); return nullptr; } } @@ -2955,10 +2960,15 @@ llama_context * llama_init_from_model( if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) { const uint32_t blck_size = ggml_blck_size(params.type_v); + const bool is_tbq_v = params.type_v == GGML_TYPE_TBQ3_0 || params.type_v == GGML_TYPE_TBQ4_0; + for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { - if (model->hparams.n_embd_head_v(il) % blck_size != 0) { - LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n", - __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il)); + const uint32_t n_embd_v = is_tbq_v ? model->hparams.n_embd_v_gqa(il) : model->hparams.n_embd_head_v(il); + + if (n_embd_v % blck_size != 0) { + LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide %s=%u\n", + __func__, ggml_type_name(params.type_v), blck_size, + is_tbq_v ? "n_embd_v_gqa" : "n_embd_head_v", n_embd_v); return nullptr; } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f8ce1cf314..75e2ae22d2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7804,15 +7804,29 @@ static std::vector> make_test_cases_eval() { } } for (ggml_type type_src : all_types) { - for (ggml_type type_dst : {GGML_TYPE_F32}) { + if (!ggml_is_quantized(type_src)) { + continue; + } + test_cases.emplace_back(new test_cpy(type_src, GGML_TYPE_F32, {256, 4, 4, 4})); + test_cases.emplace_back(new test_cpy(type_src, GGML_TYPE_F32, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + } + for (ggml_type type : turboq_types) { + test_cases.emplace_back(new test_cpy(type, GGML_TYPE_F32, {256, 4, 4, 4})); + test_cases.emplace_back(new test_cpy(type, GGML_TYPE_F32, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + } + for (ggml_type type_src : all_types) { + if (!ggml_is_quantized(type_src)) { + continue; + } + for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_BF16}) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows } } - for (ggml_type type_src : all_types) { + for (ggml_type type : turboq_types) { for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_BF16}) { - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type, type_dst, {256, 4, 4, 4})); + test_cases.emplace_back(new test_cpy(type, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {