Use Q8_0_C in token embd, lm_head, and for 5 and 6 bits quant

This commit is contained in:
Yu, Zijun 2025-12-29 15:27:50 +08:00 committed by Mustafa Cavus
parent 67c9720e49
commit 4e451778d3
1 changed files with 5 additions and 9 deletions

View File

@ -164,23 +164,19 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
// Get requantization type for a tensor type (returns nullopt if no requant needed)
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
if (!ggml_openvino_is_npu()) {
return std::nullopt;
}
// NPU requantization rules
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
return ExtraQuantType::F16;
return ExtraQuantType::Q8_0_C;
}
if (strncmp(tensor->name, "output.weight", 13) == 0) {
return ExtraQuantType::Q8_0_C;
}
if (ggml_openvino_is_npu()) {
return ExtraQuantType::Q4_0_128;
}
switch (tensor->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q5_K:
return ExtraQuantType::Q4_0_128;
return ExtraQuantType::Q8_0_C;
default:
return std::nullopt;
}