requant to f16 for Q6 embed on NPU

This commit is contained in:
Mustafa Cavus 2026-01-12 10:47:16 -08:00
parent a40a5dfc60
commit a81b202f57
1 changed files with 1 additions and 1 deletions

View File

@ -165,7 +165,7 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
// Get requantization type for a tensor type (returns nullopt if no requant needed)
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
return (ggml_openvino_is_npu() ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
}
if (strncmp(tensor->name, "output.weight", 13) == 0) {
return ExtraQuantType::Q8_0_C;