Use Q8_0_C in token embd, lm_head, and for 5 and 6 bits quant
This commit is contained in:
parent
67c9720e49
commit
4e451778d3
|
|
@ -164,23 +164,19 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
|
||||||
|
|
||||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
|
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
|
||||||
if (!ggml_openvino_is_npu()) {
|
|
||||||
return std::nullopt;
|
|
||||||
}
|
|
||||||
// NPU requantization rules
|
|
||||||
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
|
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
|
||||||
return ExtraQuantType::F16;
|
return ExtraQuantType::Q8_0_C;
|
||||||
}
|
}
|
||||||
if (strncmp(tensor->name, "output.weight", 13) == 0) {
|
if (strncmp(tensor->name, "output.weight", 13) == 0) {
|
||||||
|
return ExtraQuantType::Q8_0_C;
|
||||||
|
}
|
||||||
|
if (ggml_openvino_is_npu()) {
|
||||||
return ExtraQuantType::Q4_0_128;
|
return ExtraQuantType::Q4_0_128;
|
||||||
}
|
}
|
||||||
switch (tensor->type) {
|
switch (tensor->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
|
||||||
case GGML_TYPE_Q4_1:
|
|
||||||
case GGML_TYPE_Q4_K:
|
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
return ExtraQuantType::Q4_0_128;
|
return ExtraQuantType::Q8_0_C;
|
||||||
default:
|
default:
|
||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue