NPU always requant to q4_0_128

This commit is contained in:
Yu, Zijun 2025-12-26 15:18:30 +08:00 committed by Mustafa Cavus
parent 52a44012c0
commit c1142ddb7c
4 changed files with 12 additions and 44 deletions

View File

@ -3,6 +3,7 @@
#include "ggml-impl.h"
#include "ggml.h"
#include <cstring>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
@ -162,19 +163,24 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
}
// Get requantization type for a tensor type (returns nullopt if no requant needed)
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
if (!ggml_openvino_is_npu()) {
return std::nullopt;
}
// NPU requantization rules
switch (type) {
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
return ExtraQuantType::F16;
}
if (strncmp(tensor->name, "output.weight", 13) == 0) {
return ExtraQuantType::Q4_0_128;
}
switch (tensor->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_K:
return ExtraQuantType::Q4_0_128;
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q5_K:
return ExtraQuantType::F16;
return ExtraQuantType::Q4_0_128;
default:
return std::nullopt;
}
@ -200,7 +206,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
const size_t alignment = 64; // Good for SIMD
// Check if requantization is needed (NPU-specific)
auto requant_type = ggml_openvino_get_requant_type(tensor->type);
auto requant_type = ggml_openvino_get_requant_type(tensor);
if (requant_type.has_value()) {
layout.is_requant = true;
layout.requant_type = requant_type;

View File

@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name();
bool ggml_openvino_is_npu();
// Get requantization type for a tensor type (returns nullopt if no requant needed)
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type);
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor);
// =====================================================
// OpenVINO Tensor Extra Types

View File

@ -535,40 +535,6 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
return result;
}
std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) {
ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
// FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k)
// (In some q4_0 models which use two different weight for token_emb and output, token_emb is q4_0)
std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
if (device == "NPU" && std::string(tensor->name) == "token_embd.weight") {
requant_type = ExtraQuantType::F16;
}
// Determine block size
int64_t block_size = node_shape[1];
if (requant_type == ExtraQuantType::Q4_0_128) {
block_size = 128;
} else if (requant_type == ExtraQuantType::Q8_0_32) {
block_size = 32;
}
// Allocate tensors
ov::Tensor weights, scales, biases;
if (requant_type == ExtraQuantType::F16) {
weights = ov::Tensor(ov::element::f16, node_shape);
} else {
bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
ov::element::Type weight_type = is_u4 ? ov::element::u4 : ov::element::u8;
ov::Shape scales_shape = {node_shape[0], node_shape[1] / block_size};
weights = ov::Tensor(weight_type, node_shape);
scales = ov::Tensor(ov::element::f16, scales_shape);
biases = ov::Tensor(ov::element::f16, scales_shape);
}
return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases);
}
std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
GGML_ASSERT(tensor != nullptr);
GGML_ASSERT(data != nullptr);

View File

@ -52,10 +52,6 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
ov::Tensor& biases,
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
// ExtraQuantType is defined in ggml-openvino-extra.h
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
// Extract quantized weights from tensor and create weight subgraph
// If weights/scales/biases are provided (non-empty), uses them as output buffers
// Otherwise allocates new ov::Tensors internally