NPU always requant to q4_0_128
This commit is contained in:
parent
52a44012c0
commit
c1142ddb7c
|
|
@ -3,6 +3,7 @@
|
|||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
|
||||
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
|
||||
|
||||
|
|
@ -162,19 +163,24 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
|
|||
}
|
||||
|
||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
|
||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
|
||||
if (!ggml_openvino_is_npu()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
// NPU requantization rules
|
||||
switch (type) {
|
||||
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
|
||||
return ExtraQuantType::F16;
|
||||
}
|
||||
if (strncmp(tensor->name, "output.weight", 13) == 0) {
|
||||
return ExtraQuantType::Q4_0_128;
|
||||
}
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return ExtraQuantType::Q4_0_128;
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
return ExtraQuantType::F16;
|
||||
return ExtraQuantType::Q4_0_128;
|
||||
default:
|
||||
return std::nullopt;
|
||||
}
|
||||
|
|
@ -200,7 +206,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
|||
const size_t alignment = 64; // Good for SIMD
|
||||
|
||||
// Check if requantization is needed (NPU-specific)
|
||||
auto requant_type = ggml_openvino_get_requant_type(tensor->type);
|
||||
auto requant_type = ggml_openvino_get_requant_type(tensor);
|
||||
if (requant_type.has_value()) {
|
||||
layout.is_requant = true;
|
||||
layout.requant_type = requant_type;
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name();
|
|||
bool ggml_openvino_is_npu();
|
||||
|
||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type);
|
||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor);
|
||||
|
||||
// =====================================================
|
||||
// OpenVINO Tensor Extra Types
|
||||
|
|
|
|||
|
|
@ -535,40 +535,6 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
|||
return result;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) {
|
||||
ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
|
||||
|
||||
// FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k)
|
||||
// (In some q4_0 models which use two different weight for token_emb and output, token_emb is q4_0)
|
||||
std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
|
||||
if (device == "NPU" && std::string(tensor->name) == "token_embd.weight") {
|
||||
requant_type = ExtraQuantType::F16;
|
||||
}
|
||||
|
||||
// Determine block size
|
||||
int64_t block_size = node_shape[1];
|
||||
if (requant_type == ExtraQuantType::Q4_0_128) {
|
||||
block_size = 128;
|
||||
} else if (requant_type == ExtraQuantType::Q8_0_32) {
|
||||
block_size = 32;
|
||||
}
|
||||
|
||||
// Allocate tensors
|
||||
ov::Tensor weights, scales, biases;
|
||||
if (requant_type == ExtraQuantType::F16) {
|
||||
weights = ov::Tensor(ov::element::f16, node_shape);
|
||||
} else {
|
||||
bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
|
||||
ov::element::Type weight_type = is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scales_shape = {node_shape[0], node_shape[1] / block_size};
|
||||
weights = ov::Tensor(weight_type, node_shape);
|
||||
scales = ov::Tensor(ov::element::f16, scales_shape);
|
||||
biases = ov::Tensor(ov::element::f16, scales_shape);
|
||||
}
|
||||
|
||||
return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
|
||||
GGML_ASSERT(tensor != nullptr);
|
||||
GGML_ASSERT(data != nullptr);
|
||||
|
|
|
|||
|
|
@ -52,10 +52,6 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
|
|||
ov::Tensor& biases,
|
||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
|
||||
|
||||
// ExtraQuantType is defined in ggml-openvino-extra.h
|
||||
|
||||
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
|
||||
|
||||
// Extract quantized weights from tensor and create weight subgraph
|
||||
// If weights/scales/biases are provided (non-empty), uses them as output buffers
|
||||
// Otherwise allocates new ov::Tensors internally
|
||||
|
|
|
|||
Loading…
Reference in New Issue