Add custom quant type: q8_1_c, q4_0_128
This commit is contained in:
parent
b593428eb3
commit
6926655f5b
|
|
@ -25,6 +25,7 @@
|
|||
#include <openvino/op/parameter.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <optional>
|
||||
#include <ostream>
|
||||
#include <set>
|
||||
#include <stdexcept>
|
||||
|
|
@ -371,7 +372,7 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
|
|||
}
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
|
||||
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize) {
|
||||
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
static std::mutex weights_mutex;
|
||||
auto* nodes = cgraph->nodes;
|
||||
|
|
@ -396,7 +397,10 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
|
|||
}
|
||||
}
|
||||
if (should_create) {
|
||||
auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0);
|
||||
auto requant_type = types_to_requantize.count(src->type) ?
|
||||
std::optional<ExtraQuantType>(types_to_requantize.at(src->type)) :
|
||||
std::nullopt;
|
||||
auto weight_node = create_weight_node(src, requant_type);
|
||||
weight_node->set_friendly_name(src_name);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
|
|
@ -410,7 +414,8 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
|
|||
return model_weights;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) {
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
|
||||
std::optional<ExtraQuantType> requant_type) {
|
||||
std::set<ggml_type> weight_types = {
|
||||
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
|
||||
if (weight_types.find(tensor->type) == weight_types.end()) {
|
||||
|
|
@ -443,21 +448,15 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
|
|||
tensor->extra == nullptr,
|
||||
"Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights");
|
||||
|
||||
if (to_dequantize) {
|
||||
std::vector<float> weights_f32(ne_total);
|
||||
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
|
||||
ov::Tensor weights(ov::element::f16, node_shape);
|
||||
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
|
||||
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
|
||||
weight_node->set_friendly_name(tensor->name);
|
||||
return weight_node;
|
||||
if (requant_type.has_value()) {
|
||||
return requantize(tensor, requant_type.value());
|
||||
}
|
||||
|
||||
uint64_t weights_per_byte;
|
||||
ov::element::Type weight_type;
|
||||
if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
|
||||
weights_per_byte = 2;
|
||||
weight_type = ov::element::u4;
|
||||
} else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K
|
||||
weights_per_byte = 1;
|
||||
weight_type = ov::element::u8;
|
||||
}
|
||||
|
||||
uint64_t weights_per_block;
|
||||
|
|
@ -474,15 +473,12 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
|
|||
" has incompatible last dim shape: ",
|
||||
node_shape.back());
|
||||
|
||||
auto weights_shape = node_shape;
|
||||
weights_shape.back() /= (weights_per_byte * 4); // means u32 type can store 8 q4 or 4 q8
|
||||
|
||||
ov::Tensor weights(ov::element::u32, weights_shape);
|
||||
// For scales and bias
|
||||
ov::Tensor weights(weight_type, node_shape);
|
||||
// For scales and biases
|
||||
node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block;
|
||||
|
||||
ov::Tensor scales(ov::element::f16, node_shape);
|
||||
ov::Tensor biases(ov::element::f16, node_shape);
|
||||
|
||||
ov::Output<ov::Node> weight_node;
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
extract_q4_0_data(tensor, weights, scales, biases);
|
||||
|
|
@ -494,7 +490,6 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
|
|||
extract_q8_0_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
|
||||
} else if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
// due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled.
|
||||
extract_q6_k_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
|
||||
} else if (tensor->type == GGML_TYPE_Q4_K) {
|
||||
|
|
@ -503,15 +498,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
|
|||
}
|
||||
|
||||
OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
|
||||
// weight_node = std::make_shared<ov::op::v0::Unsqueeze>(
|
||||
// weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}));
|
||||
|
||||
weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
|
||||
// GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n",
|
||||
// tensor->name,
|
||||
// ggml_type_name(tensor->type),
|
||||
// weight_node.get_element_type().get_type_name().c_str(),
|
||||
// weight_node.get_partial_shape().to_string().c_str());
|
||||
return weight_node.get_node_shared_ptr();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4,8 +4,10 @@
|
|||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/partial_shape.hpp>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-quants.hpp"
|
||||
#include "ggml.h"
|
||||
#include "openvino/decoder.hpp"
|
||||
|
||||
|
|
@ -117,9 +119,10 @@ public:
|
|||
|
||||
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
|
||||
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor, bool to_dequantize);
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor,
|
||||
std::optional<ExtraQuantType> requant_type = std::nullopt);
|
||||
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
|
||||
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize = {});
|
||||
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
|
||||
|
||||
const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
|
||||
const ggml_tensor* get_tensor_from_name(const std::string& name) const;
|
||||
|
|
|
|||
|
|
@ -1,15 +1,20 @@
|
|||
#include "ggml-quants.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <openvino/core/parallel.hpp>
|
||||
#include <openvino/core/type/element_type_traits.hpp>
|
||||
#include <openvino/core/type/float16.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <string>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
void unpack_32_4(const uint8_t* data, uint8_t* dst) {
|
||||
|
|
@ -203,20 +208,24 @@ void extract_q6_k_data(const ggml_tensor* tensor,
|
|||
// TODO Reorder for make_intX_weights
|
||||
|
||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
|
||||
|
||||
// Reshape weight to (num_heads, -1, group_size)
|
||||
ov::Shape orig_shape = weight.get_shape();
|
||||
orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t);
|
||||
size_t num_groups = orig_shape[1] / group_size;
|
||||
|
||||
// Expand dimensions for scales and biases
|
||||
auto scale_shape = scales.get_shape();
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
biases.set_shape(scale_shape);
|
||||
|
||||
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
|
||||
|
||||
if (packed_shape[1] == 1) {
|
||||
packed_shape.erase(packed_shape.begin() + 1);
|
||||
} else {
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
biases.set_shape(scale_shape);
|
||||
}
|
||||
|
||||
// Create graph nodes
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast<uint8_t*>(weight.data()), nullptr);
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(
|
||||
ov::element::u8, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
ov::Tensor biases_u8(ov::element::u8, scale_shape);
|
||||
|
|
@ -242,32 +251,24 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
|
||||
weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY
|
||||
);
|
||||
auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
|
||||
w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY
|
||||
);
|
||||
ov::Output<ov::Node> w_zp_s =
|
||||
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
|
||||
// Reshape back to original dimensions
|
||||
auto final_shape = std::make_shared<ov::op::v0::Constant>(
|
||||
ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape
|
||||
);
|
||||
auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
|
||||
w_zp_s, final_shape, false
|
||||
);
|
||||
if (packed_shape.size() != 2) {
|
||||
// If not requantized channel-wise case, reshape back to original shape
|
||||
auto final_shape =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
|
||||
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
|
||||
}
|
||||
|
||||
return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
|
||||
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
|
||||
|
||||
// Convert weight to uint8 view and adjust shape
|
||||
ov::Shape orig_weight_shape = weight.get_shape();
|
||||
orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation
|
||||
|
||||
// Expand dimensions for scales and biases
|
||||
ov::Shape scale_bias_shape = scales.get_shape();
|
||||
scale_bias_shape.push_back(1); // Add new axis at the end
|
||||
scales.set_shape(scale_bias_shape);
|
||||
biases.set_shape(scale_bias_shape);
|
||||
|
||||
// Create INT4 weight tensor
|
||||
ov::Shape packed_shape = {
|
||||
|
|
@ -276,8 +277,17 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
group_size
|
||||
};
|
||||
|
||||
// Requantized channel-wise case
|
||||
if (packed_shape[1] == 1) {
|
||||
packed_shape.erase(packed_shape.begin() + 1);
|
||||
} else {
|
||||
scale_bias_shape.push_back(1);
|
||||
scales.set_shape(scale_bias_shape);
|
||||
biases.set_shape(scale_bias_shape);
|
||||
}
|
||||
|
||||
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holde"] = weight;
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
// Pack zero points: two subsequent values into one
|
||||
|
|
@ -304,15 +314,129 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
|
||||
weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
|
||||
auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
|
||||
w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
ov::Output<ov::Node> w_zp_s =
|
||||
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||
|
||||
// Reshape back to original shape
|
||||
auto final_shape = std::make_shared<ov::op::v0::Constant>(
|
||||
ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
|
||||
if (packed_shape.size() != 2) {
|
||||
// If not requantized channel-wise case, reshape back to original shape
|
||||
auto final_shape = std::make_shared<ov::op::v0::Constant>(
|
||||
ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
|
||||
|
||||
auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
|
||||
w_zp_s, final_shape, false);
|
||||
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
|
||||
}
|
||||
|
||||
return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
|
||||
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) {
|
||||
std::vector<float> weights_f32(tensor->ne[0] * tensor->ne[1]);
|
||||
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
|
||||
|
||||
std::shared_ptr<ov::Node> weight_node;
|
||||
ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
|
||||
|
||||
if (requant_type == ExtraQuantType::F16) {
|
||||
ov::Tensor weights(ov::element::f16, node_shape);
|
||||
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
|
||||
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
|
||||
weight_node->set_friendly_name(tensor->name);
|
||||
return weight_node;
|
||||
}
|
||||
|
||||
int64_t block_size = node_shape[1];
|
||||
if (requant_type == ExtraQuantType::Q4_0_128) {
|
||||
block_size = 128;
|
||||
}
|
||||
auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size};
|
||||
|
||||
ov::Tensor weights;
|
||||
ov::Tensor scales(ov::element::f16, scales_shape);
|
||||
ov::Tensor bias(ov::element::f16, scales_shape);
|
||||
|
||||
if (requant_type == ExtraQuantType::Q4_0_C) {
|
||||
weights = ov::Tensor(ov::element::u4, node_shape);
|
||||
quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
|
||||
weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
|
||||
} else if (requant_type == ExtraQuantType::Q8_1_C) {
|
||||
weights = ov::Tensor(ov::element::u8, node_shape);
|
||||
quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
|
||||
weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
|
||||
} else if (requant_type == ExtraQuantType::Q4_0_128) {
|
||||
weights = ov::Tensor(ov::element::u4, node_shape);
|
||||
quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
|
||||
weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
|
||||
}
|
||||
|
||||
weight_node->set_friendly_name(tensor->name);
|
||||
return weight_node;
|
||||
}
|
||||
|
||||
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
const int nb = k / qk;
|
||||
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (amax < fabsf(v)) {
|
||||
amax = fabsf(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = max / -8;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
biases[i] = ov::float16(-8.f * d);
|
||||
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
const float x0 = x[i * qk + 2 * j] * id;
|
||||
const float x1 = x[i * qk + 2 * j + 1] * id;
|
||||
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
|
||||
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
|
||||
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
const int nb = k / qk;
|
||||
|
||||
auto* weights = static_cast<uint8_t*>(weights_arr.data());
|
||||
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float min = std::numeric_limits<float>::max();
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
const float v = x[i * qk + j];
|
||||
if (v < min) {
|
||||
min = v;
|
||||
}
|
||||
if (v > max) {
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = (max - min) / ((1 << 8) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
biases[i] = ov::float16(min);
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = (x[i * qk + j] - min) * id;
|
||||
const uint8_t xi0 = roundf(x0);
|
||||
weights[i * qk + j] = xi0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
|
|
@ -45,6 +46,15 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
|
|||
ov::Tensor& biases,
|
||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
|
||||
|
||||
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 };
|
||||
|
||||
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
|
||||
|
||||
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
int64_t qk);
|
||||
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
int64_t qk);
|
||||
|
||||
namespace ov {
|
||||
namespace op {
|
||||
namespace util {
|
||||
|
|
|
|||
|
|
@ -130,11 +130,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
compile_end_time = conversion_end_time;
|
||||
} else {
|
||||
std::shared_ptr<ov::Model> model;
|
||||
std::set<ggml_type> types_to_dequantize;
|
||||
std::map<ggml_type, ExtraQuantType> types_to_requantize;
|
||||
if (is_static) {
|
||||
types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
|
||||
types_to_requantize = {
|
||||
{GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
|
||||
{GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128},
|
||||
{GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128},
|
||||
{GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C },
|
||||
};
|
||||
} else if (device == "GPU") {
|
||||
types_to_requantize = {
|
||||
// CVS-166739
|
||||
{GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C},
|
||||
};
|
||||
}
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize);
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_requantize);
|
||||
|
||||
if (is_static) {
|
||||
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);
|
||||
|
|
|
|||
Loading…
Reference in New Issue