stateful masking fix (#38)
Fix for stateful accuracy issues and cl_out_of_resources error in stateful GPU with larger context sizes.
This commit is contained in:
parent
e0590152ff
commit
0d74aba277
|
|
@ -16,6 +16,7 @@
|
|||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/convert_like.hpp>
|
||||
#include <openvino/op/cos.hpp>
|
||||
#include <openvino/op/divide.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
|
|
@ -89,12 +90,14 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
|
|||
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
|
||||
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
|
||||
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
|
||||
auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
|
||||
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1});
|
||||
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
|
||||
auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
|
||||
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
|
||||
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len_per_seq, gather_inp_pos}, 0);
|
||||
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, neg_one_1d, three_1d);
|
||||
auto reshaped_inp_pos = std::make_shared<ov::op::v1::Reshape>(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false);
|
||||
auto inp_pos_incremented = std::make_shared<ov::op::v1::Add>(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1}));
|
||||
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len_per_seq, std::make_shared<v1::ConvertLike>(inp_pos_incremented, token_len_per_seq)}, 0);
|
||||
mask_sliced =
|
||||
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
|
||||
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
|
|||
auto & core = ov_singleton_core();
|
||||
const auto & config = ggml_openvino_get_compile_config();
|
||||
static auto is_static = false;
|
||||
static size_t stateful_kv_size = 0;
|
||||
|
||||
// if (is_naive(cgraph)) {
|
||||
// return naive_compute(cgraph, core, device, config);
|
||||
|
|
@ -106,12 +107,27 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
|
|||
}
|
||||
ggml_decoder->add_extra_inputs();
|
||||
infer_request = infer_request_cache.at(key);
|
||||
|
||||
if (stateful) {
|
||||
const auto * inp_pos = get_inp_pos_tensor(cgraph);
|
||||
int32_t * pos_data = (int32_t *) inp_pos->data;
|
||||
auto pos_shape = ggml_decoder->get_shape(inp_pos);
|
||||
if (pos_data[0] == 0) {
|
||||
infer_request->reset_state();
|
||||
}
|
||||
stateful_kv_size = pos_shape[3];
|
||||
} else if (stateful_kv_size == pos_data[0]) {
|
||||
stateful_kv_size += pos_shape[3];
|
||||
} else {
|
||||
auto states = infer_request->query_state();
|
||||
for (auto state : states) {
|
||||
auto state_tensor = state.get_state();
|
||||
ov::Coordinate begin = {0, 0, 0, 0};
|
||||
ov::Coordinate end = {state_tensor.get_shape()[0], static_cast<uint32_t>(pos_data[0]), state_tensor.get_shape()[2], state_tensor.get_shape()[3]};
|
||||
ov::Tensor new_state_tensor(state_tensor, begin, end);
|
||||
state.set_state(new_state_tensor);
|
||||
}
|
||||
stateful_kv_size = pos_data[0] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
|
|
|||
Loading…
Reference in New Issue