stateful masking fix (#38)

Fix for stateful accuracy issues and cl_out_of_resources error in stateful GPU with larger context sizes.
2026-02-11 16:31:06 -08:00 · 2026-02-11 16:31:06 -08:00 · 0d74aba277
parent e0590152ff
commit 0d74aba277
2 changed files with 24 additions and 5 deletions
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@ -16,6 +16,7 @@
 #include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
 #include <openvino/op/convert.hpp>
+#include <openvino/op/convert_like.hpp>
 #include <openvino/op/cos.hpp>
 #include <openvino/op/divide.hpp>
 #include <openvino/op/gather.hpp>
@ -89,12 +90,14 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
                auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
                auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
                auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
-                auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
+                auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
+                auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
                auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1});
                auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
-                auto shape_of_inp_pos = std::make_shared<ov::op::v3::ShapeOf>(inp_pos);
-                auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(shape_of_inp_pos, two_1d, zero_1d);
-                auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len_per_seq, gather_inp_pos}, 0);
+                auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, neg_one_1d, three_1d);
+                auto reshaped_inp_pos = std::make_shared<ov::op::v1::Reshape>(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false);
+                auto inp_pos_incremented = std::make_shared<ov::op::v1::Add>(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1}));
+                auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len_per_seq, std::make_shared<v1::ConvertLike>(inp_pos_incremented, token_len_per_seq)}, 0);
                mask_sliced =
                    std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
                mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -57,6 +57,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
    auto & core = ov_singleton_core();
    const auto & config = ggml_openvino_get_compile_config();
    static auto is_static = false;
+    static size_t stateful_kv_size = 0;

    // if (is_naive(cgraph)) {
    //     return naive_compute(cgraph, core, device, config);
@ -106,12 +107,27 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
            }
            ggml_decoder->add_extra_inputs();
            infer_request = infer_request_cache.at(key);
+
            if (stateful) {
                const auto * inp_pos = get_inp_pos_tensor(cgraph);
                int32_t * pos_data = (int32_t *) inp_pos->data;
+                auto pos_shape = ggml_decoder->get_shape(inp_pos);
                if (pos_data[0] == 0) {
                    infer_request->reset_state();
-                }
+                    stateful_kv_size = pos_shape[3];
+                } else if (stateful_kv_size == pos_data[0]) {
+                    stateful_kv_size += pos_shape[3];
+                } else {
+                    auto states = infer_request->query_state();
+                    for (auto state : states) {
+                        auto state_tensor = state.get_state();
+                        ov::Coordinate begin = {0, 0, 0, 0};
+                        ov::Coordinate end = {state_tensor.get_shape()[0], static_cast<uint32_t>(pos_data[0]), state_tensor.get_shape()[2], state_tensor.get_shape()[3]};
+                        ov::Tensor new_state_tensor(state_tensor, begin, end);
+                        state.set_state(new_state_tensor);
+                    }
+                    stateful_kv_size = pos_data[0] + 1;
+                 }
            }

            decoder_end_time = ggml_time_us();