diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 4b9429740c..8796c23abd 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -95,9 +95,6 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::mapn_nodes; node_n++) { auto * cur_node = cgraph->nodes[node_n]; - if (cur_node->op == GGML_OP_NONE) { - continue; - } set_input_output(cur_node, true); } for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { @@ -110,6 +107,9 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map data_addr_map; std::unordered_set output_name_set; for (const auto & node_info : m_node_info_list) { + if (node_info.node->op == GGML_OP_NONE) { + continue; + } for (const auto & it : node_info.node_inputs) { const auto & src_name = it.first; const auto & src_node = it.second; @@ -164,6 +164,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (src->flags & GGML_TENSOR_FLAG_INPUT) { src_name = get_graph_input_ov_name(src, node); } + m_inputs[src_name] = src; current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); @@ -193,7 +194,6 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } - m_inputs[src_name] = src; assert(stateful_kv_shape.rank().is_static()); ov::PartialShape param_shape = (stateful_kv_shape.rank().get_length() != 0) ? stateful_kv_shape : get_graph_input_shape(node, src); @@ -264,7 +264,7 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { } else { op_case = 3; } - } else if (node->src[0]->src[0]->op == GGML_OP_ROPE || node->src[0]->src[0]->src[0]->op == GGML_OP_ROPE) { + } else { // rope'ed query tensor op_case = 4; } @@ -839,6 +839,9 @@ int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const { void GgmlOvDecoder::visit_subgraph(std::function, int node_idx)> node_visitor) const { for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) { + if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) { + continue; + } node_visitor(std::make_shared(*this), node_idx); } } diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index efd399fe3f..801d9ad5c4 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -113,8 +113,8 @@ struct ggml_backend_openvino_buffer_context { ~ggml_backend_openvino_buffer_context() { // Clean up all tensor extras - GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device, - size / 1024 / 1024); + // GGML_LOG_DEBUG("Deleting OpenVINO buffer context #%zu for device %d, size %zu MB\n", id, device, + // size / 1024 / 1024); for (auto & pair : tensor_extras) { delete pair.second; } @@ -454,9 +454,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) { ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor); if (layout.total_size > 0) { - GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n", - __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, - layout.scales_size, layout.zp_size); + // GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n", + // __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, + // layout.scales_size, layout.zp_size); return layout.total_size; } } @@ -763,8 +763,36 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_t return ggml_backend_openvino_host_buffer_type(ctx->device); } +static bool has_view_input(const ggml_tensor * op) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (op->src[i] == nullptr) { + break; + } + if (op->src[i]->op == GGML_OP_VIEW) { + return true; + } + } + return false; +} + static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { + case GGML_OP_GET_ROWS: + case GGML_OP_SET_ROWS: { + if (op->ne[3] != 1) { + return true; + } + break; + } + case GGML_OP_ADD: + case GGML_OP_MUL: { + for (int i = 0; i < 4; i++) { + if (op->src[0]->ne[i] != op->src[1]->ne[i] && (op->src[0]->ne[i] != 1 && op->src[1]->ne[i] != 1)) { + return true; + } + } + break; + } case GGML_OP_SOFT_MAX: { if (op->src[2] != nullptr) { GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); @@ -876,7 +904,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, - GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, + /*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, // softmax is not updated due to replaced by flash_attn_ext // GGML_OP_SOFT_MAX, @@ -896,6 +924,11 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op))); return false; } + if (has_view_input(op)) { + GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", + ggml_unary_op_name(ggml_get_unary_op(op))); + return false; + } break; } case GGML_OP_GLU: { @@ -904,6 +937,11 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_LOG_WARN("OpenVINO backend does not support GLU op %s\n", ggml_glu_op_name(ggml_get_glu_op(op))); return false; } + if (has_view_input(op)) { + GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", + ggml_glu_op_name(ggml_get_glu_op(op))); + return false; + } break; } default: { @@ -912,6 +950,14 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_LOG_WARN("OpenVINO backend does not support op %s\n", ggml_op_name(op->op)); return false; } + static std::set ops_not_support_view_input{ + GGML_OP_GET_ROWS, + GGML_OP_RMS_NORM, + }; + if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_input(op)) { + GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op)); + return false; + } } } diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index d6e7a35534..cdc0ec58b0 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -34,9 +34,18 @@ OutputVector translate_get_rows(const NodeContext & context) { indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); if (data.get_partial_shape().rank() == 4) { - auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); - data = std::make_shared(data, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); - res = std::make_shared(data, indices, axis, 1); + if (data.get_partial_shape()[1].get_length() == 1) { + // Work-around for a bug in ov cpu plugin for test-backend-ops + data = std::make_shared(data, + ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + res = std::make_shared(data, indices, axis); + } else { + auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + data = + std::make_shared(data, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + res = std::make_shared(data, indices, axis, 1); + } } else if (context.is_stateful() && data.get_partial_shape().rank() == 3) { auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); res = std::make_shared(data, indices, axis, 1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index 8be9e8deb0..124003911b 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -9,7 +9,6 @@ #include #include #include -#include namespace ov { namespace frontend { @@ -25,11 +24,23 @@ OutputVector translate_glu_geglu(const NodeContext & context) { src0 = context.get_input(0); src1 = context.get_input(1); } else { + // GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2. + // Both halves are nc elements; if the dimension is odd, the last element is dropped. + // Use Slice instead of Split to handle odd dimensions correctly. auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1}); - auto split = std::make_shared(combined, split_axis, 2); - src0 = split->output(0); - src1 = split->output(1); + auto combined_shape = combined.get_partial_shape(); + int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length(); + int64_t nc = last_dim_val / 2; + + auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc}); + + src0 = std::make_shared(combined, start0, stop0, step, axis); + src1 = std::make_shared(combined, start1, stop1, step, axis); } int32_t * params = context.get_output_op_params(); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 6e0b85517e..10d1b39438 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -9,7 +9,6 @@ #include #include #include -#include namespace ov { namespace frontend { @@ -25,11 +24,23 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { src0 = context.get_input(0); src1 = context.get_input(1); } else { + // GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2. + // Both halves are nc elements; if the dimension is odd, the last element is dropped. + // Use Slice instead of Split to handle odd dimensions correctly. auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1}); - auto split = std::make_shared(combined, split_axis, 2); - src0 = split->output(0); - src1 = split->output(1); + auto combined_shape = combined.get_partial_shape(); + int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length(); + int64_t nc = last_dim_val / 2; + + auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc}); + + src0 = std::make_shared(combined, start0, stop0, step, axis); + src1 = std::make_shared(combined, start1, stop1, step, axis); } int32_t * params = context.get_output_op_params(); diff --git a/ggml/src/ggml-openvino/openvino/op/scale.cpp b/ggml/src/ggml-openvino/openvino/op/scale.cpp index 01e59cedd9..ceb18ddb48 100644 --- a/ggml/src/ggml-openvino/openvino/op/scale.cpp +++ b/ggml/src/ggml-openvino/openvino/op/scale.cpp @@ -2,6 +2,7 @@ #include "../op_table.hpp" #include "../utils.hpp" +#include #include #include #include @@ -15,10 +16,21 @@ OutputVector translate_scale(const NodeContext & context) { num_inputs_check(context, 1, 1); float scale; - memcpy(&scale, context.get_output_op_params(), sizeof(float)); - auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + float bias; + memcpy(&scale, (float *) context.get_output_op_params() + 0, sizeof(float)); + memcpy(&bias, (float *) context.get_output_op_params() + 1, sizeof(float)); - auto res = std::make_shared(context.get_input(0), scale_node); + auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + auto scaled = std::make_shared(context.get_input(0), scale_node); + + std::shared_ptr res; + if (bias != 0.0f) { + auto bias_node = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{bias}); + res = std::make_shared(scaled, bias_node); + } else { + res = scaled; + } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 55ea4eb355..a370043dd7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -59,9 +59,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin static auto is_static = false; static size_t stateful_kv_size = 0; - // if (is_naive(cgraph)) { - // return naive_compute(cgraph, core, device, config); - // } + if (is_naive(cgraph)) { + return naive_compute(cgraph, core, device, config); + } auto start_time = ggml_time_us(); @@ -438,7 +438,13 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; - return cgraph->n_nodes < naive_graph_size_threshold; + int count = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + if (cgraph->nodes[i]->op != GGML_OP_NONE) { + count++; + } + } + return count < naive_graph_size_threshold; } enum ggml_status naive_compute(ggml_cgraph * cgraph,