From 40893e58c69238465563405e9b78daf228cc310d Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Tue, 7 Oct 2025 22:44:02 +0800 Subject: [PATCH] Squashed commit of the following: commit 8e0c6ec42a6436e037a7cc43418fa50baa130ca2 Author: Hongrui Chen Date: Tue Oct 7 21:56:37 2025 +0800 wip commit 30186bb8894c620168797d4d42261d555a27eed6 Author: Hongrui Chen Date: Tue Oct 7 21:46:34 2025 +0800 wip commit 3e75f5dc1dd6e64e0db2624f2e5e894f46317eed Author: Hongrui Chen Date: Tue Oct 7 21:45:14 2025 +0800 fix comment commit fe1090c8181fac9be935d606325d406a34b78a11 Author: Hongrui Chen Date: Tue Oct 7 21:36:53 2025 +0800 revert changes at tester commit ac0c2a4022e8fb300f66677edc181aec49faf171 Author: Hongrui Chen Date: Tue Oct 7 20:33:54 2025 +0800 try enable mul thread in rope commit 9f8ca968c28c2e320a03ffbe8adeb54f27811f21 Author: Hongrui Chen Date: Tue Oct 7 20:13:13 2025 +0800 disable multi thread at rope commit eed97ca12a5c35f5697cd2f4796611915acf47d8 Author: chraac Date: Tue Oct 7 16:35:28 2025 +0800 add tests commit c3ad7229bf1e5fb33ec119f6ddc5a63fe342b54f Author: chraac Date: Tue Oct 7 16:35:19 2025 +0800 wip --- ggml/src/ggml-qnn/npu/device/op/op_rope.cpp | 49 +++++---------------- 1 file changed, 12 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-qnn/npu/device/op/op_rope.cpp b/ggml/src/ggml-qnn/npu/device/op/op_rope.cpp index deba9800f1..1edcf16480 100644 --- a/ggml/src/ggml-qnn/npu/device/op/op_rope.cpp +++ b/ggml/src/ggml-qnn/npu/device/op/op_rope.cpp @@ -187,11 +187,12 @@ bool rope_impl(hexagon::tensor * out, hexagon::compute_params * params) { freq_factors = src2->get_read_buffer_as(); } - const int64_t total_rows = out->get_ne(3) * out->get_ne(2) * out->get_ne(1); - const auto start_end_row = params->get_work_slice(total_rows); + const int64_t total_planes = out->get_ne(3) * out->get_ne(2); const auto start_end_plane = - std::pair{ start_end_row.first / out->get_ne(1), - (start_end_row.second + out->get_ne(1) - 1) / out->get_ne(1) }; + params->get_work_slice(total_planes); // TODO: figure out how to use row slice for inplace rope + if (start_end_plane.first >= start_end_plane.second) { + return true; + } DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(out, params->get_thread_index(), rope); @@ -206,46 +207,22 @@ bool rope_impl(hexagon::tensor * out, hexagon::compute_params * params) { if constexpr (!_IsMrope) { DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(rope, 0, cache); const int64_t p = pos[i2]; - rope_cache_init(p, - freq_scale, - freq_factors, - corr_dims, - out->get_ne(0), - ext_factor, - attn_factor, - cache, - sin_sign, - theta_scale); + rope_cache_init(p, freq_scale, freq_factors, corr_dims, out->get_ne(0), ext_factor, attn_factor, cache, + sin_sign, theta_scale); } else { DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(rope, 0, cache); const int64_t p_t = pos[i2]; const int64_t p_h = pos[i2 + out->get_ne(2)]; const int64_t p_w = pos[i2 + out->get_ne(2) * 2]; const int64_t p_e = pos[i2 + out->get_ne(2) * 3]; - mrope_cache_init(p_t, - p_h, - p_w, - p_e, - sections, - _IsVision, - freq_scale, - freq_factors, - corr_dims, - out->get_ne(0), - ext_factor, - attn_factor, - cache, - sin_sign, - theta_scale); + mrope_cache_init(p_t, p_h, p_w, p_e, sections, _IsVision, freq_scale, freq_factors, corr_dims, + out->get_ne(0), ext_factor, attn_factor, cache, sin_sign, theta_scale); } DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(rope, 1, loop); const uint8_t * src0_plane = src0_data_ptr + i3 * src0->get_nb(3) + i2 * src0->get_nb(2); uint8_t * dst_plane = dst_data_ptr + i3 * out->get_nb(3) + i2 * out->get_nb(2); - const int64_t start_row = ip == start_end_plane.first ? (start_end_row.first % out->get_ne(1)) : 0; - const int64_t end_row = ip == start_end_plane.second ? (start_end_row.second % out->get_ne(1)) : - out->get_ne(1); // end row is exclusive - for (int64_t i1 = start_row; i1 < end_row; i1++) { // attn-heads + for (int64_t i1 = 0; i1 < out->get_ne(1); i1++) { // attn-heads const uint8_t * src0_row = src0_plane + i1 * src0->get_nb(1); uint8_t * dst_row = dst_plane + i1 * out->get_nb(1); if constexpr (_IsNeoX || _IsMrope) { @@ -385,10 +362,8 @@ bool is_rope_supported(const npu_device_tensor_op_spec * op_spec, const auto & src0 = srcs[0]; if (src0.type != dst->type) { - DEVICE_LOG_DEBUG("[%s]src0 type is not the same as dst type: %s vs %s\n", - op_get_name(op), - get_type_name(src0.type), - get_type_name(dst->type)); + DEVICE_LOG_DEBUG("[%s]src0 type is not the same as dst type: %s vs %s\n", op_get_name(op), + get_type_name(src0.type), get_type_name(dst->type)); return false; // unsupported src0 type }