From 67e3f6f60155870d4b5ce727515aa81e5a7b4753 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Mon, 5 Jan 2026 15:38:18 +0800 Subject: [PATCH] CANN: add operator fusion support for ADD + RMS_NORM (#17512) This commit implements operator fusion for ADD + RMS_NORM operations in the CANN backend to reduce memory access overhead and improve performance. The fusion is controlled by the GGML_CANN_OPERATOR_FUSION environment variable (default: false). Changes: - Implement ggml_cann_op_add_rms_norm_fused() using ACLNN AddRmsNorm - Add ggml_cann_can_fuse() to check fusion eligibility - Integrate fusion logic into computation graph evaluation - Add test cases for ADD + RMS_NORM fusion - Update documentation with new environment variable The fusion combines ADD and RMS_NORM into a single kernel call, which is more efficient than executing them separately. --- docs/backend/CANN.md | 4 +++ ggml/src/ggml-cann/aclnn_ops.cpp | 55 ++++++++++++++++++++++++++++ ggml/src/ggml-cann/aclnn_ops.h | 14 ++++++++ ggml/src/ggml-cann/ggml-cann.cpp | 44 +++++++++++++++++++++++ tests/test-backend-ops.cpp | 62 ++++++++++++++++++++++++++++++++ 5 files changed, 179 insertions(+) diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 37dcfaef9a..b03c2a122c 100755 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -327,3 +327,7 @@ Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. Whe ### GGML_CANN_PREFILL_USE_GRAPH Enable ACL graph execution during the prefill stage, default is false. This option is only effective when FA is enabled. + +### GGML_CANN_OPERATOR_FUSION + +Enable operator fusion during computation, default is false. This option fuses compatible operators (e.g., ADD + RMS_NORM) to reduce overhead and improve performance. diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 2180a06fd0..50b6bd00e4 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -26,6 +26,7 @@ #include "ggml.h" #include +#include #include #include #include @@ -3805,3 +3806,57 @@ void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) { cubeMathType); } + +void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx, + ggml_tensor * add_node, + ggml_tensor * rms_norm_node) { + // Get the two input tensors for ADD operation + ggml_tensor * x1 = add_node->src[0]; + ggml_tensor * x2 = add_node->src[1]; + + // Create ACL tensors for the two ADD inputs + acl_tensor_ptr acl_x1 = ggml_cann_create_tensor(x1); + acl_tensor_ptr acl_x2 = ggml_cann_create_tensor(x2); + + // Get epsilon parameter from rms_norm_tensor + float eps; + memcpy(&eps, rms_norm_node->op_params, sizeof(float)); + + // Build gamma tensor (RMS normalization scaling factor) + // Gamma should match the normalized dimensions (last dimension of x1) + size_t acl_gamma_nb[GGML_MAX_DIMS]; + acl_gamma_nb[0] = ggml_type_size(rms_norm_node->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + acl_gamma_nb[i] = acl_gamma_nb[i - 1] * x1->ne[i - 1]; + } + acl_tensor_ptr acl_gamma = + get_cache_acl_tensor(ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, x1->ne, + acl_gamma_nb, rms_norm_node->type, + 1, // dims - only the last dimension + 1.0f // value + ); + + // Build rstdOut tensor (output for normalized standard deviation) + // Shape should be the dimensions that are NOT normalized + int64_t acl_rstd_ne[] = { 1, x1->ne[1], x1->ne[2], x1->ne[3] }; + size_t acl_rstd_nb[GGML_MAX_DIMS - 1]; + acl_rstd_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1]; + } + acl_tensor_ptr acl_rstd = + get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size, + acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS, + 0.0f // value + ); + + acl_tensor_ptr acl_xout = ggml_cann_create_tensor(add_node); + + // Create yOut tensor (final output after RMS normalization) + acl_tensor_ptr acl_yout = ggml_cann_create_tensor(rms_norm_node); + + // Call fused ADD + RMS_NORM operator + GGML_CANN_CALL_ACLNN_OP(ctx, AddRmsNorm, acl_x1.get(), acl_x2.get(), acl_gamma.get(), + eps, // double type + acl_yout.get(), acl_rstd.get(), acl_xout.get()); +} diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index a6ea016c54..08ee7b1fbd 100644 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -935,6 +935,20 @@ template void register_acl_resources(std::vectorstream())); } +/** + * @brief Check if CANN backend can fuse the specified operation sequence + * + * This function determines whether an operation sequence starting from the specified node + * can be fused into an optimized operation in the CANN backend. Operation fusion can reduce + * memory access overhead and improve computational efficiency. + * + * @param cgraph Pointer to the computation graph + * @param node_idx Index of the starting node in the computation graph + * @param ops Sequence of operation types to check for fusion + * @return true if the operations can be fused + * @return false if the operations cannot be fused + */ +static bool ggml_cann_can_fuse(const struct ggml_cgraph * cgraph, + int node_idx, + std::initializer_list ops) { + if (!ggml_can_fuse(cgraph, node_idx, ops)) { + return false; + } + + // CANN backend supports fusing ADD + RMS_NORM operations + if ((ops.size() == 2) && ops.begin()[0] == GGML_OP_ADD && ops.begin()[1] == GGML_OP_RMS_NORM) { + ggml_tensor * add_node = cgraph->nodes[node_idx]; + // TODO: support broadcast for ADD + RMS_NORM + if (add_node->src[0]->ne[0] != add_node->src[1]->ne[0] || add_node->src[0]->ne[1] != add_node->src[1]->ne[1] || + add_node->src[0]->ne[2] != add_node->src[1]->ne[2] || add_node->src[0]->ne[3] != add_node->src[1]->ne[3]) { + return false; + } + return true; + } + + return false; +} + /** * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API. * @@ -2101,9 +2136,18 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx #endif // USE_ACL_GRAPH // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph. // With the use of CANN graphs, the execution will be performed by the graph launch. + static bool opt_fusion = parse_bool(get_env("GGML_CANN_OPERATOR_FUSION").value_or("")); + if (!use_cann_graph || cann_graph_capture_required) { for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; + if (opt_fusion) { + if (ggml_cann_can_fuse(cgraph, i, { GGML_OP_ADD, GGML_OP_RMS_NORM })) { + ggml_cann_op_add_rms_norm_fused(*cann_ctx, node, cgraph->nodes[i + 1]); + i++; + continue; + } + } if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 76abfdaf0a..fa6e80e3fc 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3431,6 +3431,65 @@ struct test_rms_norm_mul_add : public test_case { } }; +// GGML_OP_ADD + GGML_OP_RMS_NORM (fused operation) +struct test_add_rms_norm : public test_case { + const ggml_type type; + const std::array ne; + const float eps; + const bool broadcast; + + std::string op_desc(ggml_tensor * t) override { + GGML_UNUSED(t); + return "ADD_RMS_NORM"; + } + + bool run_whole_graph() override { return true; } + + std::string vars() override { + return VARS_TO_STR4(type, ne, eps, broadcast); + } + + test_add_rms_norm(ggml_type type = GGML_TYPE_F32, + std::array ne = {64, 5, 4, 3}, + float eps = 1e-6f, bool broadcast = false) + : type(type), ne(ne), eps(eps), broadcast(broadcast) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + std::array broadcast_dims = {ne[0]*2, ne[1]*3, ne[2]*3, ne[3]*4}; + + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, broadcast ? broadcast_dims.data() : ne.data()); + ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data()); + + ggml_set_param(a); + ggml_set_name(a, "a"); + ggml_set_param(b); + ggml_set_name(b, "b"); + + // ADD operation followed by RMS_NORM + ggml_tensor * add_result = ggml_add(ctx, a, b); + ggml_set_name(add_result, "add_result"); + + ggml_tensor * out = ggml_rms_norm(ctx, add_result, eps); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -10.f, 10.f); + } + } + + float grad_eps() override { + return 1.0f; + } + + bool grad_precise() override { + return true; + } +}; + // GGML_OP_SSM_CONV struct test_ssm_conv : public test_case { const ggml_type type; @@ -7393,11 +7452,14 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false)); test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); + test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false)); + test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); } for (uint32_t n : {1, 511, 1025, 8192, 33*512}) { for (bool multi_add : {false, true}) { test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {n, 1, 1, 1}, 1e-6f, false, multi_add)); } + test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {n, 1, 1, 1}, 1e-6f, false)); } for (auto multi_add : {false, true}) {