diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 8287cadeae..533b127b48 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -25,6 +25,17 @@ #include "ggml-impl.h" #include "ggml.h" +// Forward-declare InplaceFillDiagonal because aclnn_fill_diagonal.h has a +// broken include guard (OP_API_INC_ADD_H_) that conflicts with aclnn_add.h. +extern "C" { +aclnnStatus aclnnInplaceFillDiagonalGetWorkspaceSize( + aclTensor * selfRef, const aclScalar * fillValue, bool wrap, + uint64_t * workspaceSize, aclOpExecutor ** executor); +aclnnStatus aclnnInplaceFillDiagonal( + void * workspace, uint64_t workspaceSize, aclOpExecutor * executor, + aclrtStream stream); +} + #include #include #include @@ -75,6 +86,8 @@ #include #include #include +#include +#include #include #include #include @@ -670,6 +683,107 @@ void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) { aclnn_reduce_sum(ctx, dst, reduce_dims, 4); } +void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; + acl_tensor_ptr acl_src = ggml_cann_create_tensor(src); + acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst); + // GGML cumsum operates along dim 0 (innermost / ne[0]). + // ggml_cann_create_tensor reverses dimensions to [ne3,ne2,ne1,ne0], + // so GGML dim 0 maps to CANN dim 3 (the last dim of the 4-D tensor). + GGML_CANN_CALL_ACLNN_OP(ctx, Cumsum, acl_src.get(), (int64_t)3, + ggml_cann_type_mapping(dst->type), acl_dst.get()); +} + +void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; + + const int64_t S = src->ne[0]; + const int64_t n_batch = src->ne[2] * src->ne[3]; + const size_t nb_f32 = sizeof(float); + const size_t nb_bool = sizeof(uint8_t); + const size_t buf_sz = n_batch * S * S * nb_f32; + const size_t bool_sz = n_batch * S * S * nb_bool; + + int64_t ne3d[3] = { S, S, n_batch }; + size_t nb3d[3] = { nb_f32, S * nb_f32, S * S * nb_f32 }; + size_t nb3d_bool[3] = { nb_bool, S * nb_bool, S * S * nb_bool }; + + const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0); + + acl_tensor_ptr acl_src = ggml_cann_create_tensor(src->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3); + acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3); + + // LOWER: Tril(-1) directly gives strict-lower triangle (CANN dim reversal + // makes Tril(-1) equivalent to GGML's col < row). + if (ttype == GGML_TRI_TYPE_LOWER) { + GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), (int64_t)-1, acl_dst.get()); + return; + } + + // For other types: copy src→dst, build a BOOL mask of positions to zero, + // then use MaskedFillScalar to zero those positions. + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst.get(), acl_src.get()); + + // Build lower-strict float mask (1s below diagonal, 0s elsewhere). + ggml_cann_pool_alloc ones_alloc(ctx.pool(), buf_sz); + void * ones_buf = ones_alloc.get(); + acl_tensor_ptr acl_ones = ggml_cann_create_tensor(ones_buf, ACL_FLOAT, nb_f32, ne3d, nb3d, 3); + { + float one_val = 1.0f; + acl_scalar_ptr acl_one = ggml_cann_create_scalar(&one_val, ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_ones.get(), acl_one.get()); + } + + ggml_cann_pool_alloc mask_f_alloc(ctx.pool(), buf_sz); + void * mask_f_buf = mask_f_alloc.get(); + acl_tensor_ptr acl_mask_f = ggml_cann_create_tensor(mask_f_buf, ACL_FLOAT, nb_f32, ne3d, nb3d, 3); + GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_ones.get(), (int64_t)-1, acl_mask_f.get()); + + // For LOWER_DIAG and UPPER: extend mask to include diagonal via strided + // diagonal view copy (Tril(0) is buggy on CANN, giving same result as Tril(-1)). + if (ttype == GGML_TRI_TYPE_LOWER_DIAG || ttype == GGML_TRI_TYPE_UPPER) { + int64_t ne_diag[2] = { S, n_batch }; + size_t nb_diag[2] = { (S + 1) * nb_f32, S * S * nb_f32 }; + acl_tensor_ptr acl_ones_diag = ggml_cann_create_tensor(ones_buf, ACL_FLOAT, nb_f32, ne_diag, nb_diag, 2); + acl_tensor_ptr acl_mask_diag = ggml_cann_create_tensor(mask_f_buf, ACL_FLOAT, nb_f32, ne_diag, nb_diag, 2); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_mask_diag.get(), acl_ones_diag.get()); + } + + // Cast float mask to BOOL. + ggml_cann_pool_alloc mask_b_alloc(ctx.pool(), bool_sz); + void * mask_b_buf = mask_b_alloc.get(); + acl_tensor_ptr acl_mask_b = ggml_cann_create_tensor(mask_b_buf, ACL_BOOL, nb_bool, ne3d, nb3d_bool, 3); + GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_mask_f.get(), ACL_BOOL, acl_mask_b.get()); + + // Select which BOOL mask to pass to MaskedFillScalar (True positions get zeroed). + // LOWER_DIAG: invert lower_diag → upper_strict mask. + // UPPER_DIAG: use lower_strict mask directly. + // UPPER: use lower_diag mask directly. + ggml_cann_pool_alloc mask_inv_alloc(ctx.pool(), bool_sz); + void * mask_inv_buf = mask_inv_alloc.get(); + acl_tensor_ptr acl_mask_inv = ggml_cann_create_tensor(mask_inv_buf, ACL_BOOL, nb_bool, ne3d, nb3d_bool, 3); + + aclTensor * fill_mask = nullptr; + switch (ttype) { + case GGML_TRI_TYPE_LOWER_DIAG: + GGML_CANN_CALL_ACLNN_OP(ctx, LogicalNot, acl_mask_b.get(), acl_mask_inv.get()); + fill_mask = acl_mask_inv.get(); + break; + case GGML_TRI_TYPE_UPPER_DIAG: + fill_mask = acl_mask_b.get(); + break; + case GGML_TRI_TYPE_UPPER: + fill_mask = acl_mask_b.get(); + break; + default: + GGML_ABORT("unsupported tri type"); + } + + float zero_val = 0.0f; + acl_scalar_ptr acl_zero = ggml_cann_create_scalar(&zero_val, ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMaskedFillScalar, acl_dst.get(), fill_mask, acl_zero.get()); +} + void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src = dst->src[0]; acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 19d1d65bf0..3926525a95 100644 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -325,6 +328,24 @@ void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst); void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst); +/** + * @brief Computes the cumulative sum of a ggml tensor along dim 0 using the + * CANN backend. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor. dst->op is `GGML_OP_CUMSUM`. + */ +void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst); + +/** + * @brief Computes a triangular mask (tril/triu) of a square ggml tensor + * using the CANN backend. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor. dst->op is `GGML_OP_TRI`. + */ +void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst); + /** * @brief Upsamples a ggml tensor using nearest neighbor interpolation using * the CANN backend. diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 5f960548cd..58067142f6 100644 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -277,10 +277,7 @@ struct ggml_graph_node_properties { } } - if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU || node->op == GGML_OP_ROPE){ - return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0; - } - return true; + return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0; } }; diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 2f87c649b9..5e3d7d53f3 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1908,6 +1908,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg case GGML_OP_GATED_DELTA_NET: ggml_cann_gated_delta_net(ctx, dst); break; + case GGML_OP_CUMSUM: + ggml_cann_cumsum(ctx, dst); + break; + case GGML_OP_TRI: + ggml_cann_tri(ctx, dst); + break; default: return false; } @@ -2591,6 +2597,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten && ggml_is_contiguous(beta) && q->type == GGML_TYPE_F32; } + case GGML_OP_CUMSUM: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_TRI: + return op->src[0]->type == GGML_TYPE_F32; default: return false; }