diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a950475fc3..a4510a771c 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -25,6 +25,17 @@ #include "ggml-impl.h" #include "ggml.h" +// Forward-declare InplaceFillDiagonal because aclnn_fill_diagonal.h has a +// broken include guard (OP_API_INC_ADD_H_) that conflicts with aclnn_add.h. +extern "C" { +aclnnStatus aclnnInplaceFillDiagonalGetWorkspaceSize( + aclTensor * selfRef, const aclScalar * fillValue, bool wrap, + uint64_t * workspaceSize, aclOpExecutor ** executor); +aclnnStatus aclnnInplaceFillDiagonal( + void * workspace, uint64_t workspaceSize, aclOpExecutor * executor, + aclrtStream stream); +} + #include #include #include @@ -62,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -73,7 +85,10 @@ #include #include #include +#include #include +#include +#include #include #include #include @@ -589,6 +604,33 @@ void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) { acl_mean_out.get(), acl_rstd_out.get()); } +void ggml_cann_set(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 }; + + // Create a view of dst at the target offset with src1's dimensions + acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); + acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1); + + if (!inplace) { + // First copy src0 to dst entirely + size_t cpy_size = ggml_nbytes(dst); + ACL_CHECK( + aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + } + + // Copy src1 into the target region of dst + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst.get(), acl_src1.get()); +} + void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; @@ -652,6 +694,166 @@ void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) { aclnn_reduce_sum(ctx, dst, reduce_dims, 4); } +void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; + acl_tensor_ptr acl_src = ggml_cann_create_tensor(src); + acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst); + // GGML cumsum operates along dim 0 (innermost / ne[0]). + // ggml_cann_create_tensor reverses dimensions to [ne3,ne2,ne1,ne0], + // so GGML dim 0 maps to CANN dim 3 (the last dim of the 4-D tensor). + GGML_CANN_CALL_ACLNN_OP(ctx, Cumsum, acl_src.get(), (int64_t)3, + ggml_cann_type_mapping(dst->type), acl_dst.get()); +} + +void ggml_cann_solve_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; // A: [N, N, B2, B3] lower triangular + ggml_tensor * src1 = dst->src[1]; // B: [K, N, B2, B3] + + acl_tensor_ptr acl_a = ggml_cann_create_tensor(src0); + acl_tensor_ptr acl_b = ggml_cann_create_tensor(src1); + acl_tensor_ptr acl_x = ggml_cann_create_tensor(dst); + + // mOut: triangular copy of A (required output), same shape as A. + const size_t a_bytes = ggml_nbytes(src0); + ggml_cann_pool_alloc m_alloc(ctx.pool(), a_bytes); + acl_tensor_ptr acl_m = ggml_cann_create_tensor( + m_alloc.get(), ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS); + + // Solve AX = B: upper=false (lower tri), transpose=false, unitriangular=false. + GGML_CANN_CALL_ACLNN_OP(ctx, TriangularSolve, + acl_b.get(), acl_a.get(), false, false, false, + acl_x.get(), acl_m.get()); +} + +void ggml_cann_diag(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; + + GGML_ASSERT(src->ne[1] == 1); + + const int64_t N = src->ne[0]; + const int64_t n_batch = src->ne[2] * src->ne[3]; + const size_t nb_f32 = sizeof(float); + + // Fill dst with zeros. + acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst); + { + float zero = 0.0f; + acl_scalar_ptr acl_zero = ggml_cann_create_scalar(&zero, ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst.get(), acl_zero.get()); + } + + // Copy src vector onto the diagonal of dst via strided views. + // src viewed as [N, n_batch], contiguous strides. + int64_t ne_vec[2] = { N, n_batch }; + size_t nb_src_vec[2] = { nb_f32, N * nb_f32 }; + // dst diagonal view: stride (N+1)*4 steps along the diagonal. + size_t nb_dst_diag[2] = { (N + 1) * nb_f32, N * N * nb_f32 }; + + acl_tensor_ptr acl_src_vec = ggml_cann_create_tensor(src->data, ACL_FLOAT, nb_f32, ne_vec, nb_src_vec, 2); + acl_tensor_ptr acl_dst_diag = ggml_cann_create_tensor(dst->data, ACL_FLOAT, nb_f32, ne_vec, nb_dst_diag, 2); + + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst_diag.get(), acl_src_vec.get()); +} + +void ggml_cann_fill(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + float c = ggml_get_op_params_f32(dst, 0); + + acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst); + acl_scalar_ptr acl_c = ggml_cann_create_scalar(&c, ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst.get(), acl_c.get()); +} + +void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; + + const int64_t S = src->ne[0]; + const int64_t n_batch = src->ne[2] * src->ne[3]; + const size_t nb_f32 = sizeof(float); + const size_t nb_bool = sizeof(uint8_t); + const size_t buf_sz = n_batch * S * S * nb_f32; + const size_t bool_sz = n_batch * S * S * nb_bool; + + int64_t ne3d[3] = { S, S, n_batch }; + size_t nb3d[3] = { nb_f32, S * nb_f32, S * S * nb_f32 }; + size_t nb3d_bool[3] = { nb_bool, S * nb_bool, S * S * nb_bool }; + + const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0); + + acl_tensor_ptr acl_src = ggml_cann_create_tensor(src->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3); + acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3); + + // LOWER: Tril(-1) directly gives strict-lower triangle (CANN dim reversal + // makes Tril(-1) equivalent to GGML's col < row). + if (ttype == GGML_TRI_TYPE_LOWER) { + GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), (int64_t)-1, acl_dst.get()); + return; + } + + // For other types: copy src→dst, build a BOOL mask of positions to zero, + // then use MaskedFillScalar to zero those positions. + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst.get(), acl_src.get()); + + // Build lower-strict float mask (1s below diagonal, 0s elsewhere). + ggml_cann_pool_alloc ones_alloc(ctx.pool(), buf_sz); + void * ones_buf = ones_alloc.get(); + acl_tensor_ptr acl_ones = ggml_cann_create_tensor(ones_buf, ACL_FLOAT, nb_f32, ne3d, nb3d, 3); + { + float one_val = 1.0f; + acl_scalar_ptr acl_one = ggml_cann_create_scalar(&one_val, ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_ones.get(), acl_one.get()); + } + + ggml_cann_pool_alloc mask_f_alloc(ctx.pool(), buf_sz); + void * mask_f_buf = mask_f_alloc.get(); + acl_tensor_ptr acl_mask_f = ggml_cann_create_tensor(mask_f_buf, ACL_FLOAT, nb_f32, ne3d, nb3d, 3); + GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_ones.get(), (int64_t)-1, acl_mask_f.get()); + + // For LOWER_DIAG and UPPER: extend mask to include diagonal via strided + // diagonal view copy (Tril(0) is buggy on CANN, giving same result as Tril(-1)). + if (ttype == GGML_TRI_TYPE_LOWER_DIAG || ttype == GGML_TRI_TYPE_UPPER) { + int64_t ne_diag[2] = { S, n_batch }; + size_t nb_diag[2] = { (S + 1) * nb_f32, S * S * nb_f32 }; + acl_tensor_ptr acl_ones_diag = ggml_cann_create_tensor(ones_buf, ACL_FLOAT, nb_f32, ne_diag, nb_diag, 2); + acl_tensor_ptr acl_mask_diag = ggml_cann_create_tensor(mask_f_buf, ACL_FLOAT, nb_f32, ne_diag, nb_diag, 2); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_mask_diag.get(), acl_ones_diag.get()); + } + + // Cast float mask to BOOL. + ggml_cann_pool_alloc mask_b_alloc(ctx.pool(), bool_sz); + void * mask_b_buf = mask_b_alloc.get(); + acl_tensor_ptr acl_mask_b = ggml_cann_create_tensor(mask_b_buf, ACL_BOOL, nb_bool, ne3d, nb3d_bool, 3); + GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_mask_f.get(), ACL_BOOL, acl_mask_b.get()); + + // Select which BOOL mask to pass to MaskedFillScalar (True positions get zeroed). + // LOWER_DIAG: invert lower_diag → upper_strict mask. + // UPPER_DIAG: use lower_strict mask directly. + // UPPER: use lower_diag mask directly. + ggml_cann_pool_alloc mask_inv_alloc(ctx.pool(), bool_sz); + void * mask_inv_buf = mask_inv_alloc.get(); + acl_tensor_ptr acl_mask_inv = ggml_cann_create_tensor(mask_inv_buf, ACL_BOOL, nb_bool, ne3d, nb3d_bool, 3); + + aclTensor * fill_mask = nullptr; + switch (ttype) { + case GGML_TRI_TYPE_LOWER_DIAG: + GGML_CANN_CALL_ACLNN_OP(ctx, LogicalNot, acl_mask_b.get(), acl_mask_inv.get()); + fill_mask = acl_mask_inv.get(); + break; + case GGML_TRI_TYPE_UPPER_DIAG: + fill_mask = acl_mask_b.get(); + break; + case GGML_TRI_TYPE_UPPER: + fill_mask = acl_mask_b.get(); + break; + default: + GGML_ABORT("unsupported tri type"); + } + + float zero_val = 0.0f; + acl_scalar_ptr acl_zero = ggml_cann_create_scalar(&zero_val, ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMaskedFillScalar, acl_dst.get(), fill_mask, acl_zero.get()); +} + void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src = dst->src[0]; acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); @@ -4170,3 +4372,211 @@ void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor * } } } + +// ggml_cann_gated_delta_net +// +// Head-parallel implementation of the Gated Delta Net recurrence. +// +// CANN's aclnnBatchMatMul accepts rank-3 tensors only: [batch, M, K] @ [batch, K, N]. +// The n_seqs sequences have non-uniform strides across the batch dimension when +// viewed as [n_seqs*H, S, S] (seq boundary stride ≠ head stride), so we keep a +// thin outer loop over n_seqs and batch all H heads per sequence using 3-D BMM. +// +// Per sequence s, per timestep t: +// Step 1 – Decay M[H,S,S] *= exp(g) +// KDA: g_exp[H,S] broadcast as [H,1,S] → M[h,j,i] *= exp(g[h,i]) +// Scalar: g_exp[H] broadcast as [H,1,1] → M[h,:,:] *= exp(g[h]) +// Step 2 – Mk = M @ k_col [H,S,S] @ [H,S,1] → [H,S,1] +// Step 3 – delta = (v - Mk) * beta → [H,S] +// Step 4 – M += outer(delta, k) [H,S,1] @ [H,1,S] → [H,S,S] +// Step 5 – o = M @ q * scale [H,S,S] @ [H,S,1] → [H,S,1] +// +// Kernel launches: ~6 * n_seqs * n_tokens +// vs. naive: ~6 * n_seqs * H * n_tokens (H× reduction) +// +// n_seqs is typically 1–4 in practice, so the outer loop is negligible. +// +// GGML→CANN convention: ne[] is REVERSED by create_tensor. +// ne=[S,S,H] → CANN [H,S,S], ne=[1,S,H] → CANN [H,S,1], etc. +// +// Preconditions (checked by caller): +// - no GQA: neq1==H, nek1==H, neq3==n_seqs, nek3==n_seqs +// - F32 contiguous q, k, v, g, beta +void ggml_cann_gated_delta_net(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src_q = dst->src[0]; + ggml_tensor * src_k = dst->src[1]; + ggml_tensor * src_v = dst->src[2]; + ggml_tensor * src_g = dst->src[3]; + ggml_tensor * src_beta = dst->src[4]; + ggml_tensor * src_state = dst->src[5]; + + const int64_t S_v = src_v->ne[0]; + const int64_t H = src_v->ne[1]; + const int64_t n_tokens = src_v->ne[2]; + const int64_t n_seqs = src_v->ne[3]; + const bool kda = (src_g->ne[0] == S_v); + const float scale = 1.0f / sqrtf((float)S_v); + const size_t F32 = sizeof(float); + + // Output: [attn_scores | new_states] + // attn: [S_v, H, n_tokens, n_seqs] = S_v*H*n_tokens*n_seqs floats + // state: [S_v, S_v, H, n_seqs] starts after attn + const size_t state_off = (size_t)(S_v * H * n_tokens * n_seqs) * F32; + + // Copy input state → output state region (updated in-place below) + { + int64_t ne_flat[1] = { S_v * S_v * H * n_seqs }; + size_t nb_flat[1] = { F32 }; + auto acl_sin = ggml_cann_create_tensor(src_state->data, ACL_FLOAT, F32, ne_flat, nb_flat, 1); + auto acl_sout = ggml_cann_create_tensor(dst->data, ACL_FLOAT, F32, ne_flat, nb_flat, 1, + ACL_FORMAT_ND, state_off); + cann_copy(ctx, acl_sin.get(), acl_sout.get()); + } + + // ── Temporary buffers (pre-allocated once, reused every (s,t)) ────────── + // g_exp: [H * (kda ? S_v : 1)] – exp(g) for current (s,t) + // mk: [H * S_v] – result of M @ k + // delta: [H * S_v] – (v - mk) * beta + // outer: [H * S_v * S_v] – rank-1 update delta ⊗ k^T + ggml_cann_pool_alloc g_exp_alloc(ctx.pool(), (size_t)H * (kda ? S_v : 1) * F32); + ggml_cann_pool_alloc mk_alloc (ctx.pool(), (size_t)H * S_v * F32); + ggml_cann_pool_alloc delta_alloc(ctx.pool(), (size_t)H * S_v * F32); + ggml_cann_pool_alloc outer_alloc(ctx.pool(), (size_t)H * S_v * S_v * F32); + + // ── 3-D shape/stride descriptors (GGML order; reversed by create_tensor) ─ + // + // ne=[S,S,H] → CANN [H,S,S] (state matrix, batch=H) + // ne=[1,S,H] → CANN [H,S,1] (column vec, batch=H) + // ne=[S,1,H] → CANN [H,1,S] (row vec, batch=H) + // ne=[S, H] → CANN [H,S] (flat vec, batch=H) + // ne=[1, H] → CANN [H,1] (scalar per head, batch=H) + // + // Stride derivation examples (elem strides after reversal → CANN strides): + // ne=[1,S,H], nb=[F32, F32, S*F32]: + // elem [1,1,S] → rev → [S,1,1] for [H,S,1]: k[h][i][0] at h*S+i ✓ + // ne=[S,1,H], nb=[F32, S*F32, S*F32]: + // elem [1,S,S] → rev → [S,S,1] for [H,1,S]: k[h][0][j] at h*S+j ✓ + + int64_t ne_M[3] = { S_v, S_v, H }; + size_t nb_M[3] = { F32, (size_t)S_v*F32, (size_t)S_v*S_v*F32 }; + int64_t ne_col[3] = { 1, S_v, H }; + size_t nb_col[3] = { F32, F32, (size_t)S_v*F32 }; + int64_t ne_row[3] = { S_v, 1, H }; + size_t nb_row[3] = { F32, (size_t)S_v*F32, (size_t)S_v*F32 }; + int64_t ne_vec[2] = { S_v, H }; + size_t nb_vec[2] = { F32, (size_t)S_v*F32 }; + + for (int64_t s = 0; s < n_seqs; s++) { + // State M for seq s: CANN [H, S_v, S_v] starting at s_base + const size_t s_base = state_off + (size_t)(s * H * S_v * S_v) * F32; + + for (int64_t t = 0; t < n_tokens; t++) { + + // ── Step 1: Decay M_h *= exp(g_h) ────────────────────────────── + { + const size_t g_off = (size_t)(s * src_g->nb[3] + t * src_g->nb[2]); + + if (kda) { + // g slice [H, S_v] at (s,t) + int64_t ne_g[2] = { S_v, H }; + size_t nb_g_src[2] = { (size_t)src_g->nb[0], (size_t)src_g->nb[1] }; + size_t nb_g_tmp[2] = { F32, (size_t)S_v*F32 }; + auto acl_g_src = ggml_cann_create_tensor(src_g->data, ACL_FLOAT, F32, + ne_g, nb_g_src, 2, ACL_FORMAT_ND, g_off); + auto acl_g_exp = ggml_cann_create_tensor(g_exp_alloc.get(), ACL_FLOAT, F32, + ne_g, nb_g_tmp, 2); + cann_copy(ctx, acl_g_src.get(), acl_g_exp.get()); + aclnn_exp(ctx, acl_g_exp.get()); + // Broadcast as CANN [H,1,S] → M[h,j,i] *= exp(g[h,i]) + auto acl_g_bc = ggml_cann_create_tensor(g_exp_alloc.get(), ACL_FLOAT, F32, + ne_row, nb_row, 3); + auto acl_M = ggml_cann_create_tensor(dst->data, ACL_FLOAT, F32, + ne_M, nb_M, 3, ACL_FORMAT_ND, s_base); + aclnn_mul(ctx, acl_M.get(), acl_g_bc.get(), nullptr); + } else { + // g slice [H, 1] at (s,t), one scalar per head + int64_t ne_g[2] = { 1, H }; + size_t nb_g_src[2] = { (size_t)src_g->nb[0], (size_t)src_g->nb[1] }; + size_t nb_g_tmp[2] = { F32, F32 }; + auto acl_g_src = ggml_cann_create_tensor(src_g->data, ACL_FLOAT, F32, + ne_g, nb_g_src, 2, ACL_FORMAT_ND, g_off); + auto acl_g_exp = ggml_cann_create_tensor(g_exp_alloc.get(), ACL_FLOAT, F32, + ne_g, nb_g_tmp, 2); + cann_copy(ctx, acl_g_src.get(), acl_g_exp.get()); + aclnn_exp(ctx, acl_g_exp.get()); + // Broadcast as CANN [H,1,1] → M_h *= exp(g_h) + int64_t ne_g_bc[3] = { 1, 1, H }; + size_t nb_g_bc[3] = { F32, F32, F32 }; + auto acl_g_bc = ggml_cann_create_tensor(g_exp_alloc.get(), ACL_FLOAT, F32, + ne_g_bc, nb_g_bc, 3); + auto acl_M = ggml_cann_create_tensor(dst->data, ACL_FLOAT, F32, + ne_M, nb_M, 3, ACL_FORMAT_ND, s_base); + aclnn_mul(ctx, acl_M.get(), acl_g_bc.get(), nullptr); + } + } + + // ── Step 2: Mk = M @ k_col [H,S,S]@[H,S,1] → [H,S,1] ───────── + { + const size_t k_off = (size_t)(s * src_k->nb[3] + t * src_k->nb[2]); + size_t nb_k_col[3] = { F32, (size_t)src_k->nb[0], (size_t)src_k->nb[1] }; + auto acl_M = ggml_cann_create_tensor(dst->data, ACL_FLOAT, F32, + ne_M, nb_M, 3, ACL_FORMAT_ND, s_base); + auto acl_k = ggml_cann_create_tensor(src_k->data, ACL_FLOAT, F32, + ne_col, nb_k_col, 3, ACL_FORMAT_ND, k_off); + auto acl_Mk = ggml_cann_create_tensor(mk_alloc.get(), ACL_FLOAT, F32, + ne_col, nb_col, 3); + GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_M.get(), acl_k.get(), acl_Mk.get(), 2); + } + + // ── Step 3: delta = (v - Mk) * beta [H,S] ────────────────────── + { + const size_t v_off = (size_t)(s * src_v->nb[3] + t * src_v->nb[2]); + const size_t beta_off = (size_t)(s * src_beta->nb[3] + t * src_beta->nb[2]); + size_t nb_v[2] = { (size_t)src_v->nb[0], (size_t)src_v->nb[1] }; + int64_t ne_beta[2] = { 1, H }; + size_t nb_beta[2] = { (size_t)src_beta->nb[0], (size_t)src_beta->nb[1] }; + auto acl_v = ggml_cann_create_tensor(src_v->data, ACL_FLOAT, F32, + ne_vec, nb_v, 2, ACL_FORMAT_ND, v_off); + auto acl_Mk_sq = ggml_cann_create_tensor(mk_alloc.get(), ACL_FLOAT, F32, + ne_vec, nb_vec, 2); + auto acl_delta = ggml_cann_create_tensor(delta_alloc.get(), ACL_FLOAT, F32, + ne_vec, nb_vec, 2); + auto acl_beta = ggml_cann_create_tensor(src_beta->data, ACL_FLOAT, F32, + ne_beta, nb_beta, 2, ACL_FORMAT_ND, beta_off); + aclnn_sub(ctx, acl_v.get(), acl_Mk_sq.get(), acl_delta.get()); + aclnn_mul(ctx, acl_delta.get(), acl_beta.get(), nullptr); + } + + // ── Step 4: M += outer(delta, k) [H,S,1]@[H,1,S] → [H,S,S] ──── + { + const size_t k_off = (size_t)(s * src_k->nb[3] + t * src_k->nb[2]); + auto acl_d_col = ggml_cann_create_tensor(delta_alloc.get(), ACL_FLOAT, F32, + ne_col, nb_col, 3); + auto acl_k_row = ggml_cann_create_tensor(src_k->data, ACL_FLOAT, F32, + ne_row, nb_row, 3, ACL_FORMAT_ND, k_off); + auto acl_outer = ggml_cann_create_tensor(outer_alloc.get(), ACL_FLOAT, F32, + ne_M, nb_M, 3); + GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_d_col.get(), acl_k_row.get(), acl_outer.get(), 2); + auto acl_M = ggml_cann_create_tensor(dst->data, ACL_FLOAT, F32, + ne_M, nb_M, 3, ACL_FORMAT_ND, s_base); + aclnn_add(ctx, acl_M.get(), acl_outer.get(), nullptr); + } + + // ── Step 5: o = M @ q * scale [H,S,S]@[H,S,1] → [H,S,1] ─────── + { + const size_t q_off = (size_t)(s * src_q->nb[3] + t * src_q->nb[2]); + const size_t attn_off = (size_t)(s * n_tokens * H + t * H) * S_v * F32; + size_t nb_q_col[3] = { F32, (size_t)src_q->nb[0], (size_t)src_q->nb[1] }; + auto acl_M = ggml_cann_create_tensor(dst->data, ACL_FLOAT, F32, + ne_M, nb_M, 3, ACL_FORMAT_ND, s_base); + auto acl_q = ggml_cann_create_tensor(src_q->data, ACL_FLOAT, F32, + ne_col, nb_q_col, 3, ACL_FORMAT_ND, q_off); + auto acl_out = ggml_cann_create_tensor(dst->data, ACL_FLOAT, F32, + ne_col, nb_col, 3, ACL_FORMAT_ND, attn_off); + GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_M.get(), acl_q.get(), acl_out.get(), 2); + aclnn_muls(ctx, acl_out.get(), scale, nullptr, true); + } + } + } +} + diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 7f5ba4d330..f5aadb38ae 100644 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -47,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -325,6 +329,48 @@ void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst); void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst); +/** + * @brief Computes the cumulative sum of a ggml tensor along dim 0 using the + * CANN backend. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor. dst->op is `GGML_OP_CUMSUM`. + */ +void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst); + +/** + * @brief Computes a triangular mask (tril/triu) of a square ggml tensor + * using the CANN backend. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor. dst->op is `GGML_OP_TRI`. + */ +void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst); + +/** + * @brief Solves a triangular linear system AX=B using the CANN backend. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor. dst->op is `GGML_OP_SOLVE_TRI`. + */ +void ggml_cann_solve_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst); + +/** + * @brief Creates a diagonal matrix from a vector using the CANN backend. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor. dst->op is `GGML_OP_DIAG`. + */ +void ggml_cann_diag(ggml_backend_cann_context & ctx, ggml_tensor * dst); + +/** + * @brief Fills a tensor with a constant scalar value using the CANN backend. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor. dst->op is `GGML_OP_FILL`. + */ +void ggml_cann_fill(ggml_backend_cann_context & ctx, ggml_tensor * dst); + /** * @brief Upsamples a ggml tensor using nearest neighbor interpolation using * the CANN backend. @@ -461,6 +507,9 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * // @see ggml_cann_dup. void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst); +// @see ggml_cann_acc, but copies src1 into dst instead of adding. +void ggml_cann_set(ggml_backend_cann_context & ctx, ggml_tensor * dst); + /** * @brief Computes the softmax activation with optional masking. * @@ -844,6 +893,27 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst */ void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor * dst); +/** + * @brief Forward Gated Delta Net on the CANN backend. + * + * Expects dst->src[0..5] = {q, k, v, g, beta, state} with shape conventions: + * q, k: [S_v, H_q/H_k, n_tokens, n_seqs] (contiguous rows) + * v: [S_v, H, n_tokens, n_seqs] + * g: [1, H, n_tokens, n_seqs] (scalar gate) or [S_v, H, n_tokens, n_seqs] (KDA) + * beta: [1, H, n_tokens, n_seqs] + * state:[S_v, S_v, H, n_seqs] + * + * Per token recurrence: + * S_t = exp(g_t) * S_{t-1} + k_t * (v_t - S_{t-1}^T k_t)^T * beta_t + * out_t = S_t^T q_t / sqrt(S_v) + * + * dst holds both attention outputs and updated state. + * + * @param ctx Backend context providing stream/allocator utilities. + * @param dst Output tensor; src deps are q, k, v, g, beta, state as above. + */ +void ggml_cann_gated_delta_net(ggml_backend_cann_context & ctx, ggml_tensor * dst); + /** * @brief Launches an asynchronous task using the memory allocator. * diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 40fe3d82ec..6fa2b79ce7 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1428,6 +1428,22 @@ static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer, return false; } +/** + * @brief Set a region of a tensor's device memory to a specified value. + * + * @param buffer The CANN buffer containing the tensor. + * @param tensor Pointer to the tensor whose memory will be set. + * @param value The value to which each byte in the region will be set. + * @param offset Byte offset within the tensor's data to start setting. + * @param size Number of bytes to set. + */ +static void ggml_backend_cann_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context; + + ggml_cann_set_device(ctx->device); + ACL_CHECK(aclrtMemset((char *) tensor->data + offset, size, value, size)); +} + /** * @brief Clear a CANN buffer by setting all its memory to a specified value. * @@ -1454,7 +1470,7 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer, /* .get_base = */ ggml_backend_cann_buffer_get_base, /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor, - /* .memset_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_cann_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cann_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cann_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor, @@ -1833,6 +1849,20 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg case GGML_UNARY_OP_STEP: ggml_cann_step(ctx, dst); break; + case GGML_UNARY_OP_SOFTPLUS: + { + auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { + float beta_val = 1.0f; + float threshold_val = 20.0f; + aclScalar * beta = aclCreateScalar(&beta_val, aclDataType::ACL_FLOAT); + aclScalar * threshold = aclCreateScalar(&threshold_val, aclDataType::ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, Softplus, acl_src, beta, threshold, acl_dst); + aclDestroyScalar(beta); + aclDestroyScalar(threshold); + }; + ggml_cann_op_unary(lambda, ctx, dst); + } + break; default: return false; } @@ -1918,6 +1948,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg case GGML_OP_CPY: ggml_cann_cpy(ctx, dst); break; + case GGML_OP_SET: + ggml_cann_set(ctx, dst); + break; case GGML_OP_CONT: ggml_cann_dup(ctx, dst); break; @@ -1987,6 +2020,24 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg case GGML_OP_SSM_CONV: ggml_cann_ssm_conv(ctx, dst); break; + case GGML_OP_GATED_DELTA_NET: + ggml_cann_gated_delta_net(ctx, dst); + break; + case GGML_OP_CUMSUM: + ggml_cann_cumsum(ctx, dst); + break; + case GGML_OP_TRI: + ggml_cann_tri(ctx, dst); + break; + case GGML_OP_FILL: + ggml_cann_fill(ctx, dst); + break; + case GGML_OP_DIAG: + ggml_cann_diag(ctx, dst); + break; + case GGML_OP_SOLVE_TRI: + ggml_cann_solve_tri(ctx, dst); + break; default: return false; } @@ -2322,6 +2373,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, if (use_cann_graph) { // If no matching graph is found, the graph needs to be recaptured. graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph); + if (graph_capture_required) { // If no matching graph is found, add a new ACL graph. ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph); @@ -2380,6 +2432,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten case GGML_UNARY_OP_SGN: case GGML_UNARY_OP_STEP: case GGML_UNARY_OP_GELU_ERF: + case GGML_UNARY_OP_SOFTPLUS: return true; default: return false; @@ -2570,6 +2623,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: + case GGML_OP_SET: case GGML_OP_GROUP_NORM: return true; case GGML_OP_PAD: @@ -2647,6 +2701,38 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten } case GGML_OP_SSM_CONV: return true; + case GGML_OP_GATED_DELTA_NET: + { + // Only the batched path (BatchMatMul over all heads) is efficient. + // Non-contiguous / GQA / non-F32 cases fall back to CPU. + const ggml_tensor * q = op->src[0]; + const ggml_tensor * k = op->src[1]; + const ggml_tensor * v = op->src[2]; + const ggml_tensor * g = op->src[3]; + const ggml_tensor * beta = op->src[4]; + const int64_t H = v->ne[1]; + const int64_t n_seqs = v->ne[3]; + return q->ne[1] == H + && k->ne[1] == H + && q->ne[3] == n_seqs + && k->ne[3] == n_seqs + && ggml_is_contiguous(q) + && ggml_is_contiguous(k) + && ggml_is_contiguous(v) + && ggml_is_contiguous(g) + && ggml_is_contiguous(beta) + && q->type == GGML_TYPE_F32; + } + case GGML_OP_CUMSUM: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_TRI: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_FILL: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_DIAG: + return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_SOLVE_TRI: + return op->src[0]->type == GGML_TYPE_F32; default: return false; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 781c621d93..f904db40b9 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3689,6 +3689,20 @@ struct test_gated_delta_net : public test_case { : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs), v_repeat(v_repeat), permuted(permuted), kda(kda) {} + double max_nmse_err() override { + return 1e-7; + } + + double max_nmse_err(ggml_backend_t backend) override { + // Accelerator backends (CANN, etc.) use batched matmul/hardware ops that + // accumulate FP32 rounding differently from CPU scalar loops. Allow up + // to 1e-6 (roughly 8 ULPs of float32 epsilon) for those backends. + if (strncmp(ggml_backend_name(backend), "CANN", 4) == 0) { + return 1e-6; + } + return max_nmse_err(); + } + ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * q; ggml_tensor * k;