From df6a560c1097c87ffaa88fb76db28e97c62c36d6 Mon Sep 17 00:00:00 2001 From: Aleksei Lobanov <2401213370@stu.pku.edu.cn> Date: Thu, 20 Nov 2025 14:30:42 +0800 Subject: [PATCH 1/3] CANN: implement SSM_CONV operator Co-authored-by: Aleksei Lobanov, Co-authored-by: Sujin Kang, --- ggml/src/ggml-cann/aclnn_ops.cpp | 123 +++++++++++++++++++++++++++++++ ggml/src/ggml-cann/aclnn_ops.h | 2 + ggml/src/ggml-cann/ggml-cann.cpp | 4 + tests/test-backend-ops.cpp | 8 ++ 4 files changed, 137 insertions(+) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 48f4b7db69..1d6fd5c00b 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -3484,3 +3484,126 @@ void ggml_cann_out_prod(ggml_backend_cann_context & ctx, ggml_tensor * dst) { break; } } + +void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; // conv_x + ggml_tensor * src1 = dst->src[1]; // conv1d.weight + + // This op is currently defined only for F32 in ggml_cpu + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + // Shapes follow ggml_compute_forward_ssm_conv_f32 + const int64_t nc = src1->ne[0]; // d_conv + const int64_t ncs = src0->ne[0]; // d_conv - 1 + n_t + const int64_t nr = src0->ne[1]; // d_inner + const int64_t n_s = src0->ne[2]; // n_seqs + + const int64_t n_t = dst->ne[1]; // tokens per sequence + + GGML_ASSERT(dst->ne[0] == nr); // dst: {d_inner, n_t, n_s} + GGML_ASSERT(src1->ne[1] == nr); // weight: {d_conv, d_inner} + GGML_ASSERT(ncs == nc - 1 + n_t); // conv_x: {d_conv - 1 + n_t, d_inner, n_s} + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src1->nb[0] == sizeof(float)); + + // --- Build CANN tensors --- + + // 1) Input: conv_x as NCL + // + // src0->ne = { ncs, nr, n_s, 1 } // {L_in, C, N} + // Passing ACL_FORMAT_NCL here means: + // reversed dims -> [N, C, L_in] = [n_s, nr, ncs] + acl_tensor_ptr acl_x = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL); + + // 2) Weights: depthwise conv kernel, view src1 as {K, 1, C} + // + // src1 original: ne = { nc, nr, 1, 1 } // [K, C, 1, 1] + // we want a view: ne_w = { nc, 1, nr } // [K, 1, C] + // so that reversed dims -> [C, 1, K] which matches + // [out_channels, in_channels/groups, kernel_size] + int64_t w_ne[GGML_MAX_DIMS] = { 0 }; + size_t w_nb[GGML_MAX_DIMS] = { 0 }; + + w_ne[0] = nc; // K + w_ne[1] = 1; // 1 input channel per group + w_ne[2] = nr; // C groups + w_ne[3] = 1; + + // Layout: src1 data is [K, C] with + // offset(k, c) = k*nb0 + c*nb1 + // We want offset_w(k, 0, c) = k*nb0 + c*nb1, + // so we can reuse nb0 and nb1, and set nb2 = nb1. + w_nb[0] = src1->nb[0]; // sizeof(float) + w_nb[1] = src1->nb[1]; // nc * sizeof(float) + w_nb[2] = src1->nb[1]; // same stride for each (fake) "channel" + w_nb[3] = src1->nb[3]; + + acl_tensor_ptr acl_w = ggml_cann_create_tensor( + src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), w_ne, w_nb, 3, ACL_FORMAT_NCL); + + // 3) Output: dst is { d_inner, n_t, n_s } (CLN) + // + // We need an NCL view of the same buffer: + // desired NCL logical shape: { L_out = n_t, C = nr, N = n_s } + // + // Original CLN layout: + // dst->ne = { nr, n_t, n_s } + // dst->nb[0] = sizeof(float) + // dst->nb[1] = nr * sizeof(float) + // dst->nb[2] = nr * n_t * sizeof(float) + // + // We want offset_new(L, C, N) = offset_orig(C, L, N). + // Choose: + // nb_y[0] = nr * sizeof(float); // step in L + // nb_y[1] = sizeof(float); // step in C + // nb_y[2] = nr * n_t * sizeof(float); // step in N + int64_t y_ne[GGML_MAX_DIMS] = { 0 }; + size_t y_nb[GGML_MAX_DIMS] = { 0 }; + + y_ne[0] = n_t; // L_out + y_ne[1] = nr; // C + y_ne[2] = n_s; // N + y_ne[3] = 1; + + y_nb[0] = dst->ne[0] * sizeof(float); // nr * sizeof(float) + y_nb[1] = sizeof(float); + y_nb[2] = dst->ne[0] * dst->ne[1] * sizeof(float); // nr * n_t * sizeof(float) + y_nb[3] = dst->nb[3]; + + acl_tensor_ptr acl_y = ggml_cann_create_tensor( + dst->data, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), y_ne, y_nb, 3, ACL_FORMAT_NCL); + + // --- Conv1d parameters: depthwise, stride 1, no padding ("valid") --- + int64_t strideVal[1] = { 1 }; + int64_t paddingVal[1] = { 0 }; + int64_t dilationVal[1] = { 1 }; + + acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1); + acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1); + acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1); + + const bool transposed = false; + const int64_t groups = nr; // depthwise: one group per inner dim + int8_t cubeMathType = 0; + +#ifdef ASCEND_310P + cubeMathType = 1; +#endif + + GGML_CANN_CALL_ACLNN_OP(ctx, + Convolution, + acl_x.get(), // input: N, C, L_in = ncs + acl_w.get(), // weight: [C, 1, K] with groups=nr + nullptr, // bias + stride.get(), + padding.get(), + dilation.get(), + transposed, + padding.get(), // output padding (unused for non-transposed) + groups, + acl_y.get(), + cubeMathType); +} + diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 1ebbc769c7..16389cb16e 100644 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -1032,6 +1032,8 @@ void ggml_cann_op_unary(std::function 0.000000100 + double max_nmse_err() override { + return 1e-6; + } }; // GGML_OP_SSM_SCAN From eb07456eef8452d45595c2ac5b96ab128dc67acc Mon Sep 17 00:00:00 2001 From: Aleksei Lobanov <2401213370@stu.pku.edu.cn> Date: Thu, 4 Dec 2025 17:02:24 +0800 Subject: [PATCH 2/3] CANN: remove custom error limit for SSM_CONV --- tests/test-backend-ops.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f454bda18e..9645d0b390 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3379,14 +3379,6 @@ struct test_ssm_conv : public test_case { ggml_tensor * out = ggml_ssm_conv(ctx, a, b); return out; } - - // for CANN Ascend310P3: - // this card requires setting cubeMathType=1 (ALLOW_FP32_DOWN_PRECISION) - // so the inputs are converted from f32 - // and tests fail with NMSE = 0.000000114 > 0.000000100 - double max_nmse_err() override { - return 1e-6; - } }; // GGML_OP_SSM_SCAN From a70e4c87e490e41b2265809f36b0c8183959d531 Mon Sep 17 00:00:00 2001 From: Aleksei Lobanov <2401213370@stu.pku.edu.cn> Date: Thu, 4 Dec 2025 18:10:43 +0800 Subject: [PATCH 3/3] CANN: merge SSM_CONV tensor shape/strides into one line --- ggml/src/ggml-cann/aclnn_ops.cpp | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 1d6fd5c00b..a59b2585d1 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -3523,22 +3523,12 @@ void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) { // we want a view: ne_w = { nc, 1, nr } // [K, 1, C] // so that reversed dims -> [C, 1, K] which matches // [out_channels, in_channels/groups, kernel_size] - int64_t w_ne[GGML_MAX_DIMS] = { 0 }; - size_t w_nb[GGML_MAX_DIMS] = { 0 }; - - w_ne[0] = nc; // K - w_ne[1] = 1; // 1 input channel per group - w_ne[2] = nr; // C groups - w_ne[3] = 1; - + int64_t w_ne[GGML_MAX_DIMS] = { nc, 1, nr, 1 }; // [K, 1 input ch. per group, C groups] // Layout: src1 data is [K, C] with // offset(k, c) = k*nb0 + c*nb1 // We want offset_w(k, 0, c) = k*nb0 + c*nb1, // so we can reuse nb0 and nb1, and set nb2 = nb1. - w_nb[0] = src1->nb[0]; // sizeof(float) - w_nb[1] = src1->nb[1]; // nc * sizeof(float) - w_nb[2] = src1->nb[1]; // same stride for each (fake) "channel" - w_nb[3] = src1->nb[3]; + size_t w_nb[GGML_MAX_DIMS] = { src1->nb[0], src1->nb[1], src1->nb[1], src1->nb[3] }; // same as src1 acl_tensor_ptr acl_w = ggml_cann_create_tensor( src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), w_ne, w_nb, 3, ACL_FORMAT_NCL); @@ -3559,18 +3549,8 @@ void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) { // nb_y[0] = nr * sizeof(float); // step in L // nb_y[1] = sizeof(float); // step in C // nb_y[2] = nr * n_t * sizeof(float); // step in N - int64_t y_ne[GGML_MAX_DIMS] = { 0 }; - size_t y_nb[GGML_MAX_DIMS] = { 0 }; - - y_ne[0] = n_t; // L_out - y_ne[1] = nr; // C - y_ne[2] = n_s; // N - y_ne[3] = 1; - - y_nb[0] = dst->ne[0] * sizeof(float); // nr * sizeof(float) - y_nb[1] = sizeof(float); - y_nb[2] = dst->ne[0] * dst->ne[1] * sizeof(float); // nr * n_t * sizeof(float) - y_nb[3] = dst->nb[3]; + int64_t y_ne[GGML_MAX_DIMS] = { n_t, nr, n_s, 1 }; // [L_out, C, N] + size_t y_nb[GGML_MAX_DIMS] = { dst->ne[0] * sizeof(float), sizeof(float), dst->ne[0] * dst->ne[1] * sizeof(float), dst->nb[3] }; // [nr, 1, nr * n_t] acl_tensor_ptr acl_y = ggml_cann_create_tensor( dst->data, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), y_ne, y_nb, 3, ACL_FORMAT_NCL);