tests: set tensor usage as weight for weight tensors

only for mul_mat and mul_mat_id ops Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2025-12-14 18:05:51 +08:00 · 2025-12-14 18:05:51 +08:00 · 61ee32dec3
parent 1926e07e1a
commit 61ee32dec3
1 changed files with 116 additions and 9 deletions
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -1160,6 +1160,9 @@ struct test_case {
    std::vector<ggml_tensor *> sentinels;
    // Track weight tensors for separate buffer allocation with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
    std::vector<ggml_tensor *> weight_tensors;
    std::string current_op_name;
    void add_sentinel(ggml_context * ctx) {
@ -1238,6 +1241,8 @@ struct test_case {
                       const char *   op_names_filter,
                       printer *      output_printer) {
        mode = MODE_TEST;
        weight_tensors.clear();
        sentinels.clear();
        ggml_init_params params = {
            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
@ -1288,10 +1293,35 @@ struct test_case {
        // post-graph sentinel
        add_sentinel(ctx);
-        // allocate
+        // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
        ggml_backend_buffer_t weights_buf = nullptr;
        if (!weight_tensors.empty()) {
            // Calculate total size needed for weight tensors
            size_t weight_size = 0;
            for (ggml_tensor * wt : weight_tensors) {
                weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend1), wt);
            }
            weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend1)));
            weights_buf = ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend1), weight_size);
            if (weights_buf == NULL) {
                printf("failed to allocate weight tensors [%s] ", ggml_backend_name(backend1));
                ggml_free(ctx);
                return test_status_t::FAIL;
            }
            ggml_backend_buffer_set_usage(weights_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
            // Allocate each weight tensor in the weights buffer
            ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf);
            for (ggml_tensor * wt : weight_tensors) {
                ggml_tallocr_alloc(&weights_talloc, wt);
            }
        }
        // allocate remaining tensors
        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
-        if (buf == NULL) {
+        if (buf == NULL && weights_buf == NULL) {
            printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
            ggml_free(ctx);
            return test_status_t::FAIL;
@ -1385,6 +1415,9 @@ struct test_case {
        const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr);
        if (weights_buf) {
            ggml_backend_buffer_free(weights_buf);
        }
        ggml_backend_buffer_free(buf);
        ggml_free(ctx);
@ -1404,6 +1437,7 @@ struct test_case {
    bool eval_perf(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
        mode = MODE_PERF;
        weight_tensors.clear();
        static const size_t graph_nodes = 8192;
@ -1432,10 +1466,34 @@ struct test_case {
            return true;
        }
-        // allocate
+        // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
        ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr
        if (!weight_tensors.empty()) {
            // Calculate total size needed for weight tensors
            size_t weight_size = 0;
            for (ggml_tensor * wt : weight_tensors) {
                weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt);
            }
            weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)));
            weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size));
            if (weights_buf == NULL) {
                printf("failed to allocate weight tensors\n");
                return false;
            }
            ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
            // Allocate each weight tensor in the weights buffer
            ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get());
            for (ggml_tensor * wt : weight_tensors) {
                ggml_tallocr_alloc(&weights_talloc, wt);
            }
        }
        // allocate remaining tensors
        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
-        if (buf == NULL) {
+        if (buf == NULL && weights_buf == NULL) {
            printf("failed to allocate tensors\n");
            return false;
        }
@ -1534,6 +1592,7 @@ struct test_case {
    bool eval_support(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
        mode = MODE_SUPPORT;
        weight_tensors.clear();
        static const size_t graph_nodes = 8192;
@ -1569,6 +1628,7 @@ struct test_case {
    bool eval_grad(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
        mode = MODE_GRAD;
        weight_tensors.clear();
        const std::vector<float> expect = grad_expect();
        ggml_init_params params = {
@ -1679,9 +1739,35 @@ struct test_case {
            return true;
        }
-        // allocate
+        // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
        ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr
        if (!weight_tensors.empty()) {
            // Calculate total size needed for weight tensors
            size_t weight_size = 0;
            for (ggml_tensor * wt : weight_tensors) {
                weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt);
            }
            weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)));
            weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size));
            if (weights_buf == NULL) {
                test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
                info.set_error("weight allocation", "");
                output_printer->print_operation(info);
                return false;
            }
            ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
            // Allocate each weight tensor in the weights buffer
            ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get());
            for (ggml_tensor * wt : weight_tensors) {
                ggml_tallocr_alloc(&weights_talloc, wt);
            }
        }
        // allocate remaining tensors
        ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
-        if (buf == NULL) {
+        if (buf == NULL && weights_buf == NULL) {
            test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
            info.set_error("allocation", "");
            output_printer->print_operation(info);
@ -3606,6 +3692,7 @@ struct test_mul_mat : public test_case {
            a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
            b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
            weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
            if (!ggml_is_quantized(type_a)) {
                if (bs[1] == 1 && nr[1] == 1) {
                    ggml_set_param(a);
@ -3623,6 +3710,7 @@ struct test_mul_mat : public test_case {
            const int64_t k_physical = k_v == 0 ? k : k_v;
            a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0],       bs[1]);
            b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]);
            weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
            if (!ggml_is_quantized(type_a)) {
                if (bs[1] == 1 && nr[1] == 1) {
@ -3716,6 +3804,7 @@ struct test_mul_mat_id : public test_case {
        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
        ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
        ggml_set_name(as, "as");
        weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
        ggml_set_name(ids, "ids");
@ -3776,6 +3865,7 @@ struct test_mul_mat_id_fusion : public test_case {
        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
        ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
        ggml_set_name(as, "as");
        weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
        ggml_set_name(ids, "ids");
@ -3792,6 +3882,7 @@ struct test_mul_mat_id_fusion : public test_case {
        for (uint32_t i = 1; i < o; ++i) {
            ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
            weight_tensors.push_back(a2); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
            ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids);
            ggml_set_name(out2, "out2");
            out = ggml_add(ctx, out, out2);
@ -7861,9 +7952,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
    for (bool v : {false, true}) {
        for (bool circular : {false, true}) {
@ -8064,12 +8170,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
-    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
    // qwen3next with CHUNK_SIZE 64
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
    // qwen3next with CHUNK_SIZE 128
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));
    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));