diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7be1f66038..f876a54b70 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1160,6 +1160,9 @@ struct test_case { std::vector sentinels; + // Track weight tensors for separate buffer allocation with GGML_BACKEND_BUFFER_USAGE_WEIGHTS + std::vector weight_tensors; + std::string current_op_name; void add_sentinel(ggml_context * ctx) { @@ -1238,6 +1241,8 @@ struct test_case { const char * op_names_filter, printer * output_printer) { mode = MODE_TEST; + weight_tensors.clear(); + sentinels.clear(); ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), @@ -1288,10 +1293,35 @@ struct test_case { // post-graph sentinel add_sentinel(ctx); - // allocate + // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS + ggml_backend_buffer_t weights_buf = nullptr; + if (!weight_tensors.empty()) { + // Calculate total size needed for weight tensors + size_t weight_size = 0; + for (ggml_tensor * wt : weight_tensors) { + weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend1), wt); + } + weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend1))); + + weights_buf = ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend1), weight_size); + if (weights_buf == NULL) { + printf("failed to allocate weight tensors [%s] ", ggml_backend_name(backend1)); + ggml_free(ctx); + return test_status_t::FAIL; + } + ggml_backend_buffer_set_usage(weights_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + // Allocate each weight tensor in the weights buffer + ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf); + for (ggml_tensor * wt : weight_tensors) { + ggml_tallocr_alloc(&weights_talloc, wt); + } + } + + // allocate remaining tensors ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1); - if (buf == NULL) { + if (buf == NULL && weights_buf == NULL) { printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1)); ggml_free(ctx); return test_status_t::FAIL; @@ -1385,6 +1415,9 @@ struct test_case { const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr); + if (weights_buf) { + ggml_backend_buffer_free(weights_buf); + } ggml_backend_buffer_free(buf); ggml_free(ctx); @@ -1404,6 +1437,7 @@ struct test_case { bool eval_perf(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) { mode = MODE_PERF; + weight_tensors.clear(); static const size_t graph_nodes = 8192; @@ -1432,10 +1466,34 @@ struct test_case { return true; } - // allocate + // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS + ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr + if (!weight_tensors.empty()) { + // Calculate total size needed for weight tensors + size_t weight_size = 0; + for (ggml_tensor * wt : weight_tensors) { + weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt); + } + weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend))); + + weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size)); + if (weights_buf == NULL) { + printf("failed to allocate weight tensors\n"); + return false; + } + ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + // Allocate each weight tensor in the weights buffer + ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get()); + for (ggml_tensor * wt : weight_tensors) { + ggml_tallocr_alloc(&weights_talloc, wt); + } + } + + // allocate remaining tensors ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr - if (buf == NULL) { + if (buf == NULL && weights_buf == NULL) { printf("failed to allocate tensors\n"); return false; } @@ -1534,6 +1592,7 @@ struct test_case { bool eval_support(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) { mode = MODE_SUPPORT; + weight_tensors.clear(); static const size_t graph_nodes = 8192; @@ -1569,6 +1628,7 @@ struct test_case { bool eval_grad(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) { mode = MODE_GRAD; + weight_tensors.clear(); const std::vector expect = grad_expect(); ggml_init_params params = { @@ -1679,9 +1739,35 @@ struct test_case { return true; } - // allocate + // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS + ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr + if (!weight_tensors.empty()) { + // Calculate total size needed for weight tensors + size_t weight_size = 0; + for (ggml_tensor * wt : weight_tensors) { + weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt); + } + weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend))); + + weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size)); + if (weights_buf == NULL) { + test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend)); + info.set_error("weight allocation", ""); + output_printer->print_operation(info); + return false; + } + ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + // Allocate each weight tensor in the weights buffer + ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get()); + for (ggml_tensor * wt : weight_tensors) { + ggml_tallocr_alloc(&weights_talloc, wt); + } + } + + // allocate remaining tensors ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr - if (buf == NULL) { + if (buf == NULL && weights_buf == NULL) { test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend)); info.set_error("allocation", ""); output_printer->print_operation(info); @@ -3606,6 +3692,7 @@ struct test_mul_mat : public test_case { a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]); b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]); + weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS if (!ggml_is_quantized(type_a)) { if (bs[1] == 1 && nr[1] == 1) { ggml_set_param(a); @@ -3623,6 +3710,7 @@ struct test_mul_mat : public test_case { const int64_t k_physical = k_v == 0 ? k : k_v; a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0], bs[1]); b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]); + weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS if (!ggml_is_quantized(type_a)) { if (bs[1] == 1 && nr[1] == 1) { @@ -3716,6 +3804,7 @@ struct test_mul_mat_id : public test_case { // C^T = A * B^T: (k, m) * (k, n) => (m, n) ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ggml_set_name(as, "as"); + weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ggml_set_name(ids, "ids"); @@ -3776,6 +3865,7 @@ struct test_mul_mat_id_fusion : public test_case { // C^T = A * B^T: (k, m) * (k, n) => (m, n) ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ggml_set_name(as, "as"); + weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ggml_set_name(ids, "ids"); @@ -3792,6 +3882,7 @@ struct test_mul_mat_id_fusion : public test_case { for (uint32_t i = 1; i < o; ++i) { ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); + weight_tensors.push_back(a2); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids); ggml_set_name(out2, "out2"); out = ggml_add(ctx, out, out2); @@ -7861,9 +7952,24 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 })); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 })); for (bool v : {false, true}) { for (bool circular : {false, true}) { @@ -8064,12 +8170,13 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416)); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 })); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 })); // qwen3next with CHUNK_SIZE 64 test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 })); // qwen3next with CHUNK_SIZE 128 test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));