tests: set tensor usage as weight for weight tensors

only for mul_mat and mul_mat_id ops

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
This commit is contained in:
Aaron Teo 2025-12-14 18:05:51 +08:00
parent 1926e07e1a
commit 61ee32dec3
No known key found for this signature in database
1 changed files with 116 additions and 9 deletions

View File

@ -1160,6 +1160,9 @@ struct test_case {
std::vector<ggml_tensor *> sentinels; std::vector<ggml_tensor *> sentinels;
// Track weight tensors for separate buffer allocation with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
std::vector<ggml_tensor *> weight_tensors;
std::string current_op_name; std::string current_op_name;
void add_sentinel(ggml_context * ctx) { void add_sentinel(ggml_context * ctx) {
@ -1238,6 +1241,8 @@ struct test_case {
const char * op_names_filter, const char * op_names_filter,
printer * output_printer) { printer * output_printer) {
mode = MODE_TEST; mode = MODE_TEST;
weight_tensors.clear();
sentinels.clear();
ggml_init_params params = { ggml_init_params params = {
/* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
@ -1288,10 +1293,35 @@ struct test_case {
// post-graph sentinel // post-graph sentinel
add_sentinel(ctx); add_sentinel(ctx);
// allocate // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
ggml_backend_buffer_t weights_buf = nullptr;
if (!weight_tensors.empty()) {
// Calculate total size needed for weight tensors
size_t weight_size = 0;
for (ggml_tensor * wt : weight_tensors) {
weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend1), wt);
}
weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend1)));
weights_buf = ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend1), weight_size);
if (weights_buf == NULL) {
printf("failed to allocate weight tensors [%s] ", ggml_backend_name(backend1));
ggml_free(ctx);
return test_status_t::FAIL;
}
ggml_backend_buffer_set_usage(weights_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
// Allocate each weight tensor in the weights buffer
ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf);
for (ggml_tensor * wt : weight_tensors) {
ggml_tallocr_alloc(&weights_talloc, wt);
}
}
// allocate remaining tensors
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1); ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
if (buf == NULL) { if (buf == NULL && weights_buf == NULL) {
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1)); printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
ggml_free(ctx); ggml_free(ctx);
return test_status_t::FAIL; return test_status_t::FAIL;
@ -1385,6 +1415,9 @@ struct test_case {
const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr); const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr);
if (weights_buf) {
ggml_backend_buffer_free(weights_buf);
}
ggml_backend_buffer_free(buf); ggml_backend_buffer_free(buf);
ggml_free(ctx); ggml_free(ctx);
@ -1404,6 +1437,7 @@ struct test_case {
bool eval_perf(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) { bool eval_perf(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
mode = MODE_PERF; mode = MODE_PERF;
weight_tensors.clear();
static const size_t graph_nodes = 8192; static const size_t graph_nodes = 8192;
@ -1432,10 +1466,34 @@ struct test_case {
return true; return true;
} }
// allocate // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr
if (!weight_tensors.empty()) {
// Calculate total size needed for weight tensors
size_t weight_size = 0;
for (ggml_tensor * wt : weight_tensors) {
weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt);
}
weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)));
weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size));
if (weights_buf == NULL) {
printf("failed to allocate weight tensors\n");
return false;
}
ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
// Allocate each weight tensor in the weights buffer
ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get());
for (ggml_tensor * wt : weight_tensors) {
ggml_tallocr_alloc(&weights_talloc, wt);
}
}
// allocate remaining tensors
ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
if (buf == NULL) { if (buf == NULL && weights_buf == NULL) {
printf("failed to allocate tensors\n"); printf("failed to allocate tensors\n");
return false; return false;
} }
@ -1534,6 +1592,7 @@ struct test_case {
bool eval_support(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) { bool eval_support(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
mode = MODE_SUPPORT; mode = MODE_SUPPORT;
weight_tensors.clear();
static const size_t graph_nodes = 8192; static const size_t graph_nodes = 8192;
@ -1569,6 +1628,7 @@ struct test_case {
bool eval_grad(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) { bool eval_grad(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) {
mode = MODE_GRAD; mode = MODE_GRAD;
weight_tensors.clear();
const std::vector<float> expect = grad_expect(); const std::vector<float> expect = grad_expect();
ggml_init_params params = { ggml_init_params params = {
@ -1679,9 +1739,35 @@ struct test_case {
return true; return true;
} }
// allocate // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS
ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr
if (!weight_tensors.empty()) {
// Calculate total size needed for weight tensors
size_t weight_size = 0;
for (ggml_tensor * wt : weight_tensors) {
weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt);
}
weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend)));
weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size));
if (weights_buf == NULL) {
test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
info.set_error("weight allocation", "");
output_printer->print_operation(info);
return false;
}
ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
// Allocate each weight tensor in the weights buffer
ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get());
for (ggml_tensor * wt : weight_tensors) {
ggml_tallocr_alloc(&weights_talloc, wt);
}
}
// allocate remaining tensors
ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr
if (buf == NULL) { if (buf == NULL && weights_buf == NULL) {
test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend)); test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend));
info.set_error("allocation", ""); info.set_error("allocation", "");
output_printer->print_operation(info); output_printer->print_operation(info);
@ -3606,6 +3692,7 @@ struct test_mul_mat : public test_case {
a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]); a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]); b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
if (!ggml_is_quantized(type_a)) { if (!ggml_is_quantized(type_a)) {
if (bs[1] == 1 && nr[1] == 1) { if (bs[1] == 1 && nr[1] == 1) {
ggml_set_param(a); ggml_set_param(a);
@ -3623,6 +3710,7 @@ struct test_mul_mat : public test_case {
const int64_t k_physical = k_v == 0 ? k : k_v; const int64_t k_physical = k_v == 0 ? k : k_v;
a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0], bs[1]); a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0], bs[1]);
b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]); b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]);
weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
if (!ggml_is_quantized(type_a)) { if (!ggml_is_quantized(type_a)) {
if (bs[1] == 1 && nr[1] == 1) { if (bs[1] == 1 && nr[1] == 1) {
@ -3716,6 +3804,7 @@ struct test_mul_mat_id : public test_case {
// C^T = A * B^T: (k, m) * (k, n) => (m, n) // C^T = A * B^T: (k, m) * (k, n) => (m, n)
ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
ggml_set_name(as, "as"); ggml_set_name(as, "as");
weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
ggml_set_name(ids, "ids"); ggml_set_name(ids, "ids");
@ -3776,6 +3865,7 @@ struct test_mul_mat_id_fusion : public test_case {
// C^T = A * B^T: (k, m) * (k, n) => (m, n) // C^T = A * B^T: (k, m) * (k, n) => (m, n)
ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
ggml_set_name(as, "as"); ggml_set_name(as, "as");
weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
ggml_set_name(ids, "ids"); ggml_set_name(ids, "ids");
@ -3792,6 +3882,7 @@ struct test_mul_mat_id_fusion : public test_case {
for (uint32_t i = 1; i < o; ++i) { for (uint32_t i = 1; i < o; ++i) {
ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
weight_tensors.push_back(a2); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS
ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids); ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids);
ggml_set_name(out2, "out2"); ggml_set_name(out2, "out2");
out = ggml_add(ctx, out, out2); out = ggml_add(ctx, out, out2);
@ -7861,9 +7952,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 }));
for (bool v : {false, true}) { for (bool v : {false, true}) {
for (bool circular : {false, true}) { for (bool circular : {false, true}) {
@ -8064,12 +8170,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416)); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 }));
// qwen3next with CHUNK_SIZE 64 // qwen3next with CHUNK_SIZE 64
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
// qwen3next with CHUNK_SIZE 128 // qwen3next with CHUNK_SIZE 128
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 }));
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));