fix mul_mat_id stats, add throughput stat, add envvar trigger, add concurrent mode fix

This commit is contained in:
Piotr Wilkin 2026-03-29 22:52:33 +02:00
parent 64e438465f
commit 9da7e4e74f
12 changed files with 459 additions and 93 deletions

View File

@ -251,11 +251,17 @@ int main(int argc, char ** argv) {
if (params.profiling_output.empty()) {
ggml_backend_sched_print_profiling(sched);
} else {
int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
if (ret == 0) {
LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
const std::string & path = params.profiling_output;
int ret;
if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
} else {
LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
}
if (ret == 0) {
LOG("\nProfiling data exported to: %s\n", path.c_str());
} else {
LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str());
}
}
}

View File

@ -28,7 +28,7 @@ extern "C" {
void * profiling_context;
// callback for recording a profile record from C code (set by backend when profiling)
// params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4]
// params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4], ne_src2[4]
void (*profiling_record_fn)(void * context,
int type,
const char * name,
@ -38,7 +38,8 @@ extern "C" {
uint64_t bytes,
const char * extra,
const int64_t ne_src0[4],
const int64_t ne_src1[4]);
const int64_t ne_src1[4],
const int64_t ne_src2[4]);
};
// numa strategies

View File

@ -29,6 +29,7 @@ typedef struct ggml_profile_record {
const char * extra; // fusion name for fused ops, or NULL
int64_t ne_src0[4]; // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT)
int64_t ne_src1[4]; // src[1] tensor dimensions (e.g. input matrix for MUL_MAT)
int64_t ne_src2[4]; // src[2] tensor dimensions (e.g. ids for MUL_MAT_ID)
} ggml_profile_record;
// Backend profiler interface - each backend optionally implements this
@ -99,6 +100,13 @@ GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched
// Export profiling data as JSON to a FILE pointer
GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp);
// Export profiling data as plain text statistics to a file
// Returns 0 on success, -1 on error
GGML_API int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath);
// Export profiling data as plain text statistics to a FILE pointer
GGML_API int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp);
#ifdef __cplusplus
}
#endif

View File

@ -21,6 +21,7 @@
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <string>
#include <vector>
#ifdef __APPLE__
@ -749,8 +750,17 @@ struct ggml_backend_sched {
// profiling
bool profiling_enabled;
std::vector<ggml_profile_record> copy_records; // copy events recorded by the scheduler
std::vector<ggml_profile_record> profiling_records; // merged records from all sources
std::string profiling_env_path; // GGML_PROFILE env var value (for auto-export on free)
std::vector<ggml_profile_record> copy_records; // copy events recorded by the scheduler
std::vector<ggml_profile_record> profiling_records; // merged records from all sources
// Cached backend metadata for safe access during auto-export (backends may be freed first)
struct backend_meta {
std::string name;
std::string device;
int device_type;
};
std::vector<backend_meta> profiling_backend_meta;
};
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@ -1515,7 +1525,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} });
} else {
ggml_backend_tensor_copy(input, input_cpy);
}
@ -1635,7 +1645,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, moe_copy_start, moe_copy_end,
(uint64_t) total_copied_bytes, NULL, {0}, {0} });
(uint64_t) total_copied_bytes, NULL, {0}, {0}, {0} });
}
} else {
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
@ -1671,7 +1681,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} });
} else {
ggml_backend_tensor_copy(input, input_cpy);
}
@ -1691,7 +1701,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} });
}
}
}
@ -1837,7 +1847,24 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
sched->profiling_enabled = (getenv("GGML_PROFILE") != NULL);
const char * profile_env = getenv("GGML_PROFILE");
if (profile_env != NULL) {
sched->profiling_enabled = true;
sched->profiling_env_path = profile_env;
}
// Cache backend metadata for safe access during auto-export
for (int b = 0; b < n_backends; b++) {
ggml_backend_sched::backend_meta meta;
meta.name = ggml_backend_name(backends[b]);
meta.device = "unknown";
meta.device_type = 0;
if (backends[b]->device != NULL) {
meta.device = ggml_backend_dev_name(backends[b]->device);
meta.device_type = (int) ggml_backend_dev_type(backends[b]->device);
}
sched->profiling_backend_meta.push_back(std::move(meta));
}
ggml_backend_sched_reset(sched);
@ -1848,6 +1875,33 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
if (sched == NULL) {
return;
}
// Auto-export profiling data if enabled via GGML_PROFILE env var
// GGML_PROFILE=1 or GGML_PROFILE="" → print to stdout
// GGML_PROFILE=file.json → export JSON
// GGML_PROFILE=file.txt → export text
if (!sched->profiling_records.empty() && getenv("GGML_PROFILE") != NULL) {
const std::string & path = sched->profiling_env_path;
if (path.empty() || path == "1" || path == "stdout") {
ggml_backend_sched_print_profiling(sched);
} else if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
int ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
if (ret == 0) {
GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str());
} else {
GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str());
}
} else {
// Default to JSON for any other path (including .json)
int ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
if (ret == 0) {
GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str());
} else {
GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str());
}
}
}
for (int b = 0; b < sched->n_backends; b++) {
for (int c = 0; c < sched->n_copies; c++) {
ggml_backend_event_free(sched->events[b][c]);
@ -2518,18 +2572,16 @@ void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) {
double min_us = (double) s.min_ns / 1000.0;
double max_us = (double) s.max_ns / 1000.0;
if (s.type == GGML_PROFILE_EVENT_COPY) {
GGML_LOG_INFO(
" [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us",
type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us);
if (s.total_bytes > 0 && s.total_ns > 0) {
double bw_gbps = (double) s.total_bytes / (double) s.total_ns;
GGML_LOG_INFO(
" [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us "
" %8.2f GB/s",
type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us,
bw_gbps);
} else {
GGML_LOG_INFO(
" [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us",
type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us,
max_us);
if (bw_gbps >= 1000.0) {
GGML_LOG_INFO(" %6.2f TB/s", bw_gbps / 1000.0);
} else {
GGML_LOG_INFO(" %6.2f GB/s", bw_gbps);
}
}
// Print representative tensor shape (first record's ne)
if (s.representative_ne[0] > 0 || s.representative_ne[1] > 0) {
@ -2565,16 +2617,22 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f
fprintf(fp, " \"total_records\": %d,\n", (int) sched->profiling_records.size());
fprintf(fp, " \"total_ns\": %llu,\n", (unsigned long long) total_ns);
// Backend metadata
// Backend metadata (use cached data if available, fall back to live pointers)
fprintf(fp, " \"backends\": [\n");
for (int b = 0; b < sched->n_backends; b++) {
ggml_backend_t backend = sched->backends[b];
const char * name = ggml_backend_name(backend);
const char * dev_name = "unknown";
int dev_type = 0;
if (backend->device != NULL) {
dev_name = ggml_backend_dev_name(backend->device);
dev_type = (int) ggml_backend_dev_type(backend->device);
const char * name = "unknown";
const char * dev_name = "unknown";
int dev_type = 0;
if (b < (int) sched->profiling_backend_meta.size()) {
name = sched->profiling_backend_meta[b].name.c_str();
dev_name = sched->profiling_backend_meta[b].device.c_str();
dev_type = sched->profiling_backend_meta[b].device_type;
} else if (sched->backends[b] != NULL) {
name = ggml_backend_name(sched->backends[b]);
if (sched->backends[b]->device != NULL) {
dev_name = ggml_backend_dev_name(sched->backends[b]->device);
dev_type = (int) ggml_backend_dev_type(sched->backends[b]->device);
}
}
fprintf(fp, " {\"id\": %d, \"name\": \"%s\", \"device\": \"%s\", \"device_type\": %d}%s\n", b, name,
dev_name, dev_type, (b < sched->n_backends - 1) ? "," : "");
@ -2600,11 +2658,13 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f
fprintf(fp, "null");
}
// Tensor dimensions (both source tensors)
// Tensor dimensions (all source tensors)
fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1],
(long long) rec.ne_src0[2], (long long) rec.ne_src0[3]);
fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1],
(long long) rec.ne_src1[2], (long long) rec.ne_src1[3]);
fprintf(fp, ", \"ne_src2\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src2[0], (long long) rec.ne_src2[1],
(long long) rec.ne_src2[2], (long long) rec.ne_src2[3]);
fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : "");
}
@ -2630,3 +2690,254 @@ int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const c
return ret;
}
// Helper: format ne dimensions as string, e.g. "[4096, 4096, 1]"
static void fmt_ne(char * buf, size_t bufsize, const int64_t ne[4]) {
if (ne[0] == 0 && ne[1] == 0 && ne[2] == 0 && ne[3] == 0) {
buf[0] = '\0';
return;
}
int ndims = 4;
while (ndims > 1 && ne[ndims - 1] <= 1) {
ndims--;
}
int pos = snprintf(buf, bufsize, "[");
for (int i = 0; i < ndims && pos < (int) bufsize - 1; i++) {
pos += snprintf(buf + pos, bufsize - pos, "%s%lld", i > 0 ? ", " : "", (long long) ne[i]);
}
snprintf(buf + pos, bufsize - pos, "]");
}
// Helper: format bandwidth as string
static void fmt_bandwidth(char * buf, size_t bufsize, uint64_t bytes, uint64_t ns) {
if (ns == 0 || bytes == 0) {
buf[0] = '\0';
return;
}
double bw_gbps = (double) bytes / (double) ns;
if (bw_gbps >= 1000.0) {
snprintf(buf, bufsize, "%.2f TB/s", bw_gbps / 1000.0);
} else {
snprintf(buf, bufsize, "%.2f GB/s", bw_gbps);
}
}
int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp) {
GGML_ASSERT(sched);
GGML_ASSERT(fp != NULL);
if (sched->profiling_records.empty()) {
fprintf(fp, "No profiling data available.\n");
return 0;
}
// Aggregate by (name, type, backend_id)
struct op_stats {
const char * name;
enum ggml_profile_event_type type;
int backend_id;
uint64_t total_ns;
uint64_t min_ns;
uint64_t max_ns;
int count;
uint64_t total_bytes;
int64_t representative_ne_src0[4];
int64_t representative_ne_src1[4];
int64_t representative_ne_src2[4];
};
std::vector<op_stats> stats;
for (const auto & rec : sched->profiling_records) {
uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
bool found = false;
for (auto & s : stats) {
if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) {
s.total_ns += dur;
s.min_ns = std::min(s.min_ns, dur);
s.max_ns = std::max(s.max_ns, dur);
s.count++;
s.total_bytes += rec.bytes;
found = true;
break;
}
}
if (!found) {
op_stats s = {};
s.name = rec.name;
s.type = rec.type;
s.backend_id = rec.backend_id;
s.total_ns = dur;
s.min_ns = dur;
s.max_ns = dur;
s.count = 1;
s.total_bytes = rec.bytes;
memcpy(s.representative_ne_src0, rec.ne_src0, sizeof(s.representative_ne_src0));
memcpy(s.representative_ne_src1, rec.ne_src1, sizeof(s.representative_ne_src1));
memcpy(s.representative_ne_src2, rec.ne_src2, sizeof(s.representative_ne_src2));
stats.push_back(s);
}
}
std::sort(stats.begin(), stats.end(),
[](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; });
uint64_t grand_total = 0;
for (const auto & s : stats) {
grand_total += s.total_ns;
}
// --- Section 1: Overall summary ---
fprintf(fp, "=== Profiling Summary ===\n");
fprintf(fp, "Total time: %.2f ms\n", (double) grand_total / 1e6);
fprintf(fp, "Total records: %d\n", (int) sched->profiling_records.size());
fprintf(fp, "Unique ops: %d\n\n", (int) stats.size());
// --- Section 2: Per-backend breakdown ---
fprintf(fp, "=== Per-Backend Summary ===\n");
{
struct backend_stats {
int backend_id;
int op_count;
int copy_count;
uint64_t op_ns;
uint64_t copy_ns;
uint64_t op_bytes;
uint64_t copy_bytes;
};
std::vector<backend_stats> bstats;
for (const auto & s : stats) {
bool found = false;
for (auto & bs : bstats) {
if (bs.backend_id == s.backend_id) {
if (s.type == GGML_PROFILE_EVENT_OP) {
bs.op_count += s.count;
bs.op_ns += s.total_ns;
bs.op_bytes += s.total_bytes;
} else {
bs.copy_count += s.count;
bs.copy_ns += s.total_ns;
bs.copy_bytes += s.total_bytes;
}
found = true;
break;
}
}
if (!found) {
backend_stats bs = {};
bs.backend_id = s.backend_id;
if (s.type == GGML_PROFILE_EVENT_OP) {
bs.op_count = s.count;
bs.op_ns = s.total_ns;
bs.op_bytes = s.total_bytes;
} else {
bs.copy_count = s.count;
bs.copy_ns = s.total_ns;
bs.copy_bytes = s.total_bytes;
}
bstats.push_back(bs);
}
}
std::sort(bstats.begin(), bstats.end(),
[](const backend_stats & a, const backend_stats & b) {
return (a.op_ns + a.copy_ns) > (b.op_ns + b.copy_ns);
});
for (const auto & bs : bstats) {
uint64_t total = bs.op_ns + bs.copy_ns;
double pct = grand_total > 0 ? 100.0 * (double) total / (double) grand_total : 0;
const char * bname = "unknown";
if (bs.backend_id >= 0 && bs.backend_id < (int) sched->profiling_backend_meta.size()) {
bname = sched->profiling_backend_meta[bs.backend_id].name.c_str();
} else if (bs.backend_id >= 0 && bs.backend_id < sched->n_backends && sched->backends[bs.backend_id] != NULL) {
bname = ggml_backend_name(sched->backends[bs.backend_id]);
}
fprintf(fp, " Backend %d (%s): %.2f ms (%.1f%%)\n", bs.backend_id, bname, (double) total / 1e6, pct);
if (bs.op_count > 0) {
char bw_buf[32];
fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.op_bytes, bs.op_ns);
fprintf(fp, " OPs: %d calls, %.2f ms", bs.op_count, (double) bs.op_ns / 1e6);
if (bw_buf[0]) {
fprintf(fp, ", %s", bw_buf);
}
fprintf(fp, "\n");
}
if (bs.copy_count > 0) {
char bw_buf[32];
fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.copy_bytes, bs.copy_ns);
fprintf(fp, " COPYs: %d calls, %.2f ms", bs.copy_count, (double) bs.copy_ns / 1e6);
if (bw_buf[0]) {
fprintf(fp, ", %s", bw_buf);
}
fprintf(fp, "\n");
}
}
}
fprintf(fp, "\n");
// --- Section 3: Detailed operation table ---
fprintf(fp, "=== Operations (sorted by total time) ===\n");
fprintf(fp, "%-5s %4s %-28s %7s %6s %10s %10s %10s %10s %12s %s\n",
"TYPE", "BKND", "Operation", "%Time", "Count", "Total(ms)", "Avg(us)", "Min(us)", "Max(us)", "Bandwidth", "Tensors");
fprintf(fp, "%-5s %4s %-28s %7s %6s %10s %10s %10s %10s %12s %s\n",
"-----", "----", "----------------------------", "-------", "------",
"----------", "----------", "----------", "----------", "------------", "-------");
const char * type_str[] = { "OP", "COPY" };
for (const auto & s : stats) {
double pct = grand_total > 0 ? 100.0 * (double) s.total_ns / (double) grand_total : 0;
double avg_us = (double) s.total_ns / (double) s.count / 1000.0;
double min_us = (double) s.min_ns / 1000.0;
double max_us = (double) s.max_ns / 1000.0;
char bw_buf[32] = "";
fmt_bandwidth(bw_buf, sizeof(bw_buf), s.total_bytes, s.total_ns);
char ne0_buf[64];
char ne1_buf[64];
char ne2_buf[64];
fmt_ne(ne0_buf, sizeof(ne0_buf), s.representative_ne_src0);
fmt_ne(ne1_buf, sizeof(ne1_buf), s.representative_ne_src1);
fmt_ne(ne2_buf, sizeof(ne2_buf), s.representative_ne_src2);
// Build tensor shapes string
char tensors_buf[256] = "";
int tpos = 0;
if (ne0_buf[0]) {
tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, "%s", ne0_buf);
}
if (ne1_buf[0]) {
tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne1_buf);
}
if (ne2_buf[0]) {
tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne2_buf);
}
fprintf(fp, "%-5s %4d %-28s %6.1f%% %6d %10.2f %10.2f %10.2f %10.2f %12s %s\n",
type_str[s.type], s.backend_id, s.name, pct, s.count,
(double) s.total_ns / 1e6, avg_us, min_us, max_us,
bw_buf, tensors_buf);
}
fprintf(fp, "\nTotal: %.2f ms (%d records, %d unique ops)\n", (double) grand_total / 1e6,
(int) sched->profiling_records.size(), (int) stats.size());
return 0;
}
int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath) {
GGML_ASSERT(sched);
GGML_ASSERT(filepath != NULL);
FILE * fp = fopen(filepath, "w");
if (fp == NULL) {
GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath);
return -1;
}
int ret = ggml_backend_sched_write_profiling_text(sched, fp);
fclose(fp);
return ret;
}

View File

@ -276,6 +276,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
rec.extra = NULL;
if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
if (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) { memcpy(rec.ne_src2, node->src[2]->ne, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
ctx->profiling_records.push_back(rec);
}
}

View File

@ -3009,9 +3009,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
static const int64_t zero_ne[4] = {0, 0, 0, 0};
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
src0_ne, src1_ne);
src0_ne, src1_ne, src2_ne);
}
}

View File

@ -183,7 +183,8 @@ static void ggml_cpu_profiler_record_callback(void * context,
uint64_t bytes,
const char * extra,
const int64_t ne_src0[4],
const int64_t ne_src1[4]) {
const int64_t ne_src1[4],
const int64_t ne_src2[4]) {
auto * cpu_ctx = (ggml_backend_cpu_context *) context;
ggml_profile_record rec;
rec.type = (enum ggml_profile_event_type) type;
@ -204,6 +205,11 @@ static void ggml_cpu_profiler_record_callback(void * context,
} else {
memset(rec.ne_src1, 0, sizeof(rec.ne_src1));
}
if (ne_src2) {
memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2));
} else {
memset(rec.ne_src2, 0, sizeof(rec.ne_src2));
}
cpu_ctx->profiling_records.push_back(rec);
}

View File

@ -138,7 +138,7 @@ struct ggml_cuda_profiler_state {
}
void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
const int64_t ne_src0[4], const int64_t ne_src1[4]) {
const int64_t ne_src0[4], const int64_t ne_src1[4], const int64_t ne_src2[4]) {
cudaEvent_t ev;
(void) cudaEventCreate(&ev);
(void) cudaEventRecord(ev, stream);
@ -156,6 +156,7 @@ struct ggml_cuda_profiler_state {
rec.extra = extra;
if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
if (ne_src2) { memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
records.push_back(rec);
}
@ -4137,7 +4138,8 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
ggml_nbytes(node),
nullptr,
node->src[0] ? node->src[0]->ne : nullptr,
node->src[1] ? node->src[1]->ne : nullptr
node->src[1] ? node->src[1]->ne : nullptr,
(node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : nullptr
);
}

View File

@ -14682,6 +14682,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
static const int64_t zero_ne[4] = {0, 0, 0, 0};
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
? ctx->profiler_state->cpu_timestamps[i] : 0;
@ -14696,6 +14697,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
rec.extra = name; // fusion name or NULL
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
ctx->profiler_state->records.push_back(rec);
}
}
@ -14724,27 +14726,31 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
if (has_profiler && !nodes.empty()) {
uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
? ctx->profiler_state->cpu_timestamps[i] : 0;
// In concurrent mode, distribute duration evenly across ops in group
uint64_t per_op_ns = duration_ns / nodes.size();
for (size_t j = 0; j < nodes.size(); j++) {
auto * node = nodes[j];
static const int64_t zero_ne[4] = {0, 0, 0, 0};
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
// In concurrent mode, report the group as a single combined operation
auto * node = nodes[0];
static const int64_t zero_ne[4] = {0, 0, 0, 0};
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
ggml_profile_record rec;
rec.type = GGML_PROFILE_EVENT_OP;
rec.name = ggml_op_name(node->op);
rec.backend_id = -1;
rec.split_id = ctx->profiler_state->split_id;
rec.start_ns = cpu_ts + j * per_op_ns;
rec.end_ns = cpu_ts + (j + 1) * per_op_ns;
rec.bytes = ggml_nbytes(node);
rec.extra = names[j];
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
ctx->profiler_state->records.push_back(rec);
uint64_t total_bytes = 0;
for (size_t j = 0; j < nodes.size(); j++) {
total_bytes += ggml_nbytes(nodes[j]);
}
ggml_profile_record rec;
rec.type = GGML_PROFILE_EVENT_OP;
rec.name = ggml_op_name(node->op);
rec.backend_id = -1;
rec.split_id = ctx->profiler_state->split_id;
rec.start_ns = cpu_ts;
rec.end_ns = cpu_ts + duration_ns;
rec.bytes = total_bytes;
rec.extra = names[0]; // fusion name of first op, or NULL
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
ctx->profiler_state->records.push_back(rec);
}
}
}

View File

@ -651,11 +651,17 @@ int main(int argc, char ** argv) {
if (params.profiling_output.empty()) {
ggml_backend_sched_print_profiling(sched);
} else {
int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
if (ret == 0) {
console::log("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
const std::string & path = params.profiling_output;
int ret;
if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
} else {
console::error("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
}
if (ret == 0) {
console::log("\nProfiling data exported to: %s\n", path.c_str());
} else {
console::error("\nFailed to export profiling data to: %s\n", path.c_str());
}
}
}

View File

@ -998,11 +998,17 @@ int main(int argc, char ** argv) {
if (params.profiling_output.empty()) {
ggml_backend_sched_print_profiling(sched);
} else {
int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
if (ret == 0) {
LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
const std::string & path = params.profiling_output;
int ret;
if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
} else {
LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
}
if (ret == 0) {
LOG("\nProfiling data exported to: %s\n", path.c_str());
} else {
LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str());
}
}
}

View File

@ -33,6 +33,7 @@ class ProfileRecord:
extra: Optional[str]
ne_src0: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
ne_src1: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
ne_src2: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
@property
def type_name(self) -> str:
@ -48,7 +49,7 @@ class ProfileRecord:
@property
def bandwidth_gbps(self) -> float:
"""Bandwidth in GB/s (only meaningful for copy events)."""
"""Bandwidth in GB/s."""
if self.duration_ns == 0 or self.bytes == 0:
return 0.0
return self.bytes / self.duration_ns
@ -62,12 +63,12 @@ class ProfileRecord:
@property
def shape_str(self) -> str:
"""Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1]'."""
"""Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1] x [8, 1]'."""
s0 = self._fmt_ne(self.ne_src0)
s1 = self._fmt_ne(self.ne_src1)
if s0 and s1:
return s0 + " x " + s1
return s0 or s1
s2 = self._fmt_ne(self.ne_src2)
parts = [s for s in (s0, s1, s2) if s]
return " x ".join(parts)
def to_dict(self) -> dict:
return {
@ -81,6 +82,7 @@ class ProfileRecord:
"extra": self.extra,
"ne_src0": self.ne_src0,
"ne_src1": self.ne_src1,
"ne_src2": self.ne_src2,
}
@ -160,6 +162,7 @@ class ProfileData:
# Support both old "ne" format and new "ne_src0"/"ne_src1" format
ne_src0 = _pad_ne(r.get("ne_src0", r.get("ne", [0, 0, 0, 0])))
ne_src1 = _pad_ne(r.get("ne_src1", [0, 0, 0, 0]))
ne_src2 = _pad_ne(r.get("ne_src2", [0, 0, 0, 0]))
records.append(ProfileRecord(
type=r.get("type", 0),
name=r.get("name", "unknown"),
@ -171,6 +174,7 @@ class ProfileData:
extra=r.get("extra"),
ne_src0=ne_src0,
ne_src1=ne_src1,
ne_src2=ne_src2,
))
backends_raw = data.get("backends", [])
@ -267,9 +271,9 @@ class ProfileData:
return
print(f" {'TYPE':<5} {'BKND':>4} {'Operation':<28} {'%Time':>7} {'Count':>6} "
f"{'Total':>10} {'Avg':>10} {'Min':>10} {'Max':>10} {'Bytes':>10}")
f"{'Total':>10} {'Avg':>10} {'Min':>10} {'Max':>10} {'Bandwidth':>12}")
print(f" {'':->5} {'':->4} {'':->28} {'':->7} {'':->6} "
f"{'(ms)':>10} {'(us)':>10} {'(us)':>10} {'(us)':>10} {'':->10}")
f"{'(ms)':>10} {'(us)':>10} {'(us)':>10} {'(us)':>10} {'':->12}")
for s in stats:
pct = 100.0 * s.total_ns / self.total_ns if self.total_ns > 0 else 0
@ -278,15 +282,14 @@ class ProfileData:
f"{s.count:>6} {s.total_ms:>10.2f} {s.avg_us:>10.2f} "
f"{s.min_us:>10.2f} {s.max_us:>10.2f}")
if s.total_bytes > 0:
if s.total_bytes > 0 and s.total_ns > 0:
bw = s.bandwidth_gbps
bytes_str = f"{s.total_bytes / 1e6:.1f} MB"
if s.event_type == COPY_EVENT:
line += f" {bw:>8.2f} GB/s"
if bw >= 1000.0:
line += f" {bw / 1000.0:>9.2f} TB/s"
else:
line += f" {bytes_str:>10}"
line += f" {bw:>9.2f} GB/s"
else:
line += f" {'':>10}"
line += f" {'':>12}"
# Tensor shape from longest call
shape_dims = [n for n in s.representative_ne if n > 0]
@ -807,19 +810,19 @@ function buildStats(){
var ops={};
for(var i=0;i<EVENTS.length;i++){
var ev=EVENTS[i];
if(!ops[ev.n])ops[ev.n]={name:ev.n,d:0,count:0,min:Infinity,max:0,backends:{}};
if(!ops[ev.n])ops[ev.n]={name:ev.n,d:0,count:0,min:Infinity,max:0,bytes:0,backends:{}};
var op=ops[ev.n];
op.d+=ev.d;op.count++;
op.d+=ev.d;op.count++;op.bytes+=(ev.b||0);
if(ev.d<op.min)op.min=ev.d;if(ev.d>op.max)op.max=ev.d;
var bk=String(ev.bid);
if(!op.backends[bk])op.backends[bk]={bid:ev.bid,d:0,count:0,min:Infinity,max:0,shapes:{}};
if(!op.backends[bk])op.backends[bk]={bid:ev.bid,d:0,count:0,min:Infinity,max:0,bytes:0,shapes:{}};
var b=op.backends[bk];
b.d+=ev.d;b.count++;
b.d+=ev.d;b.count++;b.bytes+=(ev.b||0);
if(ev.d<b.min)b.min=ev.d;if(ev.d>b.max)b.max=ev.d;
var sh=ev.s||'\u2014';
if(!b.shapes[sh])b.shapes[sh]={d:0,count:0,min:Infinity,max:0};
if(!b.shapes[sh])b.shapes[sh]={d:0,count:0,min:Infinity,max:0,bytes:0};
var s=b.shapes[sh];
s.d+=ev.d;s.count++;
s.d+=ev.d;s.count++;s.bytes+=(ev.b||0);
if(ev.d<s.min)s.min=ev.d;if(ev.d>s.max)s.max=ev.d;
}
var sorted=[];for(var k in ops)sorted.push(ops[k]);
@ -832,7 +835,7 @@ function buildStats(){
var opId=rid++;
var bkeys=[];for(var bk in op.backends)bkeys.push(bk);
bkeys.sort(function(a,b){return op.backends[b].d-op.backends[a].d;});
rows.push({id:opId,p:-1,lv:0,name:op.name,d:op.d,count:op.count,
rows.push({id:opId,p:-1,lv:0,name:op.name,d:op.d,count:op.count,bytes:op.bytes,
min:op.min,max:op.max,pct:op.d/TOTAL_US*100,ch:bkeys.length>0});
for(var bi=0;bi<bkeys.length;bi++){
var bdata=op.backends[bkeys[bi]];
@ -840,25 +843,33 @@ function buildStats(){
var bname=BACKENDS[bdata.bid]||('B'+bdata.bid);
var skeys=[];for(var sk in bdata.shapes)skeys.push(sk);
skeys.sort(function(a,b){return bdata.shapes[b].d-bdata.shapes[a].d;});
rows.push({id:bId,p:opId,lv:1,name:bname,d:bdata.d,count:bdata.count,
rows.push({id:bId,p:opId,lv:1,name:bname,d:bdata.d,count:bdata.count,bytes:bdata.bytes,
min:bdata.min,max:bdata.max,pct:bdata.d/TOTAL_US*100,ch:skeys.length>0});
for(var si=0;si<skeys.length;si++){
var sdata=bdata.shapes[skeys[si]];
var sId=rid++;
rows.push({id:sId,p:bId,lv:2,name:skeys[si],d:sdata.d,count:sdata.count,
rows.push({id:sId,p:bId,lv:2,name:skeys[si],d:sdata.d,count:sdata.count,bytes:sdata.bytes,
min:sdata.min,max:sdata.max,pct:sdata.d/TOTAL_US*100,ch:false});
}
}
}
// Render
var h='<table><thead><tr><th style="width:30%">Operation</th>'
+'<th class="r" style="width:12%">% Time</th>'
+'<th class="r" style="width:12%">Total</th>'
+'<th class="r" style="width:10%">Count</th>'
+'<th class="r" style="width:12%">Avg</th>'
+'<th class="r" style="width:12%">Min</th>'
+'<th class="r" style="width:12%">Max</th>'
function fmtBW(bytes,d_us){
if(!bytes||!d_us)return'\u2014';
var gbps=bytes/(d_us*1000);
if(gbps>=1000)return(gbps/1000).toFixed(2)+' TB/s';
if(gbps>=1)return gbps.toFixed(2)+' GB/s';
return(gbps*1000).toFixed(1)+' MB/s';
}
var h='<table><thead><tr><th style="width:26%">Operation</th>'
+'<th class="r" style="width:10%">% Time</th>'
+'<th class="r" style="width:10%">Total</th>'
+'<th class="r" style="width:8%">Count</th>'
+'<th class="r" style="width:10%">Avg</th>'
+'<th class="r" style="width:10%">Min</th>'
+'<th class="r" style="width:10%">Max</th>'
+'<th class="r" style="width:10%">Bandwidth</th>'
+'</tr></thead><tbody>';
for(var ri=0;ri<rows.length;ri++){
@ -883,6 +894,7 @@ function buildStats(){
h+='<td class="r">'+fmtT(r.d/r.count)+'</td>';
h+='<td class="r">'+fmtT(r.min)+'</td>';
h+='<td class="r">'+fmtT(r.max)+'</td>';
h+='<td class="r">'+fmtBW(r.bytes,r.d)+'</td>';
h+='</tr>';
}
h+='</tbody></table>';