diff --git a/examples/debug/debug.cpp b/examples/debug/debug.cpp index d855d33477..fe2b883c12 100644 --- a/examples/debug/debug.cpp +++ b/examples/debug/debug.cpp @@ -251,11 +251,17 @@ int main(int argc, char ** argv) { if (params.profiling_output.empty()) { ggml_backend_sched_print_profiling(sched); } else { - int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str()); - if (ret == 0) { - LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str()); + const std::string & path = params.profiling_output; + int ret; + if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) { + ret = ggml_backend_sched_export_profiling_text(sched, path.c_str()); } else { - LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str()); + ret = ggml_backend_sched_export_profiling_json(sched, path.c_str()); + } + if (ret == 0) { + LOG("\nProfiling data exported to: %s\n", path.c_str()); + } else { + LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str()); } } } diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 43885350cf..cbbac0b663 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -28,7 +28,7 @@ extern "C" { void * profiling_context; // callback for recording a profile record from C code (set by backend when profiling) - // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4] + // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4], ne_src2[4] void (*profiling_record_fn)(void * context, int type, const char * name, @@ -38,7 +38,8 @@ extern "C" { uint64_t bytes, const char * extra, const int64_t ne_src0[4], - const int64_t ne_src1[4]); + const int64_t ne_src1[4], + const int64_t ne_src2[4]); }; // numa strategies diff --git a/ggml/include/ggml-profiler.h b/ggml/include/ggml-profiler.h index 5026d45001..2328f6b49f 100644 --- a/ggml/include/ggml-profiler.h +++ b/ggml/include/ggml-profiler.h @@ -29,6 +29,7 @@ typedef struct ggml_profile_record { const char * extra; // fusion name for fused ops, or NULL int64_t ne_src0[4]; // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT) int64_t ne_src1[4]; // src[1] tensor dimensions (e.g. input matrix for MUL_MAT) + int64_t ne_src2[4]; // src[2] tensor dimensions (e.g. ids for MUL_MAT_ID) } ggml_profile_record; // Backend profiler interface - each backend optionally implements this @@ -99,6 +100,13 @@ GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched // Export profiling data as JSON to a FILE pointer GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp); +// Export profiling data as plain text statistics to a file +// Returns 0 on success, -1 on error +GGML_API int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath); + +// Export profiling data as plain text statistics to a FILE pointer +GGML_API int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 577cdeb3b5..73961cd52d 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #ifdef __APPLE__ @@ -749,8 +750,17 @@ struct ggml_backend_sched { // profiling bool profiling_enabled; - std::vector copy_records; // copy events recorded by the scheduler - std::vector profiling_records; // merged records from all sources + std::string profiling_env_path; // GGML_PROFILE env var value (for auto-export on free) + std::vector copy_records; // copy events recorded by the scheduler + std::vector profiling_records; // merged records from all sources + + // Cached backend metadata for safe access during auto-export (backends may be freed first) + struct backend_meta { + std::string name; + std::string device; + int device_type; + }; + std::vector profiling_backend_meta; }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) @@ -1515,7 +1525,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id, - copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} }); + copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} }); } else { ggml_backend_tensor_copy(input, input_cpy); } @@ -1635,7 +1645,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id, moe_copy_start, moe_copy_end, - (uint64_t) total_copied_bytes, NULL, {0}, {0} }); + (uint64_t) total_copied_bytes, NULL, {0}, {0}, {0} }); } } else { // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events @@ -1671,7 +1681,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, - split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} }); + split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} }); } else { ggml_backend_tensor_copy(input, input_cpy); } @@ -1691,7 +1701,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, - split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} }); + split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} }); } } } @@ -1837,7 +1847,24 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; - sched->profiling_enabled = (getenv("GGML_PROFILE") != NULL); + const char * profile_env = getenv("GGML_PROFILE"); + if (profile_env != NULL) { + sched->profiling_enabled = true; + sched->profiling_env_path = profile_env; + } + + // Cache backend metadata for safe access during auto-export + for (int b = 0; b < n_backends; b++) { + ggml_backend_sched::backend_meta meta; + meta.name = ggml_backend_name(backends[b]); + meta.device = "unknown"; + meta.device_type = 0; + if (backends[b]->device != NULL) { + meta.device = ggml_backend_dev_name(backends[b]->device); + meta.device_type = (int) ggml_backend_dev_type(backends[b]->device); + } + sched->profiling_backend_meta.push_back(std::move(meta)); + } ggml_backend_sched_reset(sched); @@ -1848,6 +1875,33 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { if (sched == NULL) { return; } + + // Auto-export profiling data if enabled via GGML_PROFILE env var + // GGML_PROFILE=1 or GGML_PROFILE="" → print to stdout + // GGML_PROFILE=file.json → export JSON + // GGML_PROFILE=file.txt → export text + if (!sched->profiling_records.empty() && getenv("GGML_PROFILE") != NULL) { + const std::string & path = sched->profiling_env_path; + if (path.empty() || path == "1" || path == "stdout") { + ggml_backend_sched_print_profiling(sched); + } else if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) { + int ret = ggml_backend_sched_export_profiling_text(sched, path.c_str()); + if (ret == 0) { + GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str()); + } else { + GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str()); + } + } else { + // Default to JSON for any other path (including .json) + int ret = ggml_backend_sched_export_profiling_json(sched, path.c_str()); + if (ret == 0) { + GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str()); + } else { + GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str()); + } + } + } + for (int b = 0; b < sched->n_backends; b++) { for (int c = 0; c < sched->n_copies; c++) { ggml_backend_event_free(sched->events[b][c]); @@ -2518,18 +2572,16 @@ void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) { double min_us = (double) s.min_ns / 1000.0; double max_us = (double) s.max_ns / 1000.0; - if (s.type == GGML_PROFILE_EVENT_COPY) { + GGML_LOG_INFO( + " [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us", + type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us); + if (s.total_bytes > 0 && s.total_ns > 0) { double bw_gbps = (double) s.total_bytes / (double) s.total_ns; - GGML_LOG_INFO( - " [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us " - " %8.2f GB/s", - type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us, - bw_gbps); - } else { - GGML_LOG_INFO( - " [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us", - type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, - max_us); + if (bw_gbps >= 1000.0) { + GGML_LOG_INFO(" %6.2f TB/s", bw_gbps / 1000.0); + } else { + GGML_LOG_INFO(" %6.2f GB/s", bw_gbps); + } } // Print representative tensor shape (first record's ne) if (s.representative_ne[0] > 0 || s.representative_ne[1] > 0) { @@ -2565,16 +2617,22 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f fprintf(fp, " \"total_records\": %d,\n", (int) sched->profiling_records.size()); fprintf(fp, " \"total_ns\": %llu,\n", (unsigned long long) total_ns); - // Backend metadata + // Backend metadata (use cached data if available, fall back to live pointers) fprintf(fp, " \"backends\": [\n"); for (int b = 0; b < sched->n_backends; b++) { - ggml_backend_t backend = sched->backends[b]; - const char * name = ggml_backend_name(backend); - const char * dev_name = "unknown"; - int dev_type = 0; - if (backend->device != NULL) { - dev_name = ggml_backend_dev_name(backend->device); - dev_type = (int) ggml_backend_dev_type(backend->device); + const char * name = "unknown"; + const char * dev_name = "unknown"; + int dev_type = 0; + if (b < (int) sched->profiling_backend_meta.size()) { + name = sched->profiling_backend_meta[b].name.c_str(); + dev_name = sched->profiling_backend_meta[b].device.c_str(); + dev_type = sched->profiling_backend_meta[b].device_type; + } else if (sched->backends[b] != NULL) { + name = ggml_backend_name(sched->backends[b]); + if (sched->backends[b]->device != NULL) { + dev_name = ggml_backend_dev_name(sched->backends[b]->device); + dev_type = (int) ggml_backend_dev_type(sched->backends[b]->device); + } } fprintf(fp, " {\"id\": %d, \"name\": \"%s\", \"device\": \"%s\", \"device_type\": %d}%s\n", b, name, dev_name, dev_type, (b < sched->n_backends - 1) ? "," : ""); @@ -2600,11 +2658,13 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f fprintf(fp, "null"); } - // Tensor dimensions (both source tensors) + // Tensor dimensions (all source tensors) fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1], (long long) rec.ne_src0[2], (long long) rec.ne_src0[3]); fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1], (long long) rec.ne_src1[2], (long long) rec.ne_src1[3]); + fprintf(fp, ", \"ne_src2\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src2[0], (long long) rec.ne_src2[1], + (long long) rec.ne_src2[2], (long long) rec.ne_src2[3]); fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : ""); } @@ -2630,3 +2690,254 @@ int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const c return ret; } + +// Helper: format ne dimensions as string, e.g. "[4096, 4096, 1]" +static void fmt_ne(char * buf, size_t bufsize, const int64_t ne[4]) { + if (ne[0] == 0 && ne[1] == 0 && ne[2] == 0 && ne[3] == 0) { + buf[0] = '\0'; + return; + } + int ndims = 4; + while (ndims > 1 && ne[ndims - 1] <= 1) { + ndims--; + } + int pos = snprintf(buf, bufsize, "["); + for (int i = 0; i < ndims && pos < (int) bufsize - 1; i++) { + pos += snprintf(buf + pos, bufsize - pos, "%s%lld", i > 0 ? ", " : "", (long long) ne[i]); + } + snprintf(buf + pos, bufsize - pos, "]"); +} + +// Helper: format bandwidth as string +static void fmt_bandwidth(char * buf, size_t bufsize, uint64_t bytes, uint64_t ns) { + if (ns == 0 || bytes == 0) { + buf[0] = '\0'; + return; + } + double bw_gbps = (double) bytes / (double) ns; + if (bw_gbps >= 1000.0) { + snprintf(buf, bufsize, "%.2f TB/s", bw_gbps / 1000.0); + } else { + snprintf(buf, bufsize, "%.2f GB/s", bw_gbps); + } +} + +int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp) { + GGML_ASSERT(sched); + GGML_ASSERT(fp != NULL); + + if (sched->profiling_records.empty()) { + fprintf(fp, "No profiling data available.\n"); + return 0; + } + + // Aggregate by (name, type, backend_id) + struct op_stats { + const char * name; + enum ggml_profile_event_type type; + int backend_id; + uint64_t total_ns; + uint64_t min_ns; + uint64_t max_ns; + int count; + uint64_t total_bytes; + int64_t representative_ne_src0[4]; + int64_t representative_ne_src1[4]; + int64_t representative_ne_src2[4]; + }; + + std::vector stats; + for (const auto & rec : sched->profiling_records) { + uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + bool found = false; + for (auto & s : stats) { + if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) { + s.total_ns += dur; + s.min_ns = std::min(s.min_ns, dur); + s.max_ns = std::max(s.max_ns, dur); + s.count++; + s.total_bytes += rec.bytes; + found = true; + break; + } + } + if (!found) { + op_stats s = {}; + s.name = rec.name; + s.type = rec.type; + s.backend_id = rec.backend_id; + s.total_ns = dur; + s.min_ns = dur; + s.max_ns = dur; + s.count = 1; + s.total_bytes = rec.bytes; + memcpy(s.representative_ne_src0, rec.ne_src0, sizeof(s.representative_ne_src0)); + memcpy(s.representative_ne_src1, rec.ne_src1, sizeof(s.representative_ne_src1)); + memcpy(s.representative_ne_src2, rec.ne_src2, sizeof(s.representative_ne_src2)); + stats.push_back(s); + } + } + + std::sort(stats.begin(), stats.end(), + [](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; }); + + uint64_t grand_total = 0; + for (const auto & s : stats) { + grand_total += s.total_ns; + } + + // --- Section 1: Overall summary --- + fprintf(fp, "=== Profiling Summary ===\n"); + fprintf(fp, "Total time: %.2f ms\n", (double) grand_total / 1e6); + fprintf(fp, "Total records: %d\n", (int) sched->profiling_records.size()); + fprintf(fp, "Unique ops: %d\n\n", (int) stats.size()); + + // --- Section 2: Per-backend breakdown --- + fprintf(fp, "=== Per-Backend Summary ===\n"); + { + struct backend_stats { + int backend_id; + int op_count; + int copy_count; + uint64_t op_ns; + uint64_t copy_ns; + uint64_t op_bytes; + uint64_t copy_bytes; + }; + std::vector bstats; + for (const auto & s : stats) { + bool found = false; + for (auto & bs : bstats) { + if (bs.backend_id == s.backend_id) { + if (s.type == GGML_PROFILE_EVENT_OP) { + bs.op_count += s.count; + bs.op_ns += s.total_ns; + bs.op_bytes += s.total_bytes; + } else { + bs.copy_count += s.count; + bs.copy_ns += s.total_ns; + bs.copy_bytes += s.total_bytes; + } + found = true; + break; + } + } + if (!found) { + backend_stats bs = {}; + bs.backend_id = s.backend_id; + if (s.type == GGML_PROFILE_EVENT_OP) { + bs.op_count = s.count; + bs.op_ns = s.total_ns; + bs.op_bytes = s.total_bytes; + } else { + bs.copy_count = s.count; + bs.copy_ns = s.total_ns; + bs.copy_bytes = s.total_bytes; + } + bstats.push_back(bs); + } + } + std::sort(bstats.begin(), bstats.end(), + [](const backend_stats & a, const backend_stats & b) { + return (a.op_ns + a.copy_ns) > (b.op_ns + b.copy_ns); + }); + + for (const auto & bs : bstats) { + uint64_t total = bs.op_ns + bs.copy_ns; + double pct = grand_total > 0 ? 100.0 * (double) total / (double) grand_total : 0; + + const char * bname = "unknown"; + if (bs.backend_id >= 0 && bs.backend_id < (int) sched->profiling_backend_meta.size()) { + bname = sched->profiling_backend_meta[bs.backend_id].name.c_str(); + } else if (bs.backend_id >= 0 && bs.backend_id < sched->n_backends && sched->backends[bs.backend_id] != NULL) { + bname = ggml_backend_name(sched->backends[bs.backend_id]); + } + + fprintf(fp, " Backend %d (%s): %.2f ms (%.1f%%)\n", bs.backend_id, bname, (double) total / 1e6, pct); + if (bs.op_count > 0) { + char bw_buf[32]; + fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.op_bytes, bs.op_ns); + fprintf(fp, " OPs: %d calls, %.2f ms", bs.op_count, (double) bs.op_ns / 1e6); + if (bw_buf[0]) { + fprintf(fp, ", %s", bw_buf); + } + fprintf(fp, "\n"); + } + if (bs.copy_count > 0) { + char bw_buf[32]; + fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.copy_bytes, bs.copy_ns); + fprintf(fp, " COPYs: %d calls, %.2f ms", bs.copy_count, (double) bs.copy_ns / 1e6); + if (bw_buf[0]) { + fprintf(fp, ", %s", bw_buf); + } + fprintf(fp, "\n"); + } + } + } + fprintf(fp, "\n"); + + // --- Section 3: Detailed operation table --- + fprintf(fp, "=== Operations (sorted by total time) ===\n"); + fprintf(fp, "%-5s %4s %-28s %7s %6s %10s %10s %10s %10s %12s %s\n", + "TYPE", "BKND", "Operation", "%Time", "Count", "Total(ms)", "Avg(us)", "Min(us)", "Max(us)", "Bandwidth", "Tensors"); + fprintf(fp, "%-5s %4s %-28s %7s %6s %10s %10s %10s %10s %12s %s\n", + "-----", "----", "----------------------------", "-------", "------", + "----------", "----------", "----------", "----------", "------------", "-------"); + + const char * type_str[] = { "OP", "COPY" }; + for (const auto & s : stats) { + double pct = grand_total > 0 ? 100.0 * (double) s.total_ns / (double) grand_total : 0; + double avg_us = (double) s.total_ns / (double) s.count / 1000.0; + double min_us = (double) s.min_ns / 1000.0; + double max_us = (double) s.max_ns / 1000.0; + + char bw_buf[32] = ""; + fmt_bandwidth(bw_buf, sizeof(bw_buf), s.total_bytes, s.total_ns); + + char ne0_buf[64]; + char ne1_buf[64]; + char ne2_buf[64]; + fmt_ne(ne0_buf, sizeof(ne0_buf), s.representative_ne_src0); + fmt_ne(ne1_buf, sizeof(ne1_buf), s.representative_ne_src1); + fmt_ne(ne2_buf, sizeof(ne2_buf), s.representative_ne_src2); + + // Build tensor shapes string + char tensors_buf[256] = ""; + int tpos = 0; + if (ne0_buf[0]) { + tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, "%s", ne0_buf); + } + if (ne1_buf[0]) { + tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne1_buf); + } + if (ne2_buf[0]) { + tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne2_buf); + } + + fprintf(fp, "%-5s %4d %-28s %6.1f%% %6d %10.2f %10.2f %10.2f %10.2f %12s %s\n", + type_str[s.type], s.backend_id, s.name, pct, s.count, + (double) s.total_ns / 1e6, avg_us, min_us, max_us, + bw_buf, tensors_buf); + } + + fprintf(fp, "\nTotal: %.2f ms (%d records, %d unique ops)\n", (double) grand_total / 1e6, + (int) sched->profiling_records.size(), (int) stats.size()); + + return 0; +} + +int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath) { + GGML_ASSERT(sched); + GGML_ASSERT(filepath != NULL); + + FILE * fp = fopen(filepath, "w"); + if (fp == NULL) { + GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath); + return -1; + } + + int ret = ggml_backend_sched_write_profiling_text(sched, fp); + fclose(fp); + + return ret; +} diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 7d2f7b7682..b68f45452e 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -276,6 +276,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, rec.extra = NULL; if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); } if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); } + if (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) { memcpy(rec.ne_src2, node->src[2]->ne, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); } ctx->profiling_records.push_back(rec); } } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index c101e5ece7..d0fc4228ff 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3009,9 +3009,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { static const int64_t zero_ne[4] = {0, 0, 0, 0}; const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne; cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */, ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL, - src0_ne, src1_ne); + src0_ne, src1_ne, src2_ne); } } diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index f5fe903a0f..6fb9e1987c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -183,7 +183,8 @@ static void ggml_cpu_profiler_record_callback(void * context, uint64_t bytes, const char * extra, const int64_t ne_src0[4], - const int64_t ne_src1[4]) { + const int64_t ne_src1[4], + const int64_t ne_src2[4]) { auto * cpu_ctx = (ggml_backend_cpu_context *) context; ggml_profile_record rec; rec.type = (enum ggml_profile_event_type) type; @@ -204,6 +205,11 @@ static void ggml_cpu_profiler_record_callback(void * context, } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); } + if (ne_src2) { + memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2)); + } else { + memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); + } cpu_ctx->profiling_records.push_back(rec); } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c0c7749162..85d1941dca 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -138,7 +138,7 @@ struct ggml_cuda_profiler_state { } void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra, - const int64_t ne_src0[4], const int64_t ne_src1[4]) { + const int64_t ne_src0[4], const int64_t ne_src1[4], const int64_t ne_src2[4]) { cudaEvent_t ev; (void) cudaEventCreate(&ev); (void) cudaEventRecord(ev, stream); @@ -156,6 +156,7 @@ struct ggml_cuda_profiler_state { rec.extra = extra; if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); } if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); } + if (ne_src2) { memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); } records.push_back(rec); } @@ -4137,7 +4138,8 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud ggml_nbytes(node), nullptr, node->src[0] ? node->src[0]->ne : nullptr, - node->src[1] ? node->src[1]->ne : nullptr + node->src[1] ? node->src[1]->ne : nullptr, + (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : nullptr ); } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index bde4d3b8e2..499e384b9d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -14682,6 +14682,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg static const int64_t zero_ne[4] = {0, 0, 0, 0}; const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne; uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size()) ? ctx->profiler_state->cpu_timestamps[i] : 0; @@ -14696,6 +14697,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg rec.extra = name; // fusion name or NULL memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0)); memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1)); + memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2)); ctx->profiler_state->records.push_back(rec); } } @@ -14724,27 +14726,31 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg if (has_profiler && !nodes.empty()) { uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size()) ? ctx->profiler_state->cpu_timestamps[i] : 0; - // In concurrent mode, distribute duration evenly across ops in group - uint64_t per_op_ns = duration_ns / nodes.size(); - for (size_t j = 0; j < nodes.size(); j++) { - auto * node = nodes[j]; - static const int64_t zero_ne[4] = {0, 0, 0, 0}; - const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; - const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + // In concurrent mode, report the group as a single combined operation + auto * node = nodes[0]; + static const int64_t zero_ne[4] = {0, 0, 0, 0}; + const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; + const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne; - ggml_profile_record rec; - rec.type = GGML_PROFILE_EVENT_OP; - rec.name = ggml_op_name(node->op); - rec.backend_id = -1; - rec.split_id = ctx->profiler_state->split_id; - rec.start_ns = cpu_ts + j * per_op_ns; - rec.end_ns = cpu_ts + (j + 1) * per_op_ns; - rec.bytes = ggml_nbytes(node); - rec.extra = names[j]; - memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0)); - memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1)); - ctx->profiler_state->records.push_back(rec); + uint64_t total_bytes = 0; + for (size_t j = 0; j < nodes.size(); j++) { + total_bytes += ggml_nbytes(nodes[j]); } + + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = ggml_op_name(node->op); + rec.backend_id = -1; + rec.split_id = ctx->profiler_state->split_id; + rec.start_ns = cpu_ts; + rec.end_ns = cpu_ts + duration_ns; + rec.bytes = total_bytes; + rec.extra = names[0]; // fusion name of first op, or NULL + memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0)); + memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1)); + memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2)); + ctx->profiler_state->records.push_back(rec); } } } diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 51145b81a5..224e9c2514 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -651,11 +651,17 @@ int main(int argc, char ** argv) { if (params.profiling_output.empty()) { ggml_backend_sched_print_profiling(sched); } else { - int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str()); - if (ret == 0) { - console::log("\nProfiling data exported to: %s\n", params.profiling_output.c_str()); + const std::string & path = params.profiling_output; + int ret; + if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) { + ret = ggml_backend_sched_export_profiling_text(sched, path.c_str()); } else { - console::error("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str()); + ret = ggml_backend_sched_export_profiling_json(sched, path.c_str()); + } + if (ret == 0) { + console::log("\nProfiling data exported to: %s\n", path.c_str()); + } else { + console::error("\nFailed to export profiling data to: %s\n", path.c_str()); } } } diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index e68b7bb14a..caba5255cf 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -998,11 +998,17 @@ int main(int argc, char ** argv) { if (params.profiling_output.empty()) { ggml_backend_sched_print_profiling(sched); } else { - int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str()); - if (ret == 0) { - LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str()); + const std::string & path = params.profiling_output; + int ret; + if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) { + ret = ggml_backend_sched_export_profiling_text(sched, path.c_str()); } else { - LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str()); + ret = ggml_backend_sched_export_profiling_json(sched, path.c_str()); + } + if (ret == 0) { + LOG("\nProfiling data exported to: %s\n", path.c_str()); + } else { + LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str()); } } } diff --git a/tools/profiler/profiler.py b/tools/profiler/profiler.py index fd26f35f4e..a739a830f4 100644 --- a/tools/profiler/profiler.py +++ b/tools/profiler/profiler.py @@ -33,6 +33,7 @@ class ProfileRecord: extra: Optional[str] ne_src0: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) ne_src1: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + ne_src2: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) @property def type_name(self) -> str: @@ -48,7 +49,7 @@ class ProfileRecord: @property def bandwidth_gbps(self) -> float: - """Bandwidth in GB/s (only meaningful for copy events).""" + """Bandwidth in GB/s.""" if self.duration_ns == 0 or self.bytes == 0: return 0.0 return self.bytes / self.duration_ns @@ -62,12 +63,12 @@ class ProfileRecord: @property def shape_str(self) -> str: - """Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1]'.""" + """Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1] x [8, 1]'.""" s0 = self._fmt_ne(self.ne_src0) s1 = self._fmt_ne(self.ne_src1) - if s0 and s1: - return s0 + " x " + s1 - return s0 or s1 + s2 = self._fmt_ne(self.ne_src2) + parts = [s for s in (s0, s1, s2) if s] + return " x ".join(parts) def to_dict(self) -> dict: return { @@ -81,6 +82,7 @@ class ProfileRecord: "extra": self.extra, "ne_src0": self.ne_src0, "ne_src1": self.ne_src1, + "ne_src2": self.ne_src2, } @@ -160,6 +162,7 @@ class ProfileData: # Support both old "ne" format and new "ne_src0"/"ne_src1" format ne_src0 = _pad_ne(r.get("ne_src0", r.get("ne", [0, 0, 0, 0]))) ne_src1 = _pad_ne(r.get("ne_src1", [0, 0, 0, 0])) + ne_src2 = _pad_ne(r.get("ne_src2", [0, 0, 0, 0])) records.append(ProfileRecord( type=r.get("type", 0), name=r.get("name", "unknown"), @@ -171,6 +174,7 @@ class ProfileData: extra=r.get("extra"), ne_src0=ne_src0, ne_src1=ne_src1, + ne_src2=ne_src2, )) backends_raw = data.get("backends", []) @@ -267,9 +271,9 @@ class ProfileData: return print(f" {'TYPE':<5} {'BKND':>4} {'Operation':<28} {'%Time':>7} {'Count':>6} " - f"{'Total':>10} {'Avg':>10} {'Min':>10} {'Max':>10} {'Bytes':>10}") + f"{'Total':>10} {'Avg':>10} {'Min':>10} {'Max':>10} {'Bandwidth':>12}") print(f" {'':->5} {'':->4} {'':->28} {'':->7} {'':->6} " - f"{'(ms)':>10} {'(us)':>10} {'(us)':>10} {'(us)':>10} {'':->10}") + f"{'(ms)':>10} {'(us)':>10} {'(us)':>10} {'(us)':>10} {'':->12}") for s in stats: pct = 100.0 * s.total_ns / self.total_ns if self.total_ns > 0 else 0 @@ -278,15 +282,14 @@ class ProfileData: f"{s.count:>6} {s.total_ms:>10.2f} {s.avg_us:>10.2f} " f"{s.min_us:>10.2f} {s.max_us:>10.2f}") - if s.total_bytes > 0: + if s.total_bytes > 0 and s.total_ns > 0: bw = s.bandwidth_gbps - bytes_str = f"{s.total_bytes / 1e6:.1f} MB" - if s.event_type == COPY_EVENT: - line += f" {bw:>8.2f} GB/s" + if bw >= 1000.0: + line += f" {bw / 1000.0:>9.2f} TB/s" else: - line += f" {bytes_str:>10}" + line += f" {bw:>9.2f} GB/s" else: - line += f" {'':>10}" + line += f" {'':>12}" # Tensor shape from longest call shape_dims = [n for n in s.representative_ne if n > 0] @@ -807,19 +810,19 @@ function buildStats(){ var ops={}; for(var i=0;iop.max)op.max=ev.d; var bk=String(ev.bid); - if(!op.backends[bk])op.backends[bk]={bid:ev.bid,d:0,count:0,min:Infinity,max:0,shapes:{}}; + if(!op.backends[bk])op.backends[bk]={bid:ev.bid,d:0,count:0,min:Infinity,max:0,bytes:0,shapes:{}}; var b=op.backends[bk]; - b.d+=ev.d;b.count++; + b.d+=ev.d;b.count++;b.bytes+=(ev.b||0); if(ev.db.max)b.max=ev.d; var sh=ev.s||'\u2014'; - if(!b.shapes[sh])b.shapes[sh]={d:0,count:0,min:Infinity,max:0}; + if(!b.shapes[sh])b.shapes[sh]={d:0,count:0,min:Infinity,max:0,bytes:0}; var s=b.shapes[sh]; - s.d+=ev.d;s.count++; + s.d+=ev.d;s.count++;s.bytes+=(ev.b||0); if(ev.ds.max)s.max=ev.d; } var sorted=[];for(var k in ops)sorted.push(ops[k]); @@ -832,7 +835,7 @@ function buildStats(){ var opId=rid++; var bkeys=[];for(var bk in op.backends)bkeys.push(bk); bkeys.sort(function(a,b){return op.backends[b].d-op.backends[a].d;}); - rows.push({id:opId,p:-1,lv:0,name:op.name,d:op.d,count:op.count, + rows.push({id:opId,p:-1,lv:0,name:op.name,d:op.d,count:op.count,bytes:op.bytes, min:op.min,max:op.max,pct:op.d/TOTAL_US*100,ch:bkeys.length>0}); for(var bi=0;bi0}); for(var si=0;si% Time' - +'Total' - +'Count' - +'Avg' - +'Min' - +'Max' + function fmtBW(bytes,d_us){ + if(!bytes||!d_us)return'\u2014'; + var gbps=bytes/(d_us*1000); + if(gbps>=1000)return(gbps/1000).toFixed(2)+' TB/s'; + if(gbps>=1)return gbps.toFixed(2)+' GB/s'; + return(gbps*1000).toFixed(1)+' MB/s'; + } + var h='' + +'' + +'' + +'' + +'' + +'' + +'' + +'' +''; for(var ri=0;ri'; h+=''; h+=''; + h+=''; h+=''; } h+='
Operation% TimeTotalCountAvgMinMaxBandwidth
'+fmtT(r.min)+''+fmtT(r.max)+''+fmtBW(r.bytes,r.d)+'
';