diff --git a/common/arg.cpp b/common/arg.cpp index 538d2a4b0a..632c89edce 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1073,6 +1073,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.completion = true; } )); + add_opt(common_arg( + {"--profile"}, + "enable cross-backend profiling (CPU, BLAS, CUDA)", + [](common_params & params) { + params.profiling = true; + } + ).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG})); + add_opt(common_arg( + {"--profile-output"}, "FNAME", + "write profiling JSON output to FNAME (default: stdout)", + [](common_params & params, const std::string & value) { + params.profiling = true; + params.profiling_output = value; + } + ).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG})); add_opt(common_arg( {"--verbose-prompt"}, string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), diff --git a/common/common.cpp b/common/common.cpp index a9bd494191..b7b979e8b5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2,6 +2,7 @@ #include "gguf.h" #include "common.h" +#include "ggml-profiler.h" #include "log.h" #include "llama.h" #include "sampling.h" @@ -1231,6 +1232,14 @@ common_init_result::common_init_result(common_params & params) : return; } + if (params.profiling) { + ggml_backend_sched_t sched = llama_context_get_sched(lctx); + if (sched != nullptr) { + ggml_backend_sched_set_profiling(sched, true); + LOG_INF("%s: profiling enabled\n", __func__); + } + } + pimpl->context.reset(lctx); } diff --git a/common/common.h b/common/common.h index 17dc3fb232..72f6320348 100644 --- a/common/common.h +++ b/common/common.h @@ -3,6 +3,7 @@ #pragma once #include "ggml-opt.h" +#include "ggml-profiler.h" #include "ggml.h" #include "llama-cpp.h" @@ -669,6 +670,10 @@ struct common_params { bool spm_infill = false; // suffix/prefix/middle pattern for infill + // profiling + bool profiling = false; // enable cross-backend profiling + std::string profiling_output; // path to write profiling JSON output (empty = stdout) + // batched-bench params bool batched_bench_output_jsonl = false; diff --git a/examples/debug/debug.cpp b/examples/debug/debug.cpp index 88947acbd3..951c71d056 100644 --- a/examples/debug/debug.cpp +++ b/examples/debug/debug.cpp @@ -244,6 +244,23 @@ int main(int argc, char ** argv) { return 1; } + // Export profiling data if profiling was enabled + if (params.profiling) { + ggml_backend_sched_t sched = llama_context_get_sched(ctx); + if (sched != nullptr) { + if (params.profiling_output.empty()) { + ggml_backend_sched_print_profiling(sched); + } else { + int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str()); + if (ret == 0) { + LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str()); + } else { + LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str()); + } + } + } + } + LOG("\n"); llama_perf_context_print(ctx); diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index e3e067c916..76305d71de 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -22,6 +22,22 @@ extern "C" { // use only reference implementations bool use_ref; + + // profiler context (set by backend when profiling is enabled, NULL otherwise) + // when non-NULL, the compute loop will record per-node timing + void * profiling_context; + + // callback for recording a profile record from C code (set by backend when profiling) + // params: context, type (0=OP, 1=COPY), name, split_id, start_ns, end_ns, bytes, extra, ne[4] + void (*profiling_record_fn)(void * context, + int type, + const char * name, + int split_id, + uint64_t start_ns, + uint64_t end_ns, + uint64_t bytes, + const char * extra, + const int64_t ne[4]); }; // numa strategies diff --git a/ggml/include/ggml-profiler.h b/ggml/include/ggml-profiler.h new file mode 100644 index 0000000000..773b0635f1 --- /dev/null +++ b/ggml/include/ggml-profiler.h @@ -0,0 +1,103 @@ +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// +// Profiler +// + +// Profile event types +enum ggml_profile_event_type { + GGML_PROFILE_EVENT_OP, // single operation execution (computation kernel) + GGML_PROFILE_EVENT_COPY, // data transfer between devices +}; + +// A single profiling record representing a timed interval +typedef struct ggml_profile_record { + enum ggml_profile_event_type type; + const char * name; // operation name (e.g., "mul_mat", "copy_H2D") + int backend_id; // scheduler's backend index (0 = highest priority) + int split_id; // which graph split (0..n_splits-1) + uint64_t start_ns; // start timestamp in nanoseconds + uint64_t end_ns; // end timestamp in nanoseconds + uint64_t bytes; // bytes transferred (for copy) or tensor size (for ops) + const char * extra; // fusion name for fused ops, or NULL + int64_t ne[4]; // output tensor dimensions [ne0, ne1, ne2, ne3] +} ggml_profile_record; + +// Backend profiler interface - each backend optionally implements this +// to provide fine-grained operation timing +struct ggml_backend_profiler { + void * context; // backend-specific profiler context + + // Enable or disable profiling on this backend + void (*enable)(void * context, bool enable); + + // Clear all recorded data + void (*reset)(void * context); + + // Set the current split ID (called by scheduler before graph_compute) + void (*set_split_id)(void * context, int split_id); + + // Get recorded profiling data + // Returns the number of records; sets *out to point to internal storage + // The returned pointer remains valid until the next reset or disable call + int (*get_records)(void * context, const ggml_profile_record ** out); + + // Free the profiler context + void (*free_context)(void * context); +}; + +typedef struct ggml_backend_profiler * ggml_backend_profiler_t; + +// Register a profiler on a backend (called by backend during init) +// The profiler is owned by the backend and will be freed when the backend is freed +GGML_API void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler); + +// Get the profiler associated with a backend (returns NULL if none) +GGML_API ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend); + +// +// Scheduler profiling API +// + +// Enable or disable profiling on a scheduler +// When enabled, the scheduler will: +// - Time data copy operations between backends +// - Enable profiling on all backends that support it +// - Collect profiling records from all backends after each graph compute +GGML_API void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable); + +// Check if profiling is enabled on a scheduler +GGML_API bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched); + +// Get profiling data from the last graph compute +// Records are owned by the scheduler; valid until the next compute or reset +// Returns the number of records +GGML_API int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records); + +// Print a human-readable summary of the last profiling run to stdout +// Groups records by operation name and shows total/count/min/max/avg time +GGML_API void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched); + +// Reset profiling data (clear all recorded data) +GGML_API void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched); + +// Get current time in nanoseconds (for manual profiling if needed) +GGML_API uint64_t ggml_profiler_time_ns(void); + +// Export profiling data as JSON to a file +// Returns 0 on success, -1 on error +GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath); + +// Export profiling data as JSON to a FILE pointer +GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 78853304d9..cf8f4d0788 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -195,12 +195,14 @@ add_library(ggml-base ../include/ggml-backend.h ../include/ggml-cpp.h ../include/ggml-opt.h + ../include/ggml-profiler.h ../include/gguf.h ggml.c ggml.cpp ggml-alloc.c ggml-backend.cpp ggml-opt.cpp + ggml-profiler.cpp ggml-threading.cpp ggml-threading.h ggml-quants.c diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 59190b7c46..80feaaf384 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -3,6 +3,7 @@ // ggml-backend internal header #include "ggml-backend.h" +#include "ggml-profiler.h" #ifdef __cplusplus extern "C" { @@ -124,6 +125,9 @@ extern "C" { struct ggml_backend_i iface; ggml_backend_dev_t device; void * context; + + // Optional profiler (set by backend during init, NULL if not profiling) + ggml_backend_profiler_t profiler; }; struct ggml_backend_event { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 22c656996c..d341b245af 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -12,6 +12,7 @@ #include "ggml-backend-impl.h" #include "ggml-alloc.h" #include "ggml-impl.h" +#include "ggml-profiler.h" #include #include @@ -231,6 +232,15 @@ void ggml_backend_free(ggml_backend_t backend) { return; } + // Clean up profiler if present (before backend frees its context) + if (backend->profiler != NULL) { + if (backend->profiler->free_context != NULL) { + backend->profiler->free_context(backend->profiler->context); + } + delete backend->profiler; + backend->profiler = NULL; + } + backend->iface.free(backend); } @@ -736,6 +746,11 @@ struct ggml_backend_sched { int debug_realloc; int debug_graph_size; int debug_prev_graph_size; + + // profiling + bool profiling_enabled; + std::vector copy_records; // copy events recorded by the scheduler + std::vector profiling_records; // merged records from all sources }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) @@ -1450,11 +1465,28 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s std::vector ids; std::vector used_ids; + // Profiling: reset copy records for this compute pass + if (sched->profiling_enabled) { + sched->copy_records.clear(); + } + for (int split_id = 0; split_id < sched->n_splits; split_id++) { struct ggml_backend_sched_split * split = &splits[split_id]; int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; + // Profiling: set split ID and enable backend profiling + if (sched->profiling_enabled) { + if (split_backend->profiler != NULL) { + if (split_backend->profiler->enable != NULL) { + split_backend->profiler->enable(split_backend->profiler->context, true); + } + if (split_backend->profiler->set_split_id != NULL) { + split_backend->profiler->set_split_id(split_backend->profiler->context, split_id); + } + } + } + // copy the input tensors to the split backend for (int input_id = 0; input_id < split->n_inputs; input_id++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]); @@ -1468,7 +1500,25 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } else { ggml_backend_synchronize(split_backend); } - ggml_backend_tensor_copy(input, input_cpy); + if (sched->profiling_enabled) { + uint64_t copy_start = ggml_profiler_time_ns(); + ggml_backend_tensor_copy(input, input_cpy); + uint64_t copy_end = ggml_profiler_time_ns(); + + enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device); + enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device); + const char * copy_dir = "copy_D2D"; + if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_H2D"; + } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_D2H"; + } + + sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id, + copy_start, copy_end, ggml_nbytes(input), NULL, {0} }); + } else { + ggml_backend_tensor_copy(input, input_cpy); + } } else { // wait for the split backend to finish using the input before overwriting it if (sched->events[split_backend_id][sched->cur_copy] != NULL) { @@ -1572,7 +1622,46 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } else { ggml_backend_synchronize(split_backend); } - ggml_backend_tensor_copy(input, input_cpy); + if (sched->profiling_enabled) { + uint64_t copy_start = ggml_profiler_time_ns(); + ggml_backend_tensor_copy(input, input_cpy); + uint64_t copy_end = ggml_profiler_time_ns(); + + enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device); + enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device); + const char * copy_dir = "copy_D2D"; + if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_H2D"; + } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && + dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_D2H"; + } + + sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, + split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} }); + } else { + ggml_backend_tensor_copy(input, input_cpy); + } + } else { + // async copy completed - record it with available timing + if (sched->profiling_enabled) { + uint64_t copy_start = ggml_profiler_time_ns(); + // The async copy was already initiated; we just record the launch time + uint64_t copy_end = ggml_profiler_time_ns(); + + enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device); + enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device); + const char * copy_dir = "copy_D2D"; + if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_H2D"; + } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && + dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_D2H"; + } + + sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, + split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} }); + } } } } @@ -1625,6 +1714,34 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } + // Profiling: collect records from all backends and append to accumulated records + if (sched->profiling_enabled) { + sched->copy_records.clear(); + + // Collect backend operation records + for (int b = 0; b < sched->n_backends; b++) { + ggml_backend_t backend = sched->backends[b]; + if (backend->profiler != NULL && backend->profiler->get_records != NULL) { + const ggml_profile_record * backend_recs = NULL; + int count = backend->profiler->get_records(backend->profiler->context, &backend_recs); + for (int r = 0; r < count; r++) { + ggml_profile_record rec = backend_recs[r]; + rec.backend_id = b; // stamp correct scheduler backend index + sched->profiling_records.push_back(rec); + } + // Reset backend records (but keep profiling enabled for next compute) + if (backend->profiler->reset != NULL) { + backend->profiler->reset(backend->profiler->context); + } + } + } + + // Append copy records + for (const auto & rec : sched->copy_records) { + sched->profiling_records.push_back(rec); + } + } + return GGML_STATUS_SUCCESS; } @@ -1691,6 +1808,7 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; + sched->profiling_enabled = (getenv("GGML_PROFILE") != NULL); ggml_backend_sched_reset(sched); @@ -2268,3 +2386,216 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); } + +// +// Scheduler profiling +// + +void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable) { + GGML_ASSERT(sched); + sched->profiling_enabled = enable; + + if (!enable) { + ggml_backend_sched_reset_profiling(sched); + } +} + +bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched) { + GGML_ASSERT(sched); + return sched->profiling_enabled; +} + +int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records) { + GGML_ASSERT(sched); + GGML_ASSERT(records != NULL); + + *records = sched->profiling_records.data(); + return (int) sched->profiling_records.size(); +} + +void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched) { + GGML_ASSERT(sched); + sched->profiling_records.clear(); + sched->copy_records.clear(); +} + +void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) { + GGML_ASSERT(sched); + + if (sched->profiling_records.empty()) { + GGML_LOG_INFO("[profiler] No profiling data available\n"); + return; + } + + GGML_LOG_INFO("\n=== Profiling Summary ===\n"); + + // Aggregate by (name, type, backend_id) + struct op_stats { + const char * name; + enum ggml_profile_event_type type; + int backend_id; + uint64_t total_ns; + uint64_t min_ns; + uint64_t max_ns; + int count; + uint64_t total_bytes; + int64_t representative_ne[4]; + }; + + std::vector stats; + for (const auto & rec : sched->profiling_records) { + bool found = false; + for (auto & s : stats) { + if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) { + uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + s.total_ns += dur; + s.min_ns = std::min(s.min_ns, dur); + s.max_ns = std::max(s.max_ns, dur); + s.count++; + s.total_bytes += rec.bytes; + found = true; + break; + } + } + if (!found) { + uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + op_stats s; + s.name = rec.name; + s.type = rec.type; + s.backend_id = rec.backend_id; + s.total_ns = dur; + s.min_ns = dur; + s.max_ns = dur; + s.count = 1; + s.total_bytes = rec.bytes; + memcpy(s.representative_ne, rec.ne, sizeof(s.representative_ne)); + stats.push_back(s); + } + } + + // Sort by total time descending + std::sort(stats.begin(), stats.end(), + [](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; }); + + uint64_t grand_total = 0; + for (const auto & s : stats) { + grand_total += s.total_ns; + } + + const char * type_str[] = { "OP ", "COPY" }; + for (const auto & s : stats) { + double pct = 100.0 * (double) s.total_ns / (double) grand_total; + double avg_us = (double) s.total_ns / (double) s.count / 1000.0; + double min_us = (double) s.min_ns / 1000.0; + double max_us = (double) s.max_ns / 1000.0; + + if (s.type == GGML_PROFILE_EVENT_COPY) { + double bw_gbps = (double) s.total_bytes / (double) s.total_ns; + GGML_LOG_INFO( + " [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us " + " %8.2f GB/s", + type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us, + bw_gbps); + } else { + GGML_LOG_INFO( + " [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us", + type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, + max_us); + } + // Print representative tensor shape (first record's ne) + if (s.representative_ne[0] > 0 || s.representative_ne[1] > 0) { + GGML_LOG_INFO(" [%lld x %lld", (long long) s.representative_ne[0], (long long) s.representative_ne[1]); + if (s.representative_ne[2] > 1) { + GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[2]); + } + if (s.representative_ne[3] > 1) { + GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[3]); + } + GGML_LOG_INFO("]"); + } + GGML_LOG_INFO("\n"); + } + + GGML_LOG_INFO(" ---\n"); + GGML_LOG_INFO(" Total: %.2f ms (%d records, %d unique ops)\n\n", (double) grand_total / 1e6, + (int) sched->profiling_records.size(), (int) stats.size()); +} + +int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp) { + GGML_ASSERT(sched); + GGML_ASSERT(fp != NULL); + + uint64_t total_ns = 0; + for (const auto & rec : sched->profiling_records) { + total_ns += (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + } + + fprintf(fp, "{\n"); + fprintf(fp, " \"version\": 2,\n"); + fprintf(fp, " \"profiler\": \"ggml\",\n"); + fprintf(fp, " \"total_records\": %d,\n", (int) sched->profiling_records.size()); + fprintf(fp, " \"total_ns\": %llu,\n", (unsigned long long) total_ns); + + // Backend metadata + fprintf(fp, " \"backends\": [\n"); + for (int b = 0; b < sched->n_backends; b++) { + ggml_backend_t backend = sched->backends[b]; + const char * name = ggml_backend_name(backend); + const char * dev_name = "unknown"; + int dev_type = 0; + if (backend->device != NULL) { + dev_name = ggml_backend_dev_name(backend->device); + dev_type = (int) ggml_backend_dev_type(backend->device); + } + fprintf(fp, " {\"id\": %d, \"name\": \"%s\", \"device\": \"%s\", \"device_type\": %d}%s\n", b, name, + dev_name, dev_type, (b < sched->n_backends - 1) ? "," : ""); + } + fprintf(fp, " ],\n"); + + // Records + fprintf(fp, " \"records\": [\n"); + + for (int i = 0; i < (int) sched->profiling_records.size(); i++) { + const auto & rec = sched->profiling_records[i]; + uint64_t duration_ns = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + + fprintf(fp, + " {\"type\": %d, \"name\": \"%s\", \"backend_id\": %d, \"split_id\": %d, " + "\"start_ns\": %llu, \"duration_ns\": %llu, \"bytes\": %llu, \"extra\": ", + (int) rec.type, rec.name ? rec.name : "unknown", rec.backend_id, rec.split_id, + (unsigned long long) rec.start_ns, (unsigned long long) duration_ns, (unsigned long long) rec.bytes); + + if (rec.extra != NULL) { + fprintf(fp, "\"%s\"", rec.extra); + } else { + fprintf(fp, "null"); + } + + // Tensor dimensions + fprintf(fp, ", \"ne\": [%lld, %lld, %lld, %lld]", (long long) rec.ne[0], (long long) rec.ne[1], + (long long) rec.ne[2], (long long) rec.ne[3]); + + fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : ""); + } + + fprintf(fp, " ]\n"); + fprintf(fp, "}\n"); + + return 0; +} + +int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath) { + GGML_ASSERT(sched); + GGML_ASSERT(filepath != NULL); + + FILE * fp = fopen(filepath, "w"); + if (fp == NULL) { + GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath); + return -1; + } + + int ret = ggml_backend_sched_write_profiling_json(sched, fp); + fclose(fp); + + return ret; +} diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index e7a1763b54..b52f7f90f7 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -1,6 +1,7 @@ #include "ggml-impl.h" #include "ggml-blas.h" #include "ggml-backend-impl.h" +#include "ggml-profiler.h" #include #include @@ -25,6 +26,11 @@ struct ggml_backend_blas_context { #ifndef GGML_USE_OPENMP std::vector> tasks; #endif + + // Profiling state + bool profiling_enabled = false; + int profiling_split_id = -1; + std::vector profiling_records; }; static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { @@ -232,6 +238,18 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, continue; } + // Skip view/identity ops + if (node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_VIEW || + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) { + continue; + } + + // Profiling: time this operation + uint64_t t_start = 0; + if (ctx->profiling_enabled) { + t_start = ggml_profiler_time_ns(); + } + switch (node->op) { case GGML_OP_MUL_MAT: ggml_backend_blas_mul_mat(ctx, node); @@ -241,16 +259,24 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, ggml_backend_blas_out_prod(ctx, node); break; - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - break; - default: GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); } + + if (ctx->profiling_enabled) { + uint64_t t_end = ggml_profiler_time_ns(); + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = ggml_op_name(node->op); + rec.backend_id = 0; + rec.split_id = ctx->profiling_split_id; + rec.start_ns = t_start; + rec.end_ns = t_end; + rec.bytes = ggml_nbytes(node); + rec.extra = NULL; + memcpy(rec.ne, node->ne, sizeof(rec.ne)); + ctx->profiling_records.push_back(rec); + } } return GGML_STATUS_SUCCESS; @@ -284,10 +310,11 @@ ggml_backend_t ggml_backend_blas_init(void) { ggml_backend_blas_context * ctx = new ggml_backend_blas_context; ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_blas_guid(), - /* .iface = */ blas_backend_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), - /* .context = */ ctx, + /* .guid = */ ggml_backend_blas_guid(), + /* .iface = */ blas_backend_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), + /* .context = */ ctx, + /* .profiler = */ nullptr, }; #if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP) @@ -300,6 +327,44 @@ ggml_backend_t ggml_backend_blas_init(void) { GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); #endif + // Register profiler + ggml_backend_blas_context * blas_ctx = ctx; // ctx is already defined above + + static auto blas_prof_enable = [](void * ctx, bool enable) { + auto * bctx = (ggml_backend_blas_context *) ctx; + bctx->profiling_enabled = enable; + if (!enable) { + bctx->profiling_records.clear(); + } + }; + static auto blas_prof_reset = [](void * ctx) { + auto * bctx = (ggml_backend_blas_context *) ctx; + bctx->profiling_records.clear(); + bctx->profiling_split_id = -1; + }; + static auto blas_prof_set_split_id = [](void * ctx, int split_id) { + auto * bctx = (ggml_backend_blas_context *) ctx; + bctx->profiling_split_id = split_id; + }; + static auto blas_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int { + auto * bctx = (ggml_backend_blas_context *) ctx; + *out = bctx->profiling_records.data(); + return (int) bctx->profiling_records.size(); + }; + static auto blas_prof_free = [](void * ctx) { + (void) ctx; + }; + + auto * profiler = new ggml_backend_profiler{ + /* .context = */ blas_ctx, + /* .enable = */ blas_prof_enable, + /* .reset = */ blas_prof_reset, + /* .set_split_id = */ blas_prof_set_split_id, + /* .get_records = */ blas_prof_get_records, + /* .free_context = */ blas_prof_free, + }; + ggml_backend_set_profiler(backend, profiler); + return backend; } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index df17cc5530..5e9d0cdc76 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -6,6 +6,7 @@ #include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-impl.h" +#include "ggml-profiler.h" #include "quants.h" #include "ggml-threading.h" #include "unary-ops.h" @@ -1159,8 +1160,8 @@ static void ggml_compute_forward_mul_mat_one_chunk( const bool src1_cont = ggml_is_contiguous(src1); - ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + const ggml_vec_dot_t vec_dot = type_traits_cpu[type].vec_dot; + const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type; // broadcast factors const int64_t r2 = ne12 / ne02; @@ -1244,9 +1245,9 @@ void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; - enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; - ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; - int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows; + const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; + const ggml_from_float_t from_float = type_traits_cpu[vec_dot_type].from_float; + const int64_t vec_dot_num_rows = type_traits_cpu[src0->type].nrows; GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); @@ -1455,8 +1456,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk( const enum ggml_type type = src0->type; - ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + const ggml_vec_dot_t vec_dot = type_traits_cpu[type].vec_dot; + const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type; const int64_t blck_0 = 16; const int64_t blck_1 = 16; @@ -1523,8 +1524,8 @@ static void ggml_compute_forward_mul_mat_id( const bool src1_cont = ggml_is_contiguous(src1); - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; - ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; + const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type; + const ggml_from_float_t from_float = type_traits_cpu[vec_dot_type].from_float; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == ggml_type_size(type)); @@ -2973,28 +2974,67 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph); #endif - for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { - struct ggml_tensor * node = cgraph->nodes[node_n]; + // Profiling state + if (cplan->profiling_context != NULL && cplan->profiling_record_fn != NULL) { + for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { + struct ggml_tensor * node = cgraph->nodes[node_n]; - if (ggml_op_is_empty(node->op)) { - // skip NOPs - continue; + if (ggml_op_is_empty(node->op)) { + continue; + } + + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } + + // Only thread 0 records timing (after barrier = total node time) + uint64_t t_start = 0; + if (state->ith == 0) { + t_start = ggml_profiler_time_ns(); + } + + ggml_compute_forward(¶ms, node); + + if (node_n + 1 < cgraph->n_nodes) { + ggml_barrier(state->threadpool); + } + + if (state->ith == 0) { + uint64_t t_end = ggml_profiler_time_ns(); + cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */, + ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL, + node->ne); + } + + if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); + tp->ec = GGML_STATUS_ABORTED; + } } + } else { + for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { + struct ggml_tensor * node = cgraph->nodes[node_n]; - if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { - continue; - } + if (ggml_op_is_empty(node->op)) { + // skip NOPs + continue; + } - ggml_compute_forward(¶ms, node); + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } - if (state->ith == 0 && cplan->abort_callback && - cplan->abort_callback(cplan->abort_callback_data)) { - atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); - tp->ec = GGML_STATUS_ABORTED; - } + ggml_compute_forward(¶ms, node); - if (node_n + 1 < cgraph->n_nodes) { - ggml_barrier(state->threadpool); + if (state->ith == 0 && cplan->abort_callback && + cplan->abort_callback(cplan->abort_callback_data)) { + atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); + tp->ec = GGML_STATUS_ABORTED; + } + + if (node_n + 1 < cgraph->n_nodes) { + ggml_barrier(state->threadpool); + } } } diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index ddf1737a31..2ee638fe02 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -1,6 +1,7 @@ #include "ggml-backend.h" #include "ggml-backend-impl.h" #include "ggml-cpu.h" +#include "ggml-profiler.h" #include "repack.h" #include "traits.h" #include "ggml-impl.h" @@ -107,6 +108,11 @@ struct ggml_backend_cpu_context { void * abort_callback_data; bool use_ref; // use reference implementation + + // Profiling state + bool profiling_enabled; + int profiling_split_id; + std::vector profiling_records; }; static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { @@ -167,6 +173,34 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe GGML_UNUSED(backend); } +// Callback function for recording CPU profiling events from C code (ggml-cpu.c) +static void ggml_cpu_profiler_record_callback(void * context, + int type, + const char * name, + int split_id, + uint64_t start_ns, + uint64_t end_ns, + uint64_t bytes, + const char * extra, + const int64_t ne[4]) { + auto * cpu_ctx = (ggml_backend_cpu_context *) context; + ggml_profile_record rec; + rec.type = (enum ggml_profile_event_type) type; + rec.name = name; + rec.backend_id = 0; // will be overwritten by scheduler + rec.split_id = split_id != -1 ? split_id : cpu_ctx->profiling_split_id; + rec.start_ns = start_ns; + rec.end_ns = end_ns; + rec.bytes = bytes; + rec.extra = extra; + if (ne) { + memcpy(rec.ne, ne, sizeof(rec.ne)); + } else { + memset(rec.ne, 0, sizeof(rec.ne)); + } + cpu_ctx->profiling_records.push_back(rec); +} + static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; @@ -187,6 +221,9 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s cplan.abort_callback_data = cpu_ctx->abort_callback_data; cplan.use_ref = cpu_ctx->use_ref; + cplan.profiling_context = cpu_ctx->profiling_enabled ? cpu_ctx : NULL; + cplan.profiling_record_fn = cpu_ctx->profiling_enabled ? ggml_cpu_profiler_record_callback : NULL; + return ggml_graph_compute(cgraph, &cplan); } @@ -228,12 +265,15 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->abort_callback = NULL; ctx->abort_callback_data = NULL; ctx->use_ref = false; + ctx->profiling_enabled = false; + ctx->profiling_split_id = -1; ggml_backend_t cpu_backend = new ggml_backend { - /* .guid = */ ggml_backend_cpu_guid(), - /* .iface = */ ggml_backend_cpu_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ ctx, + /* .guid = */ ggml_backend_cpu_guid(), + /* .iface = */ ggml_backend_cpu_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ ctx, + /* .profiler = */ nullptr, }; if (cpu_backend == NULL) { @@ -241,6 +281,43 @@ ggml_backend_t ggml_backend_cpu_init(void) { return NULL; } + // Register profiler + static auto cpu_prof_enable = [](void * ctx, bool enable) { + auto * cpu_ctx = (ggml_backend_cpu_context *) ctx; + cpu_ctx->profiling_enabled = enable; + if (!enable) { + cpu_ctx->profiling_records.clear(); + } + }; + static auto cpu_prof_reset = [](void * ctx) { + auto * cpu_ctx = (ggml_backend_cpu_context *) ctx; + cpu_ctx->profiling_records.clear(); + cpu_ctx->profiling_split_id = -1; + }; + static auto cpu_prof_set_split_id = [](void * ctx, int split_id) { + auto * cpu_ctx = (ggml_backend_cpu_context *) ctx; + cpu_ctx->profiling_split_id = split_id; + }; + static auto cpu_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int { + auto * cpu_ctx = (ggml_backend_cpu_context *) ctx; + *out = cpu_ctx->profiling_records.data(); + return (int) cpu_ctx->profiling_records.size(); + }; + static auto cpu_prof_free = [](void * ctx) { + // Nothing to free - records are in the CPU context's vector + (void) ctx; + }; + + auto * profiler = new ggml_backend_profiler{ + /* .context = */ ctx, + /* .enable = */ cpu_prof_enable, + /* .reset = */ cpu_prof_reset, + /* .set_split_id = */ cpu_prof_set_split_id, + /* .get_records = */ cpu_prof_get_records, + /* .free_context = */ cpu_prof_free, + }; + ggml_backend_set_profiler(cpu_backend, profiler); + return cpu_backend; } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 7d7f20af3a..45500ba1af 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1339,6 +1339,9 @@ struct ggml_cuda_stream_context { } }; +// Forward declaration for profiler state (defined in ggml-cuda.cu) +struct ggml_cuda_profiler_state; + struct ggml_backend_cuda_context { int device; std::string name; @@ -1434,6 +1437,9 @@ struct ggml_backend_cuda_context { ggml_cuda_pool & pool() { return pool(device); } + + // Profiling + ggml_cuda_profiler_state * profiler_state = nullptr; }; struct ggml_cuda_mm_fusion_args_host { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d1239b1c5f..147b3efe80 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1,6 +1,7 @@ #include "ggml-cuda.h" #include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-profiler.h" #include "ggml-cuda/common.cuh" #include "ggml-cuda/acc.cuh" @@ -86,6 +87,90 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); +// CUDA profiler state +struct ggml_cuda_profiler_state { + bool enabled = false; + int split_id = -1; + cudaStream_t stream = nullptr; + + static constexpr int MAX_PENDING_EVENTS = 4096; + std::vector start_events; + std::vector end_events; + int event_count = 0; + + std::vector records; + std::vector record_event_indices; + + void init(cudaStream_t stream) { + this->stream = stream; + start_events.reserve(MAX_PENDING_EVENTS); + end_events.reserve(MAX_PENDING_EVENTS); + } + + void reset() { + for (auto & ev : start_events) { + cudaEventDestroy(ev); + } + for (auto & ev : end_events) { + cudaEventDestroy(ev); + } + start_events.clear(); + end_events.clear(); + event_count = 0; + records.clear(); + record_event_indices.clear(); + } + + ~ggml_cuda_profiler_state() { + reset(); + } + + void record_start() { + cudaEvent_t ev; + cudaEventCreate(&ev); + cudaEventRecord(ev, stream); + start_events.push_back(ev); + event_count++; + } + + void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra, const int64_t ne[4]) { + cudaEvent_t ev; + cudaEventCreate(&ev); + cudaEventRecord(ev, stream); + end_events.push_back(ev); + record_event_indices.push_back(records.size()); + + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = name; + rec.backend_id = backend_id; + rec.split_id = split_id; + rec.start_ns = 0; + rec.end_ns = 0; + rec.bytes = bytes; + rec.extra = extra; + if (ne) { + memcpy(rec.ne, ne, sizeof(rec.ne)); + } else { + memset(rec.ne, 0, sizeof(rec.ne)); + } + records.push_back(rec); + } + + void finalize() { + cudaStreamSynchronize(stream); + + for (int i = 0; i < (int)record_event_indices.size(); i++) { + float ms = 0.0f; + cudaEventElapsedTime(&ms, start_events[i], end_events[i]); + uint64_t ns = (uint64_t)(ms * 1e6f); + int rec_idx = record_event_indices[i]; + records[rec_idx].start_ns = 0; + records[rec_idx].end_ns = ns; + } + } +}; + [[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { int id = -1; // in case cudaGetDevice fails @@ -4035,8 +4120,23 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud #else GGML_UNUSED(integrated); #endif // NDEBUG + if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) { + cuda_ctx->profiler_state->record_start(); + } bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); + + if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) { + cuda_ctx->profiler_state->record_end( + ggml_op_name(node->op), + -1, + cuda_ctx->profiler_state->split_id, + ggml_nbytes(node), + nullptr, + node->ne + ); + } + if (!ok) { GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -4107,6 +4207,19 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cuda_set_device(cuda_ctx->device); + // Disable CUDA graphs when profiling (we need per-node timing) + bool was_graph_enabled = false; + if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) { +#ifdef USE_CUDA_GRAPH + const void * graph_key = ggml_cuda_graph_get_key(cgraph); + ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); + was_graph_enabled = graph->is_enabled(); + if (was_graph_enabled) { + graph->disable_due_to_gpu_arch = true; + } +#endif + } + bool use_cuda_graph = false; bool cuda_graph_update_required = false; const void * graph_key = nullptr; @@ -4158,6 +4271,15 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required, graph_key); + // Restore CUDA graph enabled state after profiling + if (was_graph_enabled) { +#ifdef USE_CUDA_GRAPH + const void * graph_key_prof = ggml_cuda_graph_get_key(cgraph); + ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key_prof); + graph->disable_due_to_gpu_arch = false; +#endif + } + return GGML_STATUS_SUCCESS; } @@ -5306,12 +5428,68 @@ ggml_backend_t ggml_backend_cuda_init(int device) { } ggml_backend_t cuda_backend = new ggml_backend { - /* .guid = */ ggml_backend_cuda_guid(), - /* .iface = */ ggml_backend_cuda_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), - /* .context = */ ctx, + /* .guid = */ ggml_backend_cuda_guid(), + /* .iface = */ ggml_backend_cuda_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), + /* .context = */ ctx, + /* .profiler = */ nullptr, }; + // Register profiler + auto * prof_state = new ggml_cuda_profiler_state(); + prof_state->init(ctx->stream()); + ctx->profiler_state = prof_state; + + static auto cuda_prof_enable = [](void * ctx, bool enable) { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + cuda_ctx->profiler_state->enabled = enable; + if (!enable) { + cuda_ctx->profiler_state->reset(); + } + } + }; + static auto cuda_prof_reset = [](void * ctx) { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + cuda_ctx->profiler_state->reset(); + cuda_ctx->profiler_state->split_id = -1; + } + }; + static auto cuda_prof_set_split_id = [](void * ctx, int split_id) { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + cuda_ctx->profiler_state->split_id = split_id; + } + }; + static auto cuda_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + cuda_ctx->profiler_state->finalize(); + *out = cuda_ctx->profiler_state->records.data(); + return (int)cuda_ctx->profiler_state->records.size(); + } + *out = nullptr; + return 0; + }; + static auto cuda_prof_free = [](void * ctx) { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + delete cuda_ctx->profiler_state; + cuda_ctx->profiler_state = nullptr; + } + }; + + auto * profiler = new ggml_backend_profiler { + /* .context = */ ctx, + /* .enable = */ cuda_prof_enable, + /* .reset = */ cuda_prof_reset, + /* .set_split_id = */ cuda_prof_set_split_id, + /* .get_records = */ cuda_prof_get_records, + /* .free_context = */ cuda_prof_free, + }; + ggml_backend_set_profiler(cuda_backend, profiler); + return cuda_backend; } diff --git a/ggml/src/ggml-profiler.cpp b/ggml/src/ggml-profiler.cpp new file mode 100644 index 0000000000..7d5d4c2ca1 --- /dev/null +++ b/ggml/src/ggml-profiler.cpp @@ -0,0 +1,74 @@ +#include "ggml-profiler.h" + +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +#include +#include + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#else +# include +# include +#endif + +// +// Time utilities +// + +uint64_t ggml_profiler_time_ns(void) { +#ifdef _WIN32 + LARGE_INTEGER freq, count; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&count); + return (uint64_t) (count.QuadPart * 1000000000ULL / freq.QuadPart); +#elif defined(__APPLE__) + clock_serv_t cclock; + mach_timespec_t mts; + host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &cclock); + clock_get_time(cclock, &mts); + mach_port_deallocate(mach_task_self(), cclock); + return (uint64_t) mts.tv_sec * 1000000000ULL + (uint64_t) mts.tv_nsec; +#elif defined(CLOCK_MONOTONIC_RAW) + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +#endif +} + +// +// Backend profiler registration +// + +void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler) { + if (backend == NULL) { + return; + } + + // Free any existing profiler + if (backend->profiler != NULL) { + if (backend->profiler->free_context != NULL) { + backend->profiler->free_context(backend->profiler->context); + } + delete backend->profiler; + backend->profiler = NULL; + } + + backend->profiler = profiler; +} + +ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend) { + if (backend == NULL) { + return NULL; + } + return backend->profiler; +} diff --git a/include/llama.h b/include/llama.h index 60e4b6b2ef..367a51b081 100644 --- a/include/llama.h +++ b/include/llama.h @@ -550,6 +550,8 @@ extern "C" { LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type + LLAMA_API struct ggml_backend_sched * llama_context_get_sched(const struct llama_context * ctx); + LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f6ce2817a8..5f34b6c20b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1,5 +1,6 @@ #include "llama-context.h" +#include "ggml-profiler.h" #include "llama-arch.h" #include "llama-impl.h" #include "llama-batch.h" @@ -2189,6 +2190,11 @@ ggml_status llama_context::graph_compute( LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); } + // If profiling is enabled, synchronize to ensure records are complete + if (ggml_backend_sched_get_profiling(sched.get())) { + ggml_backend_sched_synchronize(sched.get()); + } + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); return status; @@ -3025,6 +3031,10 @@ enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { return ctx->pooling_type(); } +ggml_backend_sched_t llama_context_get_sched(const llama_context * ctx) { + return ctx->get_sched(); +} + void llama_attach_threadpool( llama_context * ctx, ggml_threadpool_t threadpool, diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index c58fda83e2..6aae2fbb31 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -644,6 +644,23 @@ int main(int argc, char ** argv) { ctx_cli.ctx_server.terminate(); inference_thread.join(); + // Export profiling data if profiling was enabled + if (params.profiling) { + ggml_backend_sched_t sched = llama_context_get_sched(ctx_cli.ctx_server.get_llama_context()); + if (sched != nullptr) { + if (params.profiling_output.empty()) { + ggml_backend_sched_print_profiling(sched); + } else { + int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str()); + if (ret == 0) { + console::log("\nProfiling data exported to: %s\n", params.profiling_output.c_str()); + } else { + console::error("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str()); + } + } + } + } + // bump the log level to display timings common_log_set_verbosity_thold(LOG_LEVEL_INFO); llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context()); diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 716a30fe9a..380658ff83 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -997,6 +997,23 @@ int main(int argc, char ** argv) { llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } + // Export profiling data if profiling was enabled + if (params.profiling) { + ggml_backend_sched_t sched = llama_context_get_sched(ctx); + if (sched != nullptr) { + if (params.profiling_output.empty()) { + ggml_backend_sched_print_profiling(sched); + } else { + int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str()); + if (ret == 0) { + LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str()); + } else { + LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str()); + } + } + } + } + LOG("\n\n"); common_perf_print(ctx, smpl); diff --git a/tools/profiler/__init__.py b/tools/profiler/__init__.py new file mode 100644 index 0000000000..cdd8dd543e --- /dev/null +++ b/tools/profiler/__init__.py @@ -0,0 +1 @@ +# llama.cpp profiler analysis tools diff --git a/tools/profiler/profiler.py b/tools/profiler/profiler.py new file mode 100644 index 0000000000..a958a278ee --- /dev/null +++ b/tools/profiler/profiler.py @@ -0,0 +1,986 @@ +#!/usr/bin/env python3 +"""llama.cpp cross-backend profiler analysis tool. + +Usage: + python -m tools.profiler.profiler profile.json + python -m tools.profiler.profiler profile.json --chrome-trace trace.json +""" + +from __future__ import annotations + +import json +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + + +OP_EVENT = 0 +COPY_EVENT = 1 + +TYPE_NAMES = {0: "OP", 1: "COPY"} + + +@dataclass +class ProfileRecord: + type: int + name: str + backend_id: int + split_id: int + start_ns: int + duration_ns: int + bytes: int + extra: Optional[str] + ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + + @property + def type_name(self) -> str: + return TYPE_NAMES.get(self.type, f"UNKNOWN({self.type})") + + @property + def duration_us(self) -> float: + return self.duration_ns / 1000.0 + + @property + def duration_ms(self) -> float: + return self.duration_ns / 1_000_000.0 + + @property + def bandwidth_gbps(self) -> float: + """Bandwidth in GB/s (only meaningful for copy events).""" + if self.duration_ns == 0 or self.bytes == 0: + return 0.0 + return self.bytes / self.duration_ns + + @property + def shape_str(self) -> str: + """Human-readable tensor shape string, e.g. '[4096, 4096]'.""" + dims = [n for n in self.ne if n > 0] + if not dims: + return "" + return "[" + ", ".join(str(d) for d in dims) + "]" + + @property + def ne_elements(self) -> int: + """Total number of elements.""" + result = 1 + for n in self.ne: + if n > 0: + result *= n + return result + + def to_dict(self) -> dict: + return { + "type": self.type, + "name": self.name, + "backend_id": self.backend_id, + "split_id": self.split_id, + "start_ns": self.start_ns, + "duration_ns": self.duration_ns, + "bytes": self.bytes, + "extra": self.extra, + "ne": self.ne, + } + + +@dataclass +class OpStats: + name: str + event_type: int + backend_id: int + count: int = 0 + total_ns: int = 0 + min_ns: int = 0 + max_ns: int = 0 + total_bytes: int = 0 + representative_ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + + @property + def avg_ns(self) -> float: + return self.total_ns / self.count if self.count > 0 else 0 + + @property + def avg_us(self) -> float: + return self.avg_ns / 1000.0 + + @property + def total_ms(self) -> float: + return self.total_ns / 1_000_000.0 + + @property + def min_us(self) -> float: + return self.min_ns / 1000.0 + + @property + def max_us(self) -> float: + return self.max_ns / 1000.0 + + @property + def bandwidth_gbps(self) -> float: + if self.total_ns == 0 or self.total_bytes == 0: + return 0.0 + return self.total_bytes / self.total_ns + + @property + def time_per_byte_ns(self) -> float: + """Time per byte (lower = more efficient).""" + if self.total_bytes == 0: + return float("inf") + return self.total_ns / self.total_bytes + + @property + def type_name(self) -> str: + return TYPE_NAMES.get(self.event_type, f"UNKNOWN({self.event_type})") + + +class ProfileData: + def __init__(self, records: list[ProfileRecord], metadata: dict): + self.records = records + self.metadata = metadata + + @classmethod + def load(cls, filepath: str | Path) -> ProfileData: + """Load a profiler JSON file.""" + with open(filepath, "r") as f: + data = json.load(f) + + if data.get("profiler") != "ggml": + print(f"Warning: file may not be a ggml profiler output (profiler={data.get('profiler')})") + + records = [] + for r in data.get("records", []): + ne = r.get("ne", [0, 0, 0, 0]) + if isinstance(ne, list) and len(ne) < 4: + ne = ne + [0] * (4 - len(ne)) + elif not isinstance(ne, list): + ne = [0, 0, 0, 0] + records.append(ProfileRecord( + type=r.get("type", 0), + name=r.get("name", "unknown"), + backend_id=r.get("backend_id", 0), + split_id=r.get("split_id", 0), + start_ns=r.get("start_ns", 0), + duration_ns=r.get("duration_ns", 0), + bytes=r.get("bytes", 0), + extra=r.get("extra"), + ne=ne, + )) + + backends_raw = data.get("backends", []) + backends = [] + for b in backends_raw: + backends.append({ + "id": b.get("id", 0), + "name": b.get("name", "unknown"), + "device": b.get("device", "unknown"), + "device_type": b.get("device_type", 0), + }) + + metadata = { + "version": data.get("version", 0), + "total_records": data.get("total_records", len(records)), + "total_ns": data.get("total_ns", sum(r.duration_ns for r in records)), + "backends": backends, + } + + return cls(records, metadata) + + @property + def total_ns(self) -> int: + return sum(r.duration_ns for r in self.records) + + @property + def total_ms(self) -> float: + return self.total_ns / 1_000_000.0 + + def stats(self) -> list[OpStats]: + """Aggregate stats grouped by (name, type, backend_id).""" + groups: dict[tuple, OpStats] = {} + for rec in self.records: + key = (rec.name, rec.type, rec.backend_id) + if key not in groups: + groups[key] = OpStats( + name=rec.name, + event_type=rec.type, + backend_id=rec.backend_id, + min_ns=rec.duration_ns, + max_ns=rec.duration_ns, + representative_ne=list(rec.ne), + ) + s = groups[key] + s.count += 1 + s.total_ns += rec.duration_ns + s.min_ns = min(s.min_ns, rec.duration_ns) + s.max_ns = max(s.max_ns, rec.duration_ns) + s.total_bytes += rec.bytes + + # Track the ne from the longest individual call + if rec.duration_ns >= s.max_ns: + s.representative_ne = list(rec.ne) + + return sorted(groups.values(), key=lambda s: s.total_ns, reverse=True) + + def top_operations(self, n: int = 10) -> list[OpStats]: + """Return the N most time-consuming operations (aggregated).""" + return self.stats()[:n] + + def top_kernels(self, n: int = 10) -> list[ProfileRecord]: + """Return the N longest individual kernel executions.""" + return sorted(self.records, key=lambda r: r.duration_ns, reverse=True)[:n] + + def by_backend(self) -> dict[int, list[ProfileRecord]]: + """Group records by backend ID.""" + groups: dict[int, list[ProfileRecord]] = {} + for rec in self.records: + groups.setdefault(rec.backend_id, []).append(rec) + return dict(sorted(groups.items())) + + def timeline(self) -> list[ProfileRecord]: + """Return records sorted by start_ns for timeline visualization.""" + return sorted(self.records, key=lambda r: r.start_ns) + + def inefficiency_ranking(self, n: int = 10) -> list[OpStats]: + """Rank operations by time per byte (inefficiency). Lower is better.""" + all_stats = [s for s in self.stats() if s.total_bytes > 0 and s.event_type == OP_EVENT] + return sorted(all_stats, key=lambda s: s.time_per_byte_ns, reverse=True)[:n] + + def summary(self) -> None: + """Print a formatted summary table to stdout.""" + print(f"\n{'='*80}") + print(f" ggml Profiler Summary") + print(f"{'='*80}") + print(f" Total records: {len(self.records)}") + print(f" Total time: {self.total_ms:.2f} ms") + print(f" Unique ops: {len(set((r.name, r.type, r.backend_id) for r in self.records))}") + print(f"{'='*80}\n") + + stats = self.stats() + if not stats: + print(" No profiling data.\n") + return + + print(f" {'TYPE':<5} {'BKND':>4} {'Operation':<28} {'%Time':>7} {'Count':>6} " + f"{'Total':>10} {'Avg':>10} {'Min':>10} {'Max':>10} {'Bytes':>10}") + print(f" {'':->5} {'':->4} {'':->28} {'':->7} {'':->6} " + f"{'(ms)':>10} {'(us)':>10} {'(us)':>10} {'(us)':>10} {'':->10}") + + for s in stats: + pct = 100.0 * s.total_ns / self.total_ns if self.total_ns > 0 else 0 + + line = (f" {s.type_name:<5} {s.backend_id:>4} {s.name:<28} {pct:>6.1f}% " + f"{s.count:>6} {s.total_ms:>10.2f} {s.avg_us:>10.2f} " + f"{s.min_us:>10.2f} {s.max_us:>10.2f}") + + if s.total_bytes > 0: + bw = s.bandwidth_gbps + bytes_str = f"{s.total_bytes / 1e6:.1f} MB" + if s.event_type == COPY_EVENT: + line += f" {bw:>8.2f} GB/s" + else: + line += f" {bytes_str:>10}" + else: + line += f" {'':>10}" + + # Tensor shape from longest call + shape_dims = [n for n in s.representative_ne if n > 0] + if shape_dims: + line += f" [{', '.join(str(d) for d in shape_dims)}]" + + print(line) + + backend_groups = self.by_backend() + if len(backend_groups) > 1: + print(f"\n --- By Backend ---") + for bid, recs in sorted(backend_groups.items()): + bk_total = sum(r.duration_ns for r in recs) + bk_pct = 100.0 * bk_total / self.total_ns if self.total_ns > 0 else 0 + print(f" Backend {bid}: {bk_total / 1e6:.2f} ms ({bk_pct:.1f}%) — {len(recs)} records") + + inef = self.inefficiency_ranking(5) + if inef: + print(f"\n --- Top 5 Inefficient Operations (time/byte) ---") + for s in inef: + print(f" {s.name:<28} {s.time_per_byte_ns / 1000:.2f} us/byte " + f"({s.count} calls, {s.total_bytes / 1e6:.1f} MB)") + + top_k = self.top_kernels(5) + print(f"\n --- Top 5 Longest Kernels ---") + for rec in top_k: + shape = f" {rec.shape_str}" if rec.shape_str else "" + print(f" {rec.type_name:<5} {rec.name:<28} {rec.duration_us:>10.2f} us{shape} " + f"(split={rec.split_id}, backend={rec.backend_id})") + + print() + + def export_chrome_trace(self, filepath: str | Path) -> None: + """Export as Chrome Trace Event format for chrome://tracing.""" + events = [] + + # Build backend name mapping and remap to non-negative PIDs + # (Chrome cannot handle negative PIDs) + backend_ids = sorted(set(rec.backend_id for rec in self.records)) + backend_names: dict[int, str] = {} + pid_map: dict[int, int] = {} + + # Use metadata from JSON if available + metadata_backends = self.metadata.get("backends", []) + backend_by_id: dict[int, dict] = {b["id"]: b for b in metadata_backends} + + device_type_names = {0: "CPU", 1: "GPU", 2: "ACCEL"} + for idx, bid in enumerate(backend_ids): + pid_map[bid] = idx + if bid in backend_by_id: + binfo = backend_by_id[bid] + dev_type = binfo.get("device_type", 0) + dev_name = binfo.get("device", "") + type_name = device_type_names.get(dev_type, "Device") + if dev_name and dev_name != "unknown": + backend_names[bid] = f"{type_name}: {dev_name}" + else: + backend_names[bid] = f"{type_name}: {binfo.get('name', f'Backend {bid}')}" + else: + backend_names[bid] = f"Backend {bid}" + + # Process metadata events + for bid in backend_ids: + pid = pid_map[bid] + events.append({ + "ph": "M", # metadata + "pid": pid, + "name": "process_name", + "args": {"name": backend_names[bid]}, + }) + + # Group records by (backend_id, split_id) and lay them out sequentially + # since we don't have reliable global timestamps across backends. + # Within each group, events are cumulative. + from collections import OrderedDict + groups: OrderedDict[tuple, list[ProfileRecord]] = OrderedDict() + for rec in self.records: + key = (rec.backend_id, rec.split_id) + groups.setdefault(key, []).append(rec) + + # Assign timestamps: each group starts after the previous one, + # and events within a group are sequential (cumulative duration). + global_ts = 0.0 # microseconds + for key, recs in groups.items(): + backend_id, split_id = key + pid = pid_map[backend_id] + tid = f"split_{split_id}" + + for rec in recs: + cat = "copy" if rec.type == COPY_EVENT else "compute" + events.append({ + "ph": "X", # complete event + "pid": pid, + "tid": tid, + "name": rec.name, + "ts": global_ts, + "dur": rec.duration_ns / 1000.0, # us + "cat": cat, + "args": { + "bytes": rec.bytes, + "duration_us": rec.duration_ns / 1000.0, + "shape": rec.shape_str, + }, + }) + global_ts += rec.duration_ns / 1000.0 + + # Add a small gap between groups for visual separation + global_ts += 1.0 + + trace = {"traceEvents": events} + with open(filepath, "w") as f: + json.dump(trace, f, indent=2) + + print(f"Chrome trace exported to: {filepath}") + print(f"Open chrome://tracing in Chrome/Edge and load this file.") + + def export_html_viewer(self, filepath: str | Path, max_records: int = 0) -> None: + """Export a self-contained interactive HTML timeline viewer using Canvas.""" + import json as json_mod + + metadata_backends = self.metadata.get("backends", []) + backend_by_id: dict[int, dict] = {b["id"]: b for b in metadata_backends} + + backend_names: dict[int, str] = {} + for bid in sorted(set(rec.backend_id for rec in self.records)): + binfo = backend_by_id.get(bid, {}) + name = binfo.get("name", f"Backend {bid}") + device = binfo.get("device", "") + backend_names[bid] = device if device and device != "unknown" else name + + events: list[dict] = [] + cum_us = 0.0 + for rec in self.records: + dur_us = rec.duration_ns / 1000.0 + events.append({ + "n": rec.name, + "d": dur_us, + "s": rec.shape_str, + "b": rec.bytes, + "t": rec.type, + "bid": rec.backend_id, + "start": cum_us, + }) + cum_us += dur_us + total_us = cum_us + + if max_records > 0 and len(events) > max_records: + stride = len(events) // max_records + events = events[::stride][:max_records] + + if total_us == 0: + print("No profiling data to export.") + return + + header_stats = str(len(events)) + ' events | ' + f'{total_us/1000:.1f}' + ' ms' + + # Build backend name map with string keys for JSON + bn_str = {str(k): v for k, v in backend_names.items()} + + # --- HTML --- + html = ( + '\n' + 'ggml Profiler\n\n' + '

ggml Profiler Timeline

' + '' + header_stats + '
\n' + '
' + '' + '' + '' + '' + '' + '' + '
\n' + '
\n' + '
\n' + '
\n' + '
\n' + '
\n' + '
\n' + '' + + with open(filepath, "w") as f: + f.write(html) + + print(f"HTML viewer exported to: {filepath}") + print(f"Open in browser: file://{Path(filepath).resolve()}") + + +def load(filepath: str | Path) -> ProfileData: + """Load a profiler JSON file.""" + return ProfileData.load(filepath) + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + description="llama.cpp profiler analysis tool", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python -m tools.profiler.profiler profile.json + python -m tools.profiler.profiler profile.json --chrome-trace trace.json + python -m tools.profiler.profiler profile.json --top-ops 20 + """, + ) + parser.add_argument("profile", help="Path to profiler JSON file") + parser.add_argument("--chrome-trace", metavar="FILE", + help="Export as Chrome Trace Event format") + parser.add_argument("--html-viewer", metavar="FILE", + help="Export as interactive HTML timeline viewer") + parser.add_argument("--html-max-records", type=int, default=5000, + help="Max records per backend in HTML viewer (0=unlimited, downsample to reduce file size)") + parser.add_argument("--top-ops", type=int, default=0, + help="Show top N operations (0 = show summary)") + parser.add_argument("--top-kernels", type=int, default=0, + help="Show top N longest kernels") + parser.add_argument("--inefficiency", action="store_true", + help="Show inefficiency ranking") + + args = parser.parse_args() + + data = load(args.profile) + + if args.chrome_trace: + data.export_chrome_trace(args.chrome_trace) + + if args.html_viewer: + data.export_html_viewer(args.html_viewer, max_records=args.html_max_records) + + if args.top_ops > 0: + print(f"\nTop {args.top_ops} operations by total time:\n") + for s in data.top_operations(args.top_ops): + pct = 100.0 * s.total_ns / data.total_ns if data.total_ns > 0 else 0 + print(f" {s.type_name:<5} {s.backend_id:>4} {s.name:<28} {pct:>6.1f}% " + f"{s.count:>6}x {s.total_ms:>10.2f} ms avg={s.avg_us:.2f} us") + print() + + if args.top_kernels > 0: + print(f"\nTop {args.top_kernels} longest kernels:\n") + for rec in data.top_kernels(args.top_kernels): + print(f" {rec.type_name:<5} {rec.backend_id:>4} {rec.name:<28} " + f"{rec.duration_us:>10.2f} us split={rec.split_id}") + print() + + if args.inefficiency: + print("\nInefficiency ranking (time/byte for operations with data):\n") + for s in data.inefficiency_ranking(10): + print(f" {s.name:<28} {s.time_per_byte_ns / 1000:>10.2f} us/byte " + f"{s.count:>6} calls {s.total_bytes / 1e6:.1f} MB") + print() + + if args.top_ops == 0 and args.top_kernels == 0 and not args.inefficiency and not args.chrome_trace and not args.html_viewer: + data.summary() + + +if __name__ == "__main__": + main()