diff --git a/common/arg.cpp b/common/arg.cpp index 538d2a4b0a..632c89edce 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1073,6 +1073,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.completion = true; } )); + add_opt(common_arg( + {"--profile"}, + "enable cross-backend profiling (CPU, BLAS, CUDA)", + [](common_params & params) { + params.profiling = true; + } + ).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG})); + add_opt(common_arg( + {"--profile-output"}, "FNAME", + "write profiling JSON output to FNAME (default: stdout)", + [](common_params & params, const std::string & value) { + params.profiling = true; + params.profiling_output = value; + } + ).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG})); add_opt(common_arg( {"--verbose-prompt"}, string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), diff --git a/common/common.cpp b/common/common.cpp index 60396af1f8..17e22cec64 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2,6 +2,7 @@ #include "gguf.h" #include "common.h" +#include "ggml-profiler.h" #include "log.h" #include "llama.h" #include "sampling.h" @@ -1236,6 +1237,14 @@ common_init_result::common_init_result(common_params & params) : return; } + if (params.profiling) { + ggml_backend_sched_t sched = llama_context_get_sched(lctx); + if (sched != nullptr) { + ggml_backend_sched_set_profiling(sched, true); + LOG_INF("%s: profiling enabled\n", __func__); + } + } + pimpl->context.reset(lctx); } diff --git a/common/common.h b/common/common.h index 17dc3fb232..72f6320348 100644 --- a/common/common.h +++ b/common/common.h @@ -3,6 +3,7 @@ #pragma once #include "ggml-opt.h" +#include "ggml-profiler.h" #include "ggml.h" #include "llama-cpp.h" @@ -669,6 +670,10 @@ struct common_params { bool spm_infill = false; // suffix/prefix/middle pattern for infill + // profiling + bool profiling = false; // enable cross-backend profiling + std::string profiling_output; // path to write profiling JSON output (empty = stdout) + // batched-bench params bool batched_bench_output_jsonl = false; diff --git a/docs/cross-profiler.md b/docs/cross-profiler.md new file mode 100644 index 0000000000..ad90b9e20a --- /dev/null +++ b/docs/cross-profiler.md @@ -0,0 +1,225 @@ +# Cross-Backend Profiler + +llama.cpp includes a built-in cross-backend profiler that captures per-operation timing, data transfer costs, and tensor shapes across all compute backends. It works with any application built on the ggml scheduler — no source changes needed. + +## Supported Backends + +| Backend | Status | Timing method | +|---------|--------|---------------| +| CPU | Supported | Wall-clock (`CLOCK_MONOTONIC_RAW`) | +| CUDA | Supported | `cudaEvent` GPU timestamps | +| Vulkan | Supported | GPU timestamp queries | +| BLAS | Supported | Wall-clock | +| Metal | Not yet supported | — | +| OpenCL | Not yet supported | — | + +The scheduler also profiles **data copies** (H2D, D2H, D2D) between backends regardless of which backends have native profiler support. + +## Enabling the Profiler + +There are two independent ways to enable profiling. They can be used separately or together. + +### CLI flags (`--profile`, `--profile-output`) + +Available in `llama-cli`, `llama-completion`, `llama-server`, and `debug`: + +```bash +# Print summary to stdout +llama-completion -m model.gguf --profile -p "Hello world" + +# Export to JSON +llama-completion -m model.gguf --profile --profile-output profile.json -p "Hello world" + +# Export to plain text +llama-completion -m model.gguf --profile --profile-output profile.txt -p "Hello world" +``` + +The output format is chosen by file extension: `.json` for JSON, `.txt` for plain text. Any other extension defaults to JSON. + +### Environment variable (`GGML_PROFILE`) + +The `GGML_PROFILE` environment variable enables profiling at the ggml scheduler level. This works with **any** application that uses the scheduler — including third-party tools like `sd.cpp` — without CLI flag support. + +```bash +# Print summary to stdout +GGML_PROFILE=1 llama-completion -m model.gguf -p "Hello world" + +# Export JSON +GGML_PROFILE=profile.json llama-completion -m model.gguf -p "Hello world" + +# Export plain text +GGML_PROFILE=profile.txt llama-completion -m model.gguf -p "Hello world" + +# Works with any ggml-based application +GGML_PROFILE=1 sd -m model.gguf -p "a cat" +``` + +| Value | Behavior | +|-------|----------| +| `1`, `stdout`, or empty | Print summary to stdout | +| `path.json` | Export JSON to file | +| `path.txt` | Export plain text to file | +| Any other path | Export JSON to file | + +The export happens automatically when the scheduler is freed (typically at program exit). + +## Output Formats + +### Console summary (stdout) + +The default when `--profile` is used without `--profile-output`, or `GGML_PROFILE=1`: + +``` +=== Profiling Summary === + [OP ] backend 0 MUL_MAT 45.2% count=1200 total= 120.50 ms avg= 100.42 us ... 12.30 GB/s [4096 x 4096] + [OP ] backend 1 MUL_MAT_ID 30.1% count= 600 total= 80.20 ms avg= 133.67 us ... 0.08 GB/s [2688 x 1856 x 128] + [COPY] backend 0 copy_H2D 5.3% count= 200 total= 14.10 ms avg= 70.50 us ... 2.50 GB/s + ... +``` + +Each line shows: event type (OP or COPY), backend index, operation name, percentage of total time, call count, timing stats, bandwidth, and representative tensor shape. + +### Plain text (`.txt`) + +A more detailed report with three sections: + +1. **Profiling Summary** — total time, record count, unique ops +2. **Per-Backend Summary** — ops and copies per backend with aggregate bandwidth +3. **Operations table** — full breakdown with bandwidth and tensor shapes for all source tensors + +### JSON (`.json`) + +Machine-readable format suitable for the Python analysis tool. Contains: + +- `version`: Format version (currently `2`) +- `backends[]`: Backend metadata (name, device, device type) +- `records[]`: Every profiling event with: + - `type`: `0` = OP, `1` = COPY + - `name`: Operation name (e.g. `"MUL_MAT"`, `"copy_H2D"`) + - `backend_id`, `split_id`: Scheduler indices + - `start_ns`, `duration_ns`: Timing in nanoseconds + - `bytes`: Output tensor size (OPs) or transfer size (COPYs) + - `extra`: Fusion name for fused ops, or `null` + - `ne_src0`, `ne_src1`, `ne_src2`: Source tensor dimensions (4-element arrays) + +`ne_src2` is populated only for `MUL_MAT_ID` (expert selection indices); it is `[0,0,0,0]` for all other ops. + +## Python Analysis Tool + +The `tools/profiler/profiler.py` script reads JSON exports and produces analysis reports and visualizations. + +### Basic usage + +```bash +# Print summary +python -m tools.profiler.profiler profile.json + +# Show top 10 operations by time +python -m tools.profiler.profiler profile.json --top-ops 10 + +# Show top 10 longest individual kernels +python -m tools.profiler.profiler profile.json --top-kernels 10 + +# Show inefficiency ranking (highest time-per-byte) +python -m tools.profiler.profiler profile.json --inefficiency +``` + +### Export visualizations + +```bash +# Interactive HTML timeline (self-contained, no dependencies) +python -m tools.profiler.profiler profile.json --html-viewer timeline.html + +# Chrome Trace format (open in chrome://tracing or Perfetto) +python -m tools.profiler.profiler profile.json --chrome-trace trace.json + +# Downsample large traces for the HTML viewer +python -m tools.profiler.profiler profile.json --html-viewer timeline.html --html-max-records 50000 +``` + +Multiple exports can be combined in a single invocation: + +```bash +python -m tools.profiler.profiler profile.json --html-viewer timeline.html --chrome-trace trace.json --top-ops 20 +``` + +### CLI reference + +| Argument | Description | +|----------|-------------| +| `profile` (positional) | Path to profiler JSON file | +| `--chrome-trace FILE` | Export Chrome Trace Event format | +| `--html-viewer FILE` | Export interactive HTML timeline | +| `--html-max-records N` | Limit records in HTML output (0 = unlimited) | +| `--top-ops N` | Show top N operations by total time | +| `--top-kernels N` | Show top N longest individual kernels | +| `--inefficiency` | Rank operations by time per byte (higher = worse) | + +### HTML viewer features + +The HTML viewer is a self-contained file with no external dependencies: + +- **Canvas timeline** with per-backend lanes and color-coded operations +- **Zoom controls** (1s / 100ms / 1ms / 100us) and mouse drag navigation +- **Minimap** showing the full trace with a viewport indicator +- **Hover tooltips** with operation name, duration, shape, and bytes +- **Stats table** with collapsible tree: Operation → Backend → Tensor shape, showing % time, count, avg/min/max, and bandwidth +- **Legend** showing the most frequent operation types + +## What Gets Measured + +### OP events + +Every tensor operation (MUL_MAT, ADD, UNARY, FLASH_ATTN_EXT, etc.) is recorded with: + +- **Timing**: Start/end timestamps (nanosecond precision) +- **Bytes**: Output tensor size (`ggml_nbytes(node)`) +- **Tensor shapes**: Dimensions of `src[0]`, `src[1]`, and `src[2]` (when applicable) +- **Bandwidth**: Computed as `bytes / duration` — useful for identifying memory-bound vs compute-bound operations + +### COPY events + +Data transfers between backends: + +- **Direction**: `copy_H2D` (host→device), `copy_D2H` (device→host), `copy_D2D` (device→device) +- **Bytes**: Exact transfer size +- **Bandwidth**: Transfer throughput + +### MoE weight copies + +When `--cpu-moe` is used, the scheduler selectively copies only the active experts. These partial copies are recorded as individual COPY events with the actual bytes transferred. + +## Programmatic API + +For custom applications, the profiler can be controlled through the C API defined in `ggml/include/ggml-profiler.h`: + +```c +// Enable profiling on a scheduler +ggml_backend_sched_set_profiling(sched, true); + +// ... run inference ... + +// Get raw records +const ggml_profile_record * records; +int n = ggml_backend_sched_get_profiling_records(sched, &records); + +// Or export directly +ggml_backend_sched_print_profiling(sched); // stdout +ggml_backend_sched_export_profiling_json(sched, "profile.json"); // JSON file +ggml_backend_sched_export_profiling_text(sched, "profile.txt"); // text file +ggml_backend_sched_write_profiling_json(sched, fp); // JSON to FILE* +ggml_backend_sched_write_profiling_text(sched, fp); // text to FILE* + +// Reset for next measurement window +ggml_backend_sched_reset_profiling(sched); +``` + +Records accumulate across multiple `graph_compute` calls until explicitly reset or the scheduler is freed. + +## Tips + +- **Prompt eval vs generation**: The profiler captures all graph computes. During prompt evaluation you'll see larger batch sizes in tensor shapes; during generation, batch size is typically 1-2. +- **Vulkan concurrent mode**: When Vulkan dispatches multiple operations concurrently, they are reported as a single combined record spanning the full GPU time interval. +- **Bandwidth interpretation**: For compute ops, bandwidth = `output_bytes / duration`. This is not memory bandwidth — it's a proxy for throughput. MUL_MAT with low bandwidth typically indicates compute-bound behavior; high bandwidth indicates memory-bound. +- **Large traces**: For long inference runs, the JSON can be large. Use `--html-max-records` to downsample the HTML viewer, or use Chrome Trace format which handles large files well. +- **Multiple backends**: Backend IDs in the output correspond to the scheduler's priority order (0 = highest priority, typically GPU; last = CPU). diff --git a/examples/debug/debug.cpp b/examples/debug/debug.cpp index ec80be19ba..fe2b883c12 100644 --- a/examples/debug/debug.cpp +++ b/examples/debug/debug.cpp @@ -244,6 +244,29 @@ int main(int argc, char ** argv) { return 1; } + // Export profiling data if profiling was enabled + if (params.profiling) { + ggml_backend_sched_t sched = llama_context_get_sched(ctx); + if (sched != nullptr) { + if (params.profiling_output.empty()) { + ggml_backend_sched_print_profiling(sched); + } else { + const std::string & path = params.profiling_output; + int ret; + if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) { + ret = ggml_backend_sched_export_profiling_text(sched, path.c_str()); + } else { + ret = ggml_backend_sched_export_profiling_json(sched, path.c_str()); + } + if (ret == 0) { + LOG("\nProfiling data exported to: %s\n", path.c_str()); + } else { + LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str()); + } + } + } + } + LOG("\n"); llama_perf_context_print(ctx); diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index e3e067c916..cbbac0b663 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -22,6 +22,24 @@ extern "C" { // use only reference implementations bool use_ref; + + // profiler context (set by backend when profiling is enabled, NULL otherwise) + // when non-NULL, the compute loop will record per-node timing + void * profiling_context; + + // callback for recording a profile record from C code (set by backend when profiling) + // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4], ne_src2[4] + void (*profiling_record_fn)(void * context, + int type, + const char * name, + int split_id, + uint64_t start_ns, + uint64_t end_ns, + uint64_t bytes, + const char * extra, + const int64_t ne_src0[4], + const int64_t ne_src1[4], + const int64_t ne_src2[4]); }; // numa strategies diff --git a/ggml/include/ggml-profiler.h b/ggml/include/ggml-profiler.h new file mode 100644 index 0000000000..2328f6b49f --- /dev/null +++ b/ggml/include/ggml-profiler.h @@ -0,0 +1,112 @@ +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// +// Profiler +// + +// Profile event types +enum ggml_profile_event_type { + GGML_PROFILE_EVENT_OP, // single operation execution (computation kernel) + GGML_PROFILE_EVENT_COPY, // data transfer between devices +}; + +// A single profiling record representing a timed interval +typedef struct ggml_profile_record { + enum ggml_profile_event_type type; + const char * name; // operation name (e.g., "mul_mat", "copy_H2D") + int backend_id; // scheduler's backend index (0 = highest priority) + int split_id; // which graph split (0..n_splits-1) + uint64_t start_ns; // start timestamp in nanoseconds + uint64_t end_ns; // end timestamp in nanoseconds + uint64_t bytes; // bytes transferred (for copy) or tensor size (for ops) + const char * extra; // fusion name for fused ops, or NULL + int64_t ne_src0[4]; // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT) + int64_t ne_src1[4]; // src[1] tensor dimensions (e.g. input matrix for MUL_MAT) + int64_t ne_src2[4]; // src[2] tensor dimensions (e.g. ids for MUL_MAT_ID) +} ggml_profile_record; + +// Backend profiler interface - each backend optionally implements this +// to provide fine-grained operation timing +struct ggml_backend_profiler { + void * context; // backend-specific profiler context + + // Enable or disable profiling on this backend + void (*enable)(void * context, bool enable); + + // Clear all recorded data + void (*reset)(void * context); + + // Set the current split ID (called by scheduler before graph_compute) + void (*set_split_id)(void * context, int split_id); + + // Get recorded profiling data + // Returns the number of records; sets *out to point to internal storage + // The returned pointer remains valid until the next reset or disable call + int (*get_records)(void * context, const ggml_profile_record ** out); + + // Free the profiler context + void (*free_context)(void * context); +}; + +typedef struct ggml_backend_profiler * ggml_backend_profiler_t; + +// Register a profiler on a backend (called by backend during init) +// The profiler is owned by the backend and will be freed when the backend is freed +GGML_API void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler); + +// Get the profiler associated with a backend (returns NULL if none) +GGML_API ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend); + +// +// Scheduler profiling API +// + +// Enable or disable profiling on a scheduler +// When enabled, the scheduler will: +// - Time data copy operations between backends +// - Enable profiling on all backends that support it +// - Collect profiling records from all backends after each graph compute +GGML_API void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable); + +// Check if profiling is enabled on a scheduler +GGML_API bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched); + +// Get profiling data from the last graph compute +// Records are owned by the scheduler; valid until the next compute or reset +// Returns the number of records +GGML_API int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records); + +// Print a human-readable summary of the last profiling run to stdout +// Groups records by operation name and shows total/count/min/max/avg time +GGML_API void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched); + +// Reset profiling data (clear all recorded data) +GGML_API void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched); + +// Get current time in nanoseconds (for manual profiling if needed) +GGML_API uint64_t ggml_profiler_time_ns(void); + +// Export profiling data as JSON to a file +// Returns 0 on success, -1 on error +GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath); + +// Export profiling data as JSON to a FILE pointer +GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp); + +// Export profiling data as plain text statistics to a file +// Returns 0 on success, -1 on error +GGML_API int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath); + +// Export profiling data as plain text statistics to a FILE pointer +GGML_API int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 78853304d9..cf8f4d0788 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -195,12 +195,14 @@ add_library(ggml-base ../include/ggml-backend.h ../include/ggml-cpp.h ../include/ggml-opt.h + ../include/ggml-profiler.h ../include/gguf.h ggml.c ggml.cpp ggml-alloc.c ggml-backend.cpp ggml-opt.cpp + ggml-profiler.cpp ggml-threading.cpp ggml-threading.h ggml-quants.c diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 59190b7c46..80feaaf384 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -3,6 +3,7 @@ // ggml-backend internal header #include "ggml-backend.h" +#include "ggml-profiler.h" #ifdef __cplusplus extern "C" { @@ -124,6 +125,9 @@ extern "C" { struct ggml_backend_i iface; ggml_backend_dev_t device; void * context; + + // Optional profiler (set by backend during init, NULL if not profiling) + ggml_backend_profiler_t profiler; }; struct ggml_backend_event { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 22c656996c..fa955b68bb 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -12,6 +12,7 @@ #include "ggml-backend-impl.h" #include "ggml-alloc.h" #include "ggml-impl.h" +#include "ggml-profiler.h" #include #include @@ -20,6 +21,7 @@ #include #include #include +#include #include #ifdef __APPLE__ @@ -231,6 +233,15 @@ void ggml_backend_free(ggml_backend_t backend) { return; } + // Clean up profiler if present (before backend frees its context) + if (backend->profiler != NULL) { + if (backend->profiler->free_context != NULL) { + backend->profiler->free_context(backend->profiler->context); + } + delete backend->profiler; + backend->profiler = NULL; + } + backend->iface.free(backend); } @@ -736,6 +747,20 @@ struct ggml_backend_sched { int debug_realloc; int debug_graph_size; int debug_prev_graph_size; + + // profiling + bool profiling_enabled; + std::string profiling_env_path; // GGML_PROFILE env var value (for auto-export on free) + std::vector copy_records; // copy events recorded by the scheduler + std::vector profiling_records; // merged records from all sources + + // Cached backend metadata for safe access during auto-export (backends may be freed first) + struct backend_meta { + std::string name; + std::string device; + int device_type; + }; + std::vector profiling_backend_meta; }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) @@ -1450,11 +1475,28 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s std::vector ids; std::vector used_ids; + // Profiling: reset copy records for this compute pass + if (sched->profiling_enabled) { + sched->copy_records.clear(); + } + for (int split_id = 0; split_id < sched->n_splits; split_id++) { struct ggml_backend_sched_split * split = &splits[split_id]; int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; + // Profiling: set split ID and enable backend profiling + if (sched->profiling_enabled) { + if (split_backend->profiler != NULL) { + if (split_backend->profiler->enable != NULL) { + split_backend->profiler->enable(split_backend->profiler->context, true); + } + if (split_backend->profiler->set_split_id != NULL) { + split_backend->profiler->set_split_id(split_backend->profiler->context, split_id); + } + } + } + // copy the input tensors to the split backend for (int input_id = 0; input_id < split->n_inputs; input_id++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]); @@ -1468,7 +1510,26 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } else { ggml_backend_synchronize(split_backend); } - ggml_backend_tensor_copy(input, input_cpy); + if (sched->profiling_enabled) { + uint64_t copy_start = ggml_profiler_time_ns(); + ggml_backend_tensor_copy(input, input_cpy); + uint64_t copy_end = ggml_profiler_time_ns(); + + enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device); + enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device); + const char * copy_dir = "copy_D2D"; + if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_H2D"; + } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_D2H"; + } + + sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id, + copy_start, copy_end, ggml_nbytes(input), input->name, + {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} }); + } else { + ggml_backend_tensor_copy(input, input_cpy); + } } else { // wait for the split backend to finish using the input before overwriting it if (sched->events[split_backend_id][sched->cur_copy] != NULL) { @@ -1525,12 +1586,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } // group consecutive experts and copy them together + size_t total_copied_bytes = 0; auto copy_experts = [&](int32_t first_id, int32_t last_id) { const size_t expert_offset = first_id * expert_size; const size_t expert_size_copy = (last_id - first_id + 1) * expert_size; const size_t padding = std::min(expert_size, 512); const size_t padding_end = last_id < n_expert - 1 ? padding : 0; + total_copied_bytes += expert_size_copy + padding_end; ggml_backend_tensor_set_async(split_backend, input_cpy, (const uint8_t *)input->data + expert_offset, expert_offset, @@ -1539,6 +1602,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s expert_size_copy + padding_end); }; + uint64_t moe_copy_start = 0; + if (sched->profiling_enabled) { + moe_copy_start = ggml_profiler_time_ns(); + } + int id = 0; while (!ggml_bitset_get(used_ids.data(), id)) { id++; @@ -1562,9 +1630,35 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s last_id = id; } copy_experts(first_id, last_id); + + if (sched->profiling_enabled) { + uint64_t moe_copy_end = ggml_profiler_time_ns(); + + enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device); + enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device); + const char * copy_dir = "copy_D2D"; + if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_H2D"; + } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && + dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_D2H"; + } + + sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, + split_id, moe_copy_start, moe_copy_end, + (uint64_t) total_copied_bytes, input->name, + {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} }); + } } else { // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface + + // Capture timestamp before async attempt so we can record launch time + uint64_t copy_start = 0; + if (sched->profiling_enabled) { + copy_start = ggml_profiler_time_ns(); + } + if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { ggml_backend_synchronize(input_backend); if (sched->events[split_backend_id][sched->cur_copy] != NULL) { @@ -1572,7 +1666,47 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } else { ggml_backend_synchronize(split_backend); } - ggml_backend_tensor_copy(input, input_cpy); + if (sched->profiling_enabled) { + // Re-take start after sync for accurate sync copy measurement + copy_start = ggml_profiler_time_ns(); + ggml_backend_tensor_copy(input, input_cpy); + uint64_t copy_end = ggml_profiler_time_ns(); + + enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device); + enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device); + const char * copy_dir = "copy_D2D"; + if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_H2D"; + } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && + dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_D2H"; + } + + sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, + split_id, copy_start, copy_end, ggml_nbytes(input), input->name, + {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} }); + } else { + ggml_backend_tensor_copy(input, input_cpy); + } + } else { + // async copy was launched — record the time spanning the async call + if (sched->profiling_enabled) { + uint64_t copy_end = ggml_profiler_time_ns(); + + enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device); + enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device); + const char * copy_dir = "copy_D2D"; + if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_H2D"; + } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && + dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_D2H"; + } + + sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, + split_id, copy_start, copy_end, ggml_nbytes(input), input->name, + {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} }); + } } } } @@ -1625,6 +1759,32 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } + // Profiling: collect records from all backends and append to accumulated records + if (sched->profiling_enabled) { + // Collect backend operation records + for (int b = 0; b < sched->n_backends; b++) { + ggml_backend_t backend = sched->backends[b]; + if (backend->profiler != NULL && backend->profiler->get_records != NULL) { + const ggml_profile_record * backend_recs = NULL; + int count = backend->profiler->get_records(backend->profiler->context, &backend_recs); + for (int r = 0; r < count; r++) { + ggml_profile_record rec = backend_recs[r]; + rec.backend_id = b; // stamp correct scheduler backend index + sched->profiling_records.push_back(rec); + } + // Reset backend records (but keep profiling enabled for next compute) + if (backend->profiler->reset != NULL) { + backend->profiler->reset(backend->profiler->context); + } + } + } + + // Append copy records + for (const auto & rec : sched->copy_records) { + sched->profiling_records.push_back(rec); + } + } + return GGML_STATUS_SUCCESS; } @@ -1691,6 +1851,24 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; + const char * profile_env = getenv("GGML_PROFILE"); + if (profile_env != NULL) { + sched->profiling_enabled = true; + sched->profiling_env_path = profile_env; + } + + // Cache backend metadata for safe access during auto-export + for (int b = 0; b < n_backends; b++) { + ggml_backend_sched::backend_meta meta; + meta.name = ggml_backend_name(backends[b]); + meta.device = "unknown"; + meta.device_type = 0; + if (backends[b]->device != NULL) { + meta.device = ggml_backend_dev_name(backends[b]->device); + meta.device_type = (int) ggml_backend_dev_type(backends[b]->device); + } + sched->profiling_backend_meta.push_back(std::move(meta)); + } ggml_backend_sched_reset(sched); @@ -1701,6 +1879,33 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { if (sched == NULL) { return; } + + // Auto-export profiling data if enabled via GGML_PROFILE env var + // GGML_PROFILE=1 or GGML_PROFILE="" → print to stdout + // GGML_PROFILE=file.json → export JSON + // GGML_PROFILE=file.txt → export text + if (!sched->profiling_records.empty() && getenv("GGML_PROFILE") != NULL) { + const std::string & path = sched->profiling_env_path; + if (path.empty() || path == "1" || path == "stdout") { + ggml_backend_sched_print_profiling(sched); + } else if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) { + int ret = ggml_backend_sched_export_profiling_text(sched, path.c_str()); + if (ret == 0) { + GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str()); + } else { + GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str()); + } + } else { + // Default to JSON for any other path (including .json) + int ret = ggml_backend_sched_export_profiling_json(sched, path.c_str()); + if (ret == 0) { + GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str()); + } else { + GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str()); + } + } + } + for (int b = 0; b < sched->n_backends; b++) { for (int c = 0; c < sched->n_copies; c++) { ggml_backend_event_free(sched->events[b][c]); @@ -2268,3 +2473,475 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); } + +// +// Scheduler profiling +// + +void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable) { + GGML_ASSERT(sched); + sched->profiling_enabled = enable; + + if (!enable) { + ggml_backend_sched_reset_profiling(sched); + } +} + +bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched) { + GGML_ASSERT(sched); + return sched->profiling_enabled; +} + +int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records) { + GGML_ASSERT(sched); + GGML_ASSERT(records != NULL); + + *records = sched->profiling_records.data(); + return (int) sched->profiling_records.size(); +} + +void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched) { + GGML_ASSERT(sched); + sched->profiling_records.clear(); + sched->copy_records.clear(); +} + +void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) { + GGML_ASSERT(sched); + + if (sched->profiling_records.empty()) { + GGML_LOG_INFO("[profiler] No profiling data available\n"); + return; + } + + GGML_LOG_INFO("\n=== Profiling Summary ===\n"); + + // Aggregate by (name, type, backend_id) + struct op_stats { + const char * name; + enum ggml_profile_event_type type; + int backend_id; + uint64_t total_ns; + uint64_t min_ns; + uint64_t max_ns; + int count; + uint64_t total_bytes; + int64_t representative_ne[4]; + }; + + std::vector stats; + for (const auto & rec : sched->profiling_records) { + bool found = false; + for (auto & s : stats) { + if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) { + uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + s.total_ns += dur; + s.min_ns = std::min(s.min_ns, dur); + s.max_ns = std::max(s.max_ns, dur); + s.count++; + s.total_bytes += rec.bytes; + found = true; + break; + } + } + if (!found) { + uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + op_stats s; + s.name = rec.name; + s.type = rec.type; + s.backend_id = rec.backend_id; + s.total_ns = dur; + s.min_ns = dur; + s.max_ns = dur; + s.count = 1; + s.total_bytes = rec.bytes; + memcpy(s.representative_ne, rec.ne_src0, sizeof(s.representative_ne)); + stats.push_back(s); + } + } + + // Sort by total time descending + std::sort(stats.begin(), stats.end(), + [](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; }); + + uint64_t grand_total = 0; + for (const auto & s : stats) { + grand_total += s.total_ns; + } + + const char * type_str[] = { "OP ", "COPY" }; + for (const auto & s : stats) { + double pct = 100.0 * (double) s.total_ns / (double) grand_total; + double avg_us = (double) s.total_ns / (double) s.count / 1000.0; + double min_us = (double) s.min_ns / 1000.0; + double max_us = (double) s.max_ns / 1000.0; + + GGML_LOG_INFO( + " [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us", + type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us); + if (s.total_bytes > 0 && s.total_ns > 0) { + double bw_gbps = (double) s.total_bytes / (double) s.total_ns; + if (bw_gbps >= 1000.0) { + GGML_LOG_INFO(" %6.2f TB/s", bw_gbps / 1000.0); + } else { + GGML_LOG_INFO(" %6.2f GB/s", bw_gbps); + } + } + // Print representative tensor shape (first record's ne) + if (s.representative_ne[0] > 0 || s.representative_ne[1] > 0) { + GGML_LOG_INFO(" [%lld x %lld", (long long) s.representative_ne[0], (long long) s.representative_ne[1]); + if (s.representative_ne[2] > 1) { + GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[2]); + } + if (s.representative_ne[3] > 1) { + GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[3]); + } + GGML_LOG_INFO("]"); + } + GGML_LOG_INFO("\n"); + } + + GGML_LOG_INFO(" ---\n"); + GGML_LOG_INFO(" Total: %.2f ms (%d records, %d unique ops)\n\n", (double) grand_total / 1e6, + (int) sched->profiling_records.size(), (int) stats.size()); +} + +int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp) { + GGML_ASSERT(sched); + GGML_ASSERT(fp != NULL); + + uint64_t total_ns = 0; + for (const auto & rec : sched->profiling_records) { + total_ns += (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + } + + fprintf(fp, "{\n"); + fprintf(fp, " \"version\": 2,\n"); + fprintf(fp, " \"profiler\": \"ggml\",\n"); + fprintf(fp, " \"total_records\": %d,\n", (int) sched->profiling_records.size()); + fprintf(fp, " \"total_ns\": %llu,\n", (unsigned long long) total_ns); + + // Backend metadata (use cached data if available, fall back to live pointers) + fprintf(fp, " \"backends\": [\n"); + for (int b = 0; b < sched->n_backends; b++) { + const char * name = "unknown"; + const char * dev_name = "unknown"; + int dev_type = 0; + if (b < (int) sched->profiling_backend_meta.size()) { + name = sched->profiling_backend_meta[b].name.c_str(); + dev_name = sched->profiling_backend_meta[b].device.c_str(); + dev_type = sched->profiling_backend_meta[b].device_type; + } else if (sched->backends[b] != NULL) { + name = ggml_backend_name(sched->backends[b]); + if (sched->backends[b]->device != NULL) { + dev_name = ggml_backend_dev_name(sched->backends[b]->device); + dev_type = (int) ggml_backend_dev_type(sched->backends[b]->device); + } + } + fprintf(fp, " {\"id\": %d, \"name\": \"%s\", \"device\": \"%s\", \"device_type\": %d}%s\n", b, name, + dev_name, dev_type, (b < sched->n_backends - 1) ? "," : ""); + } + fprintf(fp, " ],\n"); + + // Records + fprintf(fp, " \"records\": [\n"); + + for (int i = 0; i < (int) sched->profiling_records.size(); i++) { + const auto & rec = sched->profiling_records[i]; + uint64_t duration_ns = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + + fprintf(fp, + " {\"type\": %d, \"name\": \"%s\", \"backend_id\": %d, \"split_id\": %d, " + "\"start_ns\": %llu, \"duration_ns\": %llu, \"bytes\": %llu, \"extra\": ", + (int) rec.type, rec.name ? rec.name : "unknown", rec.backend_id, rec.split_id, + (unsigned long long) rec.start_ns, (unsigned long long) duration_ns, (unsigned long long) rec.bytes); + + if (rec.extra != NULL) { + fprintf(fp, "\"%s\"", rec.extra); + } else { + fprintf(fp, "null"); + } + + // Tensor dimensions (all source tensors) + fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1], + (long long) rec.ne_src0[2], (long long) rec.ne_src0[3]); + fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1], + (long long) rec.ne_src1[2], (long long) rec.ne_src1[3]); + fprintf(fp, ", \"ne_src2\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src2[0], (long long) rec.ne_src2[1], + (long long) rec.ne_src2[2], (long long) rec.ne_src2[3]); + + fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : ""); + } + + fprintf(fp, " ]\n"); + fprintf(fp, "}\n"); + + return 0; +} + +int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath) { + GGML_ASSERT(sched); + GGML_ASSERT(filepath != NULL); + + FILE * fp = fopen(filepath, "w"); + if (fp == NULL) { + GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath); + return -1; + } + + int ret = ggml_backend_sched_write_profiling_json(sched, fp); + fclose(fp); + + return ret; +} + +// Helper: format ne dimensions as string, e.g. "[4096, 4096, 1]" +static void fmt_ne(char * buf, size_t bufsize, const int64_t ne[4]) { + if (ne[0] == 0 && ne[1] == 0 && ne[2] == 0 && ne[3] == 0) { + buf[0] = '\0'; + return; + } + int ndims = 4; + while (ndims > 1 && ne[ndims - 1] <= 1) { + ndims--; + } + int pos = snprintf(buf, bufsize, "["); + for (int i = 0; i < ndims && pos < (int) bufsize - 1; i++) { + pos += snprintf(buf + pos, bufsize - pos, "%s%lld", i > 0 ? ", " : "", (long long) ne[i]); + } + snprintf(buf + pos, bufsize - pos, "]"); +} + +// Helper: format bandwidth as string +static void fmt_bandwidth(char * buf, size_t bufsize, uint64_t bytes, uint64_t ns) { + if (ns == 0 || bytes == 0) { + buf[0] = '\0'; + return; + } + double bw_gbps = (double) bytes / (double) ns; + if (bw_gbps >= 1000.0) { + snprintf(buf, bufsize, "%.2f TB/s", bw_gbps / 1000.0); + } else { + snprintf(buf, bufsize, "%.2f GB/s", bw_gbps); + } +} + +int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp) { + GGML_ASSERT(sched); + GGML_ASSERT(fp != NULL); + + if (sched->profiling_records.empty()) { + fprintf(fp, "No profiling data available.\n"); + return 0; + } + + // Aggregate by (name, type, backend_id) + struct op_stats { + const char * name; + enum ggml_profile_event_type type; + int backend_id; + uint64_t total_ns; + uint64_t min_ns; + uint64_t max_ns; + int count; + uint64_t total_bytes; + int64_t representative_ne_src0[4]; + int64_t representative_ne_src1[4]; + int64_t representative_ne_src2[4]; + }; + + std::vector stats; + for (const auto & rec : sched->profiling_records) { + uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0; + bool found = false; + for (auto & s : stats) { + if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) { + s.total_ns += dur; + s.min_ns = std::min(s.min_ns, dur); + s.max_ns = std::max(s.max_ns, dur); + s.count++; + s.total_bytes += rec.bytes; + found = true; + break; + } + } + if (!found) { + op_stats s = {}; + s.name = rec.name; + s.type = rec.type; + s.backend_id = rec.backend_id; + s.total_ns = dur; + s.min_ns = dur; + s.max_ns = dur; + s.count = 1; + s.total_bytes = rec.bytes; + memcpy(s.representative_ne_src0, rec.ne_src0, sizeof(s.representative_ne_src0)); + memcpy(s.representative_ne_src1, rec.ne_src1, sizeof(s.representative_ne_src1)); + memcpy(s.representative_ne_src2, rec.ne_src2, sizeof(s.representative_ne_src2)); + stats.push_back(s); + } + } + + std::sort(stats.begin(), stats.end(), + [](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; }); + + uint64_t grand_total = 0; + for (const auto & s : stats) { + grand_total += s.total_ns; + } + + // --- Section 1: Overall summary --- + fprintf(fp, "=== Profiling Summary ===\n"); + fprintf(fp, "Total time: %.2f ms\n", (double) grand_total / 1e6); + fprintf(fp, "Total records: %d\n", (int) sched->profiling_records.size()); + fprintf(fp, "Unique ops: %d\n\n", (int) stats.size()); + + // --- Section 2: Per-backend breakdown --- + fprintf(fp, "=== Per-Backend Summary ===\n"); + { + struct backend_stats { + int backend_id; + int op_count; + int copy_count; + uint64_t op_ns; + uint64_t copy_ns; + uint64_t op_bytes; + uint64_t copy_bytes; + }; + std::vector bstats; + for (const auto & s : stats) { + bool found = false; + for (auto & bs : bstats) { + if (bs.backend_id == s.backend_id) { + if (s.type == GGML_PROFILE_EVENT_OP) { + bs.op_count += s.count; + bs.op_ns += s.total_ns; + bs.op_bytes += s.total_bytes; + } else { + bs.copy_count += s.count; + bs.copy_ns += s.total_ns; + bs.copy_bytes += s.total_bytes; + } + found = true; + break; + } + } + if (!found) { + backend_stats bs = {}; + bs.backend_id = s.backend_id; + if (s.type == GGML_PROFILE_EVENT_OP) { + bs.op_count = s.count; + bs.op_ns = s.total_ns; + bs.op_bytes = s.total_bytes; + } else { + bs.copy_count = s.count; + bs.copy_ns = s.total_ns; + bs.copy_bytes = s.total_bytes; + } + bstats.push_back(bs); + } + } + std::sort(bstats.begin(), bstats.end(), + [](const backend_stats & a, const backend_stats & b) { + return (a.op_ns + a.copy_ns) > (b.op_ns + b.copy_ns); + }); + + for (const auto & bs : bstats) { + uint64_t total = bs.op_ns + bs.copy_ns; + double pct = grand_total > 0 ? 100.0 * (double) total / (double) grand_total : 0; + + const char * bname = "unknown"; + if (bs.backend_id >= 0 && bs.backend_id < (int) sched->profiling_backend_meta.size()) { + bname = sched->profiling_backend_meta[bs.backend_id].name.c_str(); + } else if (bs.backend_id >= 0 && bs.backend_id < sched->n_backends && sched->backends[bs.backend_id] != NULL) { + bname = ggml_backend_name(sched->backends[bs.backend_id]); + } + + fprintf(fp, " Backend %d (%s): %.2f ms (%.1f%%)\n", bs.backend_id, bname, (double) total / 1e6, pct); + if (bs.op_count > 0) { + char bw_buf[32]; + fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.op_bytes, bs.op_ns); + fprintf(fp, " OPs: %d calls, %.2f ms", bs.op_count, (double) bs.op_ns / 1e6); + if (bw_buf[0]) { + fprintf(fp, ", %s", bw_buf); + } + fprintf(fp, "\n"); + } + if (bs.copy_count > 0) { + char bw_buf[32]; + fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.copy_bytes, bs.copy_ns); + fprintf(fp, " COPYs: %d calls, %.2f ms", bs.copy_count, (double) bs.copy_ns / 1e6); + if (bw_buf[0]) { + fprintf(fp, ", %s", bw_buf); + } + fprintf(fp, "\n"); + } + } + } + fprintf(fp, "\n"); + + // --- Section 3: Detailed operation table --- + fprintf(fp, "=== Operations (sorted by total time) ===\n"); + fprintf(fp, "%-5s %4s %-28s %7s %6s %10s %10s %10s %10s %12s %s\n", + "TYPE", "BKND", "Operation", "%Time", "Count", "Total(ms)", "Avg(us)", "Min(us)", "Max(us)", "Bandwidth", "Tensors"); + fprintf(fp, "%-5s %4s %-28s %7s %6s %10s %10s %10s %10s %12s %s\n", + "-----", "----", "----------------------------", "-------", "------", + "----------", "----------", "----------", "----------", "------------", "-------"); + + const char * type_str[] = { "OP", "COPY" }; + for (const auto & s : stats) { + double pct = grand_total > 0 ? 100.0 * (double) s.total_ns / (double) grand_total : 0; + double avg_us = (double) s.total_ns / (double) s.count / 1000.0; + double min_us = (double) s.min_ns / 1000.0; + double max_us = (double) s.max_ns / 1000.0; + + char bw_buf[32] = ""; + fmt_bandwidth(bw_buf, sizeof(bw_buf), s.total_bytes, s.total_ns); + + char ne0_buf[64]; + char ne1_buf[64]; + char ne2_buf[64]; + fmt_ne(ne0_buf, sizeof(ne0_buf), s.representative_ne_src0); + fmt_ne(ne1_buf, sizeof(ne1_buf), s.representative_ne_src1); + fmt_ne(ne2_buf, sizeof(ne2_buf), s.representative_ne_src2); + + // Build tensor shapes string + char tensors_buf[256] = ""; + int tpos = 0; + if (ne0_buf[0]) { + tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, "%s", ne0_buf); + } + if (ne1_buf[0]) { + tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne1_buf); + } + if (ne2_buf[0]) { + tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne2_buf); + } + + fprintf(fp, "%-5s %4d %-28s %6.1f%% %6d %10.2f %10.2f %10.2f %10.2f %12s %s\n", + type_str[s.type], s.backend_id, s.name, pct, s.count, + (double) s.total_ns / 1e6, avg_us, min_us, max_us, + bw_buf, tensors_buf); + } + + fprintf(fp, "\nTotal: %.2f ms (%d records, %d unique ops)\n", (double) grand_total / 1e6, + (int) sched->profiling_records.size(), (int) stats.size()); + + return 0; +} + +int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath) { + GGML_ASSERT(sched); + GGML_ASSERT(filepath != NULL); + + FILE * fp = fopen(filepath, "w"); + if (fp == NULL) { + GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath); + return -1; + } + + int ret = ggml_backend_sched_write_profiling_text(sched, fp); + fclose(fp); + + return ret; +} diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index e7a1763b54..b68f45452e 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -1,6 +1,7 @@ #include "ggml-impl.h" #include "ggml-blas.h" #include "ggml-backend-impl.h" +#include "ggml-profiler.h" #include #include @@ -25,6 +26,11 @@ struct ggml_backend_blas_context { #ifndef GGML_USE_OPENMP std::vector> tasks; #endif + + // Profiling state + bool profiling_enabled = false; + int profiling_split_id = -1; + std::vector profiling_records; }; static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { @@ -232,6 +238,18 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, continue; } + // Skip view/identity ops + if (node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_VIEW || + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) { + continue; + } + + // Profiling: time this operation + uint64_t t_start = 0; + if (ctx->profiling_enabled) { + t_start = ggml_profiler_time_ns(); + } + switch (node->op) { case GGML_OP_MUL_MAT: ggml_backend_blas_mul_mat(ctx, node); @@ -241,16 +259,26 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, ggml_backend_blas_out_prod(ctx, node); break; - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - break; - default: GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); } + + if (ctx->profiling_enabled) { + uint64_t t_end = ggml_profiler_time_ns(); + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = ggml_op_name(node->op); + rec.backend_id = 0; + rec.split_id = ctx->profiling_split_id; + rec.start_ns = t_start; + rec.end_ns = t_end; + rec.bytes = ggml_nbytes(node); + rec.extra = NULL; + if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); } + if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); } + if (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) { memcpy(rec.ne_src2, node->src[2]->ne, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); } + ctx->profiling_records.push_back(rec); + } } return GGML_STATUS_SUCCESS; @@ -284,10 +312,11 @@ ggml_backend_t ggml_backend_blas_init(void) { ggml_backend_blas_context * ctx = new ggml_backend_blas_context; ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_blas_guid(), - /* .iface = */ blas_backend_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), - /* .context = */ ctx, + /* .guid = */ ggml_backend_blas_guid(), + /* .iface = */ blas_backend_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), + /* .context = */ ctx, + /* .profiler = */ nullptr, }; #if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP) @@ -300,6 +329,44 @@ ggml_backend_t ggml_backend_blas_init(void) { GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); #endif + // Register profiler + ggml_backend_blas_context * blas_ctx = ctx; // ctx is already defined above + + static auto blas_prof_enable = [](void * ctx, bool enable) { + auto * bctx = (ggml_backend_blas_context *) ctx; + bctx->profiling_enabled = enable; + if (!enable) { + bctx->profiling_records.clear(); + } + }; + static auto blas_prof_reset = [](void * ctx) { + auto * bctx = (ggml_backend_blas_context *) ctx; + bctx->profiling_records.clear(); + bctx->profiling_split_id = -1; + }; + static auto blas_prof_set_split_id = [](void * ctx, int split_id) { + auto * bctx = (ggml_backend_blas_context *) ctx; + bctx->profiling_split_id = split_id; + }; + static auto blas_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int { + auto * bctx = (ggml_backend_blas_context *) ctx; + *out = bctx->profiling_records.data(); + return (int) bctx->profiling_records.size(); + }; + static auto blas_prof_free = [](void * ctx) { + (void) ctx; + }; + + auto * profiler = new ggml_backend_profiler{ + /* .context = */ blas_ctx, + /* .enable = */ blas_prof_enable, + /* .reset = */ blas_prof_reset, + /* .set_split_id = */ blas_prof_set_split_id, + /* .get_records = */ blas_prof_get_records, + /* .free_context = */ blas_prof_free, + }; + ggml_backend_set_profiler(backend, profiler); + return backend; } diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 40fe3d82ec..002e3bb196 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2985,7 +2985,8 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) { new ggml_backend{ /* .guid = */ ggml_backend_cann_guid(), /* .interface = */ ggml_backend_cann_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), - /* .context = */ ctx }; + /* .context = */ ctx, + /* .profiler = */ nullptr }; return cann_backend; } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 7486acc2b5..d0fc4228ff 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -6,6 +6,7 @@ #include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-impl.h" +#include "ggml-profiler.h" #include "quants.h" #include "ggml-threading.h" #include "unary-ops.h" @@ -1159,8 +1160,8 @@ static void ggml_compute_forward_mul_mat_one_chunk( const bool src1_cont = ggml_is_contiguous(src1); - ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + const ggml_vec_dot_t vec_dot = type_traits_cpu[type].vec_dot; + const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type; // broadcast factors const int64_t r2 = ne12 / ne02; @@ -1244,9 +1245,9 @@ void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; - enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; - ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; - int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows; + const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; + const ggml_from_float_t from_float = type_traits_cpu[vec_dot_type].from_float; + const int64_t vec_dot_num_rows = type_traits_cpu[src0->type].nrows; GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); @@ -1455,8 +1456,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk( const enum ggml_type type = src0->type; - ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + const ggml_vec_dot_t vec_dot = type_traits_cpu[type].vec_dot; + const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type; const int64_t blck_0 = 16; const int64_t blck_1 = 16; @@ -1523,8 +1524,8 @@ static void ggml_compute_forward_mul_mat_id( const bool src1_cont = ggml_is_contiguous(src1); - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; - ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; + const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type; + const ggml_from_float_t from_float = type_traits_cpu[vec_dot_type].from_float; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == ggml_type_size(type)); @@ -2977,28 +2978,73 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph); #endif - for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { - struct ggml_tensor * node = cgraph->nodes[node_n]; + // Profiling state + if (cplan->profiling_context != NULL && cplan->profiling_record_fn != NULL) { + for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { + struct ggml_tensor * node = cgraph->nodes[node_n]; - if (ggml_op_is_empty(node->op)) { - // skip NOPs - continue; + if (ggml_op_is_empty(node->op)) { + continue; + } + + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } + + // Only thread 0 records timing (after barrier = total node time) + uint64_t t_start = 0; + if (state->ith == 0) { + t_start = ggml_profiler_time_ns(); + } + + ggml_compute_forward(¶ms, node); + + if (node_n + 1 < cgraph->n_nodes) { + ggml_barrier(state->threadpool); + } + + if (state->ith == 0) { + uint64_t t_end = ggml_profiler_time_ns(); + { + static const int64_t zero_ne[4] = {0, 0, 0, 0}; + const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; + const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne; + cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */, + ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL, + src0_ne, src1_ne, src2_ne); + } + } + + if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); + tp->ec = GGML_STATUS_ABORTED; + } } + } else { + for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { + struct ggml_tensor * node = cgraph->nodes[node_n]; - if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { - continue; - } + if (ggml_op_is_empty(node->op)) { + // skip NOPs + continue; + } - ggml_compute_forward(¶ms, node); + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } - if (state->ith == 0 && cplan->abort_callback && - cplan->abort_callback(cplan->abort_callback_data)) { - atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); - tp->ec = GGML_STATUS_ABORTED; - } + ggml_compute_forward(¶ms, node); - if (node_n + 1 < cgraph->n_nodes) { - ggml_barrier(state->threadpool); + if (state->ith == 0 && cplan->abort_callback && + cplan->abort_callback(cplan->abort_callback_data)) { + atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); + tp->ec = GGML_STATUS_ABORTED; + } + + if (node_n + 1 < cgraph->n_nodes) { + ggml_barrier(state->threadpool); + } } } diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index ddf1737a31..6fb9e1987c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -1,6 +1,7 @@ #include "ggml-backend.h" #include "ggml-backend-impl.h" #include "ggml-cpu.h" +#include "ggml-profiler.h" #include "repack.h" #include "traits.h" #include "ggml-impl.h" @@ -107,6 +108,11 @@ struct ggml_backend_cpu_context { void * abort_callback_data; bool use_ref; // use reference implementation + + // Profiling state + bool profiling_enabled; + int profiling_split_id; + std::vector profiling_records; }; static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) { @@ -167,6 +173,46 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe GGML_UNUSED(backend); } +// Callback function for recording CPU profiling events from C code (ggml-cpu.c) +static void ggml_cpu_profiler_record_callback(void * context, + int type, + const char * name, + int split_id, + uint64_t start_ns, + uint64_t end_ns, + uint64_t bytes, + const char * extra, + const int64_t ne_src0[4], + const int64_t ne_src1[4], + const int64_t ne_src2[4]) { + auto * cpu_ctx = (ggml_backend_cpu_context *) context; + ggml_profile_record rec; + rec.type = (enum ggml_profile_event_type) type; + rec.name = name; + rec.backend_id = 0; // will be overwritten by scheduler + rec.split_id = split_id != -1 ? split_id : cpu_ctx->profiling_split_id; + rec.start_ns = start_ns; + rec.end_ns = end_ns; + rec.bytes = bytes; + rec.extra = extra; + if (ne_src0) { + memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); + } else { + memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); + } + if (ne_src1) { + memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); + } else { + memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); + } + if (ne_src2) { + memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2)); + } else { + memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); + } + cpu_ctx->profiling_records.push_back(rec); +} + static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; @@ -187,6 +233,9 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s cplan.abort_callback_data = cpu_ctx->abort_callback_data; cplan.use_ref = cpu_ctx->use_ref; + cplan.profiling_context = cpu_ctx->profiling_enabled ? cpu_ctx : NULL; + cplan.profiling_record_fn = cpu_ctx->profiling_enabled ? ggml_cpu_profiler_record_callback : NULL; + return ggml_graph_compute(cgraph, &cplan); } @@ -228,12 +277,15 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->abort_callback = NULL; ctx->abort_callback_data = NULL; ctx->use_ref = false; + ctx->profiling_enabled = false; + ctx->profiling_split_id = -1; ggml_backend_t cpu_backend = new ggml_backend { - /* .guid = */ ggml_backend_cpu_guid(), - /* .iface = */ ggml_backend_cpu_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ ctx, + /* .guid = */ ggml_backend_cpu_guid(), + /* .iface = */ ggml_backend_cpu_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ ctx, + /* .profiler = */ nullptr, }; if (cpu_backend == NULL) { @@ -241,6 +293,43 @@ ggml_backend_t ggml_backend_cpu_init(void) { return NULL; } + // Register profiler + static auto cpu_prof_enable = [](void * ctx, bool enable) { + auto * cpu_ctx = (ggml_backend_cpu_context *) ctx; + cpu_ctx->profiling_enabled = enable; + if (!enable) { + cpu_ctx->profiling_records.clear(); + } + }; + static auto cpu_prof_reset = [](void * ctx) { + auto * cpu_ctx = (ggml_backend_cpu_context *) ctx; + cpu_ctx->profiling_records.clear(); + cpu_ctx->profiling_split_id = -1; + }; + static auto cpu_prof_set_split_id = [](void * ctx, int split_id) { + auto * cpu_ctx = (ggml_backend_cpu_context *) ctx; + cpu_ctx->profiling_split_id = split_id; + }; + static auto cpu_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int { + auto * cpu_ctx = (ggml_backend_cpu_context *) ctx; + *out = cpu_ctx->profiling_records.data(); + return (int) cpu_ctx->profiling_records.size(); + }; + static auto cpu_prof_free = [](void * ctx) { + // Nothing to free - records are in the CPU context's vector + (void) ctx; + }; + + auto * profiler = new ggml_backend_profiler{ + /* .context = */ ctx, + /* .enable = */ cpu_prof_enable, + /* .reset = */ cpu_prof_reset, + /* .set_split_id = */ cpu_prof_set_split_id, + /* .get_records = */ cpu_prof_get_records, + /* .free_context = */ cpu_prof_free, + }; + ggml_backend_set_profiler(cpu_backend, profiler); + return cpu_backend; } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9affe02340..5f9db83cdc 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1352,6 +1352,9 @@ struct ggml_cuda_stream_context { } }; +// Forward declaration for profiler state (defined in ggml-cuda.cu) +struct ggml_cuda_profiler_state; + struct ggml_backend_cuda_context { int device; std::string name; @@ -1447,6 +1450,9 @@ struct ggml_backend_cuda_context { ggml_cuda_pool & pool() { return pool(device); } + + // Profiling + ggml_cuda_profiler_state * profiler_state = nullptr; }; struct ggml_cuda_mm_fusion_args_host { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 75b62129ad..85d1941dca 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1,6 +1,7 @@ #include "ggml-cuda.h" #include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-profiler.h" #include "ggml-cuda/common.cuh" #include "ggml-cuda/acc.cuh" @@ -86,6 +87,94 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); +// CUDA profiler state +struct ggml_cuda_profiler_state { + bool enabled = false; + int split_id = -1; + cudaStream_t stream = nullptr; + + static constexpr int MAX_PENDING_EVENTS = 4096; + std::vector start_events; + std::vector end_events; + std::vector cpu_timestamps; // CPU-side timestamps for global ordering + int event_count = 0; + + std::vector records; + std::vector record_event_indices; + + void init(cudaStream_t stream) { + this->stream = stream; + start_events.reserve(MAX_PENDING_EVENTS); + end_events.reserve(MAX_PENDING_EVENTS); + cpu_timestamps.reserve(MAX_PENDING_EVENTS); + } + + void reset() { + for (auto & ev : start_events) { + (void) cudaEventDestroy(ev); + } + for (auto & ev : end_events) { + (void) cudaEventDestroy(ev); + } + start_events.clear(); + end_events.clear(); + cpu_timestamps.clear(); + event_count = 0; + records.clear(); + record_event_indices.clear(); + } + + ~ggml_cuda_profiler_state() { + reset(); + } + + void record_start() { + cudaEvent_t ev; + (void) cudaEventCreate(&ev); + (void) cudaEventRecord(ev, stream); + start_events.push_back(ev); + cpu_timestamps.push_back(ggml_profiler_time_ns()); + event_count++; + } + + void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra, + const int64_t ne_src0[4], const int64_t ne_src1[4], const int64_t ne_src2[4]) { + cudaEvent_t ev; + (void) cudaEventCreate(&ev); + (void) cudaEventRecord(ev, stream); + end_events.push_back(ev); + record_event_indices.push_back(records.size()); + + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = name; + rec.backend_id = backend_id; + rec.split_id = split_id; + rec.start_ns = 0; + rec.end_ns = 0; + rec.bytes = bytes; + rec.extra = extra; + if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); } + if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); } + if (ne_src2) { memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); } + records.push_back(rec); + } + + void finalize() { + (void) cudaStreamSynchronize(stream); + + for (int i = 0; i < (int)record_event_indices.size(); i++) { + float ms = 0.0f; + (void) cudaEventElapsedTime(&ms, start_events[i], end_events[i]); + uint64_t duration_ns = (uint64_t)(ms * 1e6f); + int rec_idx = record_event_indices[i]; + // Use CPU-side timestamp for global ordering, GPU-measured duration for accuracy + records[rec_idx].start_ns = cpu_timestamps[i]; + records[rec_idx].end_ns = cpu_timestamps[i] + duration_ns; + } + } +}; + [[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { int id = -1; // in case cudaGetDevice fails @@ -4035,8 +4124,25 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud #else GGML_UNUSED(integrated); #endif // NDEBUG + if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) { + cuda_ctx->profiler_state->record_start(); + } bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); + + if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) { + cuda_ctx->profiler_state->record_end( + ggml_op_name(node->op), + -1, + cuda_ctx->profiler_state->split_id, + ggml_nbytes(node), + nullptr, + node->src[0] ? node->src[0]->ne : nullptr, + node->src[1] ? node->src[1]->ne : nullptr, + (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : nullptr + ); + } + if (!ok) { GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -4107,6 +4213,19 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cuda_set_device(cuda_ctx->device); + // Disable CUDA graphs when profiling (we need per-node timing) + bool was_graph_enabled = false; + if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) { +#ifdef USE_CUDA_GRAPH + const void * graph_key = ggml_cuda_graph_get_key(cgraph); + ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); + was_graph_enabled = graph->is_enabled(); + if (was_graph_enabled) { + graph->disable_due_to_gpu_arch = true; + } +#endif + } + bool use_cuda_graph = false; bool cuda_graph_update_required = false; const void * graph_key = nullptr; @@ -4158,6 +4277,15 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required, graph_key); + // Restore CUDA graph enabled state after profiling + if (was_graph_enabled) { +#ifdef USE_CUDA_GRAPH + const void * graph_key_prof = ggml_cuda_graph_get_key(cgraph); + ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key_prof); + graph->disable_due_to_gpu_arch = false; +#endif + } + return GGML_STATUS_SUCCESS; } @@ -5304,12 +5432,68 @@ ggml_backend_t ggml_backend_cuda_init(int device) { } ggml_backend_t cuda_backend = new ggml_backend { - /* .guid = */ ggml_backend_cuda_guid(), - /* .iface = */ ggml_backend_cuda_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), - /* .context = */ ctx, + /* .guid = */ ggml_backend_cuda_guid(), + /* .iface = */ ggml_backend_cuda_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), + /* .context = */ ctx, + /* .profiler = */ nullptr, }; + // Register profiler + auto * prof_state = new ggml_cuda_profiler_state(); + prof_state->init(ctx->stream()); + ctx->profiler_state = prof_state; + + static auto cuda_prof_enable = [](void * ctx, bool enable) { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + cuda_ctx->profiler_state->enabled = enable; + if (!enable) { + cuda_ctx->profiler_state->reset(); + } + } + }; + static auto cuda_prof_reset = [](void * ctx) { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + cuda_ctx->profiler_state->reset(); + cuda_ctx->profiler_state->split_id = -1; + } + }; + static auto cuda_prof_set_split_id = [](void * ctx, int split_id) { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + cuda_ctx->profiler_state->split_id = split_id; + } + }; + static auto cuda_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + cuda_ctx->profiler_state->finalize(); + *out = cuda_ctx->profiler_state->records.data(); + return (int)cuda_ctx->profiler_state->records.size(); + } + *out = nullptr; + return 0; + }; + static auto cuda_prof_free = [](void * ctx) { + auto * cuda_ctx = (ggml_backend_cuda_context *)ctx; + if (cuda_ctx->profiler_state != nullptr) { + delete cuda_ctx->profiler_state; + cuda_ctx->profiler_state = nullptr; + } + }; + + auto * profiler = new ggml_backend_profiler { + /* .context = */ ctx, + /* .enable = */ cuda_prof_enable, + /* .reset = */ cuda_prof_reset, + /* .set_split_id = */ cuda_prof_set_split_id, + /* .get_records = */ cuda_prof_get_records, + /* .free_context = */ cuda_prof_free, + }; + ggml_backend_set_profiler(cuda_backend, profiler); + return cuda_backend; } diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 9d9ba1ee21..3a07b281d6 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -55,8 +55,10 @@ #define cudaError_t hipError_t #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled +#define cudaEventCreate hipEventCreate #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventElapsedTime hipEventElapsedTime #define cudaEventRecord hipEventRecord #define cudaEventSynchronize hipEventSynchronize #define cudaEvent_t hipEvent_t diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index 1abb8acfd4..d4ff57b3d6 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -44,8 +44,10 @@ #define cudaError_t musaError_t #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled +#define cudaEventCreate musaEventCreate #define cudaEventCreateWithFlags musaEventCreateWithFlags #define cudaEventDisableTiming musaEventDisableTiming +#define cudaEventElapsedTime musaEventElapsedTime #define cudaEventRecord musaEventRecord #define cudaEventSynchronize musaEventSynchronize #define cudaEvent_t musaEvent_t diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index dd604db433..4dd352e7a0 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -3004,6 +3004,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, c /* .interface = */ hexagon_backend_i, /* .device = */ dev, /* .context = */ sess, + /* .profiler = */ nullptr, }; GGML_UNUSED(params); diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index 9382ce53b3..853edd7c75 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -597,6 +597,7 @@ ggml_backend_t ggml_backend_metal_init(void) { /* .interface = */ ggml_backend_metal_i, /* .device = */ dev, /* .context = */ ctx, + /* .profiler = */ NULL, }; ggml_backend_metal_set_n_cb(backend, 1); @@ -691,6 +692,7 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t /* .interface = */ ggml_backend_metal_i, /* .device = */ dev, /* .context = */ ctx, + /* .profiler = */ NULL, }; ggml_backend_metal_set_n_cb(backend, 1); diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 0f6628c377..48dd0c9e4a 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -4082,7 +4082,8 @@ ggml_backend_t ggml_backend_opencl_init(void) { /* .guid = */ ggml_backend_opencl_guid(), /* .iface = */ ggml_backend_opencl_i, /* .device = */ dev, - /* .context = */ backend_ctx + /* .context = */ backend_ctx, + /* .profiler = */ nullptr, }; return backend; @@ -5897,6 +5898,7 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co /* .interface = */ ggml_backend_opencl_i, /* .device = */ dev, /* .context = */ backend_ctx, + /* .profiler = */ nullptr, }; return backend; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index b3058b4af7..7c36b3c5e1 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -673,6 +673,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) { /* .interface = */ ggml_backend_openvino_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device), /* .context = */ ctx, + /* .profiler = */ nullptr, }; return openvino_backend; diff --git a/ggml/src/ggml-profiler.cpp b/ggml/src/ggml-profiler.cpp new file mode 100644 index 0000000000..3dc60595ff --- /dev/null +++ b/ggml/src/ggml-profiler.cpp @@ -0,0 +1,67 @@ +#include "ggml-profiler.h" + +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +#include +#include + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#else +# include +# include +#endif + +// +// Time utilities +// + +uint64_t ggml_profiler_time_ns(void) { +#ifdef _WIN32 + LARGE_INTEGER freq, count; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&count); + return (uint64_t) (count.QuadPart * 1000000000ULL / freq.QuadPart); +#elif defined(CLOCK_MONOTONIC_RAW) + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +#endif +} + +// +// Backend profiler registration +// + +void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler) { + if (backend == NULL) { + return; + } + + // Free any existing profiler + if (backend->profiler != NULL) { + if (backend->profiler->free_context != NULL) { + backend->profiler->free_context(backend->profiler->context); + } + delete backend->profiler; + backend->profiler = NULL; + } + + backend->profiler = profiler; +} + +ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend) { + if (backend == NULL) { + return NULL; + } + return backend->profiler; +} diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 1378ba9f5b..24dca4503d 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -952,7 +952,8 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) { /* .guid = */ ggml_backend_rpc_guid(), /* .iface = */ ggml_backend_rpc_interface, /* .device = */ ggml_backend_reg_dev_get(reg, device), - /* .context = */ ctx + /* .context = */ ctx, + /* .profiler = */ nullptr, }; return backend; } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 456b1699fa..22060441be 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -5101,7 +5101,8 @@ ggml_backend_t ggml_backend_sycl_init(int device) { /* .guid = */ ggml_backend_sycl_guid(), /* .iface = */ ggml_backend_sycl_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device), - /* .context = */ ctx + /* .context = */ ctx, + /* .profiler = */ nullptr, }; return sycl_backend; diff --git a/ggml/src/ggml-virtgpu/ggml-backend.cpp b/ggml/src/ggml-virtgpu/ggml-backend.cpp index a63ee2b9d2..912aa2ff4b 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend.cpp @@ -63,6 +63,7 @@ ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const c /* .interface = */ ggml_backend_remoting_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_virtgpu_reg(), ctx->device), /* .context = */ ctx, + /* .profiler = */ nullptr, }; return remoting_backend; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 15ed5b2a79..499e384b9d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1,4 +1,5 @@ #include "ggml-vulkan.h" +#include "ggml-profiler.h" #include #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS) #include @@ -1700,8 +1701,8 @@ private: std::mutex vk_memory_logger::log_mutex; -static bool vk_perf_logger_enabled = false; -static bool vk_perf_logger_concurrent = false; +static bool vk_perf_logger_enabled = false; // deprecated: use --profile instead +static bool vk_perf_logger_concurrent = false; // GGML_VK_PERF_LOGGER_CONCURRENT: use concurrent timestamp mode static bool vk_enable_sync_logger = false; // number of calls between perf logger prints static uint32_t vk_perf_logger_frequency = 1; @@ -1873,6 +1874,21 @@ class vk_perf_logger { uint32_t print_count {}; }; +// Profiler state for the new ggml_backend_profiler interface +struct ggml_vk_profiler_state { + bool enabled = false; + int split_id = -1; + + std::vector records; + std::vector cpu_timestamps; // CPU-side timestamps for global ordering + + void reset() { + records.clear(); + cpu_timestamps.clear(); + split_id = -1; + } +}; + struct ggml_backend_vk_context { std::string name; @@ -1930,8 +1946,9 @@ struct ggml_backend_vk_context { topk_moe_mode fused_topk_moe_mode {}; bool fused_topk_moe_scale {}; - // for GGML_VK_PERF_LOGGER - std::unique_ptr perf_logger; + // Profiling + std::unique_ptr perf_logger; // legacy env-var profiler + ggml_vk_profiler_state * profiler_state = nullptr; vk::QueryPool query_pool; std::vector query_fusion_names; std::vector query_fusion_node_count; @@ -12859,9 +12876,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr ctx->unsynced_nodes_read.clear(); ggml_vk_sync_buffers(ctx, compute_ctx); - if (vk_perf_logger_enabled && vk_perf_logger_concurrent) { + if ((vk_perf_logger_enabled || (ctx->profiler_state != nullptr && ctx->profiler_state->enabled)) + && vk_perf_logger_concurrent) { ctx->query_node_idx[ctx->query_idx] = node_idx; compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) { + ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns()); + } } } // Add all fused nodes to the unsynchronized lists. @@ -13384,7 +13405,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->transfer_cmd_pool.destroy(ctx->device->device); } - if (vk_perf_logger_enabled) { + if (ctx->perf_logger) { ctx->perf_logger->print_timings(true); } } @@ -14323,7 +14344,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ggml_vk_submit_transfer_ctx(ctx); vk_context compute_ctx; - if (vk_perf_logger_enabled) { + bool profiling = vk_perf_logger_enabled || + (ctx->profiler_state != nullptr && ctx->profiler_state->enabled); + if (profiling) { // allocate/resize the query pool if (ctx->num_queries < cgraph->n_nodes + 1) { if (ctx->query_pool) { @@ -14350,6 +14373,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg compute_ctx = ggml_vk_get_compute_ctx(ctx); ctx->query_idx = 0; compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) { + ctx->profiler_state->cpu_timestamps.clear(); + ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns()); + } } ctx->prealloc_y_last_pipeline_used = nullptr; @@ -14579,13 +14606,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit); - if (vk_perf_logger_enabled && enqueued) { + if (profiling && enqueued) { compute_ctx = ggml_vk_get_compute_ctx(ctx); if (!vk_perf_logger_concurrent) { // track a single node/fusion for the current query ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; ctx->query_fusion_names[ctx->query_idx] = fusion_string; compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) { + ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns()); + } } else { // track a fusion string and number of fused ops for the current node_idx ctx->query_fusion_names[i] = fusion_string; @@ -14619,7 +14649,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->last_total_mul_mat_bytes = total_mul_mat_bytes; - if (vk_perf_logger_enabled) { + if (profiling) { // End the command buffer and submit/wait GGML_ASSERT(!ctx->compute_ctx.expired()); compute_ctx = ctx->compute_ctx.lock(); @@ -14633,15 +14663,46 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Get the results and pass them to the logger std::vector timestamps(cgraph->n_nodes + 1); VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); + + const double ts_period = ctx->device->properties.limits.timestampPeriod; + const bool has_profiler = ctx->profiler_state != nullptr && ctx->profiler_state->enabled; + if (!vk_perf_logger_concurrent) { // Log each op separately for (int i = 1; i < ctx->query_idx; i++) { auto node = ctx->query_nodes[i]; auto name = ctx->query_fusion_names[i]; - ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period); + + if (ctx->perf_logger) { + ctx->perf_logger->log_timing(node, name, duration_ns); + } + + if (has_profiler && node != nullptr) { + static const int64_t zero_ne[4] = {0, 0, 0, 0}; + const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; + const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne; + uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size()) + ? ctx->profiler_state->cpu_timestamps[i] : 0; + + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = ggml_op_name(node->op); + rec.backend_id = -1; + rec.split_id = ctx->profiler_state->split_id; + rec.start_ns = cpu_ts; + rec.end_ns = cpu_ts + duration_ns; + rec.bytes = ggml_nbytes(node); + rec.extra = name; // fusion name or NULL + memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0)); + memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1)); + memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2)); + ctx->profiler_state->records.push_back(rec); + } } } else { - // Log each group of nodes + // Log each group of nodes (concurrent mode) int prev_node_idx = 0; for (int i = 1; i < ctx->query_idx; i++) { auto cur_node_idx = ctx->query_node_idx[i]; @@ -14656,10 +14717,46 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg node_idx += ctx->query_fusion_node_count[node_idx]; } prev_node_idx = cur_node_idx; - ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period); + + if (ctx->perf_logger) { + ctx->perf_logger->log_timing(nodes, names, duration_ns); + } + + if (has_profiler && !nodes.empty()) { + uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size()) + ? ctx->profiler_state->cpu_timestamps[i] : 0; + // In concurrent mode, report the group as a single combined operation + auto * node = nodes[0]; + static const int64_t zero_ne[4] = {0, 0, 0, 0}; + const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; + const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne; + + uint64_t total_bytes = 0; + for (size_t j = 0; j < nodes.size(); j++) { + total_bytes += ggml_nbytes(nodes[j]); + } + + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = ggml_op_name(node->op); + rec.backend_id = -1; + rec.split_id = ctx->profiler_state->split_id; + rec.start_ns = cpu_ts; + rec.end_ns = cpu_ts + duration_ns; + rec.bytes = total_bytes; + rec.extra = names[0]; // fusion name of first op, or NULL + memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0)); + memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1)); + memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2)); + ctx->profiler_state->records.push_back(rec); + } } } - ctx->perf_logger->print_timings(); + if (ctx->perf_logger) { + ctx->perf_logger->print_timings(); + } } if (!ctx->device->support_async) { @@ -14994,13 +15091,66 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) { /* .guid = */ ggml_backend_vk_guid(), /* .iface = */ ggml_backend_vk_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num), - /* .context = */ ctx, + /* .context = */ ctx, + /* .profiler = */ nullptr, }; if (!ctx->device->support_async) { vk_backend->iface.get_tensor_async = nullptr; } + // Register profiler + auto * prof_state = new ggml_vk_profiler_state(); + ctx->profiler_state = prof_state; + + static auto vk_prof_enable = [](void * context, bool enable) { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + vk_ctx->profiler_state->enabled = enable; + if (!enable) { + vk_ctx->profiler_state->reset(); + } + } + }; + static auto vk_prof_reset = [](void * context) { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + vk_ctx->profiler_state->reset(); + } + }; + static auto vk_prof_set_split_id = [](void * context, int split_id) { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + vk_ctx->profiler_state->split_id = split_id; + } + }; + static auto vk_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + *out = vk_ctx->profiler_state->records.data(); + return (int)vk_ctx->profiler_state->records.size(); + } + *out = nullptr; + return 0; + }; + static auto vk_prof_free = [](void * context) { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + delete vk_ctx->profiler_state; + vk_ctx->profiler_state = nullptr; + } + }; + + auto * profiler = new ggml_backend_profiler { + /* .context = */ ctx, + /* .enable = */ vk_prof_enable, + /* .reset = */ vk_prof_reset, + /* .set_split_id = */ vk_prof_set_split_id, + /* .get_records = */ vk_prof_get_records, + /* .free_context = */ vk_prof_free, + }; + ggml_backend_set_profiler(vk_backend, profiler); + return vk_backend; } diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 1aa15b0507..15c33b65c5 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -3102,6 +3102,7 @@ static ggml_backend_t ggml_backend_webgpu_backend_init(ggml_backend_dev_t dev, c /* .interface = */ ggml_backend_webgpu_i, /* .device = */ dev, /* .context = */ backend_ctx, + /* .profiler = */ nullptr, }; return backend; } diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp index 9b6938abf7..37050a9a09 100644 --- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp +++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp @@ -499,7 +499,8 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons /* .guid = */ ggml_backend_zdnn_guid(), /* .iface = */ ggml_backend_zdnn_i, /* .device = */ dev, - /* .context = */ ctx + /* .context = */ ctx, + /* .profiler = */ NULL, }; return backend; diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp index c876030400..a78ada8f2f 100644 --- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp +++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp @@ -264,7 +264,8 @@ ggml_backend_t ggml_backend_zendnn_init(void) { /* .guid = */ ggml_backend_zendnn_guid(), /* .iface = */ ggml_backend_zendnn_i, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_zendnn_reg(), 0), - /* .context = */ ctx, + /* .context = */ ctx, + /* .profiler = */ nullptr, }; return backend; diff --git a/include/llama.h b/include/llama.h index a940f9d648..4186f236d6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -561,6 +561,8 @@ extern "C" { LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type + LLAMA_API struct ggml_backend_sched * llama_context_get_sched(const struct llama_context * ctx); + LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a808e3e454..5e3030c8f6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1,5 +1,6 @@ #include "llama-context.h" +#include "ggml-profiler.h" #include "llama-arch.h" #include "llama-impl.h" #include "llama-batch.h" @@ -2189,6 +2190,11 @@ ggml_status llama_context::graph_compute( LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); } + // If profiling is enabled, synchronize to ensure records are complete + if (ggml_backend_sched_get_profiling(sched.get())) { + ggml_backend_sched_synchronize(sched.get()); + } + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched)); return status; @@ -3025,6 +3031,10 @@ enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { return ctx->pooling_type(); } +ggml_backend_sched_t llama_context_get_sched(const llama_context * ctx) { + return ctx->get_sched(); +} + void llama_attach_threadpool( llama_context * ctx, ggml_threadpool_t threadpool, diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index b57d27762c..224e9c2514 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -644,6 +644,29 @@ int main(int argc, char ** argv) { ctx_cli.ctx_server.terminate(); inference_thread.join(); + // Export profiling data if profiling was enabled + if (params.profiling) { + ggml_backend_sched_t sched = llama_context_get_sched(ctx_cli.ctx_server.get_llama_context()); + if (sched != nullptr) { + if (params.profiling_output.empty()) { + ggml_backend_sched_print_profiling(sched); + } else { + const std::string & path = params.profiling_output; + int ret; + if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) { + ret = ggml_backend_sched_export_profiling_text(sched, path.c_str()); + } else { + ret = ggml_backend_sched_export_profiling_json(sched, path.c_str()); + } + if (ret == 0) { + console::log("\nProfiling data exported to: %s\n", path.c_str()); + } else { + console::error("\nFailed to export profiling data to: %s\n", path.c_str()); + } + } + } + } + // bump the log level to display timings common_log_set_verbosity_thold(LOG_LEVEL_INFO); llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context()); diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 1dc5df1afa..caba5255cf 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -991,6 +991,29 @@ int main(int argc, char ** argv) { llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } + // Export profiling data if profiling was enabled + if (params.profiling) { + ggml_backend_sched_t sched = llama_context_get_sched(ctx); + if (sched != nullptr) { + if (params.profiling_output.empty()) { + ggml_backend_sched_print_profiling(sched); + } else { + const std::string & path = params.profiling_output; + int ret; + if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) { + ret = ggml_backend_sched_export_profiling_text(sched, path.c_str()); + } else { + ret = ggml_backend_sched_export_profiling_json(sched, path.c_str()); + } + if (ret == 0) { + LOG("\nProfiling data exported to: %s\n", path.c_str()); + } else { + LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str()); + } + } + } + } + LOG("\n\n"); common_perf_print(ctx, smpl); diff --git a/tools/profiler/__init__.py b/tools/profiler/__init__.py new file mode 100644 index 0000000000..cdd8dd543e --- /dev/null +++ b/tools/profiler/__init__.py @@ -0,0 +1 @@ +# llama.cpp profiler analysis tools diff --git a/tools/profiler/profiler.py b/tools/profiler/profiler.py new file mode 100644 index 0000000000..a739a830f4 --- /dev/null +++ b/tools/profiler/profiler.py @@ -0,0 +1,1003 @@ +#!/usr/bin/env python3 +"""llama.cpp cross-backend profiler analysis tool. + +Usage: + python -m tools.profiler.profiler profile.json + python -m tools.profiler.profiler profile.json --chrome-trace trace.json +""" + +from __future__ import annotations + +import json +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + + +OP_EVENT = 0 +COPY_EVENT = 1 + +TYPE_NAMES = {0: "OP", 1: "COPY"} + + +@dataclass +class ProfileRecord: + type: int + name: str + backend_id: int + split_id: int + start_ns: int + duration_ns: int + bytes: int + extra: Optional[str] + ne_src0: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + ne_src1: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + ne_src2: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + + @property + def type_name(self) -> str: + return TYPE_NAMES.get(self.type, f"UNKNOWN({self.type})") + + @property + def duration_us(self) -> float: + return self.duration_ns / 1000.0 + + @property + def duration_ms(self) -> float: + return self.duration_ns / 1_000_000.0 + + @property + def bandwidth_gbps(self) -> float: + """Bandwidth in GB/s.""" + if self.duration_ns == 0 or self.bytes == 0: + return 0.0 + return self.bytes / self.duration_ns + + @staticmethod + def _fmt_ne(ne: list[int]) -> str: + dims = [n for n in ne if n > 0] + if not dims: + return "" + return "[" + ", ".join(str(d) for d in dims) + "]" + + @property + def shape_str(self) -> str: + """Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1] x [8, 1]'.""" + s0 = self._fmt_ne(self.ne_src0) + s1 = self._fmt_ne(self.ne_src1) + s2 = self._fmt_ne(self.ne_src2) + parts = [s for s in (s0, s1, s2) if s] + return " x ".join(parts) + + def to_dict(self) -> dict: + return { + "type": self.type, + "name": self.name, + "backend_id": self.backend_id, + "split_id": self.split_id, + "start_ns": self.start_ns, + "duration_ns": self.duration_ns, + "bytes": self.bytes, + "extra": self.extra, + "ne_src0": self.ne_src0, + "ne_src1": self.ne_src1, + "ne_src2": self.ne_src2, + } + + +@dataclass +class OpStats: + name: str + event_type: int + backend_id: int + count: int = 0 + total_ns: int = 0 + min_ns: int = 0 + max_ns: int = 0 + total_bytes: int = 0 + representative_ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + + @property + def avg_ns(self) -> float: + return self.total_ns / self.count if self.count > 0 else 0 + + @property + def avg_us(self) -> float: + return self.avg_ns / 1000.0 + + @property + def total_ms(self) -> float: + return self.total_ns / 1_000_000.0 + + @property + def min_us(self) -> float: + return self.min_ns / 1000.0 + + @property + def max_us(self) -> float: + return self.max_ns / 1000.0 + + @property + def bandwidth_gbps(self) -> float: + if self.total_ns == 0 or self.total_bytes == 0: + return 0.0 + return self.total_bytes / self.total_ns + + @property + def time_per_byte_ns(self) -> float: + """Time per byte (lower = more efficient).""" + if self.total_bytes == 0: + return float("inf") + return self.total_ns / self.total_bytes + + @property + def type_name(self) -> str: + return TYPE_NAMES.get(self.event_type, f"UNKNOWN({self.event_type})") + + +class ProfileData: + def __init__(self, records: list[ProfileRecord], metadata: dict): + self.records = records + self.metadata = metadata + + @classmethod + def load(cls, filepath: str | Path) -> ProfileData: + """Load a profiler JSON file.""" + with open(filepath, "r") as f: + data = json.load(f) + + if data.get("profiler") != "ggml": + print(f"Warning: file may not be a ggml profiler output (profiler={data.get('profiler')})") + + records = [] + def _pad_ne(v): + if isinstance(v, list) and len(v) < 4: + return v + [0] * (4 - len(v)) + if not isinstance(v, list): + return [0, 0, 0, 0] + return v + + for r in data.get("records", []): + # Support both old "ne" format and new "ne_src0"/"ne_src1" format + ne_src0 = _pad_ne(r.get("ne_src0", r.get("ne", [0, 0, 0, 0]))) + ne_src1 = _pad_ne(r.get("ne_src1", [0, 0, 0, 0])) + ne_src2 = _pad_ne(r.get("ne_src2", [0, 0, 0, 0])) + records.append(ProfileRecord( + type=r.get("type", 0), + name=r.get("name", "unknown"), + backend_id=r.get("backend_id", 0), + split_id=r.get("split_id", 0), + start_ns=r.get("start_ns", 0), + duration_ns=r.get("duration_ns", 0), + bytes=r.get("bytes", 0), + extra=r.get("extra"), + ne_src0=ne_src0, + ne_src1=ne_src1, + ne_src2=ne_src2, + )) + + backends_raw = data.get("backends", []) + backends = [] + for b in backends_raw: + backends.append({ + "id": b.get("id", 0), + "name": b.get("name", "unknown"), + "device": b.get("device", "unknown"), + "device_type": b.get("device_type", 0), + }) + + metadata = { + "version": data.get("version", 0), + "total_records": data.get("total_records", len(records)), + "total_ns": data.get("total_ns", sum(r.duration_ns for r in records)), + "backends": backends, + } + + return cls(records, metadata) + + @property + def total_ns(self) -> int: + return sum(r.duration_ns for r in self.records) + + @property + def total_ms(self) -> float: + return self.total_ns / 1_000_000.0 + + def stats(self) -> list[OpStats]: + """Aggregate stats grouped by (name, type, backend_id).""" + groups: dict[tuple, OpStats] = {} + for rec in self.records: + key = (rec.name, rec.type, rec.backend_id) + if key not in groups: + groups[key] = OpStats( + name=rec.name, + event_type=rec.type, + backend_id=rec.backend_id, + min_ns=rec.duration_ns, + max_ns=rec.duration_ns, + representative_ne=list(rec.ne_src0), + ) + s = groups[key] + s.count += 1 + s.total_ns += rec.duration_ns + s.min_ns = min(s.min_ns, rec.duration_ns) + s.max_ns = max(s.max_ns, rec.duration_ns) + s.total_bytes += rec.bytes + + # Track the ne from the longest individual call + if rec.duration_ns >= s.max_ns: + s.representative_ne = list(rec.ne_src0) + + return sorted(groups.values(), key=lambda s: s.total_ns, reverse=True) + + def top_operations(self, n: int = 10) -> list[OpStats]: + """Return the N most time-consuming operations (aggregated).""" + return self.stats()[:n] + + def top_kernels(self, n: int = 10) -> list[ProfileRecord]: + """Return the N longest individual kernel executions.""" + return sorted(self.records, key=lambda r: r.duration_ns, reverse=True)[:n] + + def by_backend(self) -> dict[int, list[ProfileRecord]]: + """Group records by backend ID.""" + groups: dict[int, list[ProfileRecord]] = {} + for rec in self.records: + groups.setdefault(rec.backend_id, []).append(rec) + return dict(sorted(groups.items())) + + def timeline(self) -> list[ProfileRecord]: + """Return records sorted by start_ns for timeline visualization.""" + return sorted(self.records, key=lambda r: r.start_ns) + + def inefficiency_ranking(self, n: int = 10) -> list[OpStats]: + """Rank operations by time per byte (inefficiency). Lower is better.""" + all_stats = [s for s in self.stats() if s.total_bytes > 0 and s.event_type == OP_EVENT] + return sorted(all_stats, key=lambda s: s.time_per_byte_ns, reverse=True)[:n] + + def summary(self) -> None: + """Print a formatted summary table to stdout.""" + print(f"\n{'='*80}") + print(f" ggml Profiler Summary") + print(f"{'='*80}") + print(f" Total records: {len(self.records)}") + print(f" Total time: {self.total_ms:.2f} ms") + print(f" Unique ops: {len(set((r.name, r.type, r.backend_id) for r in self.records))}") + print(f"{'='*80}\n") + + stats = self.stats() + if not stats: + print(" No profiling data.\n") + return + + print(f" {'TYPE':<5} {'BKND':>4} {'Operation':<28} {'%Time':>7} {'Count':>6} " + f"{'Total':>10} {'Avg':>10} {'Min':>10} {'Max':>10} {'Bandwidth':>12}") + print(f" {'':->5} {'':->4} {'':->28} {'':->7} {'':->6} " + f"{'(ms)':>10} {'(us)':>10} {'(us)':>10} {'(us)':>10} {'':->12}") + + for s in stats: + pct = 100.0 * s.total_ns / self.total_ns if self.total_ns > 0 else 0 + + line = (f" {s.type_name:<5} {s.backend_id:>4} {s.name:<28} {pct:>6.1f}% " + f"{s.count:>6} {s.total_ms:>10.2f} {s.avg_us:>10.2f} " + f"{s.min_us:>10.2f} {s.max_us:>10.2f}") + + if s.total_bytes > 0 and s.total_ns > 0: + bw = s.bandwidth_gbps + if bw >= 1000.0: + line += f" {bw / 1000.0:>9.2f} TB/s" + else: + line += f" {bw:>9.2f} GB/s" + else: + line += f" {'':>12}" + + # Tensor shape from longest call + shape_dims = [n for n in s.representative_ne if n > 0] + if shape_dims: + line += f" [{', '.join(str(d) for d in shape_dims)}]" + + print(line) + + backend_groups = self.by_backend() + if len(backend_groups) > 1: + print(f"\n --- By Backend ---") + for bid, recs in sorted(backend_groups.items()): + bk_total = sum(r.duration_ns for r in recs) + bk_pct = 100.0 * bk_total / self.total_ns if self.total_ns > 0 else 0 + print(f" Backend {bid}: {bk_total / 1e6:.2f} ms ({bk_pct:.1f}%) — {len(recs)} records") + + inef = self.inefficiency_ranking(5) + if inef: + print(f"\n --- Top 5 Inefficient Operations (time/byte) ---") + for s in inef: + print(f" {s.name:<28} {s.time_per_byte_ns / 1000:.2f} us/byte " + f"({s.count} calls, {s.total_bytes / 1e6:.1f} MB)") + + top_k = self.top_kernels(5) + print(f"\n --- Top 5 Longest Kernels ---") + for rec in top_k: + shape = f" {rec.shape_str}" if rec.shape_str else "" + print(f" {rec.type_name:<5} {rec.name:<28} {rec.duration_us:>10.2f} us{shape} " + f"(split={rec.split_id}, backend={rec.backend_id})") + + print() + + def export_chrome_trace(self, filepath: str | Path) -> None: + """Export as Chrome Trace Event format for chrome://tracing.""" + events = [] + + # Build backend name mapping and remap to non-negative PIDs + # (Chrome cannot handle negative PIDs) + backend_ids = sorted(set(rec.backend_id for rec in self.records)) + backend_names: dict[int, str] = {} + pid_map: dict[int, int] = {} + + # Use metadata from JSON if available + metadata_backends = self.metadata.get("backends", []) + backend_by_id: dict[int, dict] = {b["id"]: b for b in metadata_backends} + + device_type_names = {0: "CPU", 1: "GPU", 2: "ACCEL"} + for idx, bid in enumerate(backend_ids): + pid_map[bid] = idx + if bid in backend_by_id: + binfo = backend_by_id[bid] + dev_type = binfo.get("device_type", 0) + dev_name = binfo.get("device", "") + type_name = device_type_names.get(dev_type, "Device") + if dev_name and dev_name != "unknown": + backend_names[bid] = f"{type_name}: {dev_name}" + else: + backend_names[bid] = f"{type_name}: {binfo.get('name', f'Backend {bid}')}" + else: + backend_names[bid] = f"Backend {bid}" + + # Process metadata events + for bid in backend_ids: + pid = pid_map[bid] + events.append({ + "ph": "M", # metadata + "pid": pid, + "name": "process_name", + "args": {"name": backend_names[bid]}, + }) + + # Use real timestamps, but prevent overlaps within each track. + # GPU kernels are launched rapidly (small start_ns gaps) but have long + # durations, so naive real timestamps overlap. Sweep-line per track: + # sort by start_ns, then place each event at max(start, prev_end). + from collections import defaultdict + tracks: dict[tuple, list[ProfileRecord]] = defaultdict(list) + for rec in self.records: + tracks[(rec.backend_id, rec.split_id)].append(rec) + + for key in tracks: + tracks[key].sort(key=lambda r: r.start_ns) + + for key, recs in tracks.items(): + pid = pid_map[key[0]] + tid = f"split_{key[1]}" + cursor = 0.0 + for rec in recs: + ts = max(rec.start_ns / 1000.0, cursor) + dur = rec.duration_ns / 1000.0 + cat = "copy" if rec.type == COPY_EVENT else "compute" + events.append({ + "ph": "X", # complete event + "pid": pid, + "tid": tid, + "name": rec.name, + "ts": ts, + "dur": dur, + "cat": cat, + "args": { + "bytes": rec.bytes, + "duration_us": dur, + "shape": rec.shape_str, + }, + }) + cursor = ts + dur + + trace = {"traceEvents": events} + with open(filepath, "w") as f: + json.dump(trace, f, indent=2) + + print(f"Chrome trace exported to: {filepath}") + print(f"Open chrome://tracing in Chrome/Edge and load this file.") + + def export_html_viewer(self, filepath: str | Path, max_records: int = 0) -> None: + """Export a self-contained interactive HTML timeline viewer using Canvas.""" + import json as json_mod + + metadata_backends = self.metadata.get("backends", []) + backend_by_id: dict[int, dict] = {b["id"]: b for b in metadata_backends} + + backend_names: dict[int, str] = {} + for bid in sorted(set(rec.backend_id for rec in self.records)): + binfo = backend_by_id.get(bid, {}) + name = binfo.get("name", f"Backend {bid}") + device = binfo.get("device", "") + backend_names[bid] = device if device and device != "unknown" else name + + events: list[dict] = [] + cum_us = 0.0 + for rec in self.records: + dur_us = rec.duration_ns / 1000.0 + events.append({ + "n": rec.name, + "d": dur_us, + "s": rec.shape_str, + "b": rec.bytes, + "t": rec.type, + "bid": rec.backend_id, + "start": cum_us, + }) + cum_us += dur_us + total_us = cum_us + + if max_records > 0 and len(events) > max_records: + stride = len(events) // max_records + events = events[::stride][:max_records] + + if total_us == 0: + print("No profiling data to export.") + return + + header_stats = str(len(events)) + ' events | ' + f'{total_us/1000:.1f}' + ' ms' + + # Build backend name map with string keys for JSON + bn_str = {str(k): v for k, v in backend_names.items()} + + # --- HTML --- + html = ( + '\n' + 'ggml Profiler\n\n' + '

ggml Profiler Timeline

' + '' + header_stats + '
\n' + '
' + '' + '' + '' + '' + '' + '' + '
\n' + '
\n' + '
\n' + '
\n' + '
\n' + '
\n' + '
\n' + '' + + with open(filepath, "w") as f: + f.write(html) + + print(f"HTML viewer exported to: {filepath}") + print(f"Open in browser: file://{Path(filepath).resolve()}") + + +def load(filepath: str | Path) -> ProfileData: + """Load a profiler JSON file.""" + return ProfileData.load(filepath) + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + description="llama.cpp profiler analysis tool", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python -m tools.profiler.profiler profile.json + python -m tools.profiler.profiler profile.json --chrome-trace trace.json + python -m tools.profiler.profiler profile.json --top-ops 20 + """, + ) + parser.add_argument("profile", help="Path to profiler JSON file") + parser.add_argument("--chrome-trace", metavar="FILE", + help="Export as Chrome Trace Event format") + parser.add_argument("--html-viewer", metavar="FILE", + help="Export as interactive HTML timeline viewer") + parser.add_argument("--html-max-records", type=int, default=0, + help="Max records in HTML viewer (0=unlimited, set to downsample for huge traces)") + parser.add_argument("--top-ops", type=int, default=0, + help="Show top N operations (0 = show summary)") + parser.add_argument("--top-kernels", type=int, default=0, + help="Show top N longest kernels") + parser.add_argument("--inefficiency", action="store_true", + help="Show inefficiency ranking") + + args = parser.parse_args() + + data = load(args.profile) + + if args.chrome_trace: + data.export_chrome_trace(args.chrome_trace) + + if args.html_viewer: + data.export_html_viewer(args.html_viewer, max_records=args.html_max_records) + + if args.top_ops > 0: + print(f"\nTop {args.top_ops} operations by total time:\n") + for s in data.top_operations(args.top_ops): + pct = 100.0 * s.total_ns / data.total_ns if data.total_ns > 0 else 0 + print(f" {s.type_name:<5} {s.backend_id:>4} {s.name:<28} {pct:>6.1f}% " + f"{s.count:>6}x {s.total_ms:>10.2f} ms avg={s.avg_us:.2f} us") + print() + + if args.top_kernels > 0: + print(f"\nTop {args.top_kernels} longest kernels:\n") + for rec in data.top_kernels(args.top_kernels): + print(f" {rec.type_name:<5} {rec.backend_id:>4} {rec.name:<28} " + f"{rec.duration_us:>10.2f} us split={rec.split_id}") + print() + + if args.inefficiency: + print("\nInefficiency ranking (time/byte for operations with data):\n") + for s in data.inefficiency_ranking(10): + print(f" {s.name:<28} {s.time_per_byte_ns / 1000:>10.2f} us/byte " + f"{s.count:>6} calls {s.total_bytes / 1e6:.1f} MB") + print() + + if args.top_ops == 0 and args.top_kernels == 0 and not args.inefficiency and not args.chrome_trace and not args.html_viewer: + data.summary() + + +if __name__ == "__main__": + main()