From 532a8ebdde704f95541e670e0506f3dc0521a6d3 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Sun, 29 Mar 2026 23:35:38 +0200
Subject: [PATCH] docs, pass copy details

---
 docs/cross-profiler.md    | 225 ++++++++++++++++++++++++++++++++++++++
 ggml/src/ggml-backend.cpp |  12 +-
 2 files changed, 233 insertions(+), 4 deletions(-)
 create mode 100644 docs/cross-profiler.md

diff --git a/docs/cross-profiler.md b/docs/cross-profiler.md
new file mode 100644
index 0000000000..ad90b9e20a
--- /dev/null
+++ b/docs/cross-profiler.md
@@ -0,0 +1,225 @@
+# Cross-Backend Profiler
+
+llama.cpp includes a built-in cross-backend profiler that captures per-operation timing, data transfer costs, and tensor shapes across all compute backends. It works with any application built on the ggml scheduler — no source changes needed.
+
+## Supported Backends
+
+| Backend | Status | Timing method |
+|---------|--------|---------------|
+| CPU     | Supported | Wall-clock (`CLOCK_MONOTONIC_RAW`) |
+| CUDA    | Supported | `cudaEvent` GPU timestamps |
+| Vulkan  | Supported | GPU timestamp queries |
+| BLAS    | Supported | Wall-clock |
+| Metal   | Not yet supported | — |
+| OpenCL  | Not yet supported | — |
+
+The scheduler also profiles **data copies** (H2D, D2H, D2D) between backends regardless of which backends have native profiler support.
+
+## Enabling the Profiler
+
+There are two independent ways to enable profiling. They can be used separately or together.
+
+### CLI flags (`--profile`, `--profile-output`)
+
+Available in `llama-cli`, `llama-completion`, `llama-server`, and `debug`:
+
+```bash
+# Print summary to stdout
+llama-completion -m model.gguf --profile -p "Hello world"
+
+# Export to JSON
+llama-completion -m model.gguf --profile --profile-output profile.json -p "Hello world"
+
+# Export to plain text
+llama-completion -m model.gguf --profile --profile-output profile.txt -p "Hello world"
+```
+
+The output format is chosen by file extension: `.json` for JSON, `.txt` for plain text. Any other extension defaults to JSON.
+
+### Environment variable (`GGML_PROFILE`)
+
+The `GGML_PROFILE` environment variable enables profiling at the ggml scheduler level. This works with **any** application that uses the scheduler — including third-party tools like `sd.cpp` — without CLI flag support.
+
+```bash
+# Print summary to stdout
+GGML_PROFILE=1 llama-completion -m model.gguf -p "Hello world"
+
+# Export JSON
+GGML_PROFILE=profile.json llama-completion -m model.gguf -p "Hello world"
+
+# Export plain text
+GGML_PROFILE=profile.txt llama-completion -m model.gguf -p "Hello world"
+
+# Works with any ggml-based application
+GGML_PROFILE=1 sd -m model.gguf -p "a cat"
+```
+
+| Value | Behavior |
+|-------|----------|
+| `1`, `stdout`, or empty | Print summary to stdout |
+| `path.json` | Export JSON to file |
+| `path.txt` | Export plain text to file |
+| Any other path | Export JSON to file |
+
+The export happens automatically when the scheduler is freed (typically at program exit).
+
+## Output Formats
+
+### Console summary (stdout)
+
+The default when `--profile` is used without `--profile-output`, or `GGML_PROFILE=1`:
+
+```
+=== Profiling Summary ===
+  [OP  ] backend 0 MUL_MAT                    45.2%  count=1200  total=  120.50 ms  avg=  100.42 us  ...  12.30 GB/s  [4096 x 4096]
+  [OP  ] backend 1 MUL_MAT_ID                 30.1%  count= 600  total=   80.20 ms  avg=  133.67 us  ...   0.08 GB/s  [2688 x 1856 x 128]
+  [COPY] backend 0 copy_H2D                    5.3%  count= 200  total=   14.10 ms  avg=   70.50 us  ...   2.50 GB/s
+  ...
+```
+
+Each line shows: event type (OP or COPY), backend index, operation name, percentage of total time, call count, timing stats, bandwidth, and representative tensor shape.
+
+### Plain text (`.txt`)
+
+A more detailed report with three sections:
+
+1. **Profiling Summary** — total time, record count, unique ops
+2. **Per-Backend Summary** — ops and copies per backend with aggregate bandwidth
+3. **Operations table** — full breakdown with bandwidth and tensor shapes for all source tensors
+
+### JSON (`.json`)
+
+Machine-readable format suitable for the Python analysis tool. Contains:
+
+- `version`: Format version (currently `2`)
+- `backends[]`: Backend metadata (name, device, device type)
+- `records[]`: Every profiling event with:
+  - `type`: `0` = OP, `1` = COPY
+  - `name`: Operation name (e.g. `"MUL_MAT"`, `"copy_H2D"`)
+  - `backend_id`, `split_id`: Scheduler indices
+  - `start_ns`, `duration_ns`: Timing in nanoseconds
+  - `bytes`: Output tensor size (OPs) or transfer size (COPYs)
+  - `extra`: Fusion name for fused ops, or `null`
+  - `ne_src0`, `ne_src1`, `ne_src2`: Source tensor dimensions (4-element arrays)
+
+`ne_src2` is populated only for `MUL_MAT_ID` (expert selection indices); it is `[0,0,0,0]` for all other ops.
+
+## Python Analysis Tool
+
+The `tools/profiler/profiler.py` script reads JSON exports and produces analysis reports and visualizations.
+
+### Basic usage
+
+```bash
+# Print summary
+python -m tools.profiler.profiler profile.json
+
+# Show top 10 operations by time
+python -m tools.profiler.profiler profile.json --top-ops 10
+
+# Show top 10 longest individual kernels
+python -m tools.profiler.profiler profile.json --top-kernels 10
+
+# Show inefficiency ranking (highest time-per-byte)
+python -m tools.profiler.profiler profile.json --inefficiency
+```
+
+### Export visualizations
+
+```bash
+# Interactive HTML timeline (self-contained, no dependencies)
+python -m tools.profiler.profiler profile.json --html-viewer timeline.html
+
+# Chrome Trace format (open in chrome://tracing or Perfetto)
+python -m tools.profiler.profiler profile.json --chrome-trace trace.json
+
+# Downsample large traces for the HTML viewer
+python -m tools.profiler.profiler profile.json --html-viewer timeline.html --html-max-records 50000
+```
+
+Multiple exports can be combined in a single invocation:
+
+```bash
+python -m tools.profiler.profiler profile.json --html-viewer timeline.html --chrome-trace trace.json --top-ops 20
+```
+
+### CLI reference
+
+| Argument | Description |
+|----------|-------------|
+| `profile` (positional) | Path to profiler JSON file |
+| `--chrome-trace FILE` | Export Chrome Trace Event format |
+| `--html-viewer FILE` | Export interactive HTML timeline |
+| `--html-max-records N` | Limit records in HTML output (0 = unlimited) |
+| `--top-ops N` | Show top N operations by total time |
+| `--top-kernels N` | Show top N longest individual kernels |
+| `--inefficiency` | Rank operations by time per byte (higher = worse) |
+
+### HTML viewer features
+
+The HTML viewer is a self-contained file with no external dependencies:
+
+- **Canvas timeline** with per-backend lanes and color-coded operations
+- **Zoom controls** (1s / 100ms / 1ms / 100us) and mouse drag navigation
+- **Minimap** showing the full trace with a viewport indicator
+- **Hover tooltips** with operation name, duration, shape, and bytes
+- **Stats table** with collapsible tree: Operation → Backend → Tensor shape, showing % time, count, avg/min/max, and bandwidth
+- **Legend** showing the most frequent operation types
+
+## What Gets Measured
+
+### OP events
+
+Every tensor operation (MUL_MAT, ADD, UNARY, FLASH_ATTN_EXT, etc.) is recorded with:
+
+- **Timing**: Start/end timestamps (nanosecond precision)
+- **Bytes**: Output tensor size (`ggml_nbytes(node)`)
+- **Tensor shapes**: Dimensions of `src[0]`, `src[1]`, and `src[2]` (when applicable)
+- **Bandwidth**: Computed as `bytes / duration` — useful for identifying memory-bound vs compute-bound operations
+
+### COPY events
+
+Data transfers between backends:
+
+- **Direction**: `copy_H2D` (host→device), `copy_D2H` (device→host), `copy_D2D` (device→device)
+- **Bytes**: Exact transfer size
+- **Bandwidth**: Transfer throughput
+
+### MoE weight copies
+
+When `--cpu-moe` is used, the scheduler selectively copies only the active experts. These partial copies are recorded as individual COPY events with the actual bytes transferred.
+
+## Programmatic API
+
+For custom applications, the profiler can be controlled through the C API defined in `ggml/include/ggml-profiler.h`:
+
+```c
+// Enable profiling on a scheduler
+ggml_backend_sched_set_profiling(sched, true);
+
+// ... run inference ...
+
+// Get raw records
+const ggml_profile_record * records;
+int n = ggml_backend_sched_get_profiling_records(sched, &records);
+
+// Or export directly
+ggml_backend_sched_print_profiling(sched);                         // stdout
+ggml_backend_sched_export_profiling_json(sched, "profile.json");   // JSON file
+ggml_backend_sched_export_profiling_text(sched, "profile.txt");    // text file
+ggml_backend_sched_write_profiling_json(sched, fp);                // JSON to FILE*
+ggml_backend_sched_write_profiling_text(sched, fp);                // text to FILE*
+
+// Reset for next measurement window
+ggml_backend_sched_reset_profiling(sched);
+```
+
+Records accumulate across multiple `graph_compute` calls until explicitly reset or the scheduler is freed.
+
+## Tips
+
+- **Prompt eval vs generation**: The profiler captures all graph computes. During prompt evaluation you'll see larger batch sizes in tensor shapes; during generation, batch size is typically 1-2.
+- **Vulkan concurrent mode**: When Vulkan dispatches multiple operations concurrently, they are reported as a single combined record spanning the full GPU time interval.
+- **Bandwidth interpretation**: For compute ops, bandwidth = `output_bytes / duration`. This is not memory bandwidth — it's a proxy for throughput. MUL_MAT with low bandwidth typically indicates compute-bound behavior; high bandwidth indicates memory-bound.
+- **Large traces**: For long inference runs, the JSON can be large. Use `--html-max-records` to downsample the HTML viewer, or use Chrome Trace format which handles large files well.
+- **Multiple backends**: Backend IDs in the output correspond to the scheduler's priority order (0 = highest priority, typically GPU; last = CPU).
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 73961cd52d..fa955b68bb 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1525,7 +1525,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     }
 
                     sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
-                                                    copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} });
+                                                    copy_start, copy_end, ggml_nbytes(input), input->name,
+                                                    {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
                 } else {
                     ggml_backend_tensor_copy(input, input_cpy);
                 }
@@ -1645,7 +1646,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
 
                         sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
                                                         split_id, moe_copy_start, moe_copy_end,
-                                                        (uint64_t) total_copied_bytes, NULL, {0}, {0}, {0} });
+                                                        (uint64_t) total_copied_bytes, input->name,
+                                                        {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
                     }
                 } else {
                     // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
@@ -1681,7 +1683,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                             }
 
                             sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
-                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} });
+                                                            split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
+                                                            {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
                         } else {
                             ggml_backend_tensor_copy(input, input_cpy);
                         }
@@ -1701,7 +1704,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                             }
 
                             sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
-                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} });
+                                                            split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
+                                                            {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
                         }
                     }
                 }