diff --git a/common/arg.cpp b/common/arg.cpp
index 538d2a4b0a..632c89edce 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1073,6 +1073,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.completion = true;
         }
     ));
+    add_opt(common_arg(
+        {"--profile"},
+        "enable cross-backend profiling (CPU, BLAS, CUDA)",
+        [](common_params & params) {
+            params.profiling = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
+    add_opt(common_arg(
+        {"--profile-output"}, "FNAME",
+        "write profiling JSON output to FNAME (default: stdout)",
+        [](common_params & params, const std::string & value) {
+            params.profiling        = true;
+            params.profiling_output = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
     add_opt(common_arg(
         {"--verbose-prompt"},
         string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
diff --git a/common/common.cpp b/common/common.cpp
index 60396af1f8..17e22cec64 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2,6 +2,7 @@
 #include "gguf.h"
 
 #include "common.h"
+#include "ggml-profiler.h"
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
@@ -1236,6 +1237,14 @@ common_init_result::common_init_result(common_params & params) :
         return;
     }
 
+    if (params.profiling) {
+        ggml_backend_sched_t sched = llama_context_get_sched(lctx);
+        if (sched != nullptr) {
+            ggml_backend_sched_set_profiling(sched, true);
+            LOG_INF("%s: profiling enabled\n", __func__);
+        }
+    }
+
     pimpl->context.reset(lctx);
 }
 
diff --git a/common/common.h b/common/common.h
index 17dc3fb232..72f6320348 100644
--- a/common/common.h
+++ b/common/common.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "ggml-opt.h"
+#include "ggml-profiler.h"
 #include "ggml.h"
 #include "llama-cpp.h"
 
@@ -669,6 +670,10 @@ struct common_params {
 
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
 
+    // profiling
+    bool        profiling = false;  // enable cross-backend profiling
+    std::string profiling_output;   // path to write profiling JSON output (empty = stdout)
+
     // batched-bench params
     bool batched_bench_output_jsonl = false;
 
diff --git a/docs/cross-profiler.md b/docs/cross-profiler.md
new file mode 100644
index 0000000000..ad90b9e20a
--- /dev/null
+++ b/docs/cross-profiler.md
@@ -0,0 +1,225 @@
+# Cross-Backend Profiler
+
+llama.cpp includes a built-in cross-backend profiler that captures per-operation timing, data transfer costs, and tensor shapes across all compute backends. It works with any application built on the ggml scheduler — no source changes needed.
+
+## Supported Backends
+
+| Backend | Status | Timing method |
+|---------|--------|---------------|
+| CPU     | Supported | Wall-clock (`CLOCK_MONOTONIC_RAW`) |
+| CUDA    | Supported | `cudaEvent` GPU timestamps |
+| Vulkan  | Supported | GPU timestamp queries |
+| BLAS    | Supported | Wall-clock |
+| Metal   | Not yet supported | — |
+| OpenCL  | Not yet supported | — |
+
+The scheduler also profiles **data copies** (H2D, D2H, D2D) between backends regardless of which backends have native profiler support.
+
+## Enabling the Profiler
+
+There are two independent ways to enable profiling. They can be used separately or together.
+
+### CLI flags (`--profile`, `--profile-output`)
+
+Available in `llama-cli`, `llama-completion`, `llama-server`, and `debug`:
+
+```bash
+# Print summary to stdout
+llama-completion -m model.gguf --profile -p "Hello world"
+
+# Export to JSON
+llama-completion -m model.gguf --profile --profile-output profile.json -p "Hello world"
+
+# Export to plain text
+llama-completion -m model.gguf --profile --profile-output profile.txt -p "Hello world"
+```
+
+The output format is chosen by file extension: `.json` for JSON, `.txt` for plain text. Any other extension defaults to JSON.
+
+### Environment variable (`GGML_PROFILE`)
+
+The `GGML_PROFILE` environment variable enables profiling at the ggml scheduler level. This works with **any** application that uses the scheduler — including third-party tools like `sd.cpp` — without CLI flag support.
+
+```bash
+# Print summary to stdout
+GGML_PROFILE=1 llama-completion -m model.gguf -p "Hello world"
+
+# Export JSON
+GGML_PROFILE=profile.json llama-completion -m model.gguf -p "Hello world"
+
+# Export plain text
+GGML_PROFILE=profile.txt llama-completion -m model.gguf -p "Hello world"
+
+# Works with any ggml-based application
+GGML_PROFILE=1 sd -m model.gguf -p "a cat"
+```
+
+| Value | Behavior |
+|-------|----------|
+| `1`, `stdout`, or empty | Print summary to stdout |
+| `path.json` | Export JSON to file |
+| `path.txt` | Export plain text to file |
+| Any other path | Export JSON to file |
+
+The export happens automatically when the scheduler is freed (typically at program exit).
+
+## Output Formats
+
+### Console summary (stdout)
+
+The default when `--profile` is used without `--profile-output`, or `GGML_PROFILE=1`:
+
+```
+=== Profiling Summary ===
+  [OP  ] backend 0 MUL_MAT                    45.2%  count=1200  total=  120.50 ms  avg=  100.42 us  ...  12.30 GB/s  [4096 x 4096]
+  [OP  ] backend 1 MUL_MAT_ID                 30.1%  count= 600  total=   80.20 ms  avg=  133.67 us  ...   0.08 GB/s  [2688 x 1856 x 128]
+  [COPY] backend 0 copy_H2D                    5.3%  count= 200  total=   14.10 ms  avg=   70.50 us  ...   2.50 GB/s
+  ...
+```
+
+Each line shows: event type (OP or COPY), backend index, operation name, percentage of total time, call count, timing stats, bandwidth, and representative tensor shape.
+
+### Plain text (`.txt`)
+
+A more detailed report with three sections:
+
+1. **Profiling Summary** — total time, record count, unique ops
+2. **Per-Backend Summary** — ops and copies per backend with aggregate bandwidth
+3. **Operations table** — full breakdown with bandwidth and tensor shapes for all source tensors
+
+### JSON (`.json`)
+
+Machine-readable format suitable for the Python analysis tool. Contains:
+
+- `version`: Format version (currently `2`)
+- `backends[]`: Backend metadata (name, device, device type)
+- `records[]`: Every profiling event with:
+  - `type`: `0` = OP, `1` = COPY
+  - `name`: Operation name (e.g. `"MUL_MAT"`, `"copy_H2D"`)
+  - `backend_id`, `split_id`: Scheduler indices
+  - `start_ns`, `duration_ns`: Timing in nanoseconds
+  - `bytes`: Output tensor size (OPs) or transfer size (COPYs)
+  - `extra`: Fusion name for fused ops, or `null`
+  - `ne_src0`, `ne_src1`, `ne_src2`: Source tensor dimensions (4-element arrays)
+
+`ne_src2` is populated only for `MUL_MAT_ID` (expert selection indices); it is `[0,0,0,0]` for all other ops.
+
+## Python Analysis Tool
+
+The `tools/profiler/profiler.py` script reads JSON exports and produces analysis reports and visualizations.
+
+### Basic usage
+
+```bash
+# Print summary
+python -m tools.profiler.profiler profile.json
+
+# Show top 10 operations by time
+python -m tools.profiler.profiler profile.json --top-ops 10
+
+# Show top 10 longest individual kernels
+python -m tools.profiler.profiler profile.json --top-kernels 10
+
+# Show inefficiency ranking (highest time-per-byte)
+python -m tools.profiler.profiler profile.json --inefficiency
+```
+
+### Export visualizations
+
+```bash
+# Interactive HTML timeline (self-contained, no dependencies)
+python -m tools.profiler.profiler profile.json --html-viewer timeline.html
+
+# Chrome Trace format (open in chrome://tracing or Perfetto)
+python -m tools.profiler.profiler profile.json --chrome-trace trace.json
+
+# Downsample large traces for the HTML viewer
+python -m tools.profiler.profiler profile.json --html-viewer timeline.html --html-max-records 50000
+```
+
+Multiple exports can be combined in a single invocation:
+
+```bash
+python -m tools.profiler.profiler profile.json --html-viewer timeline.html --chrome-trace trace.json --top-ops 20
+```
+
+### CLI reference
+
+| Argument | Description |
+|----------|-------------|
+| `profile` (positional) | Path to profiler JSON file |
+| `--chrome-trace FILE` | Export Chrome Trace Event format |
+| `--html-viewer FILE` | Export interactive HTML timeline |
+| `--html-max-records N` | Limit records in HTML output (0 = unlimited) |
+| `--top-ops N` | Show top N operations by total time |
+| `--top-kernels N` | Show top N longest individual kernels |
+| `--inefficiency` | Rank operations by time per byte (higher = worse) |
+
+### HTML viewer features
+
+The HTML viewer is a self-contained file with no external dependencies:
+
+- **Canvas timeline** with per-backend lanes and color-coded operations
+- **Zoom controls** (1s / 100ms / 1ms / 100us) and mouse drag navigation
+- **Minimap** showing the full trace with a viewport indicator
+- **Hover tooltips** with operation name, duration, shape, and bytes
+- **Stats table** with collapsible tree: Operation → Backend → Tensor shape, showing % time, count, avg/min/max, and bandwidth
+- **Legend** showing the most frequent operation types
+
+## What Gets Measured
+
+### OP events
+
+Every tensor operation (MUL_MAT, ADD, UNARY, FLASH_ATTN_EXT, etc.) is recorded with:
+
+- **Timing**: Start/end timestamps (nanosecond precision)
+- **Bytes**: Output tensor size (`ggml_nbytes(node)`)
+- **Tensor shapes**: Dimensions of `src[0]`, `src[1]`, and `src[2]` (when applicable)
+- **Bandwidth**: Computed as `bytes / duration` — useful for identifying memory-bound vs compute-bound operations
+
+### COPY events
+
+Data transfers between backends:
+
+- **Direction**: `copy_H2D` (host→device), `copy_D2H` (device→host), `copy_D2D` (device→device)
+- **Bytes**: Exact transfer size
+- **Bandwidth**: Transfer throughput
+
+### MoE weight copies
+
+When `--cpu-moe` is used, the scheduler selectively copies only the active experts. These partial copies are recorded as individual COPY events with the actual bytes transferred.
+
+## Programmatic API
+
+For custom applications, the profiler can be controlled through the C API defined in `ggml/include/ggml-profiler.h`:
+
+```c
+// Enable profiling on a scheduler
+ggml_backend_sched_set_profiling(sched, true);
+
+// ... run inference ...
+
+// Get raw records
+const ggml_profile_record * records;
+int n = ggml_backend_sched_get_profiling_records(sched, &records);
+
+// Or export directly
+ggml_backend_sched_print_profiling(sched);                         // stdout
+ggml_backend_sched_export_profiling_json(sched, "profile.json");   // JSON file
+ggml_backend_sched_export_profiling_text(sched, "profile.txt");    // text file
+ggml_backend_sched_write_profiling_json(sched, fp);                // JSON to FILE*
+ggml_backend_sched_write_profiling_text(sched, fp);                // text to FILE*
+
+// Reset for next measurement window
+ggml_backend_sched_reset_profiling(sched);
+```
+
+Records accumulate across multiple `graph_compute` calls until explicitly reset or the scheduler is freed.
+
+## Tips
+
+- **Prompt eval vs generation**: The profiler captures all graph computes. During prompt evaluation you'll see larger batch sizes in tensor shapes; during generation, batch size is typically 1-2.
+- **Vulkan concurrent mode**: When Vulkan dispatches multiple operations concurrently, they are reported as a single combined record spanning the full GPU time interval.
+- **Bandwidth interpretation**: For compute ops, bandwidth = `output_bytes / duration`. This is not memory bandwidth — it's a proxy for throughput. MUL_MAT with low bandwidth typically indicates compute-bound behavior; high bandwidth indicates memory-bound.
+- **Large traces**: For long inference runs, the JSON can be large. Use `--html-max-records` to downsample the HTML viewer, or use Chrome Trace format which handles large files well.
+- **Multiple backends**: Backend IDs in the output correspond to the scheduler's priority order (0 = highest priority, typically GPU; last = CPU).
diff --git a/examples/debug/debug.cpp b/examples/debug/debug.cpp
index ec80be19ba..fe2b883c12 100644
--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@@ -244,6 +244,29 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    // Export profiling data if profiling was enabled
+    if (params.profiling) {
+        ggml_backend_sched_t sched = llama_context_get_sched(ctx);
+        if (sched != nullptr) {
+            if (params.profiling_output.empty()) {
+                ggml_backend_sched_print_profiling(sched);
+            } else {
+                const std::string & path = params.profiling_output;
+                int ret;
+                if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
+                    ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
+                } else {
+                    ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
+                }
+                if (ret == 0) {
+                    LOG("\nProfiling data exported to: %s\n", path.c_str());
+                } else {
+                    LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str());
+                }
+            }
+        }
+    }
+
     LOG("\n");
     llama_perf_context_print(ctx);
 
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index e3e067c916..cbbac0b663 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -22,6 +22,24 @@ extern "C" {
 
         // use only reference implementations
         bool use_ref;
+
+        // profiler context (set by backend when profiling is enabled, NULL otherwise)
+        // when non-NULL, the compute loop will record per-node timing
+        void * profiling_context;
+
+        // callback for recording a profile record from C code (set by backend when profiling)
+        // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4], ne_src2[4]
+        void (*profiling_record_fn)(void *        context,
+                                    int           type,
+                                    const char *  name,
+                                    int           split_id,
+                                    uint64_t      start_ns,
+                                    uint64_t      end_ns,
+                                    uint64_t      bytes,
+                                    const char *  extra,
+                                    const int64_t ne_src0[4],
+                                    const int64_t ne_src1[4],
+                                    const int64_t ne_src2[4]);
     };
 
     // numa strategies
diff --git a/ggml/include/ggml-profiler.h b/ggml/include/ggml-profiler.h
new file mode 100644
index 0000000000..2328f6b49f
--- /dev/null
+++ b/ggml/include/ggml-profiler.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// Profiler
+//
+
+// Profile event types
+enum ggml_profile_event_type {
+    GGML_PROFILE_EVENT_OP,    // single operation execution (computation kernel)
+    GGML_PROFILE_EVENT_COPY,  // data transfer between devices
+};
+
+// A single profiling record representing a timed interval
+typedef struct ggml_profile_record {
+    enum ggml_profile_event_type type;
+    const char *                 name;        // operation name (e.g., "mul_mat", "copy_H2D")
+    int                          backend_id;  // scheduler's backend index (0 = highest priority)
+    int                          split_id;    // which graph split (0..n_splits-1)
+    uint64_t                     start_ns;    // start timestamp in nanoseconds
+    uint64_t                     end_ns;      // end timestamp in nanoseconds
+    uint64_t                     bytes;       // bytes transferred (for copy) or tensor size (for ops)
+    const char *                 extra;       // fusion name for fused ops, or NULL
+    int64_t                      ne_src0[4];  // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT)
+    int64_t                      ne_src1[4];  // src[1] tensor dimensions (e.g. input matrix for MUL_MAT)
+    int64_t                      ne_src2[4];  // src[2] tensor dimensions (e.g. ids for MUL_MAT_ID)
+} ggml_profile_record;
+
+// Backend profiler interface - each backend optionally implements this
+// to provide fine-grained operation timing
+struct ggml_backend_profiler {
+    void * context;  // backend-specific profiler context
+
+    // Enable or disable profiling on this backend
+    void (*enable)(void * context, bool enable);
+
+    // Clear all recorded data
+    void (*reset)(void * context);
+
+    // Set the current split ID (called by scheduler before graph_compute)
+    void (*set_split_id)(void * context, int split_id);
+
+    // Get recorded profiling data
+    // Returns the number of records; sets *out to point to internal storage
+    // The returned pointer remains valid until the next reset or disable call
+    int (*get_records)(void * context, const ggml_profile_record ** out);
+
+    // Free the profiler context
+    void (*free_context)(void * context);
+};
+
+typedef struct ggml_backend_profiler * ggml_backend_profiler_t;
+
+// Register a profiler on a backend (called by backend during init)
+// The profiler is owned by the backend and will be freed when the backend is freed
+GGML_API void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler);
+
+// Get the profiler associated with a backend (returns NULL if none)
+GGML_API ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend);
+
+//
+// Scheduler profiling API
+//
+
+// Enable or disable profiling on a scheduler
+// When enabled, the scheduler will:
+//   - Time data copy operations between backends
+//   - Enable profiling on all backends that support it
+//   - Collect profiling records from all backends after each graph compute
+GGML_API void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable);
+
+// Check if profiling is enabled on a scheduler
+GGML_API bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched);
+
+// Get profiling data from the last graph compute
+// Records are owned by the scheduler; valid until the next compute or reset
+// Returns the number of records
+GGML_API int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records);
+
+// Print a human-readable summary of the last profiling run to stdout
+// Groups records by operation name and shows total/count/min/max/avg time
+GGML_API void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched);
+
+// Reset profiling data (clear all recorded data)
+GGML_API void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched);
+
+// Get current time in nanoseconds (for manual profiling if needed)
+GGML_API uint64_t ggml_profiler_time_ns(void);
+
+// Export profiling data as JSON to a file
+// Returns 0 on success, -1 on error
+GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath);
+
+// Export profiling data as JSON to a FILE pointer
+GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp);
+
+// Export profiling data as plain text statistics to a file
+// Returns 0 on success, -1 on error
+GGML_API int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath);
+
+// Export profiling data as plain text statistics to a FILE pointer
+GGML_API int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 78853304d9..cf8f4d0788 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -195,12 +195,14 @@ add_library(ggml-base
             ../include/ggml-backend.h
             ../include/ggml-cpp.h
             ../include/ggml-opt.h
+            ../include/ggml-profiler.h
             ../include/gguf.h
             ggml.c
             ggml.cpp
             ggml-alloc.c
             ggml-backend.cpp
             ggml-opt.cpp
+            ggml-profiler.cpp
             ggml-threading.cpp
             ggml-threading.h
             ggml-quants.c
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 59190b7c46..80feaaf384 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -3,6 +3,7 @@
 // ggml-backend internal header
 
 #include "ggml-backend.h"
+#include "ggml-profiler.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -124,6 +125,9 @@ extern "C" {
         struct ggml_backend_i iface;
         ggml_backend_dev_t device;
         void * context;
+
+        // Optional profiler (set by backend during init, NULL if not profiling)
+        ggml_backend_profiler_t profiler;
     };
 
     struct ggml_backend_event {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 22c656996c..fa955b68bb 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -12,6 +12,7 @@
 #include "ggml-backend-impl.h"
 #include "ggml-alloc.h"
 #include "ggml-impl.h"
+#include "ggml-profiler.h"
 
 #include <assert.h>
 #include <limits.h>
@@ -20,6 +21,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <algorithm>
+#include <string>
 #include <vector>
 
 #ifdef __APPLE__
@@ -231,6 +233,15 @@ void ggml_backend_free(ggml_backend_t backend) {
         return;
     }
 
+    // Clean up profiler if present (before backend frees its context)
+    if (backend->profiler != NULL) {
+        if (backend->profiler->free_context != NULL) {
+            backend->profiler->free_context(backend->profiler->context);
+        }
+        delete backend->profiler;
+        backend->profiler = NULL;
+    }
+
     backend->iface.free(backend);
 }
 
@@ -736,6 +747,20 @@ struct ggml_backend_sched {
     int debug_realloc;
     int debug_graph_size;
     int debug_prev_graph_size;
+
+    // profiling
+    bool                             profiling_enabled;
+    std::string                      profiling_env_path;  // GGML_PROFILE env var value (for auto-export on free)
+    std::vector<ggml_profile_record> copy_records;        // copy events recorded by the scheduler
+    std::vector<ggml_profile_record> profiling_records;   // merged records from all sources
+
+    // Cached backend metadata for safe access during auto-export (backends may be freed first)
+    struct backend_meta {
+        std::string name;
+        std::string device;
+        int         device_type;
+    };
+    std::vector<backend_meta>        profiling_backend_meta;
 };
 
 #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1450,11 +1475,28 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
     std::vector<int32_t> ids;
     std::vector<ggml_bitset_t> used_ids;
 
+    // Profiling: reset copy records for this compute pass
+    if (sched->profiling_enabled) {
+        sched->copy_records.clear();
+    }
+
     for (int split_id = 0; split_id < sched->n_splits; split_id++) {
         struct ggml_backend_sched_split * split = &splits[split_id];
         int split_backend_id = split->backend_id;
         ggml_backend_t split_backend = sched->backends[split_backend_id];
 
+        // Profiling: set split ID and enable backend profiling
+        if (sched->profiling_enabled) {
+            if (split_backend->profiler != NULL) {
+                if (split_backend->profiler->enable != NULL) {
+                    split_backend->profiler->enable(split_backend->profiler->context, true);
+                }
+                if (split_backend->profiler->set_split_id != NULL) {
+                    split_backend->profiler->set_split_id(split_backend->profiler->context, split_id);
+                }
+            }
+        }
+
         // copy the input tensors to the split backend
         for (int input_id = 0; input_id < split->n_inputs; input_id++) {
             ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1468,7 +1510,26 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 } else {
                     ggml_backend_synchronize(split_backend);
                 }
-                ggml_backend_tensor_copy(input, input_cpy);
+                if (sched->profiling_enabled) {
+                    uint64_t copy_start = ggml_profiler_time_ns();
+                    ggml_backend_tensor_copy(input, input_cpy);
+                    uint64_t copy_end = ggml_profiler_time_ns();
+
+                    enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
+                    enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
+                    const char *               copy_dir = "copy_D2D";
+                    if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                        copy_dir = "copy_H2D";
+                    } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                        copy_dir = "copy_D2H";
+                    }
+
+                    sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
+                                                    copy_start, copy_end, ggml_nbytes(input), input->name,
+                                                    {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
+                } else {
+                    ggml_backend_tensor_copy(input, input_cpy);
+                }
             } else {
                 // wait for the split backend to finish using the input before overwriting it
                 if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@@ -1525,12 +1586,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     }
 
                     // group consecutive experts and copy them together
+                    size_t total_copied_bytes = 0;
                     auto copy_experts = [&](int32_t first_id, int32_t last_id) {
                         const size_t expert_offset = first_id * expert_size;
                         const size_t expert_size_copy =  (last_id - first_id + 1) * expert_size;
                         const size_t padding = std::min<size_t>(expert_size, 512);
                         const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
 
+                        total_copied_bytes += expert_size_copy + padding_end;
                         ggml_backend_tensor_set_async(split_backend,
                             input_cpy,
                             (const uint8_t *)input->data + expert_offset, expert_offset,
@@ -1539,6 +1602,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                             expert_size_copy + padding_end);
                     };
 
+                    uint64_t moe_copy_start = 0;
+                    if (sched->profiling_enabled) {
+                        moe_copy_start = ggml_profiler_time_ns();
+                    }
+
                     int id = 0;
                     while (!ggml_bitset_get(used_ids.data(), id)) {
                         id++;
@@ -1562,9 +1630,35 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                         last_id = id;
                     }
                     copy_experts(first_id, last_id);
+
+                    if (sched->profiling_enabled) {
+                        uint64_t moe_copy_end = ggml_profiler_time_ns();
+
+                        enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
+                        enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
+                        const char *               copy_dir = "copy_D2D";
+                        if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                            copy_dir = "copy_H2D";
+                        } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
+                                   dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                            copy_dir = "copy_D2H";
+                        }
+
+                        sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
+                                                        split_id, moe_copy_start, moe_copy_end,
+                                                        (uint64_t) total_copied_bytes, input->name,
+                                                        {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
+                    }
                 } else {
                     // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                     // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
+
+                    // Capture timestamp before async attempt so we can record launch time
+                    uint64_t copy_start = 0;
+                    if (sched->profiling_enabled) {
+                        copy_start = ggml_profiler_time_ns();
+                    }
+
                     if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
                         ggml_backend_synchronize(input_backend);
                         if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@@ -1572,7 +1666,47 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                         } else {
                             ggml_backend_synchronize(split_backend);
                         }
-                        ggml_backend_tensor_copy(input, input_cpy);
+                        if (sched->profiling_enabled) {
+                            // Re-take start after sync for accurate sync copy measurement
+                            copy_start = ggml_profiler_time_ns();
+                            ggml_backend_tensor_copy(input, input_cpy);
+                            uint64_t copy_end = ggml_profiler_time_ns();
+
+                            enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
+                            enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
+                            const char *               copy_dir = "copy_D2D";
+                            if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                                copy_dir = "copy_H2D";
+                            } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
+                                       dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                                copy_dir = "copy_D2H";
+                            }
+
+                            sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
+                                                            split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
+                                                            {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
+                        } else {
+                            ggml_backend_tensor_copy(input, input_cpy);
+                        }
+                    } else {
+                        // async copy was launched — record the time spanning the async call
+                        if (sched->profiling_enabled) {
+                            uint64_t copy_end = ggml_profiler_time_ns();
+
+                            enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
+                            enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
+                            const char *               copy_dir = "copy_D2D";
+                            if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                                copy_dir = "copy_H2D";
+                            } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
+                                       dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                                copy_dir = "copy_D2H";
+                            }
+
+                            sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
+                                                            split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
+                                                            {input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
+                        }
                     }
                 }
             }
@@ -1625,6 +1759,32 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         }
     }
 
+    // Profiling: collect records from all backends and append to accumulated records
+    if (sched->profiling_enabled) {
+        // Collect backend operation records
+        for (int b = 0; b < sched->n_backends; b++) {
+            ggml_backend_t backend = sched->backends[b];
+            if (backend->profiler != NULL && backend->profiler->get_records != NULL) {
+                const ggml_profile_record * backend_recs = NULL;
+                int count = backend->profiler->get_records(backend->profiler->context, &backend_recs);
+                for (int r = 0; r < count; r++) {
+                    ggml_profile_record rec = backend_recs[r];
+                    rec.backend_id          = b;  // stamp correct scheduler backend index
+                    sched->profiling_records.push_back(rec);
+                }
+                // Reset backend records (but keep profiling enabled for next compute)
+                if (backend->profiler->reset != NULL) {
+                    backend->profiler->reset(backend->profiler->context);
+                }
+            }
+        }
+
+        // Append copy records
+        for (const auto & rec : sched->copy_records) {
+            sched->profiling_records.push_back(rec);
+        }
+    }
+
     return GGML_STATUS_SUCCESS;
 }
 
@@ -1691,6 +1851,24 @@ ggml_backend_sched_t ggml_backend_sched_new(
 
     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
     sched->op_offload = op_offload;
+    const char * profile_env = getenv("GGML_PROFILE");
+    if (profile_env != NULL) {
+        sched->profiling_enabled = true;
+        sched->profiling_env_path = profile_env;
+    }
+
+    // Cache backend metadata for safe access during auto-export
+    for (int b = 0; b < n_backends; b++) {
+        ggml_backend_sched::backend_meta meta;
+        meta.name        = ggml_backend_name(backends[b]);
+        meta.device      = "unknown";
+        meta.device_type  = 0;
+        if (backends[b]->device != NULL) {
+            meta.device      = ggml_backend_dev_name(backends[b]->device);
+            meta.device_type = (int) ggml_backend_dev_type(backends[b]->device);
+        }
+        sched->profiling_backend_meta.push_back(std::move(meta));
+    }
 
     ggml_backend_sched_reset(sched);
 
@@ -1701,6 +1879,33 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
     if (sched == NULL) {
         return;
     }
+
+    // Auto-export profiling data if enabled via GGML_PROFILE env var
+    // GGML_PROFILE=1 or GGML_PROFILE="" → print to stdout
+    // GGML_PROFILE=file.json → export JSON
+    // GGML_PROFILE=file.txt → export text
+    if (!sched->profiling_records.empty() && getenv("GGML_PROFILE") != NULL) {
+        const std::string & path = sched->profiling_env_path;
+        if (path.empty() || path == "1" || path == "stdout") {
+            ggml_backend_sched_print_profiling(sched);
+        } else if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
+            int ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
+            if (ret == 0) {
+                GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str());
+            } else {
+                GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str());
+            }
+        } else {
+            // Default to JSON for any other path (including .json)
+            int ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
+            if (ret == 0) {
+                GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str());
+            } else {
+                GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str());
+            }
+        }
+    }
+
     for (int b = 0; b < sched->n_backends; b++) {
         for (int c = 0; c < sched->n_copies; c++) {
             ggml_backend_event_free(sched->events[b][c]);
@@ -2268,3 +2473,475 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
     GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
     return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
 }
+
+//
+// Scheduler profiling
+//
+
+void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable) {
+    GGML_ASSERT(sched);
+    sched->profiling_enabled = enable;
+
+    if (!enable) {
+        ggml_backend_sched_reset_profiling(sched);
+    }
+}
+
+bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched) {
+    GGML_ASSERT(sched);
+    return sched->profiling_enabled;
+}
+
+int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records) {
+    GGML_ASSERT(sched);
+    GGML_ASSERT(records != NULL);
+
+    *records = sched->profiling_records.data();
+    return (int) sched->profiling_records.size();
+}
+
+void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched) {
+    GGML_ASSERT(sched);
+    sched->profiling_records.clear();
+    sched->copy_records.clear();
+}
+
+void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) {
+    GGML_ASSERT(sched);
+
+    if (sched->profiling_records.empty()) {
+        GGML_LOG_INFO("[profiler] No profiling data available\n");
+        return;
+    }
+
+    GGML_LOG_INFO("\n=== Profiling Summary ===\n");
+
+    // Aggregate by (name, type, backend_id)
+    struct op_stats {
+        const char *                 name;
+        enum ggml_profile_event_type type;
+        int                          backend_id;
+        uint64_t                     total_ns;
+        uint64_t                     min_ns;
+        uint64_t                     max_ns;
+        int                          count;
+        uint64_t                     total_bytes;
+        int64_t                      representative_ne[4];
+    };
+
+    std::vector<op_stats> stats;
+    for (const auto & rec : sched->profiling_records) {
+        bool found = false;
+        for (auto & s : stats) {
+            if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) {
+                uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
+                s.total_ns += dur;
+                s.min_ns = std::min(s.min_ns, dur);
+                s.max_ns = std::max(s.max_ns, dur);
+                s.count++;
+                s.total_bytes += rec.bytes;
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
+            op_stats s;
+            s.name        = rec.name;
+            s.type        = rec.type;
+            s.backend_id  = rec.backend_id;
+            s.total_ns    = dur;
+            s.min_ns      = dur;
+            s.max_ns      = dur;
+            s.count       = 1;
+            s.total_bytes = rec.bytes;
+            memcpy(s.representative_ne, rec.ne_src0, sizeof(s.representative_ne));
+            stats.push_back(s);
+        }
+    }
+
+    // Sort by total time descending
+    std::sort(stats.begin(), stats.end(),
+              [](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; });
+
+    uint64_t grand_total = 0;
+    for (const auto & s : stats) {
+        grand_total += s.total_ns;
+    }
+
+    const char * type_str[] = { "OP  ", "COPY" };
+    for (const auto & s : stats) {
+        double pct    = 100.0 * (double) s.total_ns / (double) grand_total;
+        double avg_us = (double) s.total_ns / (double) s.count / 1000.0;
+        double min_us = (double) s.min_ns / 1000.0;
+        double max_us = (double) s.max_ns / 1000.0;
+
+        GGML_LOG_INFO(
+            "  [%s] backend %d %-28s %7.1f%%  count=%-6d  total=%8.2f ms  avg=%8.2f us  min=%8.2f us  max=%8.2f us",
+            type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us);
+        if (s.total_bytes > 0 && s.total_ns > 0) {
+            double bw_gbps = (double) s.total_bytes / (double) s.total_ns;
+            if (bw_gbps >= 1000.0) {
+                GGML_LOG_INFO("  %6.2f TB/s", bw_gbps / 1000.0);
+            } else {
+                GGML_LOG_INFO("  %6.2f GB/s", bw_gbps);
+            }
+        }
+        // Print representative tensor shape (first record's ne)
+        if (s.representative_ne[0] > 0 || s.representative_ne[1] > 0) {
+            GGML_LOG_INFO("  [%lld x %lld", (long long) s.representative_ne[0], (long long) s.representative_ne[1]);
+            if (s.representative_ne[2] > 1) {
+                GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[2]);
+            }
+            if (s.representative_ne[3] > 1) {
+                GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[3]);
+            }
+            GGML_LOG_INFO("]");
+        }
+        GGML_LOG_INFO("\n");
+    }
+
+    GGML_LOG_INFO("  ---\n");
+    GGML_LOG_INFO("  Total: %.2f ms  (%d records, %d unique ops)\n\n", (double) grand_total / 1e6,
+                  (int) sched->profiling_records.size(), (int) stats.size());
+}
+
+int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp) {
+    GGML_ASSERT(sched);
+    GGML_ASSERT(fp != NULL);
+
+    uint64_t total_ns = 0;
+    for (const auto & rec : sched->profiling_records) {
+        total_ns += (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
+    }
+
+    fprintf(fp, "{\n");
+    fprintf(fp, "  \"version\": 2,\n");
+    fprintf(fp, "  \"profiler\": \"ggml\",\n");
+    fprintf(fp, "  \"total_records\": %d,\n", (int) sched->profiling_records.size());
+    fprintf(fp, "  \"total_ns\": %llu,\n", (unsigned long long) total_ns);
+
+    // Backend metadata (use cached data if available, fall back to live pointers)
+    fprintf(fp, "  \"backends\": [\n");
+    for (int b = 0; b < sched->n_backends; b++) {
+        const char * name     = "unknown";
+        const char * dev_name = "unknown";
+        int          dev_type = 0;
+        if (b < (int) sched->profiling_backend_meta.size()) {
+            name     = sched->profiling_backend_meta[b].name.c_str();
+            dev_name = sched->profiling_backend_meta[b].device.c_str();
+            dev_type = sched->profiling_backend_meta[b].device_type;
+        } else if (sched->backends[b] != NULL) {
+            name = ggml_backend_name(sched->backends[b]);
+            if (sched->backends[b]->device != NULL) {
+                dev_name = ggml_backend_dev_name(sched->backends[b]->device);
+                dev_type = (int) ggml_backend_dev_type(sched->backends[b]->device);
+            }
+        }
+        fprintf(fp, "    {\"id\": %d, \"name\": \"%s\", \"device\": \"%s\", \"device_type\": %d}%s\n", b, name,
+                dev_name, dev_type, (b < sched->n_backends - 1) ? "," : "");
+    }
+    fprintf(fp, "  ],\n");
+
+    // Records
+    fprintf(fp, "  \"records\": [\n");
+
+    for (int i = 0; i < (int) sched->profiling_records.size(); i++) {
+        const auto & rec         = sched->profiling_records[i];
+        uint64_t     duration_ns = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
+
+        fprintf(fp,
+                "    {\"type\": %d, \"name\": \"%s\", \"backend_id\": %d, \"split_id\": %d, "
+                "\"start_ns\": %llu, \"duration_ns\": %llu, \"bytes\": %llu, \"extra\": ",
+                (int) rec.type, rec.name ? rec.name : "unknown", rec.backend_id, rec.split_id,
+                (unsigned long long) rec.start_ns, (unsigned long long) duration_ns, (unsigned long long) rec.bytes);
+
+        if (rec.extra != NULL) {
+            fprintf(fp, "\"%s\"", rec.extra);
+        } else {
+            fprintf(fp, "null");
+        }
+
+        // Tensor dimensions (all source tensors)
+        fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1],
+                (long long) rec.ne_src0[2], (long long) rec.ne_src0[3]);
+        fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1],
+                (long long) rec.ne_src1[2], (long long) rec.ne_src1[3]);
+        fprintf(fp, ", \"ne_src2\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src2[0], (long long) rec.ne_src2[1],
+                (long long) rec.ne_src2[2], (long long) rec.ne_src2[3]);
+
+        fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : "");
+    }
+
+    fprintf(fp, "  ]\n");
+    fprintf(fp, "}\n");
+
+    return 0;
+}
+
+int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath) {
+    GGML_ASSERT(sched);
+    GGML_ASSERT(filepath != NULL);
+
+    FILE * fp = fopen(filepath, "w");
+    if (fp == NULL) {
+        GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath);
+        return -1;
+    }
+
+    int ret = ggml_backend_sched_write_profiling_json(sched, fp);
+    fclose(fp);
+
+    return ret;
+}
+
+// Helper: format ne dimensions as string, e.g. "[4096, 4096, 1]"
+static void fmt_ne(char * buf, size_t bufsize, const int64_t ne[4]) {
+    if (ne[0] == 0 && ne[1] == 0 && ne[2] == 0 && ne[3] == 0) {
+        buf[0] = '\0';
+        return;
+    }
+    int ndims = 4;
+    while (ndims > 1 && ne[ndims - 1] <= 1) {
+        ndims--;
+    }
+    int pos = snprintf(buf, bufsize, "[");
+    for (int i = 0; i < ndims && pos < (int) bufsize - 1; i++) {
+        pos += snprintf(buf + pos, bufsize - pos, "%s%lld", i > 0 ? ", " : "", (long long) ne[i]);
+    }
+    snprintf(buf + pos, bufsize - pos, "]");
+}
+
+// Helper: format bandwidth as string
+static void fmt_bandwidth(char * buf, size_t bufsize, uint64_t bytes, uint64_t ns) {
+    if (ns == 0 || bytes == 0) {
+        buf[0] = '\0';
+        return;
+    }
+    double bw_gbps = (double) bytes / (double) ns;
+    if (bw_gbps >= 1000.0) {
+        snprintf(buf, bufsize, "%.2f TB/s", bw_gbps / 1000.0);
+    } else {
+        snprintf(buf, bufsize, "%.2f GB/s", bw_gbps);
+    }
+}
+
+int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp) {
+    GGML_ASSERT(sched);
+    GGML_ASSERT(fp != NULL);
+
+    if (sched->profiling_records.empty()) {
+        fprintf(fp, "No profiling data available.\n");
+        return 0;
+    }
+
+    // Aggregate by (name, type, backend_id)
+    struct op_stats {
+        const char *                 name;
+        enum ggml_profile_event_type type;
+        int                          backend_id;
+        uint64_t                     total_ns;
+        uint64_t                     min_ns;
+        uint64_t                     max_ns;
+        int                          count;
+        uint64_t                     total_bytes;
+        int64_t                      representative_ne_src0[4];
+        int64_t                      representative_ne_src1[4];
+        int64_t                      representative_ne_src2[4];
+    };
+
+    std::vector<op_stats> stats;
+    for (const auto & rec : sched->profiling_records) {
+        uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
+        bool found = false;
+        for (auto & s : stats) {
+            if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) {
+                s.total_ns += dur;
+                s.min_ns = std::min(s.min_ns, dur);
+                s.max_ns = std::max(s.max_ns, dur);
+                s.count++;
+                s.total_bytes += rec.bytes;
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            op_stats s = {};
+            s.name        = rec.name;
+            s.type        = rec.type;
+            s.backend_id  = rec.backend_id;
+            s.total_ns    = dur;
+            s.min_ns      = dur;
+            s.max_ns      = dur;
+            s.count       = 1;
+            s.total_bytes = rec.bytes;
+            memcpy(s.representative_ne_src0, rec.ne_src0, sizeof(s.representative_ne_src0));
+            memcpy(s.representative_ne_src1, rec.ne_src1, sizeof(s.representative_ne_src1));
+            memcpy(s.representative_ne_src2, rec.ne_src2, sizeof(s.representative_ne_src2));
+            stats.push_back(s);
+        }
+    }
+
+    std::sort(stats.begin(), stats.end(),
+              [](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; });
+
+    uint64_t grand_total = 0;
+    for (const auto & s : stats) {
+        grand_total += s.total_ns;
+    }
+
+    // --- Section 1: Overall summary ---
+    fprintf(fp, "=== Profiling Summary ===\n");
+    fprintf(fp, "Total time:    %.2f ms\n", (double) grand_total / 1e6);
+    fprintf(fp, "Total records: %d\n", (int) sched->profiling_records.size());
+    fprintf(fp, "Unique ops:    %d\n\n", (int) stats.size());
+
+    // --- Section 2: Per-backend breakdown ---
+    fprintf(fp, "=== Per-Backend Summary ===\n");
+    {
+        struct backend_stats {
+            int      backend_id;
+            int      op_count;
+            int      copy_count;
+            uint64_t op_ns;
+            uint64_t copy_ns;
+            uint64_t op_bytes;
+            uint64_t copy_bytes;
+        };
+        std::vector<backend_stats> bstats;
+        for (const auto & s : stats) {
+            bool found = false;
+            for (auto & bs : bstats) {
+                if (bs.backend_id == s.backend_id) {
+                    if (s.type == GGML_PROFILE_EVENT_OP) {
+                        bs.op_count += s.count;
+                        bs.op_ns    += s.total_ns;
+                        bs.op_bytes += s.total_bytes;
+                    } else {
+                        bs.copy_count += s.count;
+                        bs.copy_ns    += s.total_ns;
+                        bs.copy_bytes += s.total_bytes;
+                    }
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                backend_stats bs = {};
+                bs.backend_id = s.backend_id;
+                if (s.type == GGML_PROFILE_EVENT_OP) {
+                    bs.op_count = s.count;
+                    bs.op_ns    = s.total_ns;
+                    bs.op_bytes = s.total_bytes;
+                } else {
+                    bs.copy_count = s.count;
+                    bs.copy_ns    = s.total_ns;
+                    bs.copy_bytes = s.total_bytes;
+                }
+                bstats.push_back(bs);
+            }
+        }
+        std::sort(bstats.begin(), bstats.end(),
+                  [](const backend_stats & a, const backend_stats & b) {
+                      return (a.op_ns + a.copy_ns) > (b.op_ns + b.copy_ns);
+                  });
+
+        for (const auto & bs : bstats) {
+            uint64_t total = bs.op_ns + bs.copy_ns;
+            double   pct   = grand_total > 0 ? 100.0 * (double) total / (double) grand_total : 0;
+
+            const char * bname = "unknown";
+            if (bs.backend_id >= 0 && bs.backend_id < (int) sched->profiling_backend_meta.size()) {
+                bname = sched->profiling_backend_meta[bs.backend_id].name.c_str();
+            } else if (bs.backend_id >= 0 && bs.backend_id < sched->n_backends && sched->backends[bs.backend_id] != NULL) {
+                bname = ggml_backend_name(sched->backends[bs.backend_id]);
+            }
+
+            fprintf(fp, "  Backend %d (%s): %.2f ms (%.1f%%)\n", bs.backend_id, bname, (double) total / 1e6, pct);
+            if (bs.op_count > 0) {
+                char bw_buf[32];
+                fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.op_bytes, bs.op_ns);
+                fprintf(fp, "    OPs:   %d calls, %.2f ms", bs.op_count, (double) bs.op_ns / 1e6);
+                if (bw_buf[0]) {
+                    fprintf(fp, ", %s", bw_buf);
+                }
+                fprintf(fp, "\n");
+            }
+            if (bs.copy_count > 0) {
+                char bw_buf[32];
+                fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.copy_bytes, bs.copy_ns);
+                fprintf(fp, "    COPYs: %d calls, %.2f ms", bs.copy_count, (double) bs.copy_ns / 1e6);
+                if (bw_buf[0]) {
+                    fprintf(fp, ", %s", bw_buf);
+                }
+                fprintf(fp, "\n");
+            }
+        }
+    }
+    fprintf(fp, "\n");
+
+    // --- Section 3: Detailed operation table ---
+    fprintf(fp, "=== Operations (sorted by total time) ===\n");
+    fprintf(fp, "%-5s %4s  %-28s  %7s  %6s  %10s  %10s  %10s  %10s  %12s  %s\n",
+            "TYPE", "BKND", "Operation", "%Time", "Count", "Total(ms)", "Avg(us)", "Min(us)", "Max(us)", "Bandwidth", "Tensors");
+    fprintf(fp, "%-5s %4s  %-28s  %7s  %6s  %10s  %10s  %10s  %10s  %12s  %s\n",
+            "-----", "----", "----------------------------", "-------", "------",
+            "----------", "----------", "----------", "----------", "------------", "-------");
+
+    const char * type_str[] = { "OP", "COPY" };
+    for (const auto & s : stats) {
+        double pct    = grand_total > 0 ? 100.0 * (double) s.total_ns / (double) grand_total : 0;
+        double avg_us = (double) s.total_ns / (double) s.count / 1000.0;
+        double min_us = (double) s.min_ns / 1000.0;
+        double max_us = (double) s.max_ns / 1000.0;
+
+        char bw_buf[32] = "";
+        fmt_bandwidth(bw_buf, sizeof(bw_buf), s.total_bytes, s.total_ns);
+
+        char ne0_buf[64];
+        char ne1_buf[64];
+        char ne2_buf[64];
+        fmt_ne(ne0_buf, sizeof(ne0_buf), s.representative_ne_src0);
+        fmt_ne(ne1_buf, sizeof(ne1_buf), s.representative_ne_src1);
+        fmt_ne(ne2_buf, sizeof(ne2_buf), s.representative_ne_src2);
+
+        // Build tensor shapes string
+        char tensors_buf[256] = "";
+        int tpos = 0;
+        if (ne0_buf[0]) {
+            tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, "%s", ne0_buf);
+        }
+        if (ne1_buf[0]) {
+            tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne1_buf);
+        }
+        if (ne2_buf[0]) {
+            tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne2_buf);
+        }
+
+        fprintf(fp, "%-5s %4d  %-28s  %6.1f%%  %6d  %10.2f  %10.2f  %10.2f  %10.2f  %12s  %s\n",
+                type_str[s.type], s.backend_id, s.name, pct, s.count,
+                (double) s.total_ns / 1e6, avg_us, min_us, max_us,
+                bw_buf, tensors_buf);
+    }
+
+    fprintf(fp, "\nTotal: %.2f ms  (%d records, %d unique ops)\n", (double) grand_total / 1e6,
+            (int) sched->profiling_records.size(), (int) stats.size());
+
+    return 0;
+}
+
+int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath) {
+    GGML_ASSERT(sched);
+    GGML_ASSERT(filepath != NULL);
+
+    FILE * fp = fopen(filepath, "w");
+    if (fp == NULL) {
+        GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath);
+        return -1;
+    }
+
+    int ret = ggml_backend_sched_write_profiling_text(sched, fp);
+    fclose(fp);
+
+    return ret;
+}
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index e7a1763b54..b68f45452e 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -1,6 +1,7 @@
 #include "ggml-impl.h"
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"
+#include "ggml-profiler.h"
 
 #include <future>
 #include <vector>
@@ -25,6 +26,11 @@ struct ggml_backend_blas_context {
 #ifndef GGML_USE_OPENMP
     std::vector<std::future<void>> tasks;
 #endif
+
+    // Profiling state
+    bool                             profiling_enabled  = false;
+    int                              profiling_split_id = -1;
+    std::vector<ggml_profile_record> profiling_records;
 };
 
 static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
@@ -232,6 +238,18 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
             continue;
         }
 
+        // Skip view/identity ops
+        if (node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_VIEW ||
+            node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
+            continue;
+        }
+
+        // Profiling: time this operation
+        uint64_t t_start = 0;
+        if (ctx->profiling_enabled) {
+            t_start = ggml_profiler_time_ns();
+        }
+
         switch (node->op) {
             case GGML_OP_MUL_MAT:
                 ggml_backend_blas_mul_mat(ctx, node);
@@ -241,16 +259,26 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
                 ggml_backend_blas_out_prod(ctx, node);
                 break;
 
-            case GGML_OP_NONE:
-            case GGML_OP_RESHAPE:
-            case GGML_OP_VIEW:
-            case GGML_OP_PERMUTE:
-            case GGML_OP_TRANSPOSE:
-                break;
-
             default:
                 GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
         }
+
+        if (ctx->profiling_enabled) {
+            uint64_t            t_end = ggml_profiler_time_ns();
+            ggml_profile_record rec;
+            rec.type       = GGML_PROFILE_EVENT_OP;
+            rec.name       = ggml_op_name(node->op);
+            rec.backend_id = 0;
+            rec.split_id   = ctx->profiling_split_id;
+            rec.start_ns   = t_start;
+            rec.end_ns     = t_end;
+            rec.bytes      = ggml_nbytes(node);
+            rec.extra      = NULL;
+            if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
+            if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
+            if (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) { memcpy(rec.ne_src2, node->src[2]->ne, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
+            ctx->profiling_records.push_back(rec);
+        }
     }
 
     return GGML_STATUS_SUCCESS;
@@ -284,10 +312,11 @@ ggml_backend_t ggml_backend_blas_init(void) {
     ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
 
     ggml_backend_t backend = new ggml_backend {
-        /* .guid    = */ ggml_backend_blas_guid(),
-        /* .iface   = */ blas_backend_i,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
-        /* .context = */ ctx,
+        /* .guid     = */ ggml_backend_blas_guid(),
+        /* .iface    = */ blas_backend_i,
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
 
 #if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
@@ -300,6 +329,44 @@ ggml_backend_t ggml_backend_blas_init(void) {
     GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
 #endif
 
+    // Register profiler
+    ggml_backend_blas_context * blas_ctx = ctx;  // ctx is already defined above
+
+    static auto blas_prof_enable = [](void * ctx, bool enable) {
+        auto * bctx             = (ggml_backend_blas_context *) ctx;
+        bctx->profiling_enabled = enable;
+        if (!enable) {
+            bctx->profiling_records.clear();
+        }
+    };
+    static auto blas_prof_reset = [](void * ctx) {
+        auto * bctx = (ggml_backend_blas_context *) ctx;
+        bctx->profiling_records.clear();
+        bctx->profiling_split_id = -1;
+    };
+    static auto blas_prof_set_split_id = [](void * ctx, int split_id) {
+        auto * bctx              = (ggml_backend_blas_context *) ctx;
+        bctx->profiling_split_id = split_id;
+    };
+    static auto blas_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int {
+        auto * bctx = (ggml_backend_blas_context *) ctx;
+        *out        = bctx->profiling_records.data();
+        return (int) bctx->profiling_records.size();
+    };
+    static auto blas_prof_free = [](void * ctx) {
+        (void) ctx;
+    };
+
+    auto * profiler = new ggml_backend_profiler{
+        /* .context      = */ blas_ctx,
+        /* .enable       = */ blas_prof_enable,
+        /* .reset        = */ blas_prof_reset,
+        /* .set_split_id = */ blas_prof_set_split_id,
+        /* .get_records  = */ blas_prof_get_records,
+        /* .free_context = */ blas_prof_free,
+    };
+    ggml_backend_set_profiler(backend, profiler);
+
     return backend;
 }
 
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 40fe3d82ec..002e3bb196 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2985,7 +2985,8 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
         new ggml_backend{ /* .guid      = */ ggml_backend_cann_guid(),
                           /* .interface = */ ggml_backend_cann_interface,
                           /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
-                          /* .context   = */ ctx };
+                          /* .context   = */ ctx,
+                          /* .profiler  = */ nullptr };
 
     return cann_backend;
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 7486acc2b5..d0fc4228ff 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -6,6 +6,7 @@
 #include "traits.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-impl.h"
+#include "ggml-profiler.h"
 #include "quants.h"
 #include "ggml-threading.h"
 #include "unary-ops.h"
@@ -1159,8 +1160,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
 
     const bool src1_cont = ggml_is_contiguous(src1);
 
-    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
-    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+    const ggml_vec_dot_t vec_dot      = type_traits_cpu[type].vec_dot;
+    const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type;
 
     // broadcast factors
     const int64_t r2 = ne12 / ne02;
@@ -1244,9 +1245,9 @@ void ggml_compute_forward_mul_mat(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
-    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
-    int64_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
+    const enum ggml_type    vec_dot_type     = type_traits_cpu[src0->type].vec_dot_type;
+    const ggml_from_float_t from_float       = type_traits_cpu[vec_dot_type].from_float;
+    const int64_t           vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
 
     GGML_ASSERT(ne0 == ne01);
     GGML_ASSERT(ne1 == ne11);
@@ -1455,8 +1456,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(
 
     const enum ggml_type type = src0->type;
 
-    ggml_vec_dot_t    const vec_dot      = type_traits_cpu[type].vec_dot;
-    enum ggml_type    const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+    const ggml_vec_dot_t vec_dot      = type_traits_cpu[type].vec_dot;
+    const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type;
 
     const int64_t blck_0 = 16;
     const int64_t blck_1 = 16;
@@ -1523,8 +1524,8 @@ static void ggml_compute_forward_mul_mat_id(
 
     const bool src1_cont = ggml_is_contiguous(src1);
 
-    enum ggml_type    const vec_dot_type    = type_traits_cpu[type].vec_dot_type;
-    ggml_from_float_t const from_float      = type_traits_cpu[vec_dot_type].from_float;
+    const enum ggml_type    vec_dot_type = type_traits_cpu[type].vec_dot_type;
+    const ggml_from_float_t from_float   = type_traits_cpu[vec_dot_type].from_float;
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == ggml_type_size(type));
@@ -2977,28 +2978,73 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph);
 #endif
 
-    for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
-        struct ggml_tensor * node = cgraph->nodes[node_n];
+    // Profiling state
+    if (cplan->profiling_context != NULL && cplan->profiling_record_fn != NULL) {
+        for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
+            struct ggml_tensor * node = cgraph->nodes[node_n];
 
-        if (ggml_op_is_empty(node->op)) {
-            // skip NOPs
-            continue;
+            if (ggml_op_is_empty(node->op)) {
+                continue;
+            }
+
+            if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+                continue;
+            }
+
+            // Only thread 0 records timing (after barrier = total node time)
+            uint64_t t_start = 0;
+            if (state->ith == 0) {
+                t_start = ggml_profiler_time_ns();
+            }
+
+            ggml_compute_forward(&params, node);
+
+            if (node_n + 1 < cgraph->n_nodes) {
+                ggml_barrier(state->threadpool);
+            }
+
+            if (state->ith == 0) {
+                uint64_t t_end = ggml_profiler_time_ns();
+                {
+                    static const int64_t zero_ne[4] = {0, 0, 0, 0};
+                    const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
+                    const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
+                    cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
+                                               ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
+                                               src0_ne, src1_ne, src2_ne);
+                }
+            }
+
+            if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+                atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
+                tp->ec = GGML_STATUS_ABORTED;
+            }
         }
+    } else {
+        for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
+            struct ggml_tensor * node = cgraph->nodes[node_n];
 
-        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
-            continue;
-        }
+            if (ggml_op_is_empty(node->op)) {
+                // skip NOPs
+                continue;
+            }
 
-        ggml_compute_forward(&params, node);
+            if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+                continue;
+            }
 
-        if (state->ith == 0 && cplan->abort_callback &&
-                cplan->abort_callback(cplan->abort_callback_data)) {
-            atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
-            tp->ec    = GGML_STATUS_ABORTED;
-        }
+            ggml_compute_forward(&params, node);
 
-        if (node_n + 1 < cgraph->n_nodes) {
-            ggml_barrier(state->threadpool);
+            if (state->ith == 0 && cplan->abort_callback &&
+                    cplan->abort_callback(cplan->abort_callback_data)) {
+                atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
+                tp->ec    = GGML_STATUS_ABORTED;
+            }
+
+            if (node_n + 1 < cgraph->n_nodes) {
+                ggml_barrier(state->threadpool);
+            }
         }
     }
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index ddf1737a31..6fb9e1987c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -1,6 +1,7 @@
 #include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-cpu.h"
+#include "ggml-profiler.h"
 #include "repack.h"
 #include "traits.h"
 #include "ggml-impl.h"
@@ -107,6 +108,11 @@ struct ggml_backend_cpu_context {
     void *              abort_callback_data;
 
     bool                use_ref;  // use reference implementation
+
+    // Profiling state
+    bool                             profiling_enabled;
+    int                              profiling_split_id;
+    std::vector<ggml_profile_record> profiling_records;
 };
 
 static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
@@ -167,6 +173,46 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
     GGML_UNUSED(backend);
 }
 
+// Callback function for recording CPU profiling events from C code (ggml-cpu.c)
+static void ggml_cpu_profiler_record_callback(void *        context,
+                                              int           type,
+                                              const char *  name,
+                                              int           split_id,
+                                              uint64_t      start_ns,
+                                              uint64_t      end_ns,
+                                              uint64_t      bytes,
+                                              const char *  extra,
+                                              const int64_t ne_src0[4],
+                                              const int64_t ne_src1[4],
+                                              const int64_t ne_src2[4]) {
+    auto *              cpu_ctx = (ggml_backend_cpu_context *) context;
+    ggml_profile_record rec;
+    rec.type       = (enum ggml_profile_event_type) type;
+    rec.name       = name;
+    rec.backend_id = 0;  // will be overwritten by scheduler
+    rec.split_id   = split_id != -1 ? split_id : cpu_ctx->profiling_split_id;
+    rec.start_ns   = start_ns;
+    rec.end_ns     = end_ns;
+    rec.bytes      = bytes;
+    rec.extra      = extra;
+    if (ne_src0) {
+        memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0));
+    } else {
+        memset(rec.ne_src0, 0, sizeof(rec.ne_src0));
+    }
+    if (ne_src1) {
+        memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1));
+    } else {
+        memset(rec.ne_src1, 0, sizeof(rec.ne_src1));
+    }
+    if (ne_src2) {
+        memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2));
+    } else {
+        memset(rec.ne_src2, 0, sizeof(rec.ne_src2));
+    }
+    cpu_ctx->profiling_records.push_back(rec);
+}
+
 static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
@@ -187,6 +233,9 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
     cplan.use_ref             = cpu_ctx->use_ref;
 
+    cplan.profiling_context   = cpu_ctx->profiling_enabled ? cpu_ctx : NULL;
+    cplan.profiling_record_fn = cpu_ctx->profiling_enabled ? ggml_cpu_profiler_record_callback : NULL;
+
     return ggml_graph_compute(cgraph, &cplan);
 }
 
@@ -228,12 +277,15 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     ctx->abort_callback      = NULL;
     ctx->abort_callback_data = NULL;
     ctx->use_ref             = false;
+    ctx->profiling_enabled   = false;
+    ctx->profiling_split_id  = -1;
 
     ggml_backend_t cpu_backend = new ggml_backend {
-        /* .guid    = */ ggml_backend_cpu_guid(),
-        /* .iface   = */ ggml_backend_cpu_i,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ ctx,
+        /* .guid     = */ ggml_backend_cpu_guid(),
+        /* .iface    = */ ggml_backend_cpu_i,
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
 
     if (cpu_backend == NULL) {
@@ -241,6 +293,43 @@ ggml_backend_t ggml_backend_cpu_init(void) {
         return NULL;
     }
 
+    // Register profiler
+    static auto cpu_prof_enable = [](void * ctx, bool enable) {
+        auto * cpu_ctx             = (ggml_backend_cpu_context *) ctx;
+        cpu_ctx->profiling_enabled = enable;
+        if (!enable) {
+            cpu_ctx->profiling_records.clear();
+        }
+    };
+    static auto cpu_prof_reset = [](void * ctx) {
+        auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
+        cpu_ctx->profiling_records.clear();
+        cpu_ctx->profiling_split_id = -1;
+    };
+    static auto cpu_prof_set_split_id = [](void * ctx, int split_id) {
+        auto * cpu_ctx              = (ggml_backend_cpu_context *) ctx;
+        cpu_ctx->profiling_split_id = split_id;
+    };
+    static auto cpu_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int {
+        auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
+        *out           = cpu_ctx->profiling_records.data();
+        return (int) cpu_ctx->profiling_records.size();
+    };
+    static auto cpu_prof_free = [](void * ctx) {
+        // Nothing to free - records are in the CPU context's vector
+        (void) ctx;
+    };
+
+    auto * profiler = new ggml_backend_profiler{
+        /* .context      = */ ctx,
+        /* .enable       = */ cpu_prof_enable,
+        /* .reset        = */ cpu_prof_reset,
+        /* .set_split_id = */ cpu_prof_set_split_id,
+        /* .get_records  = */ cpu_prof_get_records,
+        /* .free_context = */ cpu_prof_free,
+    };
+    ggml_backend_set_profiler(cpu_backend, profiler);
+
     return cpu_backend;
 }
 
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 9affe02340..5f9db83cdc 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1352,6 +1352,9 @@ struct ggml_cuda_stream_context {
     }
 };
 
+// Forward declaration for profiler state (defined in ggml-cuda.cu)
+struct ggml_cuda_profiler_state;
+
 struct ggml_backend_cuda_context {
     int device;
     std::string name;
@@ -1447,6 +1450,9 @@ struct ggml_backend_cuda_context {
     ggml_cuda_pool & pool() {
         return pool(device);
     }
+
+    // Profiling
+    ggml_cuda_profiler_state * profiler_state = nullptr;
 };
 
 struct ggml_cuda_mm_fusion_args_host {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 75b62129ad..85d1941dca 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1,6 +1,7 @@
 #include "ggml-cuda.h"
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
+#include "ggml-profiler.h"
 
 #include "ggml-cuda/common.cuh"
 #include "ggml-cuda/acc.cuh"
@@ -86,6 +87,94 @@
 
 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
+// CUDA profiler state
+struct ggml_cuda_profiler_state {
+    bool enabled = false;
+    int split_id = -1;
+    cudaStream_t stream = nullptr;
+
+    static constexpr int MAX_PENDING_EVENTS = 4096;
+    std::vector<cudaEvent_t> start_events;
+    std::vector<cudaEvent_t> end_events;
+    std::vector<uint64_t>    cpu_timestamps;  // CPU-side timestamps for global ordering
+    int event_count = 0;
+
+    std::vector<ggml_profile_record> records;
+    std::vector<int> record_event_indices;
+
+    void init(cudaStream_t stream) {
+        this->stream = stream;
+        start_events.reserve(MAX_PENDING_EVENTS);
+        end_events.reserve(MAX_PENDING_EVENTS);
+        cpu_timestamps.reserve(MAX_PENDING_EVENTS);
+    }
+
+    void reset() {
+        for (auto & ev : start_events) {
+            (void) cudaEventDestroy(ev);
+        }
+        for (auto & ev : end_events) {
+            (void) cudaEventDestroy(ev);
+        }
+        start_events.clear();
+        end_events.clear();
+        cpu_timestamps.clear();
+        event_count = 0;
+        records.clear();
+        record_event_indices.clear();
+    }
+
+    ~ggml_cuda_profiler_state() {
+        reset();
+    }
+
+    void record_start() {
+        cudaEvent_t ev;
+        (void) cudaEventCreate(&ev);
+        (void) cudaEventRecord(ev, stream);
+        start_events.push_back(ev);
+        cpu_timestamps.push_back(ggml_profiler_time_ns());
+        event_count++;
+    }
+
+    void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
+                    const int64_t ne_src0[4], const int64_t ne_src1[4], const int64_t ne_src2[4]) {
+        cudaEvent_t ev;
+        (void) cudaEventCreate(&ev);
+        (void) cudaEventRecord(ev, stream);
+        end_events.push_back(ev);
+        record_event_indices.push_back(records.size());
+
+        ggml_profile_record rec;
+        rec.type = GGML_PROFILE_EVENT_OP;
+        rec.name = name;
+        rec.backend_id = backend_id;
+        rec.split_id = split_id;
+        rec.start_ns = 0;
+        rec.end_ns = 0;
+        rec.bytes = bytes;
+        rec.extra = extra;
+        if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
+        if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
+        if (ne_src2) { memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
+        records.push_back(rec);
+    }
+
+    void finalize() {
+        (void) cudaStreamSynchronize(stream);
+
+        for (int i = 0; i < (int)record_event_indices.size(); i++) {
+            float ms = 0.0f;
+            (void) cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
+            uint64_t duration_ns = (uint64_t)(ms * 1e6f);
+            int rec_idx = record_event_indices[i];
+            // Use CPU-side timestamp for global ordering, GPU-measured duration for accuracy
+            records[rec_idx].start_ns = cpu_timestamps[i];
+            records[rec_idx].end_ns   = cpu_timestamps[i] + duration_ns;
+        }
+    }
+};
+
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
     int id = -1; // in case cudaGetDevice fails
@@ -4035,8 +4124,25 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
 #else
                 GGML_UNUSED(integrated);
 #endif  // NDEBUG
+                if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
+                    cuda_ctx->profiler_state->record_start();
+                }
 
                 bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
+
+                if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
+                    cuda_ctx->profiler_state->record_end(
+                        ggml_op_name(node->op),
+                        -1,
+                        cuda_ctx->profiler_state->split_id,
+                        ggml_nbytes(node),
+                        nullptr,
+                        node->src[0] ? node->src[0]->ne : nullptr,
+                        node->src[1] ? node->src[1]->ne : nullptr,
+                        (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : nullptr
+                    );
+                }
+
                 if (!ok) {
                     GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
                 }
@@ -4107,6 +4213,19 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
 
     ggml_cuda_set_device(cuda_ctx->device);
 
+    // Disable CUDA graphs when profiling (we need per-node timing)
+    bool was_graph_enabled = false;
+    if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
+#ifdef USE_CUDA_GRAPH
+        const void * graph_key = ggml_cuda_graph_get_key(cgraph);
+        ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
+        was_graph_enabled = graph->is_enabled();
+        if (was_graph_enabled) {
+            graph->disable_due_to_gpu_arch = true;
+        }
+#endif
+    }
+
     bool use_cuda_graph             = false;
     bool cuda_graph_update_required = false;
     const void * graph_key = nullptr;
@@ -4158,6 +4277,15 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
 
     ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required, graph_key);
 
+    // Restore CUDA graph enabled state after profiling
+    if (was_graph_enabled) {
+#ifdef USE_CUDA_GRAPH
+        const void * graph_key_prof = ggml_cuda_graph_get_key(cgraph);
+        ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key_prof);
+        graph->disable_due_to_gpu_arch = false;
+#endif
+    }
+
     return GGML_STATUS_SUCCESS;
 }
 
@@ -5304,12 +5432,68 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
     }
 
     ggml_backend_t cuda_backend = new ggml_backend {
-        /* .guid    = */ ggml_backend_cuda_guid(),
-        /* .iface   = */ ggml_backend_cuda_interface,
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
-        /* .context = */ ctx,
+        /* .guid     = */ ggml_backend_cuda_guid(),
+        /* .iface    = */ ggml_backend_cuda_interface,
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
 
+    // Register profiler
+    auto * prof_state = new ggml_cuda_profiler_state();
+    prof_state->init(ctx->stream());
+    ctx->profiler_state = prof_state;
+
+    static auto cuda_prof_enable = [](void * ctx, bool enable) {
+        auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
+        if (cuda_ctx->profiler_state != nullptr) {
+            cuda_ctx->profiler_state->enabled = enable;
+            if (!enable) {
+                cuda_ctx->profiler_state->reset();
+            }
+        }
+    };
+    static auto cuda_prof_reset = [](void * ctx) {
+        auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
+        if (cuda_ctx->profiler_state != nullptr) {
+            cuda_ctx->profiler_state->reset();
+            cuda_ctx->profiler_state->split_id = -1;
+        }
+    };
+    static auto cuda_prof_set_split_id = [](void * ctx, int split_id) {
+        auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
+        if (cuda_ctx->profiler_state != nullptr) {
+            cuda_ctx->profiler_state->split_id = split_id;
+        }
+    };
+    static auto cuda_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int {
+        auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
+        if (cuda_ctx->profiler_state != nullptr) {
+            cuda_ctx->profiler_state->finalize();
+            *out = cuda_ctx->profiler_state->records.data();
+            return (int)cuda_ctx->profiler_state->records.size();
+        }
+        *out = nullptr;
+        return 0;
+    };
+    static auto cuda_prof_free = [](void * ctx) {
+        auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
+        if (cuda_ctx->profiler_state != nullptr) {
+            delete cuda_ctx->profiler_state;
+            cuda_ctx->profiler_state = nullptr;
+        }
+    };
+
+    auto * profiler = new ggml_backend_profiler {
+        /* .context      = */ ctx,
+        /* .enable       = */ cuda_prof_enable,
+        /* .reset        = */ cuda_prof_reset,
+        /* .set_split_id = */ cuda_prof_set_split_id,
+        /* .get_records  = */ cuda_prof_get_records,
+        /* .free_context = */ cuda_prof_free,
+    };
+    ggml_backend_set_profiler(cuda_backend, profiler);
+
     return cuda_backend;
 }
 
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 9d9ba1ee21..3a07b281d6 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -55,8 +55,10 @@
 #define cudaError_t hipError_t
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreate hipEventCreate
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventElapsedTime hipEventElapsedTime
 #define cudaEventRecord hipEventRecord
 #define cudaEventSynchronize hipEventSynchronize
 #define cudaEvent_t hipEvent_t
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
index 1abb8acfd4..d4ff57b3d6 100644
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -44,8 +44,10 @@
 #define cudaError_t musaError_t
 #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreate musaEventCreate
 #define cudaEventCreateWithFlags musaEventCreateWithFlags
 #define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventElapsedTime musaEventElapsedTime
 #define cudaEventRecord musaEventRecord
 #define cudaEventSynchronize musaEventSynchronize
 #define cudaEvent_t musaEvent_t
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index dd604db433..4dd352e7a0 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -3004,6 +3004,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, c
         /* .interface = */ hexagon_backend_i,
         /* .device    = */ dev,
         /* .context   = */ sess,
+        /* .profiler  = */ nullptr,
     };
 
     GGML_UNUSED(params);
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 9382ce53b3..853edd7c75 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -597,6 +597,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
         /* .interface = */ ggml_backend_metal_i,
         /* .device    = */ dev,
         /* .context   = */ ctx,
+        /* .profiler  = */ NULL,
     };
 
     ggml_backend_metal_set_n_cb(backend, 1);
@@ -691,6 +692,7 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t
         /* .interface = */ ggml_backend_metal_i,
         /* .device    = */ dev,
         /* .context   = */ ctx,
+        /* .profiler  = */ NULL,
     };
 
     ggml_backend_metal_set_n_cb(backend, 1);
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 0f6628c377..48dd0c9e4a 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -4082,7 +4082,8 @@ ggml_backend_t ggml_backend_opencl_init(void) {
         /* .guid    = */ ggml_backend_opencl_guid(),
         /* .iface   = */ ggml_backend_opencl_i,
         /* .device  = */ dev,
-        /* .context = */ backend_ctx
+        /* .context  = */ backend_ctx,
+        /* .profiler = */ nullptr,
     };
 
     return backend;
@@ -5897,6 +5898,7 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
         /* .interface = */ ggml_backend_opencl_i,
         /* .device    = */ dev,
         /* .context   = */ backend_ctx,
+        /* .profiler  = */ nullptr,
     };
 
     return backend;
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index b3058b4af7..7c36b3c5e1 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -673,6 +673,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
         /* .interface = */ ggml_backend_openvino_interface,
         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device),
         /* .context   = */ ctx,
+        /* .profiler  = */ nullptr,
     };
 
     return openvino_backend;
diff --git a/ggml/src/ggml-profiler.cpp b/ggml/src/ggml-profiler.cpp
new file mode 100644
index 0000000000..3dc60595ff
--- /dev/null
+++ b/ggml/src/ggml-profiler.cpp
@@ -0,0 +1,67 @@
+#include "ggml-profiler.h"
+
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#else
+#    include <time.h>
+#    include <unistd.h>
+#endif
+
+//
+// Time utilities
+//
+
+uint64_t ggml_profiler_time_ns(void) {
+#ifdef _WIN32
+    LARGE_INTEGER freq, count;
+    QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&count);
+    return (uint64_t) (count.QuadPart * 1000000000ULL / freq.QuadPart);
+#elif defined(CLOCK_MONOTONIC_RAW)
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+    return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
+#else
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
+#endif
+}
+
+//
+// Backend profiler registration
+//
+
+void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler) {
+    if (backend == NULL) {
+        return;
+    }
+
+    // Free any existing profiler
+    if (backend->profiler != NULL) {
+        if (backend->profiler->free_context != NULL) {
+            backend->profiler->free_context(backend->profiler->context);
+        }
+        delete backend->profiler;
+        backend->profiler = NULL;
+    }
+
+    backend->profiler = profiler;
+}
+
+ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend) {
+    if (backend == NULL) {
+        return NULL;
+    }
+    return backend->profiler;
+}
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 1378ba9f5b..24dca4503d 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -952,7 +952,8 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
         /* .guid    = */ ggml_backend_rpc_guid(),
         /* .iface   = */ ggml_backend_rpc_interface,
         /* .device  = */ ggml_backend_reg_dev_get(reg, device),
-        /* .context = */ ctx
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
     return backend;
 }
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 456b1699fa..22060441be 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -5101,7 +5101,8 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
         /* .guid    = */ ggml_backend_sycl_guid(),
         /* .iface   = */ ggml_backend_sycl_interface,
         /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
-        /* .context = */ ctx
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
 
     return sycl_backend;
diff --git a/ggml/src/ggml-virtgpu/ggml-backend.cpp b/ggml/src/ggml-virtgpu/ggml-backend.cpp
index a63ee2b9d2..912aa2ff4b 100644
--- a/ggml/src/ggml-virtgpu/ggml-backend.cpp
+++ b/ggml/src/ggml-virtgpu/ggml-backend.cpp
@@ -63,6 +63,7 @@ ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const c
         /* .interface = */ ggml_backend_remoting_interface,
         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_virtgpu_reg(), ctx->device),
         /* .context   = */ ctx,
+        /* .profiler  = */ nullptr,
     };
 
     return remoting_backend;
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 15ed5b2a79..499e384b9d 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1,4 +1,5 @@
 #include "ggml-vulkan.h"
+#include "ggml-profiler.h"
 #include <vulkan/vulkan_core.h>
 #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
 #include <chrono>
@@ -1700,8 +1701,8 @@ private:
 
 std::mutex vk_memory_logger::log_mutex;
 
-static bool vk_perf_logger_enabled = false;
-static bool vk_perf_logger_concurrent = false;
+static bool vk_perf_logger_enabled = false;       // deprecated: use --profile instead
+static bool vk_perf_logger_concurrent = false;     // GGML_VK_PERF_LOGGER_CONCURRENT: use concurrent timestamp mode
 static bool vk_enable_sync_logger = false;
 // number of calls between perf logger prints
 static uint32_t vk_perf_logger_frequency = 1;
@@ -1873,6 +1874,21 @@ class vk_perf_logger {
     uint32_t print_count {};
 };
 
+// Profiler state for the new ggml_backend_profiler interface
+struct ggml_vk_profiler_state {
+    bool enabled = false;
+    int  split_id = -1;
+
+    std::vector<ggml_profile_record> records;
+    std::vector<uint64_t>            cpu_timestamps;  // CPU-side timestamps for global ordering
+
+    void reset() {
+        records.clear();
+        cpu_timestamps.clear();
+        split_id = -1;
+    }
+};
+
 struct ggml_backend_vk_context {
     std::string name;
 
@@ -1930,8 +1946,9 @@ struct ggml_backend_vk_context {
     topk_moe_mode fused_topk_moe_mode {};
     bool fused_topk_moe_scale {};
 
-    // for GGML_VK_PERF_LOGGER
-    std::unique_ptr<vk_perf_logger> perf_logger;
+    // Profiling
+    std::unique_ptr<vk_perf_logger> perf_logger;   // legacy env-var profiler
+    ggml_vk_profiler_state * profiler_state = nullptr;
     vk::QueryPool query_pool;
     std::vector<const char *> query_fusion_names;
     std::vector<int> query_fusion_node_count;
@@ -12859,9 +12876,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
             ctx->unsynced_nodes_read.clear();
             ggml_vk_sync_buffers(ctx, compute_ctx);
 
-            if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
+            if ((vk_perf_logger_enabled || (ctx->profiler_state != nullptr && ctx->profiler_state->enabled))
+                    && vk_perf_logger_concurrent) {
                 ctx->query_node_idx[ctx->query_idx] = node_idx;
                 compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+                if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
+                    ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
+                }
             }
         }
         // Add all fused nodes to the unsynchronized lists.
@@ -13384,7 +13405,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
 
         ctx->transfer_cmd_pool.destroy(ctx->device->device);
     }
-    if (vk_perf_logger_enabled) {
+    if (ctx->perf_logger) {
         ctx->perf_logger->print_timings(true);
     }
 }
@@ -14323,7 +14344,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     ggml_vk_submit_transfer_ctx(ctx);
 
     vk_context compute_ctx;
-    if (vk_perf_logger_enabled) {
+    bool profiling = vk_perf_logger_enabled ||
+                     (ctx->profiler_state != nullptr && ctx->profiler_state->enabled);
+    if (profiling) {
         // allocate/resize the query pool
         if (ctx->num_queries < cgraph->n_nodes + 1) {
             if (ctx->query_pool) {
@@ -14350,6 +14373,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         compute_ctx = ggml_vk_get_compute_ctx(ctx);
         ctx->query_idx = 0;
         compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+        if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
+            ctx->profiler_state->cpu_timestamps.clear();
+            ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
+        }
     }
 
     ctx->prealloc_y_last_pipeline_used = nullptr;
@@ -14579,13 +14606,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
         bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
 
-        if (vk_perf_logger_enabled && enqueued) {
+        if (profiling && enqueued) {
             compute_ctx = ggml_vk_get_compute_ctx(ctx);
             if (!vk_perf_logger_concurrent) {
                 // track a single node/fusion for the current query
                 ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
                 ctx->query_fusion_names[ctx->query_idx] = fusion_string;
                 compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+                if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
+                    ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
+                }
             } else {
                 // track a fusion string and number of fused ops for the current node_idx
                 ctx->query_fusion_names[i] = fusion_string;
@@ -14619,7 +14649,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
     ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
 
-    if (vk_perf_logger_enabled) {
+    if (profiling) {
         // End the command buffer and submit/wait
         GGML_ASSERT(!ctx->compute_ctx.expired());
         compute_ctx = ctx->compute_ctx.lock();
@@ -14633,15 +14663,46 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         // Get the results and pass them to the logger
         std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
         VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
+
+        const double ts_period = ctx->device->properties.limits.timestampPeriod;
+        const bool has_profiler = ctx->profiler_state != nullptr && ctx->profiler_state->enabled;
+
         if (!vk_perf_logger_concurrent) {
             // Log each op separately
             for (int i = 1; i < ctx->query_idx; i++) {
                 auto node = ctx->query_nodes[i];
                 auto name = ctx->query_fusion_names[i];
-                ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+                uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
+
+                if (ctx->perf_logger) {
+                    ctx->perf_logger->log_timing(node, name, duration_ns);
+                }
+
+                if (has_profiler && node != nullptr) {
+                    static const int64_t zero_ne[4] = {0, 0, 0, 0};
+                    const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
+                    const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
+                    uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
+                                    ? ctx->profiler_state->cpu_timestamps[i] : 0;
+
+                    ggml_profile_record rec;
+                    rec.type       = GGML_PROFILE_EVENT_OP;
+                    rec.name       = ggml_op_name(node->op);
+                    rec.backend_id = -1;
+                    rec.split_id   = ctx->profiler_state->split_id;
+                    rec.start_ns   = cpu_ts;
+                    rec.end_ns     = cpu_ts + duration_ns;
+                    rec.bytes      = ggml_nbytes(node);
+                    rec.extra      = name;  // fusion name or NULL
+                    memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
+                    memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
+                    memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
+                    ctx->profiler_state->records.push_back(rec);
+                }
             }
         } else {
-            // Log each group of nodes
+            // Log each group of nodes (concurrent mode)
             int prev_node_idx = 0;
             for (int i = 1; i < ctx->query_idx; i++) {
                 auto cur_node_idx = ctx->query_node_idx[i];
@@ -14656,10 +14717,46 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                     node_idx += ctx->query_fusion_node_count[node_idx];
                 }
                 prev_node_idx = cur_node_idx;
-                ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+                uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
+
+                if (ctx->perf_logger) {
+                    ctx->perf_logger->log_timing(nodes, names, duration_ns);
+                }
+
+                if (has_profiler && !nodes.empty()) {
+                    uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
+                                    ? ctx->profiler_state->cpu_timestamps[i] : 0;
+                    // In concurrent mode, report the group as a single combined operation
+                    auto * node = nodes[0];
+                    static const int64_t zero_ne[4] = {0, 0, 0, 0};
+                    const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
+                    const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
+
+                    uint64_t total_bytes = 0;
+                    for (size_t j = 0; j < nodes.size(); j++) {
+                        total_bytes += ggml_nbytes(nodes[j]);
+                    }
+
+                    ggml_profile_record rec;
+                    rec.type       = GGML_PROFILE_EVENT_OP;
+                    rec.name       = ggml_op_name(node->op);
+                    rec.backend_id = -1;
+                    rec.split_id   = ctx->profiler_state->split_id;
+                    rec.start_ns   = cpu_ts;
+                    rec.end_ns     = cpu_ts + duration_ns;
+                    rec.bytes      = total_bytes;
+                    rec.extra      = names[0];  // fusion name of first op, or NULL
+                    memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
+                    memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
+                    memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
+                    ctx->profiler_state->records.push_back(rec);
+                }
             }
         }
-        ctx->perf_logger->print_timings();
+        if (ctx->perf_logger) {
+            ctx->perf_logger->print_timings();
+        }
     }
 
     if (!ctx->device->support_async) {
@@ -14994,13 +15091,66 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
         /* .guid    = */ ggml_backend_vk_guid(),
         /* .iface   = */ ggml_backend_vk_interface,
         /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
-        /* .context = */ ctx,
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
 
     if (!ctx->device->support_async) {
         vk_backend->iface.get_tensor_async = nullptr;
     }
 
+    // Register profiler
+    auto * prof_state = new ggml_vk_profiler_state();
+    ctx->profiler_state = prof_state;
+
+    static auto vk_prof_enable = [](void * context, bool enable) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            vk_ctx->profiler_state->enabled = enable;
+            if (!enable) {
+                vk_ctx->profiler_state->reset();
+            }
+        }
+    };
+    static auto vk_prof_reset = [](void * context) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            vk_ctx->profiler_state->reset();
+        }
+    };
+    static auto vk_prof_set_split_id = [](void * context, int split_id) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            vk_ctx->profiler_state->split_id = split_id;
+        }
+    };
+    static auto vk_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            *out = vk_ctx->profiler_state->records.data();
+            return (int)vk_ctx->profiler_state->records.size();
+        }
+        *out = nullptr;
+        return 0;
+    };
+    static auto vk_prof_free = [](void * context) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            delete vk_ctx->profiler_state;
+            vk_ctx->profiler_state = nullptr;
+        }
+    };
+
+    auto * profiler = new ggml_backend_profiler {
+        /* .context      = */ ctx,
+        /* .enable       = */ vk_prof_enable,
+        /* .reset        = */ vk_prof_reset,
+        /* .set_split_id = */ vk_prof_set_split_id,
+        /* .get_records  = */ vk_prof_get_records,
+        /* .free_context = */ vk_prof_free,
+    };
+    ggml_backend_set_profiler(vk_backend, profiler);
+
     return vk_backend;
 }
 
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 1aa15b0507..15c33b65c5 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -3102,6 +3102,7 @@ static ggml_backend_t ggml_backend_webgpu_backend_init(ggml_backend_dev_t dev, c
         /* .interface = */ ggml_backend_webgpu_i,
         /* .device    = */ dev,
         /* .context   = */ backend_ctx,
+        /* .profiler  = */ nullptr,
     };
     return backend;
 }
diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
index 9b6938abf7..37050a9a09 100644
--- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp
+++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
@@ -499,7 +499,8 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons
         /* .guid       = */ ggml_backend_zdnn_guid(),
         /* .iface      = */ ggml_backend_zdnn_i,
         /* .device     = */ dev,
-        /* .context    = */ ctx
+        /* .context    = */ ctx,
+        /* .profiler   = */ NULL,
     };
 
     return backend;
diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
index c876030400..a78ada8f2f 100644
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
@@ -264,7 +264,8 @@ ggml_backend_t ggml_backend_zendnn_init(void) {
         /* .guid    = */ ggml_backend_zendnn_guid(),
         /* .iface   = */ ggml_backend_zendnn_i,
         /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_zendnn_reg(), 0),
-        /* .context = */ ctx,
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
 
     return backend;
diff --git a/include/llama.h b/include/llama.h
index a940f9d648..4186f236d6 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -561,6 +561,8 @@ extern "C" {
     LLAMA_API           llama_memory_t   llama_get_memory  (const struct llama_context * ctx);
     LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
 
+    LLAMA_API struct ggml_backend_sched * llama_context_get_sched(const struct llama_context * ctx);
+
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
     LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index a808e3e454..5e3030c8f6 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1,5 +1,6 @@
 #include "llama-context.h"
 
+#include "ggml-profiler.h"
 #include "llama-arch.h"
 #include "llama-impl.h"
 #include "llama-batch.h"
@@ -2189,6 +2190,11 @@ ggml_status llama_context::graph_compute(
         LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
     }
 
+    // If profiling is enabled, synchronize to ensure records are complete
+    if (ggml_backend_sched_get_profiling(sched.get())) {
+        ggml_backend_sched_synchronize(sched.get());
+    }
+
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
 
     return status;
@@ -3025,6 +3031,10 @@ enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
     return ctx->pooling_type();
 }
 
+ggml_backend_sched_t llama_context_get_sched(const llama_context * ctx) {
+    return ctx->get_sched();
+}
+
 void llama_attach_threadpool(
             llama_context * ctx,
         ggml_threadpool_t   threadpool,
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index b57d27762c..224e9c2514 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -644,6 +644,29 @@ int main(int argc, char ** argv) {
     ctx_cli.ctx_server.terminate();
     inference_thread.join();
 
+    // Export profiling data if profiling was enabled
+    if (params.profiling) {
+        ggml_backend_sched_t sched = llama_context_get_sched(ctx_cli.ctx_server.get_llama_context());
+        if (sched != nullptr) {
+            if (params.profiling_output.empty()) {
+                ggml_backend_sched_print_profiling(sched);
+            } else {
+                const std::string & path = params.profiling_output;
+                int ret;
+                if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
+                    ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
+                } else {
+                    ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
+                }
+                if (ret == 0) {
+                    console::log("\nProfiling data exported to: %s\n", path.c_str());
+                } else {
+                    console::error("\nFailed to export profiling data to: %s\n", path.c_str());
+                }
+            }
+        }
+    }
+
     // bump the log level to display timings
     common_log_set_verbosity_thold(LOG_LEVEL_INFO);
     llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp
index 1dc5df1afa..caba5255cf 100644
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -991,6 +991,29 @@ int main(int argc, char ** argv) {
         llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
 
+    // Export profiling data if profiling was enabled
+    if (params.profiling) {
+        ggml_backend_sched_t sched = llama_context_get_sched(ctx);
+        if (sched != nullptr) {
+            if (params.profiling_output.empty()) {
+                ggml_backend_sched_print_profiling(sched);
+            } else {
+                const std::string & path = params.profiling_output;
+                int ret;
+                if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
+                    ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
+                } else {
+                    ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
+                }
+                if (ret == 0) {
+                    LOG("\nProfiling data exported to: %s\n", path.c_str());
+                } else {
+                    LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str());
+                }
+            }
+        }
+    }
+
     LOG("\n\n");
     common_perf_print(ctx, smpl);
 
diff --git a/tools/profiler/__init__.py b/tools/profiler/__init__.py
new file mode 100644
index 0000000000..cdd8dd543e
--- /dev/null
+++ b/tools/profiler/__init__.py
@@ -0,0 +1 @@
+# llama.cpp profiler analysis tools
diff --git a/tools/profiler/profiler.py b/tools/profiler/profiler.py
new file mode 100644
index 0000000000..a739a830f4
--- /dev/null
+++ b/tools/profiler/profiler.py
@@ -0,0 +1,1003 @@
+#!/usr/bin/env python3
+"""llama.cpp cross-backend profiler analysis tool.
+
+Usage:
+    python -m tools.profiler.profiler profile.json
+    python -m tools.profiler.profiler profile.json --chrome-trace trace.json
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+
+OP_EVENT = 0
+COPY_EVENT = 1
+
+TYPE_NAMES = {0: "OP", 1: "COPY"}
+
+
+@dataclass
+class ProfileRecord:
+    type: int
+    name: str
+    backend_id: int
+    split_id: int
+    start_ns: int
+    duration_ns: int
+    bytes: int
+    extra: Optional[str]
+    ne_src0: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
+    ne_src1: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
+    ne_src2: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
+
+    @property
+    def type_name(self) -> str:
+        return TYPE_NAMES.get(self.type, f"UNKNOWN({self.type})")
+
+    @property
+    def duration_us(self) -> float:
+        return self.duration_ns / 1000.0
+
+    @property
+    def duration_ms(self) -> float:
+        return self.duration_ns / 1_000_000.0
+
+    @property
+    def bandwidth_gbps(self) -> float:
+        """Bandwidth in GB/s."""
+        if self.duration_ns == 0 or self.bytes == 0:
+            return 0.0
+        return self.bytes / self.duration_ns
+
+    @staticmethod
+    def _fmt_ne(ne: list[int]) -> str:
+        dims = [n for n in ne if n > 0]
+        if not dims:
+            return ""
+        return "[" + ", ".join(str(d) for d in dims) + "]"
+
+    @property
+    def shape_str(self) -> str:
+        """Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1] x [8, 1]'."""
+        s0 = self._fmt_ne(self.ne_src0)
+        s1 = self._fmt_ne(self.ne_src1)
+        s2 = self._fmt_ne(self.ne_src2)
+        parts = [s for s in (s0, s1, s2) if s]
+        return " x ".join(parts)
+
+    def to_dict(self) -> dict:
+        return {
+            "type": self.type,
+            "name": self.name,
+            "backend_id": self.backend_id,
+            "split_id": self.split_id,
+            "start_ns": self.start_ns,
+            "duration_ns": self.duration_ns,
+            "bytes": self.bytes,
+            "extra": self.extra,
+            "ne_src0": self.ne_src0,
+            "ne_src1": self.ne_src1,
+            "ne_src2": self.ne_src2,
+        }
+
+
+@dataclass
+class OpStats:
+    name: str
+    event_type: int
+    backend_id: int
+    count: int = 0
+    total_ns: int = 0
+    min_ns: int = 0
+    max_ns: int = 0
+    total_bytes: int = 0
+    representative_ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
+
+    @property
+    def avg_ns(self) -> float:
+        return self.total_ns / self.count if self.count > 0 else 0
+
+    @property
+    def avg_us(self) -> float:
+        return self.avg_ns / 1000.0
+
+    @property
+    def total_ms(self) -> float:
+        return self.total_ns / 1_000_000.0
+
+    @property
+    def min_us(self) -> float:
+        return self.min_ns / 1000.0
+
+    @property
+    def max_us(self) -> float:
+        return self.max_ns / 1000.0
+
+    @property
+    def bandwidth_gbps(self) -> float:
+        if self.total_ns == 0 or self.total_bytes == 0:
+            return 0.0
+        return self.total_bytes / self.total_ns
+
+    @property
+    def time_per_byte_ns(self) -> float:
+        """Time per byte (lower = more efficient)."""
+        if self.total_bytes == 0:
+            return float("inf")
+        return self.total_ns / self.total_bytes
+
+    @property
+    def type_name(self) -> str:
+        return TYPE_NAMES.get(self.event_type, f"UNKNOWN({self.event_type})")
+
+
+class ProfileData:
+    def __init__(self, records: list[ProfileRecord], metadata: dict):
+        self.records = records
+        self.metadata = metadata
+
+    @classmethod
+    def load(cls, filepath: str | Path) -> ProfileData:
+        """Load a profiler JSON file."""
+        with open(filepath, "r") as f:
+            data = json.load(f)
+
+        if data.get("profiler") != "ggml":
+            print(f"Warning: file may not be a ggml profiler output (profiler={data.get('profiler')})")
+
+        records = []
+        def _pad_ne(v):
+            if isinstance(v, list) and len(v) < 4:
+                return v + [0] * (4 - len(v))
+            if not isinstance(v, list):
+                return [0, 0, 0, 0]
+            return v
+
+        for r in data.get("records", []):
+            # Support both old "ne" format and new "ne_src0"/"ne_src1" format
+            ne_src0 = _pad_ne(r.get("ne_src0", r.get("ne", [0, 0, 0, 0])))
+            ne_src1 = _pad_ne(r.get("ne_src1", [0, 0, 0, 0]))
+            ne_src2 = _pad_ne(r.get("ne_src2", [0, 0, 0, 0]))
+            records.append(ProfileRecord(
+                type=r.get("type", 0),
+                name=r.get("name", "unknown"),
+                backend_id=r.get("backend_id", 0),
+                split_id=r.get("split_id", 0),
+                start_ns=r.get("start_ns", 0),
+                duration_ns=r.get("duration_ns", 0),
+                bytes=r.get("bytes", 0),
+                extra=r.get("extra"),
+                ne_src0=ne_src0,
+                ne_src1=ne_src1,
+                ne_src2=ne_src2,
+            ))
+
+        backends_raw = data.get("backends", [])
+        backends = []
+        for b in backends_raw:
+            backends.append({
+                "id": b.get("id", 0),
+                "name": b.get("name", "unknown"),
+                "device": b.get("device", "unknown"),
+                "device_type": b.get("device_type", 0),
+            })
+
+        metadata = {
+            "version": data.get("version", 0),
+            "total_records": data.get("total_records", len(records)),
+            "total_ns": data.get("total_ns", sum(r.duration_ns for r in records)),
+            "backends": backends,
+        }
+
+        return cls(records, metadata)
+
+    @property
+    def total_ns(self) -> int:
+        return sum(r.duration_ns for r in self.records)
+
+    @property
+    def total_ms(self) -> float:
+        return self.total_ns / 1_000_000.0
+
+    def stats(self) -> list[OpStats]:
+        """Aggregate stats grouped by (name, type, backend_id)."""
+        groups: dict[tuple, OpStats] = {}
+        for rec in self.records:
+            key = (rec.name, rec.type, rec.backend_id)
+            if key not in groups:
+                groups[key] = OpStats(
+                    name=rec.name,
+                    event_type=rec.type,
+                    backend_id=rec.backend_id,
+                    min_ns=rec.duration_ns,
+                    max_ns=rec.duration_ns,
+                    representative_ne=list(rec.ne_src0),
+                )
+            s = groups[key]
+            s.count += 1
+            s.total_ns += rec.duration_ns
+            s.min_ns = min(s.min_ns, rec.duration_ns)
+            s.max_ns = max(s.max_ns, rec.duration_ns)
+            s.total_bytes += rec.bytes
+
+            # Track the ne from the longest individual call
+            if rec.duration_ns >= s.max_ns:
+                s.representative_ne = list(rec.ne_src0)
+
+        return sorted(groups.values(), key=lambda s: s.total_ns, reverse=True)
+
+    def top_operations(self, n: int = 10) -> list[OpStats]:
+        """Return the N most time-consuming operations (aggregated)."""
+        return self.stats()[:n]
+
+    def top_kernels(self, n: int = 10) -> list[ProfileRecord]:
+        """Return the N longest individual kernel executions."""
+        return sorted(self.records, key=lambda r: r.duration_ns, reverse=True)[:n]
+
+    def by_backend(self) -> dict[int, list[ProfileRecord]]:
+        """Group records by backend ID."""
+        groups: dict[int, list[ProfileRecord]] = {}
+        for rec in self.records:
+            groups.setdefault(rec.backend_id, []).append(rec)
+        return dict(sorted(groups.items()))
+
+    def timeline(self) -> list[ProfileRecord]:
+        """Return records sorted by start_ns for timeline visualization."""
+        return sorted(self.records, key=lambda r: r.start_ns)
+
+    def inefficiency_ranking(self, n: int = 10) -> list[OpStats]:
+        """Rank operations by time per byte (inefficiency). Lower is better."""
+        all_stats = [s for s in self.stats() if s.total_bytes > 0 and s.event_type == OP_EVENT]
+        return sorted(all_stats, key=lambda s: s.time_per_byte_ns, reverse=True)[:n]
+
+    def summary(self) -> None:
+        """Print a formatted summary table to stdout."""
+        print(f"\n{'='*80}")
+        print(f"  ggml Profiler Summary")
+        print(f"{'='*80}")
+        print(f"  Total records: {len(self.records)}")
+        print(f"  Total time:    {self.total_ms:.2f} ms")
+        print(f"  Unique ops:    {len(set((r.name, r.type, r.backend_id) for r in self.records))}")
+        print(f"{'='*80}\n")
+
+        stats = self.stats()
+        if not stats:
+            print("  No profiling data.\n")
+            return
+
+        print(f"  {'TYPE':<5} {'BKND':>4}  {'Operation':<28} {'%Time':>7}  {'Count':>6}  "
+              f"{'Total':>10}  {'Avg':>10}  {'Min':>10}  {'Max':>10}  {'Bandwidth':>12}")
+        print(f"  {'':->5} {'':->4}  {'':->28} {'':->7}  {'':->6}  "
+              f"{'(ms)':>10}  {'(us)':>10}  {'(us)':>10}  {'(us)':>10}  {'':->12}")
+
+        for s in stats:
+            pct = 100.0 * s.total_ns / self.total_ns if self.total_ns > 0 else 0
+
+            line = (f"  {s.type_name:<5} {s.backend_id:>4}  {s.name:<28} {pct:>6.1f}%  "
+                    f"{s.count:>6}  {s.total_ms:>10.2f}  {s.avg_us:>10.2f}  "
+                    f"{s.min_us:>10.2f}  {s.max_us:>10.2f}")
+
+            if s.total_bytes > 0 and s.total_ns > 0:
+                bw = s.bandwidth_gbps
+                if bw >= 1000.0:
+                    line += f"  {bw / 1000.0:>9.2f} TB/s"
+                else:
+                    line += f"  {bw:>9.2f} GB/s"
+            else:
+                line += f"  {'':>12}"
+
+            # Tensor shape from longest call
+            shape_dims = [n for n in s.representative_ne if n > 0]
+            if shape_dims:
+                line += f"  [{', '.join(str(d) for d in shape_dims)}]"
+
+            print(line)
+
+        backend_groups = self.by_backend()
+        if len(backend_groups) > 1:
+            print(f"\n  --- By Backend ---")
+            for bid, recs in sorted(backend_groups.items()):
+                bk_total = sum(r.duration_ns for r in recs)
+                bk_pct = 100.0 * bk_total / self.total_ns if self.total_ns > 0 else 0
+                print(f"  Backend {bid}: {bk_total / 1e6:.2f} ms ({bk_pct:.1f}%) — {len(recs)} records")
+
+        inef = self.inefficiency_ranking(5)
+        if inef:
+            print(f"\n  --- Top 5 Inefficient Operations (time/byte) ---")
+            for s in inef:
+                print(f"  {s.name:<28} {s.time_per_byte_ns / 1000:.2f} us/byte  "
+                      f"({s.count} calls, {s.total_bytes / 1e6:.1f} MB)")
+
+        top_k = self.top_kernels(5)
+        print(f"\n  --- Top 5 Longest Kernels ---")
+        for rec in top_k:
+            shape = f" {rec.shape_str}" if rec.shape_str else ""
+            print(f"  {rec.type_name:<5} {rec.name:<28} {rec.duration_us:>10.2f} us{shape}  "
+                  f"(split={rec.split_id}, backend={rec.backend_id})")
+
+        print()
+
+    def export_chrome_trace(self, filepath: str | Path) -> None:
+        """Export as Chrome Trace Event format for chrome://tracing."""
+        events = []
+
+        # Build backend name mapping and remap to non-negative PIDs
+        # (Chrome cannot handle negative PIDs)
+        backend_ids = sorted(set(rec.backend_id for rec in self.records))
+        backend_names: dict[int, str] = {}
+        pid_map: dict[int, int] = {}
+
+        # Use metadata from JSON if available
+        metadata_backends = self.metadata.get("backends", [])
+        backend_by_id: dict[int, dict] = {b["id"]: b for b in metadata_backends}
+
+        device_type_names = {0: "CPU", 1: "GPU", 2: "ACCEL"}
+        for idx, bid in enumerate(backend_ids):
+            pid_map[bid] = idx
+            if bid in backend_by_id:
+                binfo = backend_by_id[bid]
+                dev_type = binfo.get("device_type", 0)
+                dev_name = binfo.get("device", "")
+                type_name = device_type_names.get(dev_type, "Device")
+                if dev_name and dev_name != "unknown":
+                    backend_names[bid] = f"{type_name}: {dev_name}"
+                else:
+                    backend_names[bid] = f"{type_name}: {binfo.get('name', f'Backend {bid}')}"
+            else:
+                backend_names[bid] = f"Backend {bid}"
+
+        # Process metadata events
+        for bid in backend_ids:
+            pid = pid_map[bid]
+            events.append({
+                "ph": "M",  # metadata
+                "pid": pid,
+                "name": "process_name",
+                "args": {"name": backend_names[bid]},
+            })
+
+        # Use real timestamps, but prevent overlaps within each track.
+        # GPU kernels are launched rapidly (small start_ns gaps) but have long
+        # durations, so naive real timestamps overlap.  Sweep-line per track:
+        # sort by start_ns, then place each event at max(start, prev_end).
+        from collections import defaultdict
+        tracks: dict[tuple, list[ProfileRecord]] = defaultdict(list)
+        for rec in self.records:
+            tracks[(rec.backend_id, rec.split_id)].append(rec)
+
+        for key in tracks:
+            tracks[key].sort(key=lambda r: r.start_ns)
+
+        for key, recs in tracks.items():
+            pid = pid_map[key[0]]
+            tid = f"split_{key[1]}"
+            cursor = 0.0
+            for rec in recs:
+                ts = max(rec.start_ns / 1000.0, cursor)
+                dur = rec.duration_ns / 1000.0
+                cat = "copy" if rec.type == COPY_EVENT else "compute"
+                events.append({
+                    "ph": "X",  # complete event
+                    "pid": pid,
+                    "tid": tid,
+                    "name": rec.name,
+                    "ts": ts,
+                    "dur": dur,
+                    "cat": cat,
+                    "args": {
+                        "bytes": rec.bytes,
+                        "duration_us": dur,
+                        "shape": rec.shape_str,
+                    },
+                })
+                cursor = ts + dur
+
+        trace = {"traceEvents": events}
+        with open(filepath, "w") as f:
+            json.dump(trace, f, indent=2)
+
+        print(f"Chrome trace exported to: {filepath}")
+        print(f"Open chrome://tracing in Chrome/Edge and load this file.")
+
+    def export_html_viewer(self, filepath: str | Path, max_records: int = 0) -> None:
+        """Export a self-contained interactive HTML timeline viewer using Canvas."""
+        import json as json_mod
+
+        metadata_backends = self.metadata.get("backends", [])
+        backend_by_id: dict[int, dict] = {b["id"]: b for b in metadata_backends}
+
+        backend_names: dict[int, str] = {}
+        for bid in sorted(set(rec.backend_id for rec in self.records)):
+            binfo = backend_by_id.get(bid, {})
+            name = binfo.get("name", f"Backend {bid}")
+            device = binfo.get("device", "")
+            backend_names[bid] = device if device and device != "unknown" else name
+
+        events: list[dict] = []
+        cum_us = 0.0
+        for rec in self.records:
+            dur_us = rec.duration_ns / 1000.0
+            events.append({
+                "n": rec.name,
+                "d": dur_us,
+                "s": rec.shape_str,
+                "b": rec.bytes,
+                "t": rec.type,
+                "bid": rec.backend_id,
+                "start": cum_us,
+            })
+            cum_us += dur_us
+        total_us = cum_us
+
+        if max_records > 0 and len(events) > max_records:
+            stride = len(events) // max_records
+            events = events[::stride][:max_records]
+
+        if total_us == 0:
+            print("No profiling data to export.")
+            return
+
+        header_stats = str(len(events)) + ' events | ' + f'{total_us/1000:.1f}' + ' ms'
+
+        # Build backend name map with string keys for JSON
+        bn_str = {str(k): v for k, v in backend_names.items()}
+
+        # --- HTML ---
+        html = (
+            '<!DOCTYPE html>\n<html><head><meta charset="utf-8">'
+            '<title>ggml Profiler</title>\n<style>\n'
+            '*{margin:0;padding:0;box-sizing:border-box}\n'
+            'body{font-family:system-ui,sans-serif;background:#1a1a2e;color:#eee;'
+            'display:flex;flex-direction:column;height:100vh;overflow:hidden}\n'
+            '#hd{background:#16213e;padding:8px 16px;display:flex;align-items:center;'
+            'gap:16px;border-bottom:1px solid #0f3460;flex-shrink:0}\n'
+            '#hd h1{font-size:15px;color:#e94560}\n'
+            '#hd .st{font-size:11px;color:#888}\n'
+            '#tb{background:#16213e;padding:6px 16px;border-bottom:1px solid #0f3460;'
+            'display:flex;align-items:center;gap:6px;flex-shrink:0}\n'
+            '#tb button{background:#0f3460;color:#eee;border:none;padding:5px 12px;'
+            'cursor:pointer;border-radius:3px;font-size:11px}\n'
+            '#tb button:hover{background:#e94560}\n'
+            '#vi{font-size:10px;color:#888;margin-left:auto}\n'
+            '#main{flex:1;display:flex;flex-direction:column;overflow:hidden}\n'
+            '#cw{flex-shrink:0;overflow:hidden;position:relative}\n'
+            '#c{display:block}\n'
+            '#stats{flex:1;overflow-y:auto;background:#1a1a2e;border-top:1px solid #0f3460}\n'
+            '#stats table{width:100%;border-collapse:collapse;font-size:11px}\n'
+            '#stats thead{position:sticky;top:0;z-index:1}\n'
+            '#stats th{text-align:left;padding:6px 10px;color:#888;background:#16213e;'
+            'border-bottom:1px solid #0f3460;font-weight:normal;font-size:10px;'
+            'text-transform:uppercase;letter-spacing:0.5px}\n'
+            '#stats th.r{text-align:right}\n'
+            '#stats td{padding:4px 10px;border-bottom:1px solid rgba(15,52,96,0.4)}\n'
+            '#stats td.r{text-align:right;font-variant-numeric:tabular-nums;font-family:monospace,system-ui}\n'
+            '#stats .l0 td{background:rgba(30,30,54,0.6)}\n'
+            '#stats .l0:hover td{background:rgba(40,40,70,0.8)}\n'
+            '#stats .l1:hover td,.l2:hover td{background:rgba(35,35,60,0.5)}\n'
+            '#stats .tog{cursor:pointer;user-select:none;color:#666;'
+            'width:16px;display:inline-block;text-align:center;font-size:9px}\n'
+            '#stats .tog:hover{color:#e94560}\n'
+            '#stats .pct-cell{position:relative}\n'
+            '#stats .pct-bg{position:absolute;left:0;top:1px;bottom:1px;border-radius:2px;pointer-events:none}\n'
+            '#stats .pct-tx{position:relative}\n'
+            '#tt{position:fixed;background:#16213e;border:1px solid #e94560;'
+            'padding:10px;border-radius:5px;font-size:11px;display:none;'
+            'z-index:100;pointer-events:none;max-width:280px;line-height:1.6}\n'
+            '#lg{background:#16213e;padding:6px 16px;border-top:1px solid #0f3460;'
+            'font-size:10px;flex-shrink:0}\n'
+            '</style></head><body>\n'
+            '<div id="hd"><h1>ggml Profiler Timeline</h1>'
+            '<span class="st">' + header_stats + '</span></div>\n'
+            '<div id="tb">'
+            '<button onclick="fitAll()">Fit</button>'
+            '<button onclick="zoomTo(1000000)">1s</button>'
+            '<button onclick="zoomTo(100000)">100ms</button>'
+            '<button onclick="zoomTo(10000)">10ms</button>'
+            '<button onclick="zoomTo(1000)">1ms</button>'
+            '<button onclick="zoomTo(100)">100\u03bcs</button>'
+            '<span id="vi"></span></div>\n'
+            '<div id="main">\n'
+            '<div id="cw"><canvas id="c"></canvas></div>\n'
+            '<div id="stats"></div>\n'
+            '</div>\n'
+            '<div id="tt"></div>\n'
+            '<div id="lg"></div>\n'
+            '<script>\n'
+        )
+
+        # --- Inject data ---
+        html += 'var EVENTS=' + json_mod.dumps(events, separators=(',', ':')) + ';\n'
+        html += 'var BACKENDS=' + json_mod.dumps(bn_str, separators=(',', ':')) + ';\n'
+        html += 'var TOTAL_US=' + repr(total_us) + ';\n'
+
+        # --- JavaScript (plain string, no f-strings) ---
+        js = r"""
+// Pre-process: group events by lane
+var LANE_IDS=[],seen={};
+for(var i=0;i<EVENTS.length;i++){var b=EVENTS[i].bid;if(!(b in seen)){LANE_IDS.push(b);seen[b]=true;}}
+LANE_IDS.sort(function(a,b){return a-b;});
+var LANE_EVENTS={};
+for(var i=0;i<LANE_IDS.length;i++)LANE_EVENTS[LANE_IDS[i]]=[];
+for(var i=0;i<EVENTS.length;i++)LANE_EVENTS[EVENTS[i].bid].push(EVENTS[i]);
+
+// Constants
+var LANE_H=32,LABEL_W=150,MINIMAP_H=28,AXIS_H=18;
+var TOP_PAD=MINIMAP_H+AXIS_H;
+var BAR_PAD=3,COPY_PAD=8;
+
+// Colors
+var OP_COL={'MUL_MAT':'#4285f4','FLASH_ATTN_EXT':'#e879a0','ADD':'#81c784',
+'ROPE':'#ce93d8','GET_ROWS':'#ffab91','CPY':'#b0bec5','CONCAT':'#90caf9',
+'SCALE':'#80deea','MUL':'#a5d6a7','SOFT_MAX':'#fff176','RMS_NORM':'#ffcc80',
+'SILU':'#ef9a9a','CONT':'#80cbc4','RESHAPE':'#9fa8da','VIEW':'#a1887f',
+'PERMUTE':'#90a4ae','TRANSPOSE':'#c5e1a5','UNARY':'#f48fb1'};
+function hash(s){var h=0;for(var i=0;i<s.length;i++)h=((h<<5)-h)+s.charCodeAt(i);return Math.abs(h);}
+function col(n){return OP_COL[n]||('hsl('+hash(n)%360+',60%,55%)');}
+function fmtT(us){if(us>=1e6)return(us/1e6).toFixed(2)+'s';if(us>=1e3)return(us/1e3).toFixed(2)+'ms';return us.toFixed(1)+'\u03bcs';}
+function fmtB(b){if(!b)return'';if(b>=1e9)return(b/1e9).toFixed(1)+'GB';if(b>=1e6)return(b/1e6).toFixed(1)+'MB';if(b>=1e3)return(b/1e3).toFixed(1)+'KB';return b+'B';}
+function fmtSh(s){if(!s)return'';return s.replace(/[\[\],]| x /g,function(m){return'<span style="color:#e8a040">'+m+'</span>';});}
+
+// Canvas state
+var canvas=document.getElementById('c');
+var ctx,canvasW,canvasH,viewW;
+var scale=1,offsetUs=0;
+var hoveredEv=null,isDragging=false,dragStartX,dragStartOff;
+
+function setup(){
+  var dpr=window.devicePixelRatio||1;
+  canvasW=canvas.parentElement.clientWidth;
+  canvasH=Math.max(200,LANE_IDS.length*LANE_H+TOP_PAD+4);
+  canvas.width=Math.round(canvasW*dpr);
+  canvas.height=Math.round(canvasH*dpr);
+  canvas.style.width=canvasW+'px';
+  canvas.style.height=canvasH+'px';
+  ctx=canvas.getContext('2d');
+  ctx.scale(dpr,dpr);
+  viewW=canvasW-LABEL_W;
+  document.getElementById('cw').style.height=canvasH+'px';
+}
+
+// Binary search: first event where start+d >= t
+function bsFirst(evts,t){
+  var lo=0,hi=evts.length;
+  while(lo<hi){var m=(lo+hi)>>1;if(evts[m].start+evts[m].d<t)lo=m+1;else hi=m;}
+  return lo;
+}
+// Binary search: find event containing time t
+function bsHit(evts,t){
+  var lo=0,hi=evts.length-1;
+  while(lo<=hi){
+    var m=(lo+hi)>>1;var ev=evts[m];
+    if(t<ev.start)hi=m-1;
+    else if(t>ev.start+ev.d)lo=m+1;
+    else return ev;
+  }
+  return null;
+}
+
+// Pre-render minimap to offscreen canvas
+var mmCanvas;
+function buildMinimap(){
+  var dpr=window.devicePixelRatio||1;
+  mmCanvas=document.createElement('canvas');
+  mmCanvas.width=Math.round(canvasW*dpr);
+  mmCanvas.height=Math.round(MINIMAP_H*dpr);
+  var mc=mmCanvas.getContext('2d');
+  mc.scale(dpr,dpr);
+  mc.fillStyle='#0d1117';
+  mc.fillRect(0,0,canvasW,MINIMAP_H);
+  var mmScale=canvasW/TOTAL_US;
+  for(var li=0;li<LANE_IDS.length;li++){
+    var evts=LANE_EVENTS[LANE_IDS[li]];
+    var step=Math.max(1,Math.floor(evts.length/(canvasW*2)));
+    mc.globalAlpha=0.6;
+    for(var i=0;i<evts.length;i+=step){
+      var ev=evts[i];
+      mc.fillStyle=col(ev.n);
+      mc.fillRect(ev.start*mmScale,2,Math.max(0.5,ev.d*mmScale),MINIMAP_H-4);
+    }
+  }
+  mc.globalAlpha=1;
+}
+
+function clampOffset(){
+  var maxOff=TOTAL_US-viewW/scale;
+  if(maxOff<0)maxOff=0;
+  if(offsetUs<0)offsetUs=0;
+  if(offsetUs>maxOff)offsetUs=maxOff;
+}
+
+function render(){
+  ctx.clearRect(0,0,canvasW,canvasH);
+  var visStart=offsetUs,visEnd=offsetUs+viewW/scale;
+
+  // Minimap
+  ctx.drawImage(mmCanvas,0,0,canvasW,MINIMAP_H);
+  var vpX=offsetUs/TOTAL_US*canvasW,vpW=viewW/scale/TOTAL_US*canvasW;
+  ctx.strokeStyle='#e94560';ctx.lineWidth=2;
+  ctx.strokeRect(vpX,1,Math.max(2,vpW),MINIMAP_H-2);
+  ctx.fillStyle='rgba(233,69,96,0.15)';
+  ctx.fillRect(vpX,1,Math.max(2,vpW),MINIMAP_H-2);
+
+  // Time axis background
+  ctx.fillStyle='#12122a';
+  ctx.fillRect(LABEL_W,MINIMAP_H,viewW,AXIS_H);
+
+  // Time axis ticks
+  var rangeUs=visEnd-visStart;
+  if(rangeUs>0){
+    var raw=rangeUs/8;
+    var mag=Math.pow(10,Math.floor(Math.log10(raw)));
+    var iv;if(raw/mag<2)iv=2*mag;else if(raw/mag<5)iv=5*mag;else iv=10*mag;
+    var firstTick=Math.ceil(visStart/iv)*iv;
+    ctx.fillStyle='#555';ctx.font='9px monospace';
+    ctx.strokeStyle='rgba(255,255,255,0.06)';ctx.lineWidth=1;
+    for(var t=firstTick;t<=visEnd;t+=iv){
+      var tx=LABEL_W+(t-offsetUs)*scale;
+      ctx.beginPath();ctx.moveTo(tx,TOP_PAD);ctx.lineTo(tx,canvasH);ctx.stroke();
+      ctx.fillText(fmtT(t),tx+3,MINIMAP_H+AXIS_H-4);
+    }
+  }
+
+  // Lanes
+  for(var li=0;li<LANE_IDS.length;li++){
+    var bid=LANE_IDS[li];
+    var y=TOP_PAD+li*LANE_H;
+
+    // Background
+    ctx.fillStyle=li%2===0?'#1a1a2e':'#1c1c34';
+    ctx.fillRect(LABEL_W,y,viewW,LANE_H);
+
+    // Events (clipped to event area)
+    ctx.save();
+    ctx.beginPath();ctx.rect(LABEL_W,y,viewW,LANE_H);ctx.clip();
+    var evts=LANE_EVENTS[bid];
+    if(evts&&evts.length>0){
+      var si=bsFirst(evts,visStart);
+      for(var i=si;i<evts.length;i++){
+        var ev=evts[i];
+        if(ev.start>visEnd)break;
+        var x=LABEL_W+(ev.start-offsetUs)*scale;
+        var w=ev.d*scale;
+        ctx.fillStyle=col(ev.n);
+        if(ev.t===1){
+          ctx.globalAlpha=0.7;
+          ctx.fillRect(x,y+COPY_PAD,Math.max(0.5,w),LANE_H-2*COPY_PAD);
+          ctx.globalAlpha=1;
+        }else{
+          ctx.fillRect(x,y+BAR_PAD,Math.max(0.5,w),LANE_H-2*BAR_PAD);
+        }
+        if(w>50){
+          ctx.fillStyle='#fff';ctx.font='10px system-ui';
+          ctx.fillText(ev.n,x+3,y+LANE_H/2+3,w-6);
+        }
+      }
+    }
+    ctx.restore();
+
+    // Hover highlight
+    if(hoveredEv&&hoveredEv.bid===bid){
+      var hx=LABEL_W+(hoveredEv.start-offsetUs)*scale;
+      var hw=hoveredEv.d*scale;
+      ctx.save();
+      ctx.beginPath();ctx.rect(LABEL_W,y,viewW,LANE_H);ctx.clip();
+      ctx.strokeStyle='#fff';ctx.lineWidth=2;
+      ctx.strokeRect(hx-1,y+2,Math.max(3,hw+2),LANE_H-4);
+      ctx.restore();
+    }
+
+    // Lane separator
+    ctx.strokeStyle='#0f3460';ctx.lineWidth=0.5;
+    ctx.beginPath();ctx.moveTo(0,y+LANE_H-0.5);ctx.lineTo(canvasW,y+LANE_H-0.5);ctx.stroke();
+
+    // Label background + text
+    ctx.fillStyle='#16213e';ctx.fillRect(0,y,LABEL_W,LANE_H);
+    ctx.fillStyle='#ccc';ctx.font='11px system-ui';
+    ctx.fillText(BACKENDS[bid]||('B'+bid),8,y+LANE_H/2+4);
+  }
+
+  // Axis label area background (covers labels column in axis row)
+  ctx.fillStyle='#16213e';ctx.fillRect(0,MINIMAP_H,LABEL_W,AXIS_H);
+  ctx.fillStyle='#666';ctx.font='9px monospace';ctx.fillText('Time',8,MINIMAP_H+AXIS_H-4);
+
+  // View info
+  document.getElementById('vi').textContent=fmtT(visStart)+' \u2014 '+fmtT(visEnd)+' ('+fmtT(rangeUs)+' visible)';
+}
+
+// --- Zoom / Pan ---
+function fitAll(){scale=viewW/TOTAL_US;offsetUs=0;render();}
+function zoomTo(us){scale=viewW/us;render();}
+
+canvas.addEventListener('wheel',function(e){
+  e.preventDefault();
+  var r=canvas.getBoundingClientRect();
+  var mx=e.clientX-r.left-LABEL_W;
+  if(mx<0)return;
+  var mu=offsetUs+mx/scale;
+  scale*=(e.deltaY>0?0.8:1.25);
+  var minScale=viewW/TOTAL_US*0.5;
+  if(scale<minScale)scale=minScale;
+  offsetUs=mu-mx/scale;
+  clampOffset();render();
+},{passive:false});
+
+canvas.addEventListener('mousedown',function(e){
+  var r=canvas.getBoundingClientRect();
+  var my=e.clientY-r.top;
+  if(my<MINIMAP_H){
+    var frac=(e.clientX-r.left)/canvasW;
+    offsetUs=frac*TOTAL_US-viewW/scale/2;
+    clampOffset();render();return;
+  }
+  isDragging=true;dragStartX=e.clientX;dragStartOff=offsetUs;
+  canvas.style.cursor='grabbing';
+});
+document.addEventListener('mousemove',function(e){
+  if(!isDragging)return;
+  offsetUs=dragStartOff-(e.clientX-dragStartX)/scale;
+  clampOffset();render();
+});
+document.addEventListener('mouseup',function(){
+  if(isDragging){isDragging=false;canvas.style.cursor='default';}
+});
+
+// --- Tooltip ---
+var tip=document.getElementById('tt');
+canvas.addEventListener('mousemove',function(e){
+  if(isDragging)return;
+  var r=canvas.getBoundingClientRect();
+  var mx=e.clientX-r.left,my=e.clientY-r.top;
+  var li=Math.floor((my-TOP_PAD)/LANE_H);
+  if(li<0||li>=LANE_IDS.length||mx<LABEL_W){
+    if(hoveredEv){hoveredEv=null;render();}
+    tip.style.display='none';return;
+  }
+  var bid=LANE_IDS[li];
+  var mu=offsetUs+(mx-LABEL_W)/scale;
+  var ev=bsHit(LANE_EVENTS[bid],mu);
+  if(ev){
+    if(hoveredEv!==ev){hoveredEv=ev;render();}
+    var h='<b style="color:#e94560">'+ev.n+'</b><br>'+fmtT(ev.d)+' | '+(BACKENDS[ev.bid]||'B'+ev.bid);
+    if(ev.s)h+='<br>Shape: '+fmtSh(ev.s);
+    if(ev.b)h+='<br>Bytes: '+fmtB(ev.b);
+    tip.innerHTML=h;tip.style.display='block';
+    tip.style.left=Math.min(e.clientX+15,window.innerWidth-280)+'px';
+    tip.style.top=Math.min(e.clientY+15,window.innerHeight-100)+'px';
+  }else{
+    if(hoveredEv){hoveredEv=null;render();}
+    tip.style.display='none';
+  }
+});
+canvas.addEventListener('mouseleave',function(){
+  if(hoveredEv){hoveredEv=null;render();}
+  tip.style.display='none';
+});
+
+// --- Keyboard ---
+document.addEventListener('keydown',function(e){
+  var step=viewW/scale*0.2;
+  if(e.key==='ArrowLeft'){offsetUs-=step;clampOffset();render();}
+  else if(e.key==='ArrowRight'){offsetUs+=step;clampOffset();render();}
+  else if(e.key==='+'||e.key==='='){scale*=1.5;render();}
+  else if(e.key==='-'){scale/=1.5;var mn=viewW/TOTAL_US*0.5;if(scale<mn)scale=mn;render();}
+  else if(e.key==='Home'){fitAll();}
+});
+
+// --- Resize ---
+window.addEventListener('resize',function(){setup();buildMinimap();render();});
+
+// --- Legend ---
+function buildLegend(){
+  var counts={};
+  for(var i=0;i<EVENTS.length;i++){var n=EVENTS[i].n;counts[n]=(counts[n]||0)+1;}
+  var entries=[];for(var n in counts)entries.push([n,counts[n]]);
+  entries.sort(function(a,b){return b[1]-a[1];});
+  var top=entries.slice(0,12);
+  var h='';
+  for(var i=0;i<top.length;i++){
+    h+='<span style="display:inline-block;margin:0 8px"><span style="display:inline-block;width:10px;height:10px;border-radius:2px;background:'+col(top[i][0])+';margin-right:4px;vertical-align:middle"></span>'+top[i][0]+'</span>';
+  }
+  document.getElementById('lg').innerHTML=h;
+}
+
+// --- Stats tree-table ---
+function buildStats(){
+  var ops={};
+  for(var i=0;i<EVENTS.length;i++){
+    var ev=EVENTS[i];
+    if(!ops[ev.n])ops[ev.n]={name:ev.n,d:0,count:0,min:Infinity,max:0,bytes:0,backends:{}};
+    var op=ops[ev.n];
+    op.d+=ev.d;op.count++;op.bytes+=(ev.b||0);
+    if(ev.d<op.min)op.min=ev.d;if(ev.d>op.max)op.max=ev.d;
+    var bk=String(ev.bid);
+    if(!op.backends[bk])op.backends[bk]={bid:ev.bid,d:0,count:0,min:Infinity,max:0,bytes:0,shapes:{}};
+    var b=op.backends[bk];
+    b.d+=ev.d;b.count++;b.bytes+=(ev.b||0);
+    if(ev.d<b.min)b.min=ev.d;if(ev.d>b.max)b.max=ev.d;
+    var sh=ev.s||'\u2014';
+    if(!b.shapes[sh])b.shapes[sh]={d:0,count:0,min:Infinity,max:0,bytes:0};
+    var s=b.shapes[sh];
+    s.d+=ev.d;s.count++;s.bytes+=(ev.b||0);
+    if(ev.d<s.min)s.min=ev.d;if(ev.d>s.max)s.max=ev.d;
+  }
+  var sorted=[];for(var k in ops)sorted.push(ops[k]);
+  sorted.sort(function(a,b){return b.d-a.d;});
+
+  // Build flat row list
+  var rows=[],rid=0;
+  for(var oi=0;oi<sorted.length;oi++){
+    var op=sorted[oi];
+    var opId=rid++;
+    var bkeys=[];for(var bk in op.backends)bkeys.push(bk);
+    bkeys.sort(function(a,b){return op.backends[b].d-op.backends[a].d;});
+    rows.push({id:opId,p:-1,lv:0,name:op.name,d:op.d,count:op.count,bytes:op.bytes,
+      min:op.min,max:op.max,pct:op.d/TOTAL_US*100,ch:bkeys.length>0});
+    for(var bi=0;bi<bkeys.length;bi++){
+      var bdata=op.backends[bkeys[bi]];
+      var bId=rid++;
+      var bname=BACKENDS[bdata.bid]||('B'+bdata.bid);
+      var skeys=[];for(var sk in bdata.shapes)skeys.push(sk);
+      skeys.sort(function(a,b){return bdata.shapes[b].d-bdata.shapes[a].d;});
+      rows.push({id:bId,p:opId,lv:1,name:bname,d:bdata.d,count:bdata.count,bytes:bdata.bytes,
+        min:bdata.min,max:bdata.max,pct:bdata.d/TOTAL_US*100,ch:skeys.length>0});
+      for(var si=0;si<skeys.length;si++){
+        var sdata=bdata.shapes[skeys[si]];
+        var sId=rid++;
+        rows.push({id:sId,p:bId,lv:2,name:skeys[si],d:sdata.d,count:sdata.count,bytes:sdata.bytes,
+          min:sdata.min,max:sdata.max,pct:sdata.d/TOTAL_US*100,ch:false});
+      }
+    }
+  }
+
+  // Render
+  function fmtBW(bytes,d_us){
+    if(!bytes||!d_us)return'\u2014';
+    var gbps=bytes/(d_us*1000);
+    if(gbps>=1000)return(gbps/1000).toFixed(2)+' TB/s';
+    if(gbps>=1)return gbps.toFixed(2)+' GB/s';
+    return(gbps*1000).toFixed(1)+' MB/s';
+  }
+  var h='<table><thead><tr><th style="width:26%">Operation</th>'
+    +'<th class="r" style="width:10%">% Time</th>'
+    +'<th class="r" style="width:10%">Total</th>'
+    +'<th class="r" style="width:8%">Count</th>'
+    +'<th class="r" style="width:10%">Avg</th>'
+    +'<th class="r" style="width:10%">Min</th>'
+    +'<th class="r" style="width:10%">Max</th>'
+    +'<th class="r" style="width:10%">Bandwidth</th>'
+    +'</tr></thead><tbody>';
+
+  for(var ri=0;ri<rows.length;ri++){
+    var r=rows[ri];
+    var indent=8+r.lv*20;
+    var vis=r.lv===0?'':'display:none';
+    var tog=r.ch?'<span class="tog" onclick="togRow('+r.id+',this)">\u25b6</span>'
+      :'<span style="width:16px;display:inline-block"></span>';
+    var nc;
+    if(r.lv===0)nc='color:'+col(r.name)+';font-weight:bold';
+    else if(r.lv===1)nc='color:#ccc';
+    else nc='color:#888';
+    var barC=r.lv===0?col(r.name):'rgba(100,140,200,0.3)';
+    var barO=r.lv===0?'0.25':'0.2';
+
+    h+='<tr class="l'+r.lv+'" data-id="'+r.id+'" data-p="'+r.p+'" style="'+vis+'">';
+    var dn=r.lv===2?fmtSh(r.name):r.name;
+    h+='<td style="padding-left:'+indent+'px">'+tog+'<span style="'+nc+'">'+dn+'</span></td>';
+    h+='<td class="r pct-cell"><div class="pct-bg" style="width:'+Math.max(0.5,r.pct)+'%;background:'+barC+';opacity:'+barO+'"></div><span class="pct-tx">'+r.pct.toFixed(1)+'%</span></td>';
+    h+='<td class="r">'+fmtT(r.d)+'</td>';
+    h+='<td class="r">'+r.count.toLocaleString()+'</td>';
+    h+='<td class="r">'+fmtT(r.d/r.count)+'</td>';
+    h+='<td class="r">'+fmtT(r.min)+'</td>';
+    h+='<td class="r">'+fmtT(r.max)+'</td>';
+    h+='<td class="r">'+fmtBW(r.bytes,r.d)+'</td>';
+    h+='</tr>';
+  }
+  h+='</tbody></table>';
+  document.getElementById('stats').innerHTML=h;
+}
+
+function togRow(pid,el){
+  var exp=el.textContent==='\u25bc';
+  el.textContent=exp?'\u25b6':'\u25bc';
+  var children=document.querySelectorAll('#stats tr[data-p="'+pid+'"]');
+  for(var i=0;i<children.length;i++){
+    children[i].style.display=exp?'none':'';
+    if(exp){
+      // Collapse grandchildren too
+      var cid=children[i].getAttribute('data-id');
+      var ctog=children[i].querySelector('.tog');
+      if(ctog)ctog.textContent='\u25b6';
+      var gc=document.querySelectorAll('#stats tr[data-p="'+cid+'"]');
+      for(var j=0;j<gc.length;j++)gc[j].style.display='none';
+    }
+  }
+}
+
+// --- Init ---
+setup();buildMinimap();buildLegend();buildStats();fitAll();
+"""
+
+        html += js + '\n</script></body></html>'
+
+        with open(filepath, "w") as f:
+            f.write(html)
+
+        print(f"HTML viewer exported to: {filepath}")
+        print(f"Open in browser: file://{Path(filepath).resolve()}")
+
+
+def load(filepath: str | Path) -> ProfileData:
+    """Load a profiler JSON file."""
+    return ProfileData.load(filepath)
+
+
+def main() -> None:
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="llama.cpp profiler analysis tool",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m tools.profiler.profiler profile.json
+  python -m tools.profiler.profiler profile.json --chrome-trace trace.json
+  python -m tools.profiler.profiler profile.json --top-ops 20
+        """,
+    )
+    parser.add_argument("profile", help="Path to profiler JSON file")
+    parser.add_argument("--chrome-trace", metavar="FILE",
+                        help="Export as Chrome Trace Event format")
+    parser.add_argument("--html-viewer", metavar="FILE",
+                        help="Export as interactive HTML timeline viewer")
+    parser.add_argument("--html-max-records", type=int, default=0,
+                        help="Max records in HTML viewer (0=unlimited, set to downsample for huge traces)")
+    parser.add_argument("--top-ops", type=int, default=0,
+                        help="Show top N operations (0 = show summary)")
+    parser.add_argument("--top-kernels", type=int, default=0,
+                        help="Show top N longest kernels")
+    parser.add_argument("--inefficiency", action="store_true",
+                        help="Show inefficiency ranking")
+
+    args = parser.parse_args()
+
+    data = load(args.profile)
+
+    if args.chrome_trace:
+        data.export_chrome_trace(args.chrome_trace)
+
+    if args.html_viewer:
+        data.export_html_viewer(args.html_viewer, max_records=args.html_max_records)
+
+    if args.top_ops > 0:
+        print(f"\nTop {args.top_ops} operations by total time:\n")
+        for s in data.top_operations(args.top_ops):
+            pct = 100.0 * s.total_ns / data.total_ns if data.total_ns > 0 else 0
+            print(f"  {s.type_name:<5} {s.backend_id:>4}  {s.name:<28} {pct:>6.1f}%  "
+                  f"{s.count:>6}x  {s.total_ms:>10.2f} ms  avg={s.avg_us:.2f} us")
+        print()
+
+    if args.top_kernels > 0:
+        print(f"\nTop {args.top_kernels} longest kernels:\n")
+        for rec in data.top_kernels(args.top_kernels):
+            print(f"  {rec.type_name:<5} {rec.backend_id:>4}  {rec.name:<28} "
+                  f"{rec.duration_us:>10.2f} us  split={rec.split_id}")
+        print()
+
+    if args.inefficiency:
+        print("\nInefficiency ranking (time/byte for operations with data):\n")
+        for s in data.inefficiency_ranking(10):
+            print(f"  {s.name:<28} {s.time_per_byte_ns / 1000:>10.2f} us/byte  "
+                  f"{s.count:>6} calls  {s.total_bytes / 1e6:.1f} MB")
+        print()
+
+    if args.top_ops == 0 and args.top_kernels == 0 and not args.inefficiency and not args.chrome_trace and not args.html_viewer:
+        data.summary()
+
+
+if __name__ == "__main__":
+    main()