From 64e438465f73704615c449fa8871bdbf01be1023 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Sun, 29 Mar 2026 16:52:50 +0200
Subject: [PATCH] fix builds, integrate vulkan profiler, fix copy events, fix
 export

---
 ggml/src/ggml-backend.cpp            |  43 +++++--
 ggml/src/ggml-cuda/ggml-cuda.cu      |  27 +++--
 ggml/src/ggml-cuda/vendors/hip.h     |   2 +
 ggml/src/ggml-cuda/vendors/musa.h    |   2 +
 ggml/src/ggml-metal/ggml-metal.cpp   |   2 +
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 169 ++++++++++++++++++++++++---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp |   1 +
 ggml/src/ggml-zdnn/ggml-zdnn.cpp     |   3 +-
 tools/profiler/profiler.py           |  44 ++++---
 9 files changed, 238 insertions(+), 55 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index b8b204fc60..577cdeb3b5 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1575,12 +1575,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     }
 
                     // group consecutive experts and copy them together
+                    size_t total_copied_bytes = 0;
                     auto copy_experts = [&](int32_t first_id, int32_t last_id) {
                         const size_t expert_offset = first_id * expert_size;
                         const size_t expert_size_copy =  (last_id - first_id + 1) * expert_size;
                         const size_t padding = std::min<size_t>(expert_size, 512);
                         const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
 
+                        total_copied_bytes += expert_size_copy + padding_end;
                         ggml_backend_tensor_set_async(split_backend,
                             input_cpy,
                             (const uint8_t *)input->data + expert_offset, expert_offset,
@@ -1589,6 +1591,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                             expert_size_copy + padding_end);
                     };
 
+                    uint64_t moe_copy_start = 0;
+                    if (sched->profiling_enabled) {
+                        moe_copy_start = ggml_profiler_time_ns();
+                    }
+
                     int id = 0;
                     while (!ggml_bitset_get(used_ids.data(), id)) {
                         id++;
@@ -1612,9 +1619,34 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                         last_id = id;
                     }
                     copy_experts(first_id, last_id);
+
+                    if (sched->profiling_enabled) {
+                        uint64_t moe_copy_end = ggml_profiler_time_ns();
+
+                        enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
+                        enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
+                        const char *               copy_dir = "copy_D2D";
+                        if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                            copy_dir = "copy_H2D";
+                        } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
+                                   dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                            copy_dir = "copy_D2H";
+                        }
+
+                        sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
+                                                        split_id, moe_copy_start, moe_copy_end,
+                                                        (uint64_t) total_copied_bytes, NULL, {0}, {0} });
+                    }
                 } else {
                     // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                     // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
+
+                    // Capture timestamp before async attempt so we can record launch time
+                    uint64_t copy_start = 0;
+                    if (sched->profiling_enabled) {
+                        copy_start = ggml_profiler_time_ns();
+                    }
+
                     if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
                         ggml_backend_synchronize(input_backend);
                         if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@@ -1623,7 +1655,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                             ggml_backend_synchronize(split_backend);
                         }
                         if (sched->profiling_enabled) {
-                            uint64_t copy_start = ggml_profiler_time_ns();
+                            // Re-take start after sync for accurate sync copy measurement
+                            copy_start = ggml_profiler_time_ns();
                             ggml_backend_tensor_copy(input, input_cpy);
                             uint64_t copy_end = ggml_profiler_time_ns();
 
@@ -1643,11 +1676,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                             ggml_backend_tensor_copy(input, input_cpy);
                         }
                     } else {
-                        // async copy completed - record it with available timing
+                        // async copy was launched — record the time spanning the async call
                         if (sched->profiling_enabled) {
-                            uint64_t copy_start = ggml_profiler_time_ns();
-                            // The async copy was already initiated; we just record the launch time
-                            uint64_t copy_end   = ggml_profiler_time_ns();
+                            uint64_t copy_end = ggml_profiler_time_ns();
 
                             enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
                             enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
@@ -1716,8 +1747,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
 
     // Profiling: collect records from all backends and append to accumulated records
     if (sched->profiling_enabled) {
-        sched->copy_records.clear();
-
         // Collect backend operation records
         for (int b = 0; b < sched->n_backends; b++) {
             ggml_backend_t backend = sched->backends[b];
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 1a92a1f9c8..c0c7749162 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -96,6 +96,7 @@ struct ggml_cuda_profiler_state {
     static constexpr int MAX_PENDING_EVENTS = 4096;
     std::vector<cudaEvent_t> start_events;
     std::vector<cudaEvent_t> end_events;
+    std::vector<uint64_t>    cpu_timestamps;  // CPU-side timestamps for global ordering
     int event_count = 0;
 
     std::vector<ggml_profile_record> records;
@@ -105,17 +106,19 @@ struct ggml_cuda_profiler_state {
         this->stream = stream;
         start_events.reserve(MAX_PENDING_EVENTS);
         end_events.reserve(MAX_PENDING_EVENTS);
+        cpu_timestamps.reserve(MAX_PENDING_EVENTS);
     }
 
     void reset() {
         for (auto & ev : start_events) {
-            cudaEventDestroy(ev);
+            (void) cudaEventDestroy(ev);
         }
         for (auto & ev : end_events) {
-            cudaEventDestroy(ev);
+            (void) cudaEventDestroy(ev);
         }
         start_events.clear();
         end_events.clear();
+        cpu_timestamps.clear();
         event_count = 0;
         records.clear();
         record_event_indices.clear();
@@ -127,17 +130,18 @@ struct ggml_cuda_profiler_state {
 
     void record_start() {
         cudaEvent_t ev;
-        cudaEventCreate(&ev);
-        cudaEventRecord(ev, stream);
+        (void) cudaEventCreate(&ev);
+        (void) cudaEventRecord(ev, stream);
         start_events.push_back(ev);
+        cpu_timestamps.push_back(ggml_profiler_time_ns());
         event_count++;
     }
 
     void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
                     const int64_t ne_src0[4], const int64_t ne_src1[4]) {
         cudaEvent_t ev;
-        cudaEventCreate(&ev);
-        cudaEventRecord(ev, stream);
+        (void) cudaEventCreate(&ev);
+        (void) cudaEventRecord(ev, stream);
         end_events.push_back(ev);
         record_event_indices.push_back(records.size());
 
@@ -156,15 +160,16 @@ struct ggml_cuda_profiler_state {
     }
 
     void finalize() {
-        cudaStreamSynchronize(stream);
+        (void) cudaStreamSynchronize(stream);
 
         for (int i = 0; i < (int)record_event_indices.size(); i++) {
             float ms = 0.0f;
-            cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
-            uint64_t ns = (uint64_t)(ms * 1e6f);
+            (void) cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
+            uint64_t duration_ns = (uint64_t)(ms * 1e6f);
             int rec_idx = record_event_indices[i];
-            records[rec_idx].start_ns = 0;
-            records[rec_idx].end_ns = ns;
+            // Use CPU-side timestamp for global ordering, GPU-measured duration for accuracy
+            records[rec_idx].start_ns = cpu_timestamps[i];
+            records[rec_idx].end_ns   = cpu_timestamps[i] + duration_ns;
         }
     }
 };
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 9d9ba1ee21..3a07b281d6 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -55,8 +55,10 @@
 #define cudaError_t hipError_t
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreate hipEventCreate
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventElapsedTime hipEventElapsedTime
 #define cudaEventRecord hipEventRecord
 #define cudaEventSynchronize hipEventSynchronize
 #define cudaEvent_t hipEvent_t
diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
index 1abb8acfd4..d4ff57b3d6 100644
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -44,8 +44,10 @@
 #define cudaError_t musaError_t
 #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreate musaEventCreate
 #define cudaEventCreateWithFlags musaEventCreateWithFlags
 #define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventElapsedTime musaEventElapsedTime
 #define cudaEventRecord musaEventRecord
 #define cudaEventSynchronize musaEventSynchronize
 #define cudaEvent_t musaEvent_t
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 9382ce53b3..853edd7c75 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -597,6 +597,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
         /* .interface = */ ggml_backend_metal_i,
         /* .device    = */ dev,
         /* .context   = */ ctx,
+        /* .profiler  = */ NULL,
     };
 
     ggml_backend_metal_set_n_cb(backend, 1);
@@ -691,6 +692,7 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t
         /* .interface = */ ggml_backend_metal_i,
         /* .device    = */ dev,
         /* .context   = */ ctx,
+        /* .profiler  = */ NULL,
     };
 
     ggml_backend_metal_set_n_cb(backend, 1);
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 4ecd3d6708..bde4d3b8e2 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1,4 +1,5 @@
 #include "ggml-vulkan.h"
+#include "ggml-profiler.h"
 #include <vulkan/vulkan_core.h>
 #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
 #include <chrono>
@@ -1700,8 +1701,8 @@ private:
 
 std::mutex vk_memory_logger::log_mutex;
 
-static bool vk_perf_logger_enabled = false;
-static bool vk_perf_logger_concurrent = false;
+static bool vk_perf_logger_enabled = false;       // deprecated: use --profile instead
+static bool vk_perf_logger_concurrent = false;     // GGML_VK_PERF_LOGGER_CONCURRENT: use concurrent timestamp mode
 static bool vk_enable_sync_logger = false;
 // number of calls between perf logger prints
 static uint32_t vk_perf_logger_frequency = 1;
@@ -1873,6 +1874,21 @@ class vk_perf_logger {
     uint32_t print_count {};
 };
 
+// Profiler state for the new ggml_backend_profiler interface
+struct ggml_vk_profiler_state {
+    bool enabled = false;
+    int  split_id = -1;
+
+    std::vector<ggml_profile_record> records;
+    std::vector<uint64_t>            cpu_timestamps;  // CPU-side timestamps for global ordering
+
+    void reset() {
+        records.clear();
+        cpu_timestamps.clear();
+        split_id = -1;
+    }
+};
+
 struct ggml_backend_vk_context {
     std::string name;
 
@@ -1930,8 +1946,9 @@ struct ggml_backend_vk_context {
     topk_moe_mode fused_topk_moe_mode {};
     bool fused_topk_moe_scale {};
 
-    // for GGML_VK_PERF_LOGGER
-    std::unique_ptr<vk_perf_logger> perf_logger;
+    // Profiling
+    std::unique_ptr<vk_perf_logger> perf_logger;   // legacy env-var profiler
+    ggml_vk_profiler_state * profiler_state = nullptr;
     vk::QueryPool query_pool;
     std::vector<const char *> query_fusion_names;
     std::vector<int> query_fusion_node_count;
@@ -12859,9 +12876,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
             ctx->unsynced_nodes_read.clear();
             ggml_vk_sync_buffers(ctx, compute_ctx);
 
-            if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
+            if ((vk_perf_logger_enabled || (ctx->profiler_state != nullptr && ctx->profiler_state->enabled))
+                    && vk_perf_logger_concurrent) {
                 ctx->query_node_idx[ctx->query_idx] = node_idx;
                 compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+                if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
+                    ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
+                }
             }
         }
         // Add all fused nodes to the unsynchronized lists.
@@ -13384,7 +13405,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
 
         ctx->transfer_cmd_pool.destroy(ctx->device->device);
     }
-    if (vk_perf_logger_enabled) {
+    if (ctx->perf_logger) {
         ctx->perf_logger->print_timings(true);
     }
 }
@@ -14323,7 +14344,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     ggml_vk_submit_transfer_ctx(ctx);
 
     vk_context compute_ctx;
-    if (vk_perf_logger_enabled) {
+    bool profiling = vk_perf_logger_enabled ||
+                     (ctx->profiler_state != nullptr && ctx->profiler_state->enabled);
+    if (profiling) {
         // allocate/resize the query pool
         if (ctx->num_queries < cgraph->n_nodes + 1) {
             if (ctx->query_pool) {
@@ -14350,6 +14373,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         compute_ctx = ggml_vk_get_compute_ctx(ctx);
         ctx->query_idx = 0;
         compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+        if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
+            ctx->profiler_state->cpu_timestamps.clear();
+            ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
+        }
     }
 
     ctx->prealloc_y_last_pipeline_used = nullptr;
@@ -14579,13 +14606,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
         bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
 
-        if (vk_perf_logger_enabled && enqueued) {
+        if (profiling && enqueued) {
             compute_ctx = ggml_vk_get_compute_ctx(ctx);
             if (!vk_perf_logger_concurrent) {
                 // track a single node/fusion for the current query
                 ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
                 ctx->query_fusion_names[ctx->query_idx] = fusion_string;
                 compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+                if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
+                    ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
+                }
             } else {
                 // track a fusion string and number of fused ops for the current node_idx
                 ctx->query_fusion_names[i] = fusion_string;
@@ -14619,7 +14649,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
     ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
 
-    if (vk_perf_logger_enabled) {
+    if (profiling) {
         // End the command buffer and submit/wait
         GGML_ASSERT(!ctx->compute_ctx.expired());
         compute_ctx = ctx->compute_ctx.lock();
@@ -14633,15 +14663,44 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         // Get the results and pass them to the logger
         std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
         VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
+
+        const double ts_period = ctx->device->properties.limits.timestampPeriod;
+        const bool has_profiler = ctx->profiler_state != nullptr && ctx->profiler_state->enabled;
+
         if (!vk_perf_logger_concurrent) {
             // Log each op separately
             for (int i = 1; i < ctx->query_idx; i++) {
                 auto node = ctx->query_nodes[i];
                 auto name = ctx->query_fusion_names[i];
-                ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+                uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
+
+                if (ctx->perf_logger) {
+                    ctx->perf_logger->log_timing(node, name, duration_ns);
+                }
+
+                if (has_profiler && node != nullptr) {
+                    static const int64_t zero_ne[4] = {0, 0, 0, 0};
+                    const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
+                    const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
+                                    ? ctx->profiler_state->cpu_timestamps[i] : 0;
+
+                    ggml_profile_record rec;
+                    rec.type       = GGML_PROFILE_EVENT_OP;
+                    rec.name       = ggml_op_name(node->op);
+                    rec.backend_id = -1;
+                    rec.split_id   = ctx->profiler_state->split_id;
+                    rec.start_ns   = cpu_ts;
+                    rec.end_ns     = cpu_ts + duration_ns;
+                    rec.bytes      = ggml_nbytes(node);
+                    rec.extra      = name;  // fusion name or NULL
+                    memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
+                    memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
+                    ctx->profiler_state->records.push_back(rec);
+                }
             }
         } else {
-            // Log each group of nodes
+            // Log each group of nodes (concurrent mode)
             int prev_node_idx = 0;
             for (int i = 1; i < ctx->query_idx; i++) {
                 auto cur_node_idx = ctx->query_node_idx[i];
@@ -14656,10 +14715,42 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                     node_idx += ctx->query_fusion_node_count[node_idx];
                 }
                 prev_node_idx = cur_node_idx;
-                ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+                uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
+
+                if (ctx->perf_logger) {
+                    ctx->perf_logger->log_timing(nodes, names, duration_ns);
+                }
+
+                if (has_profiler && !nodes.empty()) {
+                    uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
+                                    ? ctx->profiler_state->cpu_timestamps[i] : 0;
+                    // In concurrent mode, distribute duration evenly across ops in group
+                    uint64_t per_op_ns = duration_ns / nodes.size();
+                    for (size_t j = 0; j < nodes.size(); j++) {
+                        auto * node = nodes[j];
+                        static const int64_t zero_ne[4] = {0, 0, 0, 0};
+                        const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
+                        const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+
+                        ggml_profile_record rec;
+                        rec.type       = GGML_PROFILE_EVENT_OP;
+                        rec.name       = ggml_op_name(node->op);
+                        rec.backend_id = -1;
+                        rec.split_id   = ctx->profiler_state->split_id;
+                        rec.start_ns   = cpu_ts + j * per_op_ns;
+                        rec.end_ns     = cpu_ts + (j + 1) * per_op_ns;
+                        rec.bytes      = ggml_nbytes(node);
+                        rec.extra      = names[j];
+                        memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
+                        memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
+                        ctx->profiler_state->records.push_back(rec);
+                    }
+                }
             }
         }
-        ctx->perf_logger->print_timings();
+        if (ctx->perf_logger) {
+            ctx->perf_logger->print_timings();
+        }
     }
 
     if (!ctx->device->support_async) {
@@ -15002,6 +15093,58 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
         vk_backend->iface.get_tensor_async = nullptr;
     }
 
+    // Register profiler
+    auto * prof_state = new ggml_vk_profiler_state();
+    ctx->profiler_state = prof_state;
+
+    static auto vk_prof_enable = [](void * context, bool enable) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            vk_ctx->profiler_state->enabled = enable;
+            if (!enable) {
+                vk_ctx->profiler_state->reset();
+            }
+        }
+    };
+    static auto vk_prof_reset = [](void * context) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            vk_ctx->profiler_state->reset();
+        }
+    };
+    static auto vk_prof_set_split_id = [](void * context, int split_id) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            vk_ctx->profiler_state->split_id = split_id;
+        }
+    };
+    static auto vk_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            *out = vk_ctx->profiler_state->records.data();
+            return (int)vk_ctx->profiler_state->records.size();
+        }
+        *out = nullptr;
+        return 0;
+    };
+    static auto vk_prof_free = [](void * context) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            delete vk_ctx->profiler_state;
+            vk_ctx->profiler_state = nullptr;
+        }
+    };
+
+    auto * profiler = new ggml_backend_profiler {
+        /* .context      = */ ctx,
+        /* .enable       = */ vk_prof_enable,
+        /* .reset        = */ vk_prof_reset,
+        /* .set_split_id = */ vk_prof_set_split_id,
+        /* .get_records  = */ vk_prof_get_records,
+        /* .free_context = */ vk_prof_free,
+    };
+    ggml_backend_set_profiler(vk_backend, profiler);
+
     return vk_backend;
 }
 
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 1aa15b0507..15c33b65c5 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -3102,6 +3102,7 @@ static ggml_backend_t ggml_backend_webgpu_backend_init(ggml_backend_dev_t dev, c
         /* .interface = */ ggml_backend_webgpu_i,
         /* .device    = */ dev,
         /* .context   = */ backend_ctx,
+        /* .profiler  = */ nullptr,
     };
     return backend;
 }
diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
index 9b6938abf7..37050a9a09 100644
--- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp
+++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
@@ -499,7 +499,8 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons
         /* .guid       = */ ggml_backend_zdnn_guid(),
         /* .iface      = */ ggml_backend_zdnn_i,
         /* .device     = */ dev,
-        /* .context    = */ ctx
+        /* .context    = */ ctx,
+        /* .profiler   = */ NULL,
     };
 
     return backend;
diff --git a/tools/profiler/profiler.py b/tools/profiler/profiler.py
index b81343c8fb..fd26f35f4e 100644
--- a/tools/profiler/profiler.py
+++ b/tools/profiler/profiler.py
@@ -358,43 +358,41 @@ class ProfileData:
                 "args": {"name": backend_names[bid]},
             })
 
-        # Group records by (backend_id, split_id) and lay them out sequentially
-        # since we don't have reliable global timestamps across backends.
-        # Within each group, events are cumulative.
-        from collections import OrderedDict
-        groups: OrderedDict[tuple, list[ProfileRecord]] = OrderedDict()
+        # Use real timestamps, but prevent overlaps within each track.
+        # GPU kernels are launched rapidly (small start_ns gaps) but have long
+        # durations, so naive real timestamps overlap.  Sweep-line per track:
+        # sort by start_ns, then place each event at max(start, prev_end).
+        from collections import defaultdict
+        tracks: dict[tuple, list[ProfileRecord]] = defaultdict(list)
         for rec in self.records:
-            key = (rec.backend_id, rec.split_id)
-            groups.setdefault(key, []).append(rec)
+            tracks[(rec.backend_id, rec.split_id)].append(rec)
 
-        # Assign timestamps: each group starts after the previous one,
-        # and events within a group are sequential (cumulative duration).
-        global_ts = 0.0  # microseconds
-        for key, recs in groups.items():
-            backend_id, split_id = key
-            pid = pid_map[backend_id]
-            tid = f"split_{split_id}"
+        for key in tracks:
+            tracks[key].sort(key=lambda r: r.start_ns)
 
+        for key, recs in tracks.items():
+            pid = pid_map[key[0]]
+            tid = f"split_{key[1]}"
+            cursor = 0.0
             for rec in recs:
+                ts = max(rec.start_ns / 1000.0, cursor)
+                dur = rec.duration_ns / 1000.0
                 cat = "copy" if rec.type == COPY_EVENT else "compute"
                 events.append({
                     "ph": "X",  # complete event
                     "pid": pid,
                     "tid": tid,
                     "name": rec.name,
-                    "ts": global_ts,
-                    "dur": rec.duration_ns / 1000.0,  # us
+                    "ts": ts,
+                    "dur": dur,
                     "cat": cat,
                     "args": {
                         "bytes": rec.bytes,
-                        "duration_us": rec.duration_ns / 1000.0,
+                        "duration_us": dur,
                         "shape": rec.shape_str,
                     },
                 })
-                global_ts += rec.duration_ns / 1000.0
-
-            # Add a small gap between groups for visual separation
-            global_ts += 1.0
+                cursor = ts + dur
 
         trace = {"traceEvents": events}
         with open(filepath, "w") as f:
@@ -944,8 +942,8 @@ Examples:
                         help="Export as Chrome Trace Event format")
     parser.add_argument("--html-viewer", metavar="FILE",
                         help="Export as interactive HTML timeline viewer")
-    parser.add_argument("--html-max-records", type=int, default=5000,
-                        help="Max records per backend in HTML viewer (0=unlimited, downsample to reduce file size)")
+    parser.add_argument("--html-max-records", type=int, default=0,
+                        help="Max records in HTML viewer (0=unlimited, set to downsample for huge traces)")
     parser.add_argument("--top-ops", type=int, default=0,
                         help="Show top N operations (0 = show summary)")
     parser.add_argument("--top-kernels", type=int, default=0,