fix builds, integrate vulkan profiler, fix copy events, fix export

2026-03-29 16:52:50 +02:00 · 2026-03-29 16:52:50 +02:00 · 64e438465f
parent a4352bd4a3
commit 64e438465f
9 changed files with 238 additions and 55 deletions
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -1575,12 +1575,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                    }

                    // group consecutive experts and copy them together
+                    size_t total_copied_bytes = 0;
                    auto copy_experts = [&](int32_t first_id, int32_t last_id) {
                        const size_t expert_offset = first_id * expert_size;
                        const size_t expert_size_copy =  (last_id - first_id + 1) * expert_size;
                        const size_t padding = std::min<size_t>(expert_size, 512);
                        const size_t padding_end = last_id < n_expert - 1 ? padding : 0;

+                        total_copied_bytes += expert_size_copy + padding_end;
                        ggml_backend_tensor_set_async(split_backend,
                            input_cpy,
                            (const uint8_t *)input->data + expert_offset, expert_offset,
@ -1589,6 +1591,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                            expert_size_copy + padding_end);
                    };

+                    uint64_t moe_copy_start = 0;
+                    if (sched->profiling_enabled) {
+                        moe_copy_start = ggml_profiler_time_ns();
+                    }
+
                    int id = 0;
                    while (!ggml_bitset_get(used_ids.data(), id)) {
                        id++;
@ -1612,9 +1619,34 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                        last_id = id;
                    }
                    copy_experts(first_id, last_id);
+
+                    if (sched->profiling_enabled) {
+                        uint64_t moe_copy_end = ggml_profiler_time_ns();
+
+                        enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
+                        enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
+                        const char *               copy_dir = "copy_D2D";
+                        if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                            copy_dir = "copy_H2D";
+                        } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
+                                   dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                            copy_dir = "copy_D2H";
+                        }
+
+                        sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
+                                                        split_id, moe_copy_start, moe_copy_end,
+                                                        (uint64_t) total_copied_bytes, NULL, {0}, {0} });
+                    }
                } else {
                    // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                    // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
+
+                    // Capture timestamp before async attempt so we can record launch time
+                    uint64_t copy_start = 0;
+                    if (sched->profiling_enabled) {
+                        copy_start = ggml_profiler_time_ns();
+                    }
+
                    if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
                        ggml_backend_synchronize(input_backend);
                        if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@ -1623,7 +1655,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                            ggml_backend_synchronize(split_backend);
                        }
                        if (sched->profiling_enabled) {
-                            uint64_t copy_start = ggml_profiler_time_ns();
+                            // Re-take start after sync for accurate sync copy measurement
+                            copy_start = ggml_profiler_time_ns();
                            ggml_backend_tensor_copy(input, input_cpy);
                            uint64_t copy_end = ggml_profiler_time_ns();

@ -1643,11 +1676,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                            ggml_backend_tensor_copy(input, input_cpy);
                        }
                    } else {
-                        // async copy completed - record it with available timing
+                        // async copy was launched — record the time spanning the async call
                        if (sched->profiling_enabled) {
-                            uint64_t copy_start = ggml_profiler_time_ns();
-                            // The async copy was already initiated; we just record the launch time
-                            uint64_t copy_end   = ggml_profiler_time_ns();
+                            uint64_t copy_end = ggml_profiler_time_ns();

                            enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
                            enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
@ -1716,8 +1747,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s

    // Profiling: collect records from all backends and append to accumulated records
    if (sched->profiling_enabled) {
-        sched->copy_records.clear();
-
        // Collect backend operation records
        for (int b = 0; b < sched->n_backends; b++) {
            ggml_backend_t backend = sched->backends[b];
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -96,6 +96,7 @@ struct ggml_cuda_profiler_state {
    static constexpr int MAX_PENDING_EVENTS = 4096;
    std::vector<cudaEvent_t> start_events;
    std::vector<cudaEvent_t> end_events;
+    std::vector<uint64_t>    cpu_timestamps;  // CPU-side timestamps for global ordering
    int event_count = 0;

    std::vector<ggml_profile_record> records;
@ -105,17 +106,19 @@ struct ggml_cuda_profiler_state {
        this->stream = stream;
        start_events.reserve(MAX_PENDING_EVENTS);
        end_events.reserve(MAX_PENDING_EVENTS);
+        cpu_timestamps.reserve(MAX_PENDING_EVENTS);
    }

    void reset() {
        for (auto & ev : start_events) {
-            cudaEventDestroy(ev);
+            (void) cudaEventDestroy(ev);
        }
        for (auto & ev : end_events) {
-            cudaEventDestroy(ev);
+            (void) cudaEventDestroy(ev);
        }
        start_events.clear();
        end_events.clear();
+        cpu_timestamps.clear();
        event_count = 0;
        records.clear();
        record_event_indices.clear();
@ -127,17 +130,18 @@ struct ggml_cuda_profiler_state {

    void record_start() {
        cudaEvent_t ev;
-        cudaEventCreate(&ev);
-        cudaEventRecord(ev, stream);
+        (void) cudaEventCreate(&ev);
+        (void) cudaEventRecord(ev, stream);
        start_events.push_back(ev);
+        cpu_timestamps.push_back(ggml_profiler_time_ns());
        event_count++;
    }

    void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
                    const int64_t ne_src0[4], const int64_t ne_src1[4]) {
        cudaEvent_t ev;
-        cudaEventCreate(&ev);
-        cudaEventRecord(ev, stream);
+        (void) cudaEventCreate(&ev);
+        (void) cudaEventRecord(ev, stream);
        end_events.push_back(ev);
        record_event_indices.push_back(records.size());

@ -156,15 +160,16 @@ struct ggml_cuda_profiler_state {
    }

    void finalize() {
-        cudaStreamSynchronize(stream);
+        (void) cudaStreamSynchronize(stream);

        for (int i = 0; i < (int)record_event_indices.size(); i++) {
            float ms = 0.0f;
-            cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
-            uint64_t ns = (uint64_t)(ms * 1e6f);
+            (void) cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
+            uint64_t duration_ns = (uint64_t)(ms * 1e6f);
            int rec_idx = record_event_indices[i];
-            records[rec_idx].start_ns = 0;
-            records[rec_idx].end_ns = ns;
+            // Use CPU-side timestamp for global ordering, GPU-measured duration for accuracy
+            records[rec_idx].start_ns = cpu_timestamps[i];
+            records[rec_idx].end_ns   = cpu_timestamps[i] + duration_ns;
        }
    }
 };
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -55,8 +55,10 @@
 #define cudaError_t hipError_t
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
+#define cudaEventCreate hipEventCreate
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventElapsedTime hipEventElapsedTime
 #define cudaEventRecord hipEventRecord
 #define cudaEventSynchronize hipEventSynchronize
 #define cudaEvent_t hipEvent_t
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -44,8 +44,10 @@
 #define cudaError_t musaError_t
 #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreate musaEventCreate
 #define cudaEventCreateWithFlags musaEventCreateWithFlags
 #define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventElapsedTime musaEventElapsedTime
 #define cudaEventRecord musaEventRecord
 #define cudaEventSynchronize musaEventSynchronize
 #define cudaEvent_t musaEvent_t
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@ -597,6 +597,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
        /* .interface = */ ggml_backend_metal_i,
        /* .device    = */ dev,
        /* .context   = */ ctx,
+        /* .profiler  = */ NULL,
    };

    ggml_backend_metal_set_n_cb(backend, 1);
@ -691,6 +692,7 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t
        /* .interface = */ ggml_backend_metal_i,
        /* .device    = */ dev,
        /* .context   = */ ctx,
+        /* .profiler  = */ NULL,
    };

    ggml_backend_metal_set_n_cb(backend, 1);
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -1,4 +1,5 @@
 #include "ggml-vulkan.h"
+#include "ggml-profiler.h"
 #include <vulkan/vulkan_core.h>
 #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
 #include <chrono>
@ -1700,8 +1701,8 @@ private:

 std::mutex vk_memory_logger::log_mutex;

-static bool vk_perf_logger_enabled = false;
-static bool vk_perf_logger_concurrent = false;
+static bool vk_perf_logger_enabled = false;       // deprecated: use --profile instead
+static bool vk_perf_logger_concurrent = false;     // GGML_VK_PERF_LOGGER_CONCURRENT: use concurrent timestamp mode
 static bool vk_enable_sync_logger = false;
 // number of calls between perf logger prints
 static uint32_t vk_perf_logger_frequency = 1;
@ -1873,6 +1874,21 @@ class vk_perf_logger {
    uint32_t print_count {};
 };

+// Profiler state for the new ggml_backend_profiler interface
+struct ggml_vk_profiler_state {
+    bool enabled = false;
+    int  split_id = -1;
+
+    std::vector<ggml_profile_record> records;
+    std::vector<uint64_t>            cpu_timestamps;  // CPU-side timestamps for global ordering
+
+    void reset() {
+        records.clear();
+        cpu_timestamps.clear();
+        split_id = -1;
+    }
+};
+
 struct ggml_backend_vk_context {
    std::string name;

@ -1930,8 +1946,9 @@ struct ggml_backend_vk_context {
    topk_moe_mode fused_topk_moe_mode {};
    bool fused_topk_moe_scale {};

-    // for GGML_VK_PERF_LOGGER
-    std::unique_ptr<vk_perf_logger> perf_logger;
+    // Profiling
+    std::unique_ptr<vk_perf_logger> perf_logger;   // legacy env-var profiler
+    ggml_vk_profiler_state * profiler_state = nullptr;
    vk::QueryPool query_pool;
    std::vector<const char *> query_fusion_names;
    std::vector<int> query_fusion_node_count;
@ -12859,9 +12876,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
            ctx->unsynced_nodes_read.clear();
            ggml_vk_sync_buffers(ctx, compute_ctx);

-            if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
+            if ((vk_perf_logger_enabled || (ctx->profiler_state != nullptr && ctx->profiler_state->enabled))
+                    && vk_perf_logger_concurrent) {
                ctx->query_node_idx[ctx->query_idx] = node_idx;
                compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+                if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
+                    ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
+                }
            }
        }
        // Add all fused nodes to the unsynchronized lists.
@ -13384,7 +13405,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {

        ctx->transfer_cmd_pool.destroy(ctx->device->device);
    }
-    if (vk_perf_logger_enabled) {
+    if (ctx->perf_logger) {
        ctx->perf_logger->print_timings(true);
    }
 }
@ -14323,7 +14344,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
    ggml_vk_submit_transfer_ctx(ctx);

    vk_context compute_ctx;
-    if (vk_perf_logger_enabled) {
+    bool profiling = vk_perf_logger_enabled ||
+                     (ctx->profiler_state != nullptr && ctx->profiler_state->enabled);
+    if (profiling) {
        // allocate/resize the query pool
        if (ctx->num_queries < cgraph->n_nodes + 1) {
            if (ctx->query_pool) {
@ -14350,6 +14373,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
        compute_ctx = ggml_vk_get_compute_ctx(ctx);
        ctx->query_idx = 0;
        compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+        if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
+            ctx->profiler_state->cpu_timestamps.clear();
+            ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
+        }
    }

    ctx->prealloc_y_last_pipeline_used = nullptr;
@ -14579,13 +14606,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg

        bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);

-        if (vk_perf_logger_enabled && enqueued) {
+        if (profiling && enqueued) {
            compute_ctx = ggml_vk_get_compute_ctx(ctx);
            if (!vk_perf_logger_concurrent) {
                // track a single node/fusion for the current query
                ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
                ctx->query_fusion_names[ctx->query_idx] = fusion_string;
                compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+                if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
+                    ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
+                }
            } else {
                // track a fusion string and number of fused ops for the current node_idx
                ctx->query_fusion_names[i] = fusion_string;
@ -14619,7 +14649,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg

    ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;

-    if (vk_perf_logger_enabled) {
+    if (profiling) {
        // End the command buffer and submit/wait
        GGML_ASSERT(!ctx->compute_ctx.expired());
        compute_ctx = ctx->compute_ctx.lock();
@ -14633,15 +14663,44 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
        // Get the results and pass them to the logger
        std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
        VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
+
+        const double ts_period = ctx->device->properties.limits.timestampPeriod;
+        const bool has_profiler = ctx->profiler_state != nullptr && ctx->profiler_state->enabled;
+
        if (!vk_perf_logger_concurrent) {
            // Log each op separately
            for (int i = 1; i < ctx->query_idx; i++) {
                auto node = ctx->query_nodes[i];
                auto name = ctx->query_fusion_names[i];
-                ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+                uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
+
+                if (ctx->perf_logger) {
+                    ctx->perf_logger->log_timing(node, name, duration_ns);
+                }
+
+                if (has_profiler && node != nullptr) {
+                    static const int64_t zero_ne[4] = {0, 0, 0, 0};
+                    const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
+                    const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
+                                    ? ctx->profiler_state->cpu_timestamps[i] : 0;
+
+                    ggml_profile_record rec;
+                    rec.type       = GGML_PROFILE_EVENT_OP;
+                    rec.name       = ggml_op_name(node->op);
+                    rec.backend_id = -1;
+                    rec.split_id   = ctx->profiler_state->split_id;
+                    rec.start_ns   = cpu_ts;
+                    rec.end_ns     = cpu_ts + duration_ns;
+                    rec.bytes      = ggml_nbytes(node);
+                    rec.extra      = name;  // fusion name or NULL
+                    memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
+                    memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
+                    ctx->profiler_state->records.push_back(rec);
+                }
            }
        } else {
-            // Log each group of nodes
+            // Log each group of nodes (concurrent mode)
            int prev_node_idx = 0;
            for (int i = 1; i < ctx->query_idx; i++) {
                auto cur_node_idx = ctx->query_node_idx[i];
@ -14656,10 +14715,42 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                    node_idx += ctx->query_fusion_node_count[node_idx];
                }
                prev_node_idx = cur_node_idx;
-                ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+                uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
+
+                if (ctx->perf_logger) {
+                    ctx->perf_logger->log_timing(nodes, names, duration_ns);
+                }
+
+                if (has_profiler && !nodes.empty()) {
+                    uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
+                                    ? ctx->profiler_state->cpu_timestamps[i] : 0;
+                    // In concurrent mode, distribute duration evenly across ops in group
+                    uint64_t per_op_ns = duration_ns / nodes.size();
+                    for (size_t j = 0; j < nodes.size(); j++) {
+                        auto * node = nodes[j];
+                        static const int64_t zero_ne[4] = {0, 0, 0, 0};
+                        const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
+                        const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+
+                        ggml_profile_record rec;
+                        rec.type       = GGML_PROFILE_EVENT_OP;
+                        rec.name       = ggml_op_name(node->op);
+                        rec.backend_id = -1;
+                        rec.split_id   = ctx->profiler_state->split_id;
+                        rec.start_ns   = cpu_ts + j * per_op_ns;
+                        rec.end_ns     = cpu_ts + (j + 1) * per_op_ns;
+                        rec.bytes      = ggml_nbytes(node);
+                        rec.extra      = names[j];
+                        memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
+                        memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
+                        ctx->profiler_state->records.push_back(rec);
+                    }
+                }
            }
        }
-        ctx->perf_logger->print_timings();
+        if (ctx->perf_logger) {
+            ctx->perf_logger->print_timings();
+        }
    }

    if (!ctx->device->support_async) {
@ -15002,6 +15093,58 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
        vk_backend->iface.get_tensor_async = nullptr;
    }

+    // Register profiler
+    auto * prof_state = new ggml_vk_profiler_state();
+    ctx->profiler_state = prof_state;
+
+    static auto vk_prof_enable = [](void * context, bool enable) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            vk_ctx->profiler_state->enabled = enable;
+            if (!enable) {
+                vk_ctx->profiler_state->reset();
+            }
+        }
+    };
+    static auto vk_prof_reset = [](void * context) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            vk_ctx->profiler_state->reset();
+        }
+    };
+    static auto vk_prof_set_split_id = [](void * context, int split_id) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            vk_ctx->profiler_state->split_id = split_id;
+        }
+    };
+    static auto vk_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            *out = vk_ctx->profiler_state->records.data();
+            return (int)vk_ctx->profiler_state->records.size();
+        }
+        *out = nullptr;
+        return 0;
+    };
+    static auto vk_prof_free = [](void * context) {
+        auto * vk_ctx = (ggml_backend_vk_context *)context;
+        if (vk_ctx->profiler_state != nullptr) {
+            delete vk_ctx->profiler_state;
+            vk_ctx->profiler_state = nullptr;
+        }
+    };
+
+    auto * profiler = new ggml_backend_profiler {
+        /* .context      = */ ctx,
+        /* .enable       = */ vk_prof_enable,
+        /* .reset        = */ vk_prof_reset,
+        /* .set_split_id = */ vk_prof_set_split_id,
+        /* .get_records  = */ vk_prof_get_records,
+        /* .free_context = */ vk_prof_free,
+    };
+    ggml_backend_set_profiler(vk_backend, profiler);
+
    return vk_backend;
 }

--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@ -3102,6 +3102,7 @@ static ggml_backend_t ggml_backend_webgpu_backend_init(ggml_backend_dev_t dev, c
        /* .interface = */ ggml_backend_webgpu_i,
        /* .device    = */ dev,
        /* .context   = */ backend_ctx,
+        /* .profiler  = */ nullptr,
    };
    return backend;
 }
--- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp
+++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
@ -499,7 +499,8 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons
        /* .guid       = */ ggml_backend_zdnn_guid(),
        /* .iface      = */ ggml_backend_zdnn_i,
        /* .device     = */ dev,
-        /* .context    = */ ctx
+        /* .context    = */ ctx,
+        /* .profiler   = */ NULL,
    };

    return backend;
--- a/tools/profiler/profiler.py
+++ b/tools/profiler/profiler.py
@ -358,43 +358,41 @@ class ProfileData:
                "args": {"name": backend_names[bid]},
            })

-        # Group records by (backend_id, split_id) and lay them out sequentially
-        # since we don't have reliable global timestamps across backends.
-        # Within each group, events are cumulative.
-        from collections import OrderedDict
-        groups: OrderedDict[tuple, list[ProfileRecord]] = OrderedDict()
+        # Use real timestamps, but prevent overlaps within each track.
+        # GPU kernels are launched rapidly (small start_ns gaps) but have long
+        # durations, so naive real timestamps overlap.  Sweep-line per track:
+        # sort by start_ns, then place each event at max(start, prev_end).
+        from collections import defaultdict
+        tracks: dict[tuple, list[ProfileRecord]] = defaultdict(list)
        for rec in self.records:
-            key = (rec.backend_id, rec.split_id)
-            groups.setdefault(key, []).append(rec)
+            tracks[(rec.backend_id, rec.split_id)].append(rec)

-        # Assign timestamps: each group starts after the previous one,
-        # and events within a group are sequential (cumulative duration).
-        global_ts = 0.0  # microseconds
-        for key, recs in groups.items():
-            backend_id, split_id = key
-            pid = pid_map[backend_id]
-            tid = f"split_{split_id}"
+        for key in tracks:
+            tracks[key].sort(key=lambda r: r.start_ns)

+        for key, recs in tracks.items():
+            pid = pid_map[key[0]]
+            tid = f"split_{key[1]}"
+            cursor = 0.0
            for rec in recs:
+                ts = max(rec.start_ns / 1000.0, cursor)
+                dur = rec.duration_ns / 1000.0
                cat = "copy" if rec.type == COPY_EVENT else "compute"
                events.append({
                    "ph": "X",  # complete event
                    "pid": pid,
                    "tid": tid,
                    "name": rec.name,
-                    "ts": global_ts,
-                    "dur": rec.duration_ns / 1000.0,  # us
+                    "ts": ts,
+                    "dur": dur,
                    "cat": cat,
                    "args": {
                        "bytes": rec.bytes,
-                        "duration_us": rec.duration_ns / 1000.0,
+                        "duration_us": dur,
                        "shape": rec.shape_str,
                    },
                })
-                global_ts += rec.duration_ns / 1000.0
-
-            # Add a small gap between groups for visual separation
-            global_ts += 1.0
+                cursor = ts + dur

        trace = {"traceEvents": events}
        with open(filepath, "w") as f:
@ -944,8 +942,8 @@ Examples:
                        help="Export as Chrome Trace Event format")
    parser.add_argument("--html-viewer", metavar="FILE",
                        help="Export as interactive HTML timeline viewer")
-    parser.add_argument("--html-max-records", type=int, default=5000,
-                        help="Max records per backend in HTML viewer (0=unlimited, downsample to reduce file size)")
+    parser.add_argument("--html-max-records", type=int, default=0,
+                        help="Max records in HTML viewer (0=unlimited, set to downsample for huge traces)")
    parser.add_argument("--top-ops", type=int, default=0,
                        help="Show top N operations (0 = show summary)")
    parser.add_argument("--top-kernels", type=int, default=0,