From 64e438465f73704615c449fa8871bdbf01be1023 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sun, 29 Mar 2026 16:52:50 +0200 Subject: [PATCH] fix builds, integrate vulkan profiler, fix copy events, fix export --- ggml/src/ggml-backend.cpp | 43 +++++-- ggml/src/ggml-cuda/ggml-cuda.cu | 27 +++-- ggml/src/ggml-cuda/vendors/hip.h | 2 + ggml/src/ggml-cuda/vendors/musa.h | 2 + ggml/src/ggml-metal/ggml-metal.cpp | 2 + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 169 ++++++++++++++++++++++++--- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 1 + ggml/src/ggml-zdnn/ggml-zdnn.cpp | 3 +- tools/profiler/profiler.py | 44 ++++--- 9 files changed, 238 insertions(+), 55 deletions(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b8b204fc60..577cdeb3b5 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1575,12 +1575,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } // group consecutive experts and copy them together + size_t total_copied_bytes = 0; auto copy_experts = [&](int32_t first_id, int32_t last_id) { const size_t expert_offset = first_id * expert_size; const size_t expert_size_copy = (last_id - first_id + 1) * expert_size; const size_t padding = std::min(expert_size, 512); const size_t padding_end = last_id < n_expert - 1 ? padding : 0; + total_copied_bytes += expert_size_copy + padding_end; ggml_backend_tensor_set_async(split_backend, input_cpy, (const uint8_t *)input->data + expert_offset, expert_offset, @@ -1589,6 +1591,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s expert_size_copy + padding_end); }; + uint64_t moe_copy_start = 0; + if (sched->profiling_enabled) { + moe_copy_start = ggml_profiler_time_ns(); + } + int id = 0; while (!ggml_bitset_get(used_ids.data(), id)) { id++; @@ -1612,9 +1619,34 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s last_id = id; } copy_experts(first_id, last_id); + + if (sched->profiling_enabled) { + uint64_t moe_copy_end = ggml_profiler_time_ns(); + + enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device); + enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device); + const char * copy_dir = "copy_D2D"; + if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_H2D"; + } else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && + dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) { + copy_dir = "copy_D2H"; + } + + sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, + split_id, moe_copy_start, moe_copy_end, + (uint64_t) total_copied_bytes, NULL, {0}, {0} }); + } } else { // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface + + // Capture timestamp before async attempt so we can record launch time + uint64_t copy_start = 0; + if (sched->profiling_enabled) { + copy_start = ggml_profiler_time_ns(); + } + if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) { ggml_backend_synchronize(input_backend); if (sched->events[split_backend_id][sched->cur_copy] != NULL) { @@ -1623,7 +1655,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s ggml_backend_synchronize(split_backend); } if (sched->profiling_enabled) { - uint64_t copy_start = ggml_profiler_time_ns(); + // Re-take start after sync for accurate sync copy measurement + copy_start = ggml_profiler_time_ns(); ggml_backend_tensor_copy(input, input_cpy); uint64_t copy_end = ggml_profiler_time_ns(); @@ -1643,11 +1676,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s ggml_backend_tensor_copy(input, input_cpy); } } else { - // async copy completed - record it with available timing + // async copy was launched — record the time spanning the async call if (sched->profiling_enabled) { - uint64_t copy_start = ggml_profiler_time_ns(); - // The async copy was already initiated; we just record the launch time - uint64_t copy_end = ggml_profiler_time_ns(); + uint64_t copy_end = ggml_profiler_time_ns(); enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device); enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device); @@ -1716,8 +1747,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s // Profiling: collect records from all backends and append to accumulated records if (sched->profiling_enabled) { - sched->copy_records.clear(); - // Collect backend operation records for (int b = 0; b < sched->n_backends; b++) { ggml_backend_t backend = sched->backends[b]; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 1a92a1f9c8..c0c7749162 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -96,6 +96,7 @@ struct ggml_cuda_profiler_state { static constexpr int MAX_PENDING_EVENTS = 4096; std::vector start_events; std::vector end_events; + std::vector cpu_timestamps; // CPU-side timestamps for global ordering int event_count = 0; std::vector records; @@ -105,17 +106,19 @@ struct ggml_cuda_profiler_state { this->stream = stream; start_events.reserve(MAX_PENDING_EVENTS); end_events.reserve(MAX_PENDING_EVENTS); + cpu_timestamps.reserve(MAX_PENDING_EVENTS); } void reset() { for (auto & ev : start_events) { - cudaEventDestroy(ev); + (void) cudaEventDestroy(ev); } for (auto & ev : end_events) { - cudaEventDestroy(ev); + (void) cudaEventDestroy(ev); } start_events.clear(); end_events.clear(); + cpu_timestamps.clear(); event_count = 0; records.clear(); record_event_indices.clear(); @@ -127,17 +130,18 @@ struct ggml_cuda_profiler_state { void record_start() { cudaEvent_t ev; - cudaEventCreate(&ev); - cudaEventRecord(ev, stream); + (void) cudaEventCreate(&ev); + (void) cudaEventRecord(ev, stream); start_events.push_back(ev); + cpu_timestamps.push_back(ggml_profiler_time_ns()); event_count++; } void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra, const int64_t ne_src0[4], const int64_t ne_src1[4]) { cudaEvent_t ev; - cudaEventCreate(&ev); - cudaEventRecord(ev, stream); + (void) cudaEventCreate(&ev); + (void) cudaEventRecord(ev, stream); end_events.push_back(ev); record_event_indices.push_back(records.size()); @@ -156,15 +160,16 @@ struct ggml_cuda_profiler_state { } void finalize() { - cudaStreamSynchronize(stream); + (void) cudaStreamSynchronize(stream); for (int i = 0; i < (int)record_event_indices.size(); i++) { float ms = 0.0f; - cudaEventElapsedTime(&ms, start_events[i], end_events[i]); - uint64_t ns = (uint64_t)(ms * 1e6f); + (void) cudaEventElapsedTime(&ms, start_events[i], end_events[i]); + uint64_t duration_ns = (uint64_t)(ms * 1e6f); int rec_idx = record_event_indices[i]; - records[rec_idx].start_ns = 0; - records[rec_idx].end_ns = ns; + // Use CPU-side timestamp for global ordering, GPU-measured duration for accuracy + records[rec_idx].start_ns = cpu_timestamps[i]; + records[rec_idx].end_ns = cpu_timestamps[i] + duration_ns; } } }; diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 9d9ba1ee21..3a07b281d6 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -55,8 +55,10 @@ #define cudaError_t hipError_t #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled +#define cudaEventCreate hipEventCreate #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventElapsedTime hipEventElapsedTime #define cudaEventRecord hipEventRecord #define cudaEventSynchronize hipEventSynchronize #define cudaEvent_t hipEvent_t diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index 1abb8acfd4..d4ff57b3d6 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -44,8 +44,10 @@ #define cudaError_t musaError_t #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled +#define cudaEventCreate musaEventCreate #define cudaEventCreateWithFlags musaEventCreateWithFlags #define cudaEventDisableTiming musaEventDisableTiming +#define cudaEventElapsedTime musaEventElapsedTime #define cudaEventRecord musaEventRecord #define cudaEventSynchronize musaEventSynchronize #define cudaEvent_t musaEvent_t diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index 9382ce53b3..853edd7c75 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -597,6 +597,7 @@ ggml_backend_t ggml_backend_metal_init(void) { /* .interface = */ ggml_backend_metal_i, /* .device = */ dev, /* .context = */ ctx, + /* .profiler = */ NULL, }; ggml_backend_metal_set_n_cb(backend, 1); @@ -691,6 +692,7 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t /* .interface = */ ggml_backend_metal_i, /* .device = */ dev, /* .context = */ ctx, + /* .profiler = */ NULL, }; ggml_backend_metal_set_n_cb(backend, 1); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 4ecd3d6708..bde4d3b8e2 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1,4 +1,5 @@ #include "ggml-vulkan.h" +#include "ggml-profiler.h" #include #if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS) #include @@ -1700,8 +1701,8 @@ private: std::mutex vk_memory_logger::log_mutex; -static bool vk_perf_logger_enabled = false; -static bool vk_perf_logger_concurrent = false; +static bool vk_perf_logger_enabled = false; // deprecated: use --profile instead +static bool vk_perf_logger_concurrent = false; // GGML_VK_PERF_LOGGER_CONCURRENT: use concurrent timestamp mode static bool vk_enable_sync_logger = false; // number of calls between perf logger prints static uint32_t vk_perf_logger_frequency = 1; @@ -1873,6 +1874,21 @@ class vk_perf_logger { uint32_t print_count {}; }; +// Profiler state for the new ggml_backend_profiler interface +struct ggml_vk_profiler_state { + bool enabled = false; + int split_id = -1; + + std::vector records; + std::vector cpu_timestamps; // CPU-side timestamps for global ordering + + void reset() { + records.clear(); + cpu_timestamps.clear(); + split_id = -1; + } +}; + struct ggml_backend_vk_context { std::string name; @@ -1930,8 +1946,9 @@ struct ggml_backend_vk_context { topk_moe_mode fused_topk_moe_mode {}; bool fused_topk_moe_scale {}; - // for GGML_VK_PERF_LOGGER - std::unique_ptr perf_logger; + // Profiling + std::unique_ptr perf_logger; // legacy env-var profiler + ggml_vk_profiler_state * profiler_state = nullptr; vk::QueryPool query_pool; std::vector query_fusion_names; std::vector query_fusion_node_count; @@ -12859,9 +12876,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr ctx->unsynced_nodes_read.clear(); ggml_vk_sync_buffers(ctx, compute_ctx); - if (vk_perf_logger_enabled && vk_perf_logger_concurrent) { + if ((vk_perf_logger_enabled || (ctx->profiler_state != nullptr && ctx->profiler_state->enabled)) + && vk_perf_logger_concurrent) { ctx->query_node_idx[ctx->query_idx] = node_idx; compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) { + ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns()); + } } } // Add all fused nodes to the unsynchronized lists. @@ -13384,7 +13405,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->transfer_cmd_pool.destroy(ctx->device->device); } - if (vk_perf_logger_enabled) { + if (ctx->perf_logger) { ctx->perf_logger->print_timings(true); } } @@ -14323,7 +14344,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ggml_vk_submit_transfer_ctx(ctx); vk_context compute_ctx; - if (vk_perf_logger_enabled) { + bool profiling = vk_perf_logger_enabled || + (ctx->profiler_state != nullptr && ctx->profiler_state->enabled); + if (profiling) { // allocate/resize the query pool if (ctx->num_queries < cgraph->n_nodes + 1) { if (ctx->query_pool) { @@ -14350,6 +14373,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg compute_ctx = ggml_vk_get_compute_ctx(ctx); ctx->query_idx = 0; compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) { + ctx->profiler_state->cpu_timestamps.clear(); + ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns()); + } } ctx->prealloc_y_last_pipeline_used = nullptr; @@ -14579,13 +14606,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit); - if (vk_perf_logger_enabled && enqueued) { + if (profiling && enqueued) { compute_ctx = ggml_vk_get_compute_ctx(ctx); if (!vk_perf_logger_concurrent) { // track a single node/fusion for the current query ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; ctx->query_fusion_names[ctx->query_idx] = fusion_string; compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) { + ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns()); + } } else { // track a fusion string and number of fused ops for the current node_idx ctx->query_fusion_names[i] = fusion_string; @@ -14619,7 +14649,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->last_total_mul_mat_bytes = total_mul_mat_bytes; - if (vk_perf_logger_enabled) { + if (profiling) { // End the command buffer and submit/wait GGML_ASSERT(!ctx->compute_ctx.expired()); compute_ctx = ctx->compute_ctx.lock(); @@ -14633,15 +14663,44 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Get the results and pass them to the logger std::vector timestamps(cgraph->n_nodes + 1); VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); + + const double ts_period = ctx->device->properties.limits.timestampPeriod; + const bool has_profiler = ctx->profiler_state != nullptr && ctx->profiler_state->enabled; + if (!vk_perf_logger_concurrent) { // Log each op separately for (int i = 1; i < ctx->query_idx; i++) { auto node = ctx->query_nodes[i]; auto name = ctx->query_fusion_names[i]; - ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period); + + if (ctx->perf_logger) { + ctx->perf_logger->log_timing(node, name, duration_ns); + } + + if (has_profiler && node != nullptr) { + static const int64_t zero_ne[4] = {0, 0, 0, 0}; + const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; + const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size()) + ? ctx->profiler_state->cpu_timestamps[i] : 0; + + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = ggml_op_name(node->op); + rec.backend_id = -1; + rec.split_id = ctx->profiler_state->split_id; + rec.start_ns = cpu_ts; + rec.end_ns = cpu_ts + duration_ns; + rec.bytes = ggml_nbytes(node); + rec.extra = name; // fusion name or NULL + memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0)); + memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1)); + ctx->profiler_state->records.push_back(rec); + } } } else { - // Log each group of nodes + // Log each group of nodes (concurrent mode) int prev_node_idx = 0; for (int i = 1; i < ctx->query_idx; i++) { auto cur_node_idx = ctx->query_node_idx[i]; @@ -14656,10 +14715,42 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg node_idx += ctx->query_fusion_node_count[node_idx]; } prev_node_idx = cur_node_idx; - ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period); + + if (ctx->perf_logger) { + ctx->perf_logger->log_timing(nodes, names, duration_ns); + } + + if (has_profiler && !nodes.empty()) { + uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size()) + ? ctx->profiler_state->cpu_timestamps[i] : 0; + // In concurrent mode, distribute duration evenly across ops in group + uint64_t per_op_ns = duration_ns / nodes.size(); + for (size_t j = 0; j < nodes.size(); j++) { + auto * node = nodes[j]; + static const int64_t zero_ne[4] = {0, 0, 0, 0}; + const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; + const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + + ggml_profile_record rec; + rec.type = GGML_PROFILE_EVENT_OP; + rec.name = ggml_op_name(node->op); + rec.backend_id = -1; + rec.split_id = ctx->profiler_state->split_id; + rec.start_ns = cpu_ts + j * per_op_ns; + rec.end_ns = cpu_ts + (j + 1) * per_op_ns; + rec.bytes = ggml_nbytes(node); + rec.extra = names[j]; + memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0)); + memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1)); + ctx->profiler_state->records.push_back(rec); + } + } } } - ctx->perf_logger->print_timings(); + if (ctx->perf_logger) { + ctx->perf_logger->print_timings(); + } } if (!ctx->device->support_async) { @@ -15002,6 +15093,58 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) { vk_backend->iface.get_tensor_async = nullptr; } + // Register profiler + auto * prof_state = new ggml_vk_profiler_state(); + ctx->profiler_state = prof_state; + + static auto vk_prof_enable = [](void * context, bool enable) { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + vk_ctx->profiler_state->enabled = enable; + if (!enable) { + vk_ctx->profiler_state->reset(); + } + } + }; + static auto vk_prof_reset = [](void * context) { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + vk_ctx->profiler_state->reset(); + } + }; + static auto vk_prof_set_split_id = [](void * context, int split_id) { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + vk_ctx->profiler_state->split_id = split_id; + } + }; + static auto vk_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + *out = vk_ctx->profiler_state->records.data(); + return (int)vk_ctx->profiler_state->records.size(); + } + *out = nullptr; + return 0; + }; + static auto vk_prof_free = [](void * context) { + auto * vk_ctx = (ggml_backend_vk_context *)context; + if (vk_ctx->profiler_state != nullptr) { + delete vk_ctx->profiler_state; + vk_ctx->profiler_state = nullptr; + } + }; + + auto * profiler = new ggml_backend_profiler { + /* .context = */ ctx, + /* .enable = */ vk_prof_enable, + /* .reset = */ vk_prof_reset, + /* .set_split_id = */ vk_prof_set_split_id, + /* .get_records = */ vk_prof_get_records, + /* .free_context = */ vk_prof_free, + }; + ggml_backend_set_profiler(vk_backend, profiler); + return vk_backend; } diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 1aa15b0507..15c33b65c5 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -3102,6 +3102,7 @@ static ggml_backend_t ggml_backend_webgpu_backend_init(ggml_backend_dev_t dev, c /* .interface = */ ggml_backend_webgpu_i, /* .device = */ dev, /* .context = */ backend_ctx, + /* .profiler = */ nullptr, }; return backend; } diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp index 9b6938abf7..37050a9a09 100644 --- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp +++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp @@ -499,7 +499,8 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons /* .guid = */ ggml_backend_zdnn_guid(), /* .iface = */ ggml_backend_zdnn_i, /* .device = */ dev, - /* .context = */ ctx + /* .context = */ ctx, + /* .profiler = */ NULL, }; return backend; diff --git a/tools/profiler/profiler.py b/tools/profiler/profiler.py index b81343c8fb..fd26f35f4e 100644 --- a/tools/profiler/profiler.py +++ b/tools/profiler/profiler.py @@ -358,43 +358,41 @@ class ProfileData: "args": {"name": backend_names[bid]}, }) - # Group records by (backend_id, split_id) and lay them out sequentially - # since we don't have reliable global timestamps across backends. - # Within each group, events are cumulative. - from collections import OrderedDict - groups: OrderedDict[tuple, list[ProfileRecord]] = OrderedDict() + # Use real timestamps, but prevent overlaps within each track. + # GPU kernels are launched rapidly (small start_ns gaps) but have long + # durations, so naive real timestamps overlap. Sweep-line per track: + # sort by start_ns, then place each event at max(start, prev_end). + from collections import defaultdict + tracks: dict[tuple, list[ProfileRecord]] = defaultdict(list) for rec in self.records: - key = (rec.backend_id, rec.split_id) - groups.setdefault(key, []).append(rec) + tracks[(rec.backend_id, rec.split_id)].append(rec) - # Assign timestamps: each group starts after the previous one, - # and events within a group are sequential (cumulative duration). - global_ts = 0.0 # microseconds - for key, recs in groups.items(): - backend_id, split_id = key - pid = pid_map[backend_id] - tid = f"split_{split_id}" + for key in tracks: + tracks[key].sort(key=lambda r: r.start_ns) + for key, recs in tracks.items(): + pid = pid_map[key[0]] + tid = f"split_{key[1]}" + cursor = 0.0 for rec in recs: + ts = max(rec.start_ns / 1000.0, cursor) + dur = rec.duration_ns / 1000.0 cat = "copy" if rec.type == COPY_EVENT else "compute" events.append({ "ph": "X", # complete event "pid": pid, "tid": tid, "name": rec.name, - "ts": global_ts, - "dur": rec.duration_ns / 1000.0, # us + "ts": ts, + "dur": dur, "cat": cat, "args": { "bytes": rec.bytes, - "duration_us": rec.duration_ns / 1000.0, + "duration_us": dur, "shape": rec.shape_str, }, }) - global_ts += rec.duration_ns / 1000.0 - - # Add a small gap between groups for visual separation - global_ts += 1.0 + cursor = ts + dur trace = {"traceEvents": events} with open(filepath, "w") as f: @@ -944,8 +942,8 @@ Examples: help="Export as Chrome Trace Event format") parser.add_argument("--html-viewer", metavar="FILE", help="Export as interactive HTML timeline viewer") - parser.add_argument("--html-max-records", type=int, default=5000, - help="Max records per backend in HTML viewer (0=unlimited, downsample to reduce file size)") + parser.add_argument("--html-max-records", type=int, default=0, + help="Max records in HTML viewer (0=unlimited, set to downsample for huge traces)") parser.add_argument("--top-ops", type=int, default=0, help="Show top N operations (0 = show summary)") parser.add_argument("--top-kernels", type=int, default=0,