fix builds, integrate vulkan profiler, fix copy events, fix export

This commit is contained in:
Piotr Wilkin 2026-03-29 16:52:50 +02:00
parent a4352bd4a3
commit 64e438465f
9 changed files with 238 additions and 55 deletions

View File

@ -1575,12 +1575,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
// group consecutive experts and copy them together
size_t total_copied_bytes = 0;
auto copy_experts = [&](int32_t first_id, int32_t last_id) {
const size_t expert_offset = first_id * expert_size;
const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
const size_t padding = std::min<size_t>(expert_size, 512);
const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
total_copied_bytes += expert_size_copy + padding_end;
ggml_backend_tensor_set_async(split_backend,
input_cpy,
(const uint8_t *)input->data + expert_offset, expert_offset,
@ -1589,6 +1591,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
expert_size_copy + padding_end);
};
uint64_t moe_copy_start = 0;
if (sched->profiling_enabled) {
moe_copy_start = ggml_profiler_time_ns();
}
int id = 0;
while (!ggml_bitset_get(used_ids.data(), id)) {
id++;
@ -1612,9 +1619,34 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
last_id = id;
}
copy_experts(first_id, last_id);
if (sched->profiling_enabled) {
uint64_t moe_copy_end = ggml_profiler_time_ns();
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
const char * copy_dir = "copy_D2D";
if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
copy_dir = "copy_H2D";
} else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
copy_dir = "copy_D2H";
}
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, moe_copy_start, moe_copy_end,
(uint64_t) total_copied_bytes, NULL, {0}, {0} });
}
} else {
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
// Capture timestamp before async attempt so we can record launch time
uint64_t copy_start = 0;
if (sched->profiling_enabled) {
copy_start = ggml_profiler_time_ns();
}
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
ggml_backend_synchronize(input_backend);
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@ -1623,7 +1655,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
ggml_backend_synchronize(split_backend);
}
if (sched->profiling_enabled) {
uint64_t copy_start = ggml_profiler_time_ns();
// Re-take start after sync for accurate sync copy measurement
copy_start = ggml_profiler_time_ns();
ggml_backend_tensor_copy(input, input_cpy);
uint64_t copy_end = ggml_profiler_time_ns();
@ -1643,11 +1676,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
ggml_backend_tensor_copy(input, input_cpy);
}
} else {
// async copy completed - record it with available timing
// async copy was launched — record the time spanning the async call
if (sched->profiling_enabled) {
uint64_t copy_start = ggml_profiler_time_ns();
// The async copy was already initiated; we just record the launch time
uint64_t copy_end = ggml_profiler_time_ns();
uint64_t copy_end = ggml_profiler_time_ns();
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
@ -1716,8 +1747,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
// Profiling: collect records from all backends and append to accumulated records
if (sched->profiling_enabled) {
sched->copy_records.clear();
// Collect backend operation records
for (int b = 0; b < sched->n_backends; b++) {
ggml_backend_t backend = sched->backends[b];

View File

@ -96,6 +96,7 @@ struct ggml_cuda_profiler_state {
static constexpr int MAX_PENDING_EVENTS = 4096;
std::vector<cudaEvent_t> start_events;
std::vector<cudaEvent_t> end_events;
std::vector<uint64_t> cpu_timestamps; // CPU-side timestamps for global ordering
int event_count = 0;
std::vector<ggml_profile_record> records;
@ -105,17 +106,19 @@ struct ggml_cuda_profiler_state {
this->stream = stream;
start_events.reserve(MAX_PENDING_EVENTS);
end_events.reserve(MAX_PENDING_EVENTS);
cpu_timestamps.reserve(MAX_PENDING_EVENTS);
}
void reset() {
for (auto & ev : start_events) {
cudaEventDestroy(ev);
(void) cudaEventDestroy(ev);
}
for (auto & ev : end_events) {
cudaEventDestroy(ev);
(void) cudaEventDestroy(ev);
}
start_events.clear();
end_events.clear();
cpu_timestamps.clear();
event_count = 0;
records.clear();
record_event_indices.clear();
@ -127,17 +130,18 @@ struct ggml_cuda_profiler_state {
void record_start() {
cudaEvent_t ev;
cudaEventCreate(&ev);
cudaEventRecord(ev, stream);
(void) cudaEventCreate(&ev);
(void) cudaEventRecord(ev, stream);
start_events.push_back(ev);
cpu_timestamps.push_back(ggml_profiler_time_ns());
event_count++;
}
void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
const int64_t ne_src0[4], const int64_t ne_src1[4]) {
cudaEvent_t ev;
cudaEventCreate(&ev);
cudaEventRecord(ev, stream);
(void) cudaEventCreate(&ev);
(void) cudaEventRecord(ev, stream);
end_events.push_back(ev);
record_event_indices.push_back(records.size());
@ -156,15 +160,16 @@ struct ggml_cuda_profiler_state {
}
void finalize() {
cudaStreamSynchronize(stream);
(void) cudaStreamSynchronize(stream);
for (int i = 0; i < (int)record_event_indices.size(); i++) {
float ms = 0.0f;
cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
uint64_t ns = (uint64_t)(ms * 1e6f);
(void) cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
uint64_t duration_ns = (uint64_t)(ms * 1e6f);
int rec_idx = record_event_indices[i];
records[rec_idx].start_ns = 0;
records[rec_idx].end_ns = ns;
// Use CPU-side timestamp for global ordering, GPU-measured duration for accuracy
records[rec_idx].start_ns = cpu_timestamps[i];
records[rec_idx].end_ns = cpu_timestamps[i] + duration_ns;
}
}
};

View File

@ -55,8 +55,10 @@
#define cudaError_t hipError_t
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreate hipEventCreate
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventElapsedTime hipEventElapsedTime
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t

View File

@ -44,8 +44,10 @@
#define cudaError_t musaError_t
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
#define cudaEventCreate musaEventCreate
#define cudaEventCreateWithFlags musaEventCreateWithFlags
#define cudaEventDisableTiming musaEventDisableTiming
#define cudaEventElapsedTime musaEventElapsedTime
#define cudaEventRecord musaEventRecord
#define cudaEventSynchronize musaEventSynchronize
#define cudaEvent_t musaEvent_t

View File

@ -597,6 +597,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
/* .interface = */ ggml_backend_metal_i,
/* .device = */ dev,
/* .context = */ ctx,
/* .profiler = */ NULL,
};
ggml_backend_metal_set_n_cb(backend, 1);
@ -691,6 +692,7 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t
/* .interface = */ ggml_backend_metal_i,
/* .device = */ dev,
/* .context = */ ctx,
/* .profiler = */ NULL,
};
ggml_backend_metal_set_n_cb(backend, 1);

View File

@ -1,4 +1,5 @@
#include "ggml-vulkan.h"
#include "ggml-profiler.h"
#include <vulkan/vulkan_core.h>
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
#include <chrono>
@ -1700,8 +1701,8 @@ private:
std::mutex vk_memory_logger::log_mutex;
static bool vk_perf_logger_enabled = false;
static bool vk_perf_logger_concurrent = false;
static bool vk_perf_logger_enabled = false; // deprecated: use --profile instead
static bool vk_perf_logger_concurrent = false; // GGML_VK_PERF_LOGGER_CONCURRENT: use concurrent timestamp mode
static bool vk_enable_sync_logger = false;
// number of calls between perf logger prints
static uint32_t vk_perf_logger_frequency = 1;
@ -1873,6 +1874,21 @@ class vk_perf_logger {
uint32_t print_count {};
};
// Profiler state for the new ggml_backend_profiler interface
struct ggml_vk_profiler_state {
bool enabled = false;
int split_id = -1;
std::vector<ggml_profile_record> records;
std::vector<uint64_t> cpu_timestamps; // CPU-side timestamps for global ordering
void reset() {
records.clear();
cpu_timestamps.clear();
split_id = -1;
}
};
struct ggml_backend_vk_context {
std::string name;
@ -1930,8 +1946,9 @@ struct ggml_backend_vk_context {
topk_moe_mode fused_topk_moe_mode {};
bool fused_topk_moe_scale {};
// for GGML_VK_PERF_LOGGER
std::unique_ptr<vk_perf_logger> perf_logger;
// Profiling
std::unique_ptr<vk_perf_logger> perf_logger; // legacy env-var profiler
ggml_vk_profiler_state * profiler_state = nullptr;
vk::QueryPool query_pool;
std::vector<const char *> query_fusion_names;
std::vector<int> query_fusion_node_count;
@ -12859,9 +12876,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
ctx->unsynced_nodes_read.clear();
ggml_vk_sync_buffers(ctx, compute_ctx);
if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
if ((vk_perf_logger_enabled || (ctx->profiler_state != nullptr && ctx->profiler_state->enabled))
&& vk_perf_logger_concurrent) {
ctx->query_node_idx[ctx->query_idx] = node_idx;
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
}
}
}
// Add all fused nodes to the unsynchronized lists.
@ -13384,7 +13405,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
ctx->transfer_cmd_pool.destroy(ctx->device->device);
}
if (vk_perf_logger_enabled) {
if (ctx->perf_logger) {
ctx->perf_logger->print_timings(true);
}
}
@ -14323,7 +14344,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ggml_vk_submit_transfer_ctx(ctx);
vk_context compute_ctx;
if (vk_perf_logger_enabled) {
bool profiling = vk_perf_logger_enabled ||
(ctx->profiler_state != nullptr && ctx->profiler_state->enabled);
if (profiling) {
// allocate/resize the query pool
if (ctx->num_queries < cgraph->n_nodes + 1) {
if (ctx->query_pool) {
@ -14350,6 +14373,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
compute_ctx = ggml_vk_get_compute_ctx(ctx);
ctx->query_idx = 0;
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
ctx->profiler_state->cpu_timestamps.clear();
ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
}
}
ctx->prealloc_y_last_pipeline_used = nullptr;
@ -14579,13 +14606,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
if (vk_perf_logger_enabled && enqueued) {
if (profiling && enqueued) {
compute_ctx = ggml_vk_get_compute_ctx(ctx);
if (!vk_perf_logger_concurrent) {
// track a single node/fusion for the current query
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
}
} else {
// track a fusion string and number of fused ops for the current node_idx
ctx->query_fusion_names[i] = fusion_string;
@ -14619,7 +14649,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
if (vk_perf_logger_enabled) {
if (profiling) {
// End the command buffer and submit/wait
GGML_ASSERT(!ctx->compute_ctx.expired());
compute_ctx = ctx->compute_ctx.lock();
@ -14633,15 +14663,44 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
// Get the results and pass them to the logger
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
const double ts_period = ctx->device->properties.limits.timestampPeriod;
const bool has_profiler = ctx->profiler_state != nullptr && ctx->profiler_state->enabled;
if (!vk_perf_logger_concurrent) {
// Log each op separately
for (int i = 1; i < ctx->query_idx; i++) {
auto node = ctx->query_nodes[i];
auto name = ctx->query_fusion_names[i];
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
if (ctx->perf_logger) {
ctx->perf_logger->log_timing(node, name, duration_ns);
}
if (has_profiler && node != nullptr) {
static const int64_t zero_ne[4] = {0, 0, 0, 0};
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
? ctx->profiler_state->cpu_timestamps[i] : 0;
ggml_profile_record rec;
rec.type = GGML_PROFILE_EVENT_OP;
rec.name = ggml_op_name(node->op);
rec.backend_id = -1;
rec.split_id = ctx->profiler_state->split_id;
rec.start_ns = cpu_ts;
rec.end_ns = cpu_ts + duration_ns;
rec.bytes = ggml_nbytes(node);
rec.extra = name; // fusion name or NULL
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
ctx->profiler_state->records.push_back(rec);
}
}
} else {
// Log each group of nodes
// Log each group of nodes (concurrent mode)
int prev_node_idx = 0;
for (int i = 1; i < ctx->query_idx; i++) {
auto cur_node_idx = ctx->query_node_idx[i];
@ -14656,10 +14715,42 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
node_idx += ctx->query_fusion_node_count[node_idx];
}
prev_node_idx = cur_node_idx;
ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
if (ctx->perf_logger) {
ctx->perf_logger->log_timing(nodes, names, duration_ns);
}
if (has_profiler && !nodes.empty()) {
uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
? ctx->profiler_state->cpu_timestamps[i] : 0;
// In concurrent mode, distribute duration evenly across ops in group
uint64_t per_op_ns = duration_ns / nodes.size();
for (size_t j = 0; j < nodes.size(); j++) {
auto * node = nodes[j];
static const int64_t zero_ne[4] = {0, 0, 0, 0};
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
ggml_profile_record rec;
rec.type = GGML_PROFILE_EVENT_OP;
rec.name = ggml_op_name(node->op);
rec.backend_id = -1;
rec.split_id = ctx->profiler_state->split_id;
rec.start_ns = cpu_ts + j * per_op_ns;
rec.end_ns = cpu_ts + (j + 1) * per_op_ns;
rec.bytes = ggml_nbytes(node);
rec.extra = names[j];
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
ctx->profiler_state->records.push_back(rec);
}
}
}
}
ctx->perf_logger->print_timings();
if (ctx->perf_logger) {
ctx->perf_logger->print_timings();
}
}
if (!ctx->device->support_async) {
@ -15002,6 +15093,58 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
vk_backend->iface.get_tensor_async = nullptr;
}
// Register profiler
auto * prof_state = new ggml_vk_profiler_state();
ctx->profiler_state = prof_state;
static auto vk_prof_enable = [](void * context, bool enable) {
auto * vk_ctx = (ggml_backend_vk_context *)context;
if (vk_ctx->profiler_state != nullptr) {
vk_ctx->profiler_state->enabled = enable;
if (!enable) {
vk_ctx->profiler_state->reset();
}
}
};
static auto vk_prof_reset = [](void * context) {
auto * vk_ctx = (ggml_backend_vk_context *)context;
if (vk_ctx->profiler_state != nullptr) {
vk_ctx->profiler_state->reset();
}
};
static auto vk_prof_set_split_id = [](void * context, int split_id) {
auto * vk_ctx = (ggml_backend_vk_context *)context;
if (vk_ctx->profiler_state != nullptr) {
vk_ctx->profiler_state->split_id = split_id;
}
};
static auto vk_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int {
auto * vk_ctx = (ggml_backend_vk_context *)context;
if (vk_ctx->profiler_state != nullptr) {
*out = vk_ctx->profiler_state->records.data();
return (int)vk_ctx->profiler_state->records.size();
}
*out = nullptr;
return 0;
};
static auto vk_prof_free = [](void * context) {
auto * vk_ctx = (ggml_backend_vk_context *)context;
if (vk_ctx->profiler_state != nullptr) {
delete vk_ctx->profiler_state;
vk_ctx->profiler_state = nullptr;
}
};
auto * profiler = new ggml_backend_profiler {
/* .context = */ ctx,
/* .enable = */ vk_prof_enable,
/* .reset = */ vk_prof_reset,
/* .set_split_id = */ vk_prof_set_split_id,
/* .get_records = */ vk_prof_get_records,
/* .free_context = */ vk_prof_free,
};
ggml_backend_set_profiler(vk_backend, profiler);
return vk_backend;
}

View File

@ -3102,6 +3102,7 @@ static ggml_backend_t ggml_backend_webgpu_backend_init(ggml_backend_dev_t dev, c
/* .interface = */ ggml_backend_webgpu_i,
/* .device = */ dev,
/* .context = */ backend_ctx,
/* .profiler = */ nullptr,
};
return backend;
}

View File

@ -499,7 +499,8 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons
/* .guid = */ ggml_backend_zdnn_guid(),
/* .iface = */ ggml_backend_zdnn_i,
/* .device = */ dev,
/* .context = */ ctx
/* .context = */ ctx,
/* .profiler = */ NULL,
};
return backend;

View File

@ -358,43 +358,41 @@ class ProfileData:
"args": {"name": backend_names[bid]},
})
# Group records by (backend_id, split_id) and lay them out sequentially
# since we don't have reliable global timestamps across backends.
# Within each group, events are cumulative.
from collections import OrderedDict
groups: OrderedDict[tuple, list[ProfileRecord]] = OrderedDict()
# Use real timestamps, but prevent overlaps within each track.
# GPU kernels are launched rapidly (small start_ns gaps) but have long
# durations, so naive real timestamps overlap. Sweep-line per track:
# sort by start_ns, then place each event at max(start, prev_end).
from collections import defaultdict
tracks: dict[tuple, list[ProfileRecord]] = defaultdict(list)
for rec in self.records:
key = (rec.backend_id, rec.split_id)
groups.setdefault(key, []).append(rec)
tracks[(rec.backend_id, rec.split_id)].append(rec)
# Assign timestamps: each group starts after the previous one,
# and events within a group are sequential (cumulative duration).
global_ts = 0.0 # microseconds
for key, recs in groups.items():
backend_id, split_id = key
pid = pid_map[backend_id]
tid = f"split_{split_id}"
for key in tracks:
tracks[key].sort(key=lambda r: r.start_ns)
for key, recs in tracks.items():
pid = pid_map[key[0]]
tid = f"split_{key[1]}"
cursor = 0.0
for rec in recs:
ts = max(rec.start_ns / 1000.0, cursor)
dur = rec.duration_ns / 1000.0
cat = "copy" if rec.type == COPY_EVENT else "compute"
events.append({
"ph": "X", # complete event
"pid": pid,
"tid": tid,
"name": rec.name,
"ts": global_ts,
"dur": rec.duration_ns / 1000.0, # us
"ts": ts,
"dur": dur,
"cat": cat,
"args": {
"bytes": rec.bytes,
"duration_us": rec.duration_ns / 1000.0,
"duration_us": dur,
"shape": rec.shape_str,
},
})
global_ts += rec.duration_ns / 1000.0
# Add a small gap between groups for visual separation
global_ts += 1.0
cursor = ts + dur
trace = {"traceEvents": events}
with open(filepath, "w") as f:
@ -944,8 +942,8 @@ Examples:
help="Export as Chrome Trace Event format")
parser.add_argument("--html-viewer", metavar="FILE",
help="Export as interactive HTML timeline viewer")
parser.add_argument("--html-max-records", type=int, default=5000,
help="Max records per backend in HTML viewer (0=unlimited, downsample to reduce file size)")
parser.add_argument("--html-max-records", type=int, default=0,
help="Max records in HTML viewer (0=unlimited, set to downsample for huge traces)")
parser.add_argument("--top-ops", type=int, default=0,
help="Show top N operations (0 = show summary)")
parser.add_argument("--top-kernels", type=int, default=0,