fix builds, integrate vulkan profiler, fix copy events, fix export
This commit is contained in:
parent
a4352bd4a3
commit
64e438465f
|
|
@ -1575,12 +1575,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
}
|
||||
|
||||
// group consecutive experts and copy them together
|
||||
size_t total_copied_bytes = 0;
|
||||
auto copy_experts = [&](int32_t first_id, int32_t last_id) {
|
||||
const size_t expert_offset = first_id * expert_size;
|
||||
const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
|
||||
const size_t padding = std::min<size_t>(expert_size, 512);
|
||||
const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
|
||||
|
||||
total_copied_bytes += expert_size_copy + padding_end;
|
||||
ggml_backend_tensor_set_async(split_backend,
|
||||
input_cpy,
|
||||
(const uint8_t *)input->data + expert_offset, expert_offset,
|
||||
|
|
@ -1589,6 +1591,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
expert_size_copy + padding_end);
|
||||
};
|
||||
|
||||
uint64_t moe_copy_start = 0;
|
||||
if (sched->profiling_enabled) {
|
||||
moe_copy_start = ggml_profiler_time_ns();
|
||||
}
|
||||
|
||||
int id = 0;
|
||||
while (!ggml_bitset_get(used_ids.data(), id)) {
|
||||
id++;
|
||||
|
|
@ -1612,9 +1619,34 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
last_id = id;
|
||||
}
|
||||
copy_experts(first_id, last_id);
|
||||
|
||||
if (sched->profiling_enabled) {
|
||||
uint64_t moe_copy_end = ggml_profiler_time_ns();
|
||||
|
||||
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
|
||||
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
|
||||
const char * copy_dir = "copy_D2D";
|
||||
if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_H2D";
|
||||
} else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
|
||||
dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_D2H";
|
||||
}
|
||||
|
||||
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
|
||||
split_id, moe_copy_start, moe_copy_end,
|
||||
(uint64_t) total_copied_bytes, NULL, {0}, {0} });
|
||||
}
|
||||
} else {
|
||||
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
||||
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
||||
|
||||
// Capture timestamp before async attempt so we can record launch time
|
||||
uint64_t copy_start = 0;
|
||||
if (sched->profiling_enabled) {
|
||||
copy_start = ggml_profiler_time_ns();
|
||||
}
|
||||
|
||||
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
||||
ggml_backend_synchronize(input_backend);
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
|
|
@ -1623,7 +1655,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
if (sched->profiling_enabled) {
|
||||
uint64_t copy_start = ggml_profiler_time_ns();
|
||||
// Re-take start after sync for accurate sync copy measurement
|
||||
copy_start = ggml_profiler_time_ns();
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
uint64_t copy_end = ggml_profiler_time_ns();
|
||||
|
||||
|
|
@ -1643,11 +1676,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
}
|
||||
} else {
|
||||
// async copy completed - record it with available timing
|
||||
// async copy was launched — record the time spanning the async call
|
||||
if (sched->profiling_enabled) {
|
||||
uint64_t copy_start = ggml_profiler_time_ns();
|
||||
// The async copy was already initiated; we just record the launch time
|
||||
uint64_t copy_end = ggml_profiler_time_ns();
|
||||
uint64_t copy_end = ggml_profiler_time_ns();
|
||||
|
||||
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
|
||||
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
|
||||
|
|
@ -1716,8 +1747,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
|
||||
// Profiling: collect records from all backends and append to accumulated records
|
||||
if (sched->profiling_enabled) {
|
||||
sched->copy_records.clear();
|
||||
|
||||
// Collect backend operation records
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
ggml_backend_t backend = sched->backends[b];
|
||||
|
|
|
|||
|
|
@ -96,6 +96,7 @@ struct ggml_cuda_profiler_state {
|
|||
static constexpr int MAX_PENDING_EVENTS = 4096;
|
||||
std::vector<cudaEvent_t> start_events;
|
||||
std::vector<cudaEvent_t> end_events;
|
||||
std::vector<uint64_t> cpu_timestamps; // CPU-side timestamps for global ordering
|
||||
int event_count = 0;
|
||||
|
||||
std::vector<ggml_profile_record> records;
|
||||
|
|
@ -105,17 +106,19 @@ struct ggml_cuda_profiler_state {
|
|||
this->stream = stream;
|
||||
start_events.reserve(MAX_PENDING_EVENTS);
|
||||
end_events.reserve(MAX_PENDING_EVENTS);
|
||||
cpu_timestamps.reserve(MAX_PENDING_EVENTS);
|
||||
}
|
||||
|
||||
void reset() {
|
||||
for (auto & ev : start_events) {
|
||||
cudaEventDestroy(ev);
|
||||
(void) cudaEventDestroy(ev);
|
||||
}
|
||||
for (auto & ev : end_events) {
|
||||
cudaEventDestroy(ev);
|
||||
(void) cudaEventDestroy(ev);
|
||||
}
|
||||
start_events.clear();
|
||||
end_events.clear();
|
||||
cpu_timestamps.clear();
|
||||
event_count = 0;
|
||||
records.clear();
|
||||
record_event_indices.clear();
|
||||
|
|
@ -127,17 +130,18 @@ struct ggml_cuda_profiler_state {
|
|||
|
||||
void record_start() {
|
||||
cudaEvent_t ev;
|
||||
cudaEventCreate(&ev);
|
||||
cudaEventRecord(ev, stream);
|
||||
(void) cudaEventCreate(&ev);
|
||||
(void) cudaEventRecord(ev, stream);
|
||||
start_events.push_back(ev);
|
||||
cpu_timestamps.push_back(ggml_profiler_time_ns());
|
||||
event_count++;
|
||||
}
|
||||
|
||||
void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
|
||||
const int64_t ne_src0[4], const int64_t ne_src1[4]) {
|
||||
cudaEvent_t ev;
|
||||
cudaEventCreate(&ev);
|
||||
cudaEventRecord(ev, stream);
|
||||
(void) cudaEventCreate(&ev);
|
||||
(void) cudaEventRecord(ev, stream);
|
||||
end_events.push_back(ev);
|
||||
record_event_indices.push_back(records.size());
|
||||
|
||||
|
|
@ -156,15 +160,16 @@ struct ggml_cuda_profiler_state {
|
|||
}
|
||||
|
||||
void finalize() {
|
||||
cudaStreamSynchronize(stream);
|
||||
(void) cudaStreamSynchronize(stream);
|
||||
|
||||
for (int i = 0; i < (int)record_event_indices.size(); i++) {
|
||||
float ms = 0.0f;
|
||||
cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
|
||||
uint64_t ns = (uint64_t)(ms * 1e6f);
|
||||
(void) cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
|
||||
uint64_t duration_ns = (uint64_t)(ms * 1e6f);
|
||||
int rec_idx = record_event_indices[i];
|
||||
records[rec_idx].start_ns = 0;
|
||||
records[rec_idx].end_ns = ns;
|
||||
// Use CPU-side timestamp for global ordering, GPU-measured duration for accuracy
|
||||
records[rec_idx].start_ns = cpu_timestamps[i];
|
||||
records[rec_idx].end_ns = cpu_timestamps[i] + duration_ns;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -55,8 +55,10 @@
|
|||
#define cudaError_t hipError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreate hipEventCreate
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
#define cudaEventDisableTiming hipEventDisableTiming
|
||||
#define cudaEventElapsedTime hipEventElapsedTime
|
||||
#define cudaEventRecord hipEventRecord
|
||||
#define cudaEventSynchronize hipEventSynchronize
|
||||
#define cudaEvent_t hipEvent_t
|
||||
|
|
|
|||
|
|
@ -44,8 +44,10 @@
|
|||
#define cudaError_t musaError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreate musaEventCreate
|
||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||
#define cudaEventDisableTiming musaEventDisableTiming
|
||||
#define cudaEventElapsedTime musaEventElapsedTime
|
||||
#define cudaEventRecord musaEventRecord
|
||||
#define cudaEventSynchronize musaEventSynchronize
|
||||
#define cudaEvent_t musaEvent_t
|
||||
|
|
|
|||
|
|
@ -597,6 +597,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
|
|||
/* .interface = */ ggml_backend_metal_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_metal_set_n_cb(backend, 1);
|
||||
|
|
@ -691,6 +692,7 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t
|
|||
/* .interface = */ ggml_backend_metal_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_metal_set_n_cb(backend, 1);
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "ggml-vulkan.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include <vulkan/vulkan_core.h>
|
||||
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
|
||||
#include <chrono>
|
||||
|
|
@ -1700,8 +1701,8 @@ private:
|
|||
|
||||
std::mutex vk_memory_logger::log_mutex;
|
||||
|
||||
static bool vk_perf_logger_enabled = false;
|
||||
static bool vk_perf_logger_concurrent = false;
|
||||
static bool vk_perf_logger_enabled = false; // deprecated: use --profile instead
|
||||
static bool vk_perf_logger_concurrent = false; // GGML_VK_PERF_LOGGER_CONCURRENT: use concurrent timestamp mode
|
||||
static bool vk_enable_sync_logger = false;
|
||||
// number of calls between perf logger prints
|
||||
static uint32_t vk_perf_logger_frequency = 1;
|
||||
|
|
@ -1873,6 +1874,21 @@ class vk_perf_logger {
|
|||
uint32_t print_count {};
|
||||
};
|
||||
|
||||
// Profiler state for the new ggml_backend_profiler interface
|
||||
struct ggml_vk_profiler_state {
|
||||
bool enabled = false;
|
||||
int split_id = -1;
|
||||
|
||||
std::vector<ggml_profile_record> records;
|
||||
std::vector<uint64_t> cpu_timestamps; // CPU-side timestamps for global ordering
|
||||
|
||||
void reset() {
|
||||
records.clear();
|
||||
cpu_timestamps.clear();
|
||||
split_id = -1;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_backend_vk_context {
|
||||
std::string name;
|
||||
|
||||
|
|
@ -1930,8 +1946,9 @@ struct ggml_backend_vk_context {
|
|||
topk_moe_mode fused_topk_moe_mode {};
|
||||
bool fused_topk_moe_scale {};
|
||||
|
||||
// for GGML_VK_PERF_LOGGER
|
||||
std::unique_ptr<vk_perf_logger> perf_logger;
|
||||
// Profiling
|
||||
std::unique_ptr<vk_perf_logger> perf_logger; // legacy env-var profiler
|
||||
ggml_vk_profiler_state * profiler_state = nullptr;
|
||||
vk::QueryPool query_pool;
|
||||
std::vector<const char *> query_fusion_names;
|
||||
std::vector<int> query_fusion_node_count;
|
||||
|
|
@ -12859,9 +12876,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|||
ctx->unsynced_nodes_read.clear();
|
||||
ggml_vk_sync_buffers(ctx, compute_ctx);
|
||||
|
||||
if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
|
||||
if ((vk_perf_logger_enabled || (ctx->profiler_state != nullptr && ctx->profiler_state->enabled))
|
||||
&& vk_perf_logger_concurrent) {
|
||||
ctx->query_node_idx[ctx->query_idx] = node_idx;
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
|
||||
ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
|
||||
}
|
||||
}
|
||||
}
|
||||
// Add all fused nodes to the unsynchronized lists.
|
||||
|
|
@ -13384,7 +13405,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|||
|
||||
ctx->transfer_cmd_pool.destroy(ctx->device->device);
|
||||
}
|
||||
if (vk_perf_logger_enabled) {
|
||||
if (ctx->perf_logger) {
|
||||
ctx->perf_logger->print_timings(true);
|
||||
}
|
||||
}
|
||||
|
|
@ -14323,7 +14344,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
ggml_vk_submit_transfer_ctx(ctx);
|
||||
|
||||
vk_context compute_ctx;
|
||||
if (vk_perf_logger_enabled) {
|
||||
bool profiling = vk_perf_logger_enabled ||
|
||||
(ctx->profiler_state != nullptr && ctx->profiler_state->enabled);
|
||||
if (profiling) {
|
||||
// allocate/resize the query pool
|
||||
if (ctx->num_queries < cgraph->n_nodes + 1) {
|
||||
if (ctx->query_pool) {
|
||||
|
|
@ -14350,6 +14373,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
ctx->query_idx = 0;
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
|
||||
ctx->profiler_state->cpu_timestamps.clear();
|
||||
ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
|
||||
}
|
||||
}
|
||||
|
||||
ctx->prealloc_y_last_pipeline_used = nullptr;
|
||||
|
|
@ -14579,13 +14606,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
|
||||
|
||||
if (vk_perf_logger_enabled && enqueued) {
|
||||
if (profiling && enqueued) {
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
if (!vk_perf_logger_concurrent) {
|
||||
// track a single node/fusion for the current query
|
||||
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
|
||||
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
|
||||
ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
|
||||
}
|
||||
} else {
|
||||
// track a fusion string and number of fused ops for the current node_idx
|
||||
ctx->query_fusion_names[i] = fusion_string;
|
||||
|
|
@ -14619,7 +14649,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
|
||||
ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
if (profiling) {
|
||||
// End the command buffer and submit/wait
|
||||
GGML_ASSERT(!ctx->compute_ctx.expired());
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
|
|
@ -14633,15 +14663,44 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
// Get the results and pass them to the logger
|
||||
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
||||
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
|
||||
|
||||
const double ts_period = ctx->device->properties.limits.timestampPeriod;
|
||||
const bool has_profiler = ctx->profiler_state != nullptr && ctx->profiler_state->enabled;
|
||||
|
||||
if (!vk_perf_logger_concurrent) {
|
||||
// Log each op separately
|
||||
for (int i = 1; i < ctx->query_idx; i++) {
|
||||
auto node = ctx->query_nodes[i];
|
||||
auto name = ctx->query_fusion_names[i];
|
||||
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
|
||||
uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
|
||||
|
||||
if (ctx->perf_logger) {
|
||||
ctx->perf_logger->log_timing(node, name, duration_ns);
|
||||
}
|
||||
|
||||
if (has_profiler && node != nullptr) {
|
||||
static const int64_t zero_ne[4] = {0, 0, 0, 0};
|
||||
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
|
||||
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
|
||||
uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
|
||||
? ctx->profiler_state->cpu_timestamps[i] : 0;
|
||||
|
||||
ggml_profile_record rec;
|
||||
rec.type = GGML_PROFILE_EVENT_OP;
|
||||
rec.name = ggml_op_name(node->op);
|
||||
rec.backend_id = -1;
|
||||
rec.split_id = ctx->profiler_state->split_id;
|
||||
rec.start_ns = cpu_ts;
|
||||
rec.end_ns = cpu_ts + duration_ns;
|
||||
rec.bytes = ggml_nbytes(node);
|
||||
rec.extra = name; // fusion name or NULL
|
||||
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
|
||||
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
|
||||
ctx->profiler_state->records.push_back(rec);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Log each group of nodes
|
||||
// Log each group of nodes (concurrent mode)
|
||||
int prev_node_idx = 0;
|
||||
for (int i = 1; i < ctx->query_idx; i++) {
|
||||
auto cur_node_idx = ctx->query_node_idx[i];
|
||||
|
|
@ -14656,10 +14715,42 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
node_idx += ctx->query_fusion_node_count[node_idx];
|
||||
}
|
||||
prev_node_idx = cur_node_idx;
|
||||
ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
|
||||
uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
|
||||
|
||||
if (ctx->perf_logger) {
|
||||
ctx->perf_logger->log_timing(nodes, names, duration_ns);
|
||||
}
|
||||
|
||||
if (has_profiler && !nodes.empty()) {
|
||||
uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
|
||||
? ctx->profiler_state->cpu_timestamps[i] : 0;
|
||||
// In concurrent mode, distribute duration evenly across ops in group
|
||||
uint64_t per_op_ns = duration_ns / nodes.size();
|
||||
for (size_t j = 0; j < nodes.size(); j++) {
|
||||
auto * node = nodes[j];
|
||||
static const int64_t zero_ne[4] = {0, 0, 0, 0};
|
||||
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
|
||||
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
|
||||
|
||||
ggml_profile_record rec;
|
||||
rec.type = GGML_PROFILE_EVENT_OP;
|
||||
rec.name = ggml_op_name(node->op);
|
||||
rec.backend_id = -1;
|
||||
rec.split_id = ctx->profiler_state->split_id;
|
||||
rec.start_ns = cpu_ts + j * per_op_ns;
|
||||
rec.end_ns = cpu_ts + (j + 1) * per_op_ns;
|
||||
rec.bytes = ggml_nbytes(node);
|
||||
rec.extra = names[j];
|
||||
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
|
||||
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
|
||||
ctx->profiler_state->records.push_back(rec);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ctx->perf_logger->print_timings();
|
||||
if (ctx->perf_logger) {
|
||||
ctx->perf_logger->print_timings();
|
||||
}
|
||||
}
|
||||
|
||||
if (!ctx->device->support_async) {
|
||||
|
|
@ -15002,6 +15093,58 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
|||
vk_backend->iface.get_tensor_async = nullptr;
|
||||
}
|
||||
|
||||
// Register profiler
|
||||
auto * prof_state = new ggml_vk_profiler_state();
|
||||
ctx->profiler_state = prof_state;
|
||||
|
||||
static auto vk_prof_enable = [](void * context, bool enable) {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
vk_ctx->profiler_state->enabled = enable;
|
||||
if (!enable) {
|
||||
vk_ctx->profiler_state->reset();
|
||||
}
|
||||
}
|
||||
};
|
||||
static auto vk_prof_reset = [](void * context) {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
vk_ctx->profiler_state->reset();
|
||||
}
|
||||
};
|
||||
static auto vk_prof_set_split_id = [](void * context, int split_id) {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
vk_ctx->profiler_state->split_id = split_id;
|
||||
}
|
||||
};
|
||||
static auto vk_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
*out = vk_ctx->profiler_state->records.data();
|
||||
return (int)vk_ctx->profiler_state->records.size();
|
||||
}
|
||||
*out = nullptr;
|
||||
return 0;
|
||||
};
|
||||
static auto vk_prof_free = [](void * context) {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
delete vk_ctx->profiler_state;
|
||||
vk_ctx->profiler_state = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
auto * profiler = new ggml_backend_profiler {
|
||||
/* .context = */ ctx,
|
||||
/* .enable = */ vk_prof_enable,
|
||||
/* .reset = */ vk_prof_reset,
|
||||
/* .set_split_id = */ vk_prof_set_split_id,
|
||||
/* .get_records = */ vk_prof_get_records,
|
||||
/* .free_context = */ vk_prof_free,
|
||||
};
|
||||
ggml_backend_set_profiler(vk_backend, profiler);
|
||||
|
||||
return vk_backend;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3102,6 +3102,7 @@ static ggml_backend_t ggml_backend_webgpu_backend_init(ggml_backend_dev_t dev, c
|
|||
/* .interface = */ ggml_backend_webgpu_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ backend_ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
return backend;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -499,7 +499,8 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons
|
|||
/* .guid = */ ggml_backend_zdnn_guid(),
|
||||
/* .iface = */ ggml_backend_zdnn_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ NULL,
|
||||
};
|
||||
|
||||
return backend;
|
||||
|
|
|
|||
|
|
@ -358,43 +358,41 @@ class ProfileData:
|
|||
"args": {"name": backend_names[bid]},
|
||||
})
|
||||
|
||||
# Group records by (backend_id, split_id) and lay them out sequentially
|
||||
# since we don't have reliable global timestamps across backends.
|
||||
# Within each group, events are cumulative.
|
||||
from collections import OrderedDict
|
||||
groups: OrderedDict[tuple, list[ProfileRecord]] = OrderedDict()
|
||||
# Use real timestamps, but prevent overlaps within each track.
|
||||
# GPU kernels are launched rapidly (small start_ns gaps) but have long
|
||||
# durations, so naive real timestamps overlap. Sweep-line per track:
|
||||
# sort by start_ns, then place each event at max(start, prev_end).
|
||||
from collections import defaultdict
|
||||
tracks: dict[tuple, list[ProfileRecord]] = defaultdict(list)
|
||||
for rec in self.records:
|
||||
key = (rec.backend_id, rec.split_id)
|
||||
groups.setdefault(key, []).append(rec)
|
||||
tracks[(rec.backend_id, rec.split_id)].append(rec)
|
||||
|
||||
# Assign timestamps: each group starts after the previous one,
|
||||
# and events within a group are sequential (cumulative duration).
|
||||
global_ts = 0.0 # microseconds
|
||||
for key, recs in groups.items():
|
||||
backend_id, split_id = key
|
||||
pid = pid_map[backend_id]
|
||||
tid = f"split_{split_id}"
|
||||
for key in tracks:
|
||||
tracks[key].sort(key=lambda r: r.start_ns)
|
||||
|
||||
for key, recs in tracks.items():
|
||||
pid = pid_map[key[0]]
|
||||
tid = f"split_{key[1]}"
|
||||
cursor = 0.0
|
||||
for rec in recs:
|
||||
ts = max(rec.start_ns / 1000.0, cursor)
|
||||
dur = rec.duration_ns / 1000.0
|
||||
cat = "copy" if rec.type == COPY_EVENT else "compute"
|
||||
events.append({
|
||||
"ph": "X", # complete event
|
||||
"pid": pid,
|
||||
"tid": tid,
|
||||
"name": rec.name,
|
||||
"ts": global_ts,
|
||||
"dur": rec.duration_ns / 1000.0, # us
|
||||
"ts": ts,
|
||||
"dur": dur,
|
||||
"cat": cat,
|
||||
"args": {
|
||||
"bytes": rec.bytes,
|
||||
"duration_us": rec.duration_ns / 1000.0,
|
||||
"duration_us": dur,
|
||||
"shape": rec.shape_str,
|
||||
},
|
||||
})
|
||||
global_ts += rec.duration_ns / 1000.0
|
||||
|
||||
# Add a small gap between groups for visual separation
|
||||
global_ts += 1.0
|
||||
cursor = ts + dur
|
||||
|
||||
trace = {"traceEvents": events}
|
||||
with open(filepath, "w") as f:
|
||||
|
|
@ -944,8 +942,8 @@ Examples:
|
|||
help="Export as Chrome Trace Event format")
|
||||
parser.add_argument("--html-viewer", metavar="FILE",
|
||||
help="Export as interactive HTML timeline viewer")
|
||||
parser.add_argument("--html-max-records", type=int, default=5000,
|
||||
help="Max records per backend in HTML viewer (0=unlimited, downsample to reduce file size)")
|
||||
parser.add_argument("--html-max-records", type=int, default=0,
|
||||
help="Max records in HTML viewer (0=unlimited, set to downsample for huge traces)")
|
||||
parser.add_argument("--top-ops", type=int, default=0,
|
||||
help="Show top N operations (0 = show summary)")
|
||||
parser.add_argument("--top-kernels", type=int, default=0,
|
||||
|
|
|
|||
Loading…
Reference in New Issue