feat: cool profiler thingy
This commit is contained in:
parent
7c203670f8
commit
c08e5f73dc
|
|
@ -1073,6 +1073,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.completion = true;
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--profile"},
|
||||
"enable cross-backend profiling (CPU, BLAS, CUDA)",
|
||||
[](common_params & params) {
|
||||
params.profiling = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
|
||||
add_opt(common_arg(
|
||||
{"--profile-output"}, "FNAME",
|
||||
"write profiling JSON output to FNAME (default: stdout)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.profiling = true;
|
||||
params.profiling_output = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
|
||||
add_opt(common_arg(
|
||||
{"--verbose-prompt"},
|
||||
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
#include "gguf.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "sampling.h"
|
||||
|
|
@ -1231,6 +1232,14 @@ common_init_result::common_init_result(common_params & params) :
|
|||
return;
|
||||
}
|
||||
|
||||
if (params.profiling) {
|
||||
ggml_backend_sched_t sched = llama_context_get_sched(lctx);
|
||||
if (sched != nullptr) {
|
||||
ggml_backend_sched_set_profiling(sched, true);
|
||||
LOG_INF("%s: profiling enabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
pimpl->context.reset(lctx);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include "ggml.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
|
|
@ -669,6 +670,10 @@ struct common_params {
|
|||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
// profiling
|
||||
bool profiling = false; // enable cross-backend profiling
|
||||
std::string profiling_output; // path to write profiling JSON output (empty = stdout)
|
||||
|
||||
// batched-bench params
|
||||
bool batched_bench_output_jsonl = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -244,6 +244,23 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
// Export profiling data if profiling was enabled
|
||||
if (params.profiling) {
|
||||
ggml_backend_sched_t sched = llama_context_get_sched(ctx);
|
||||
if (sched != nullptr) {
|
||||
if (params.profiling_output.empty()) {
|
||||
ggml_backend_sched_print_profiling(sched);
|
||||
} else {
|
||||
int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
|
||||
if (ret == 0) {
|
||||
LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
|
||||
} else {
|
||||
LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,22 @@ extern "C" {
|
|||
|
||||
// use only reference implementations
|
||||
bool use_ref;
|
||||
|
||||
// profiler context (set by backend when profiling is enabled, NULL otherwise)
|
||||
// when non-NULL, the compute loop will record per-node timing
|
||||
void * profiling_context;
|
||||
|
||||
// callback for recording a profile record from C code (set by backend when profiling)
|
||||
// params: context, type (0=OP, 1=COPY), name, split_id, start_ns, end_ns, bytes, extra, ne[4]
|
||||
void (*profiling_record_fn)(void * context,
|
||||
int type,
|
||||
const char * name,
|
||||
int split_id,
|
||||
uint64_t start_ns,
|
||||
uint64_t end_ns,
|
||||
uint64_t bytes,
|
||||
const char * extra,
|
||||
const int64_t ne[4]);
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
|
|
|
|||
|
|
@ -0,0 +1,103 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// Profiler
|
||||
//
|
||||
|
||||
// Profile event types
|
||||
enum ggml_profile_event_type {
|
||||
GGML_PROFILE_EVENT_OP, // single operation execution (computation kernel)
|
||||
GGML_PROFILE_EVENT_COPY, // data transfer between devices
|
||||
};
|
||||
|
||||
// A single profiling record representing a timed interval
|
||||
typedef struct ggml_profile_record {
|
||||
enum ggml_profile_event_type type;
|
||||
const char * name; // operation name (e.g., "mul_mat", "copy_H2D")
|
||||
int backend_id; // scheduler's backend index (0 = highest priority)
|
||||
int split_id; // which graph split (0..n_splits-1)
|
||||
uint64_t start_ns; // start timestamp in nanoseconds
|
||||
uint64_t end_ns; // end timestamp in nanoseconds
|
||||
uint64_t bytes; // bytes transferred (for copy) or tensor size (for ops)
|
||||
const char * extra; // fusion name for fused ops, or NULL
|
||||
int64_t ne[4]; // output tensor dimensions [ne0, ne1, ne2, ne3]
|
||||
} ggml_profile_record;
|
||||
|
||||
// Backend profiler interface - each backend optionally implements this
|
||||
// to provide fine-grained operation timing
|
||||
struct ggml_backend_profiler {
|
||||
void * context; // backend-specific profiler context
|
||||
|
||||
// Enable or disable profiling on this backend
|
||||
void (*enable)(void * context, bool enable);
|
||||
|
||||
// Clear all recorded data
|
||||
void (*reset)(void * context);
|
||||
|
||||
// Set the current split ID (called by scheduler before graph_compute)
|
||||
void (*set_split_id)(void * context, int split_id);
|
||||
|
||||
// Get recorded profiling data
|
||||
// Returns the number of records; sets *out to point to internal storage
|
||||
// The returned pointer remains valid until the next reset or disable call
|
||||
int (*get_records)(void * context, const ggml_profile_record ** out);
|
||||
|
||||
// Free the profiler context
|
||||
void (*free_context)(void * context);
|
||||
};
|
||||
|
||||
typedef struct ggml_backend_profiler * ggml_backend_profiler_t;
|
||||
|
||||
// Register a profiler on a backend (called by backend during init)
|
||||
// The profiler is owned by the backend and will be freed when the backend is freed
|
||||
GGML_API void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler);
|
||||
|
||||
// Get the profiler associated with a backend (returns NULL if none)
|
||||
GGML_API ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend);
|
||||
|
||||
//
|
||||
// Scheduler profiling API
|
||||
//
|
||||
|
||||
// Enable or disable profiling on a scheduler
|
||||
// When enabled, the scheduler will:
|
||||
// - Time data copy operations between backends
|
||||
// - Enable profiling on all backends that support it
|
||||
// - Collect profiling records from all backends after each graph compute
|
||||
GGML_API void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable);
|
||||
|
||||
// Check if profiling is enabled on a scheduler
|
||||
GGML_API bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched);
|
||||
|
||||
// Get profiling data from the last graph compute
|
||||
// Records are owned by the scheduler; valid until the next compute or reset
|
||||
// Returns the number of records
|
||||
GGML_API int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records);
|
||||
|
||||
// Print a human-readable summary of the last profiling run to stdout
|
||||
// Groups records by operation name and shows total/count/min/max/avg time
|
||||
GGML_API void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched);
|
||||
|
||||
// Reset profiling data (clear all recorded data)
|
||||
GGML_API void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched);
|
||||
|
||||
// Get current time in nanoseconds (for manual profiling if needed)
|
||||
GGML_API uint64_t ggml_profiler_time_ns(void);
|
||||
|
||||
// Export profiling data as JSON to a file
|
||||
// Returns 0 on success, -1 on error
|
||||
GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath);
|
||||
|
||||
// Export profiling data as JSON to a FILE pointer
|
||||
GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -195,12 +195,14 @@ add_library(ggml-base
|
|||
../include/ggml-backend.h
|
||||
../include/ggml-cpp.h
|
||||
../include/ggml-opt.h
|
||||
../include/ggml-profiler.h
|
||||
../include/gguf.h
|
||||
ggml.c
|
||||
ggml.cpp
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
ggml-opt.cpp
|
||||
ggml-profiler.cpp
|
||||
ggml-threading.cpp
|
||||
ggml-threading.h
|
||||
ggml-quants.c
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
// ggml-backend internal header
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-profiler.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
@ -124,6 +125,9 @@ extern "C" {
|
|||
struct ggml_backend_i iface;
|
||||
ggml_backend_dev_t device;
|
||||
void * context;
|
||||
|
||||
// Optional profiler (set by backend during init, NULL if not profiling)
|
||||
ggml_backend_profiler_t profiler;
|
||||
};
|
||||
|
||||
struct ggml_backend_event {
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-profiler.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
|
|
@ -231,6 +232,15 @@ void ggml_backend_free(ggml_backend_t backend) {
|
|||
return;
|
||||
}
|
||||
|
||||
// Clean up profiler if present (before backend frees its context)
|
||||
if (backend->profiler != NULL) {
|
||||
if (backend->profiler->free_context != NULL) {
|
||||
backend->profiler->free_context(backend->profiler->context);
|
||||
}
|
||||
delete backend->profiler;
|
||||
backend->profiler = NULL;
|
||||
}
|
||||
|
||||
backend->iface.free(backend);
|
||||
}
|
||||
|
||||
|
|
@ -736,6 +746,11 @@ struct ggml_backend_sched {
|
|||
int debug_realloc;
|
||||
int debug_graph_size;
|
||||
int debug_prev_graph_size;
|
||||
|
||||
// profiling
|
||||
bool profiling_enabled;
|
||||
std::vector<ggml_profile_record> copy_records; // copy events recorded by the scheduler
|
||||
std::vector<ggml_profile_record> profiling_records; // merged records from all sources
|
||||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
|
|
@ -1450,11 +1465,28 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
std::vector<int32_t> ids;
|
||||
std::vector<ggml_bitset_t> used_ids;
|
||||
|
||||
// Profiling: reset copy records for this compute pass
|
||||
if (sched->profiling_enabled) {
|
||||
sched->copy_records.clear();
|
||||
}
|
||||
|
||||
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
|
||||
struct ggml_backend_sched_split * split = &splits[split_id];
|
||||
int split_backend_id = split->backend_id;
|
||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||
|
||||
// Profiling: set split ID and enable backend profiling
|
||||
if (sched->profiling_enabled) {
|
||||
if (split_backend->profiler != NULL) {
|
||||
if (split_backend->profiler->enable != NULL) {
|
||||
split_backend->profiler->enable(split_backend->profiler->context, true);
|
||||
}
|
||||
if (split_backend->profiler->set_split_id != NULL) {
|
||||
split_backend->profiler->set_split_id(split_backend->profiler->context, split_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copy the input tensors to the split backend
|
||||
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
||||
|
|
@ -1468,7 +1500,25 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
if (sched->profiling_enabled) {
|
||||
uint64_t copy_start = ggml_profiler_time_ns();
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
uint64_t copy_end = ggml_profiler_time_ns();
|
||||
|
||||
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
|
||||
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
|
||||
const char * copy_dir = "copy_D2D";
|
||||
if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_H2D";
|
||||
} else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_D2H";
|
||||
}
|
||||
|
||||
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
|
||||
copy_start, copy_end, ggml_nbytes(input), NULL, {0} });
|
||||
} else {
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
}
|
||||
} else {
|
||||
// wait for the split backend to finish using the input before overwriting it
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
|
|
@ -1572,7 +1622,46 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
if (sched->profiling_enabled) {
|
||||
uint64_t copy_start = ggml_profiler_time_ns();
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
uint64_t copy_end = ggml_profiler_time_ns();
|
||||
|
||||
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
|
||||
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
|
||||
const char * copy_dir = "copy_D2D";
|
||||
if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_H2D";
|
||||
} else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
|
||||
dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_D2H";
|
||||
}
|
||||
|
||||
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
|
||||
split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} });
|
||||
} else {
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
}
|
||||
} else {
|
||||
// async copy completed - record it with available timing
|
||||
if (sched->profiling_enabled) {
|
||||
uint64_t copy_start = ggml_profiler_time_ns();
|
||||
// The async copy was already initiated; we just record the launch time
|
||||
uint64_t copy_end = ggml_profiler_time_ns();
|
||||
|
||||
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
|
||||
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
|
||||
const char * copy_dir = "copy_D2D";
|
||||
if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_H2D";
|
||||
} else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
|
||||
dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_D2H";
|
||||
}
|
||||
|
||||
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
|
||||
split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1625,6 +1714,34 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
}
|
||||
}
|
||||
|
||||
// Profiling: collect records from all backends and append to accumulated records
|
||||
if (sched->profiling_enabled) {
|
||||
sched->copy_records.clear();
|
||||
|
||||
// Collect backend operation records
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
ggml_backend_t backend = sched->backends[b];
|
||||
if (backend->profiler != NULL && backend->profiler->get_records != NULL) {
|
||||
const ggml_profile_record * backend_recs = NULL;
|
||||
int count = backend->profiler->get_records(backend->profiler->context, &backend_recs);
|
||||
for (int r = 0; r < count; r++) {
|
||||
ggml_profile_record rec = backend_recs[r];
|
||||
rec.backend_id = b; // stamp correct scheduler backend index
|
||||
sched->profiling_records.push_back(rec);
|
||||
}
|
||||
// Reset backend records (but keep profiling enabled for next compute)
|
||||
if (backend->profiler->reset != NULL) {
|
||||
backend->profiler->reset(backend->profiler->context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Append copy records
|
||||
for (const auto & rec : sched->copy_records) {
|
||||
sched->profiling_records.push_back(rec);
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -1691,6 +1808,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||
|
||||
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
||||
sched->op_offload = op_offload;
|
||||
sched->profiling_enabled = (getenv("GGML_PROFILE") != NULL);
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
|
|
@ -2268,3 +2386,216 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
|
|||
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
||||
}
|
||||
|
||||
//
|
||||
// Scheduler profiling
|
||||
//
|
||||
|
||||
void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable) {
|
||||
GGML_ASSERT(sched);
|
||||
sched->profiling_enabled = enable;
|
||||
|
||||
if (!enable) {
|
||||
ggml_backend_sched_reset_profiling(sched);
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched) {
|
||||
GGML_ASSERT(sched);
|
||||
return sched->profiling_enabled;
|
||||
}
|
||||
|
||||
int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT(records != NULL);
|
||||
|
||||
*records = sched->profiling_records.data();
|
||||
return (int) sched->profiling_records.size();
|
||||
}
|
||||
|
||||
void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched) {
|
||||
GGML_ASSERT(sched);
|
||||
sched->profiling_records.clear();
|
||||
sched->copy_records.clear();
|
||||
}
|
||||
|
||||
void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) {
|
||||
GGML_ASSERT(sched);
|
||||
|
||||
if (sched->profiling_records.empty()) {
|
||||
GGML_LOG_INFO("[profiler] No profiling data available\n");
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("\n=== Profiling Summary ===\n");
|
||||
|
||||
// Aggregate by (name, type, backend_id)
|
||||
struct op_stats {
|
||||
const char * name;
|
||||
enum ggml_profile_event_type type;
|
||||
int backend_id;
|
||||
uint64_t total_ns;
|
||||
uint64_t min_ns;
|
||||
uint64_t max_ns;
|
||||
int count;
|
||||
uint64_t total_bytes;
|
||||
int64_t representative_ne[4];
|
||||
};
|
||||
|
||||
std::vector<op_stats> stats;
|
||||
for (const auto & rec : sched->profiling_records) {
|
||||
bool found = false;
|
||||
for (auto & s : stats) {
|
||||
if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) {
|
||||
uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
|
||||
s.total_ns += dur;
|
||||
s.min_ns = std::min(s.min_ns, dur);
|
||||
s.max_ns = std::max(s.max_ns, dur);
|
||||
s.count++;
|
||||
s.total_bytes += rec.bytes;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
|
||||
op_stats s;
|
||||
s.name = rec.name;
|
||||
s.type = rec.type;
|
||||
s.backend_id = rec.backend_id;
|
||||
s.total_ns = dur;
|
||||
s.min_ns = dur;
|
||||
s.max_ns = dur;
|
||||
s.count = 1;
|
||||
s.total_bytes = rec.bytes;
|
||||
memcpy(s.representative_ne, rec.ne, sizeof(s.representative_ne));
|
||||
stats.push_back(s);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by total time descending
|
||||
std::sort(stats.begin(), stats.end(),
|
||||
[](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; });
|
||||
|
||||
uint64_t grand_total = 0;
|
||||
for (const auto & s : stats) {
|
||||
grand_total += s.total_ns;
|
||||
}
|
||||
|
||||
const char * type_str[] = { "OP ", "COPY" };
|
||||
for (const auto & s : stats) {
|
||||
double pct = 100.0 * (double) s.total_ns / (double) grand_total;
|
||||
double avg_us = (double) s.total_ns / (double) s.count / 1000.0;
|
||||
double min_us = (double) s.min_ns / 1000.0;
|
||||
double max_us = (double) s.max_ns / 1000.0;
|
||||
|
||||
if (s.type == GGML_PROFILE_EVENT_COPY) {
|
||||
double bw_gbps = (double) s.total_bytes / (double) s.total_ns;
|
||||
GGML_LOG_INFO(
|
||||
" [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us "
|
||||
" %8.2f GB/s",
|
||||
type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us,
|
||||
bw_gbps);
|
||||
} else {
|
||||
GGML_LOG_INFO(
|
||||
" [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us",
|
||||
type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us,
|
||||
max_us);
|
||||
}
|
||||
// Print representative tensor shape (first record's ne)
|
||||
if (s.representative_ne[0] > 0 || s.representative_ne[1] > 0) {
|
||||
GGML_LOG_INFO(" [%lld x %lld", (long long) s.representative_ne[0], (long long) s.representative_ne[1]);
|
||||
if (s.representative_ne[2] > 1) {
|
||||
GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[2]);
|
||||
}
|
||||
if (s.representative_ne[3] > 1) {
|
||||
GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[3]);
|
||||
}
|
||||
GGML_LOG_INFO("]");
|
||||
}
|
||||
GGML_LOG_INFO("\n");
|
||||
}
|
||||
|
||||
GGML_LOG_INFO(" ---\n");
|
||||
GGML_LOG_INFO(" Total: %.2f ms (%d records, %d unique ops)\n\n", (double) grand_total / 1e6,
|
||||
(int) sched->profiling_records.size(), (int) stats.size());
|
||||
}
|
||||
|
||||
int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT(fp != NULL);
|
||||
|
||||
uint64_t total_ns = 0;
|
||||
for (const auto & rec : sched->profiling_records) {
|
||||
total_ns += (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
|
||||
}
|
||||
|
||||
fprintf(fp, "{\n");
|
||||
fprintf(fp, " \"version\": 2,\n");
|
||||
fprintf(fp, " \"profiler\": \"ggml\",\n");
|
||||
fprintf(fp, " \"total_records\": %d,\n", (int) sched->profiling_records.size());
|
||||
fprintf(fp, " \"total_ns\": %llu,\n", (unsigned long long) total_ns);
|
||||
|
||||
// Backend metadata
|
||||
fprintf(fp, " \"backends\": [\n");
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
ggml_backend_t backend = sched->backends[b];
|
||||
const char * name = ggml_backend_name(backend);
|
||||
const char * dev_name = "unknown";
|
||||
int dev_type = 0;
|
||||
if (backend->device != NULL) {
|
||||
dev_name = ggml_backend_dev_name(backend->device);
|
||||
dev_type = (int) ggml_backend_dev_type(backend->device);
|
||||
}
|
||||
fprintf(fp, " {\"id\": %d, \"name\": \"%s\", \"device\": \"%s\", \"device_type\": %d}%s\n", b, name,
|
||||
dev_name, dev_type, (b < sched->n_backends - 1) ? "," : "");
|
||||
}
|
||||
fprintf(fp, " ],\n");
|
||||
|
||||
// Records
|
||||
fprintf(fp, " \"records\": [\n");
|
||||
|
||||
for (int i = 0; i < (int) sched->profiling_records.size(); i++) {
|
||||
const auto & rec = sched->profiling_records[i];
|
||||
uint64_t duration_ns = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
|
||||
|
||||
fprintf(fp,
|
||||
" {\"type\": %d, \"name\": \"%s\", \"backend_id\": %d, \"split_id\": %d, "
|
||||
"\"start_ns\": %llu, \"duration_ns\": %llu, \"bytes\": %llu, \"extra\": ",
|
||||
(int) rec.type, rec.name ? rec.name : "unknown", rec.backend_id, rec.split_id,
|
||||
(unsigned long long) rec.start_ns, (unsigned long long) duration_ns, (unsigned long long) rec.bytes);
|
||||
|
||||
if (rec.extra != NULL) {
|
||||
fprintf(fp, "\"%s\"", rec.extra);
|
||||
} else {
|
||||
fprintf(fp, "null");
|
||||
}
|
||||
|
||||
// Tensor dimensions
|
||||
fprintf(fp, ", \"ne\": [%lld, %lld, %lld, %lld]", (long long) rec.ne[0], (long long) rec.ne[1],
|
||||
(long long) rec.ne[2], (long long) rec.ne[3]);
|
||||
|
||||
fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : "");
|
||||
}
|
||||
|
||||
fprintf(fp, " ]\n");
|
||||
fprintf(fp, "}\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT(filepath != NULL);
|
||||
|
||||
FILE * fp = fopen(filepath, "w");
|
||||
if (fp == NULL) {
|
||||
GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int ret = ggml_backend_sched_write_profiling_json(sched, fp);
|
||||
fclose(fp);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "ggml-impl.h"
|
||||
#include "ggml-blas.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-profiler.h"
|
||||
|
||||
#include <future>
|
||||
#include <vector>
|
||||
|
|
@ -25,6 +26,11 @@ struct ggml_backend_blas_context {
|
|||
#ifndef GGML_USE_OPENMP
|
||||
std::vector<std::future<void>> tasks;
|
||||
#endif
|
||||
|
||||
// Profiling state
|
||||
bool profiling_enabled = false;
|
||||
int profiling_split_id = -1;
|
||||
std::vector<ggml_profile_record> profiling_records;
|
||||
};
|
||||
|
||||
static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
|
||||
|
|
@ -232,6 +238,18 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
|||
continue;
|
||||
}
|
||||
|
||||
// Skip view/identity ops
|
||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_VIEW ||
|
||||
node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Profiling: time this operation
|
||||
uint64_t t_start = 0;
|
||||
if (ctx->profiling_enabled) {
|
||||
t_start = ggml_profiler_time_ns();
|
||||
}
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
ggml_backend_blas_mul_mat(ctx, node);
|
||||
|
|
@ -241,16 +259,24 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
|||
ggml_backend_blas_out_prod(ctx, node);
|
||||
break;
|
||||
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
break;
|
||||
|
||||
default:
|
||||
GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
|
||||
}
|
||||
|
||||
if (ctx->profiling_enabled) {
|
||||
uint64_t t_end = ggml_profiler_time_ns();
|
||||
ggml_profile_record rec;
|
||||
rec.type = GGML_PROFILE_EVENT_OP;
|
||||
rec.name = ggml_op_name(node->op);
|
||||
rec.backend_id = 0;
|
||||
rec.split_id = ctx->profiling_split_id;
|
||||
rec.start_ns = t_start;
|
||||
rec.end_ns = t_end;
|
||||
rec.bytes = ggml_nbytes(node);
|
||||
rec.extra = NULL;
|
||||
memcpy(rec.ne, node->ne, sizeof(rec.ne));
|
||||
ctx->profiling_records.push_back(rec);
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
|
@ -284,10 +310,11 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|||
ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_blas_guid(),
|
||||
/* .iface = */ blas_backend_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_blas_guid(),
|
||||
/* .iface = */ blas_backend_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
|
||||
|
|
@ -300,6 +327,44 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|||
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
||||
#endif
|
||||
|
||||
// Register profiler
|
||||
ggml_backend_blas_context * blas_ctx = ctx; // ctx is already defined above
|
||||
|
||||
static auto blas_prof_enable = [](void * ctx, bool enable) {
|
||||
auto * bctx = (ggml_backend_blas_context *) ctx;
|
||||
bctx->profiling_enabled = enable;
|
||||
if (!enable) {
|
||||
bctx->profiling_records.clear();
|
||||
}
|
||||
};
|
||||
static auto blas_prof_reset = [](void * ctx) {
|
||||
auto * bctx = (ggml_backend_blas_context *) ctx;
|
||||
bctx->profiling_records.clear();
|
||||
bctx->profiling_split_id = -1;
|
||||
};
|
||||
static auto blas_prof_set_split_id = [](void * ctx, int split_id) {
|
||||
auto * bctx = (ggml_backend_blas_context *) ctx;
|
||||
bctx->profiling_split_id = split_id;
|
||||
};
|
||||
static auto blas_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int {
|
||||
auto * bctx = (ggml_backend_blas_context *) ctx;
|
||||
*out = bctx->profiling_records.data();
|
||||
return (int) bctx->profiling_records.size();
|
||||
};
|
||||
static auto blas_prof_free = [](void * ctx) {
|
||||
(void) ctx;
|
||||
};
|
||||
|
||||
auto * profiler = new ggml_backend_profiler{
|
||||
/* .context = */ blas_ctx,
|
||||
/* .enable = */ blas_prof_enable,
|
||||
/* .reset = */ blas_prof_reset,
|
||||
/* .set_split_id = */ blas_prof_set_split_id,
|
||||
/* .get_records = */ blas_prof_get_records,
|
||||
/* .free_context = */ blas_prof_free,
|
||||
};
|
||||
ggml_backend_set_profiler(backend, profiler);
|
||||
|
||||
return backend;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include "quants.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "unary-ops.h"
|
||||
|
|
@ -1159,8 +1160,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
|||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
const ggml_vec_dot_t vec_dot = type_traits_cpu[type].vec_dot;
|
||||
const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
|
|
@ -1244,9 +1245,9 @@ void ggml_compute_forward_mul_mat(
|
|||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
|
||||
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
|
||||
const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
|
||||
const ggml_from_float_t from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
const int64_t vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
|
|
@ -1455,8 +1456,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(
|
|||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
const ggml_vec_dot_t vec_dot = type_traits_cpu[type].vec_dot;
|
||||
const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
|
||||
const int64_t blck_0 = 16;
|
||||
const int64_t blck_1 = 16;
|
||||
|
|
@ -1523,8 +1524,8 @@ static void ggml_compute_forward_mul_mat_id(
|
|||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
const ggml_from_float_t from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||
|
|
@ -2973,28 +2974,67 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph);
|
||||
#endif
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
// Profiling state
|
||||
if (cplan->profiling_context != NULL && cplan->profiling_record_fn != NULL) {
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
if (ggml_op_is_empty(node->op)) {
|
||||
// skip NOPs
|
||||
continue;
|
||||
if (ggml_op_is_empty(node->op)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Only thread 0 records timing (after barrier = total node time)
|
||||
uint64_t t_start = 0;
|
||||
if (state->ith == 0) {
|
||||
t_start = ggml_profiler_time_ns();
|
||||
}
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
if (node_n + 1 < cgraph->n_nodes) {
|
||||
ggml_barrier(state->threadpool);
|
||||
}
|
||||
|
||||
if (state->ith == 0) {
|
||||
uint64_t t_end = ggml_profiler_time_ns();
|
||||
cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
|
||||
ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
|
||||
node->ne);
|
||||
}
|
||||
|
||||
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
|
||||
tp->ec = GGML_STATUS_ABORTED;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
if (ggml_op_is_empty(node->op)) {
|
||||
// skip NOPs
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (state->ith == 0 && cplan->abort_callback &&
|
||||
cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
|
||||
tp->ec = GGML_STATUS_ABORTED;
|
||||
}
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
if (node_n + 1 < cgraph->n_nodes) {
|
||||
ggml_barrier(state->threadpool);
|
||||
if (state->ith == 0 && cplan->abort_callback &&
|
||||
cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
|
||||
tp->ec = GGML_STATUS_ABORTED;
|
||||
}
|
||||
|
||||
if (node_n + 1 < cgraph->n_nodes) {
|
||||
ggml_barrier(state->threadpool);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include "repack.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-impl.h"
|
||||
|
|
@ -107,6 +108,11 @@ struct ggml_backend_cpu_context {
|
|||
void * abort_callback_data;
|
||||
|
||||
bool use_ref; // use reference implementation
|
||||
|
||||
// Profiling state
|
||||
bool profiling_enabled;
|
||||
int profiling_split_id;
|
||||
std::vector<ggml_profile_record> profiling_records;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
||||
|
|
@ -167,6 +173,34 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
|
|||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
// Callback function for recording CPU profiling events from C code (ggml-cpu.c)
|
||||
static void ggml_cpu_profiler_record_callback(void * context,
|
||||
int type,
|
||||
const char * name,
|
||||
int split_id,
|
||||
uint64_t start_ns,
|
||||
uint64_t end_ns,
|
||||
uint64_t bytes,
|
||||
const char * extra,
|
||||
const int64_t ne[4]) {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) context;
|
||||
ggml_profile_record rec;
|
||||
rec.type = (enum ggml_profile_event_type) type;
|
||||
rec.name = name;
|
||||
rec.backend_id = 0; // will be overwritten by scheduler
|
||||
rec.split_id = split_id != -1 ? split_id : cpu_ctx->profiling_split_id;
|
||||
rec.start_ns = start_ns;
|
||||
rec.end_ns = end_ns;
|
||||
rec.bytes = bytes;
|
||||
rec.extra = extra;
|
||||
if (ne) {
|
||||
memcpy(rec.ne, ne, sizeof(rec.ne));
|
||||
} else {
|
||||
memset(rec.ne, 0, sizeof(rec.ne));
|
||||
}
|
||||
cpu_ctx->profiling_records.push_back(rec);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
|
|
@ -187,6 +221,9 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
|
|||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
cplan.use_ref = cpu_ctx->use_ref;
|
||||
|
||||
cplan.profiling_context = cpu_ctx->profiling_enabled ? cpu_ctx : NULL;
|
||||
cplan.profiling_record_fn = cpu_ctx->profiling_enabled ? ggml_cpu_profiler_record_callback : NULL;
|
||||
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
|
|
@ -228,12 +265,15 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|||
ctx->abort_callback = NULL;
|
||||
ctx->abort_callback_data = NULL;
|
||||
ctx->use_ref = false;
|
||||
ctx->profiling_enabled = false;
|
||||
ctx->profiling_split_id = -1;
|
||||
|
||||
ggml_backend_t cpu_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .iface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .iface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
if (cpu_backend == NULL) {
|
||||
|
|
@ -241,6 +281,43 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// Register profiler
|
||||
static auto cpu_prof_enable = [](void * ctx, bool enable) {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
|
||||
cpu_ctx->profiling_enabled = enable;
|
||||
if (!enable) {
|
||||
cpu_ctx->profiling_records.clear();
|
||||
}
|
||||
};
|
||||
static auto cpu_prof_reset = [](void * ctx) {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
|
||||
cpu_ctx->profiling_records.clear();
|
||||
cpu_ctx->profiling_split_id = -1;
|
||||
};
|
||||
static auto cpu_prof_set_split_id = [](void * ctx, int split_id) {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
|
||||
cpu_ctx->profiling_split_id = split_id;
|
||||
};
|
||||
static auto cpu_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
|
||||
*out = cpu_ctx->profiling_records.data();
|
||||
return (int) cpu_ctx->profiling_records.size();
|
||||
};
|
||||
static auto cpu_prof_free = [](void * ctx) {
|
||||
// Nothing to free - records are in the CPU context's vector
|
||||
(void) ctx;
|
||||
};
|
||||
|
||||
auto * profiler = new ggml_backend_profiler{
|
||||
/* .context = */ ctx,
|
||||
/* .enable = */ cpu_prof_enable,
|
||||
/* .reset = */ cpu_prof_reset,
|
||||
/* .set_split_id = */ cpu_prof_set_split_id,
|
||||
/* .get_records = */ cpu_prof_get_records,
|
||||
/* .free_context = */ cpu_prof_free,
|
||||
};
|
||||
ggml_backend_set_profiler(cpu_backend, profiler);
|
||||
|
||||
return cpu_backend;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1339,6 +1339,9 @@ struct ggml_cuda_stream_context {
|
|||
}
|
||||
};
|
||||
|
||||
// Forward declaration for profiler state (defined in ggml-cuda.cu)
|
||||
struct ggml_cuda_profiler_state;
|
||||
|
||||
struct ggml_backend_cuda_context {
|
||||
int device;
|
||||
std::string name;
|
||||
|
|
@ -1434,6 +1437,9 @@ struct ggml_backend_cuda_context {
|
|||
ggml_cuda_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
|
||||
// Profiling
|
||||
ggml_cuda_profiler_state * profiler_state = nullptr;
|
||||
};
|
||||
|
||||
struct ggml_cuda_mm_fusion_args_host {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "ggml-cuda.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-profiler.h"
|
||||
|
||||
#include "ggml-cuda/common.cuh"
|
||||
#include "ggml-cuda/acc.cuh"
|
||||
|
|
@ -86,6 +87,90 @@
|
|||
|
||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
// CUDA profiler state
|
||||
struct ggml_cuda_profiler_state {
|
||||
bool enabled = false;
|
||||
int split_id = -1;
|
||||
cudaStream_t stream = nullptr;
|
||||
|
||||
static constexpr int MAX_PENDING_EVENTS = 4096;
|
||||
std::vector<cudaEvent_t> start_events;
|
||||
std::vector<cudaEvent_t> end_events;
|
||||
int event_count = 0;
|
||||
|
||||
std::vector<ggml_profile_record> records;
|
||||
std::vector<int> record_event_indices;
|
||||
|
||||
void init(cudaStream_t stream) {
|
||||
this->stream = stream;
|
||||
start_events.reserve(MAX_PENDING_EVENTS);
|
||||
end_events.reserve(MAX_PENDING_EVENTS);
|
||||
}
|
||||
|
||||
void reset() {
|
||||
for (auto & ev : start_events) {
|
||||
cudaEventDestroy(ev);
|
||||
}
|
||||
for (auto & ev : end_events) {
|
||||
cudaEventDestroy(ev);
|
||||
}
|
||||
start_events.clear();
|
||||
end_events.clear();
|
||||
event_count = 0;
|
||||
records.clear();
|
||||
record_event_indices.clear();
|
||||
}
|
||||
|
||||
~ggml_cuda_profiler_state() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void record_start() {
|
||||
cudaEvent_t ev;
|
||||
cudaEventCreate(&ev);
|
||||
cudaEventRecord(ev, stream);
|
||||
start_events.push_back(ev);
|
||||
event_count++;
|
||||
}
|
||||
|
||||
void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra, const int64_t ne[4]) {
|
||||
cudaEvent_t ev;
|
||||
cudaEventCreate(&ev);
|
||||
cudaEventRecord(ev, stream);
|
||||
end_events.push_back(ev);
|
||||
record_event_indices.push_back(records.size());
|
||||
|
||||
ggml_profile_record rec;
|
||||
rec.type = GGML_PROFILE_EVENT_OP;
|
||||
rec.name = name;
|
||||
rec.backend_id = backend_id;
|
||||
rec.split_id = split_id;
|
||||
rec.start_ns = 0;
|
||||
rec.end_ns = 0;
|
||||
rec.bytes = bytes;
|
||||
rec.extra = extra;
|
||||
if (ne) {
|
||||
memcpy(rec.ne, ne, sizeof(rec.ne));
|
||||
} else {
|
||||
memset(rec.ne, 0, sizeof(rec.ne));
|
||||
}
|
||||
records.push_back(rec);
|
||||
}
|
||||
|
||||
void finalize() {
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
for (int i = 0; i < (int)record_event_indices.size(); i++) {
|
||||
float ms = 0.0f;
|
||||
cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
|
||||
uint64_t ns = (uint64_t)(ms * 1e6f);
|
||||
int rec_idx = record_event_indices[i];
|
||||
records[rec_idx].start_ns = 0;
|
||||
records[rec_idx].end_ns = ns;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
[[noreturn]]
|
||||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
||||
int id = -1; // in case cudaGetDevice fails
|
||||
|
|
@ -4035,8 +4120,23 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
|||
#else
|
||||
GGML_UNUSED(integrated);
|
||||
#endif // NDEBUG
|
||||
if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
|
||||
cuda_ctx->profiler_state->record_start();
|
||||
}
|
||||
|
||||
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
||||
|
||||
if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
|
||||
cuda_ctx->profiler_state->record_end(
|
||||
ggml_op_name(node->op),
|
||||
-1,
|
||||
cuda_ctx->profiler_state->split_id,
|
||||
ggml_nbytes(node),
|
||||
nullptr,
|
||||
node->ne
|
||||
);
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
}
|
||||
|
|
@ -4107,6 +4207,19 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
// Disable CUDA graphs when profiling (we need per-node timing)
|
||||
bool was_graph_enabled = false;
|
||||
if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
|
||||
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
|
||||
was_graph_enabled = graph->is_enabled();
|
||||
if (was_graph_enabled) {
|
||||
graph->disable_due_to_gpu_arch = true;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
bool use_cuda_graph = false;
|
||||
bool cuda_graph_update_required = false;
|
||||
const void * graph_key = nullptr;
|
||||
|
|
@ -4158,6 +4271,15 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
|
||||
ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required, graph_key);
|
||||
|
||||
// Restore CUDA graph enabled state after profiling
|
||||
if (was_graph_enabled) {
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
const void * graph_key_prof = ggml_cuda_graph_get_key(cgraph);
|
||||
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key_prof);
|
||||
graph->disable_due_to_gpu_arch = false;
|
||||
#endif
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -5306,12 +5428,68 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|||
}
|
||||
|
||||
ggml_backend_t cuda_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cuda_guid(),
|
||||
/* .iface = */ ggml_backend_cuda_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_cuda_guid(),
|
||||
/* .iface = */ ggml_backend_cuda_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
// Register profiler
|
||||
auto * prof_state = new ggml_cuda_profiler_state();
|
||||
prof_state->init(ctx->stream());
|
||||
ctx->profiler_state = prof_state;
|
||||
|
||||
static auto cuda_prof_enable = [](void * ctx, bool enable) {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
cuda_ctx->profiler_state->enabled = enable;
|
||||
if (!enable) {
|
||||
cuda_ctx->profiler_state->reset();
|
||||
}
|
||||
}
|
||||
};
|
||||
static auto cuda_prof_reset = [](void * ctx) {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
cuda_ctx->profiler_state->reset();
|
||||
cuda_ctx->profiler_state->split_id = -1;
|
||||
}
|
||||
};
|
||||
static auto cuda_prof_set_split_id = [](void * ctx, int split_id) {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
cuda_ctx->profiler_state->split_id = split_id;
|
||||
}
|
||||
};
|
||||
static auto cuda_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
cuda_ctx->profiler_state->finalize();
|
||||
*out = cuda_ctx->profiler_state->records.data();
|
||||
return (int)cuda_ctx->profiler_state->records.size();
|
||||
}
|
||||
*out = nullptr;
|
||||
return 0;
|
||||
};
|
||||
static auto cuda_prof_free = [](void * ctx) {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
delete cuda_ctx->profiler_state;
|
||||
cuda_ctx->profiler_state = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
auto * profiler = new ggml_backend_profiler {
|
||||
/* .context = */ ctx,
|
||||
/* .enable = */ cuda_prof_enable,
|
||||
/* .reset = */ cuda_prof_reset,
|
||||
/* .set_split_id = */ cuda_prof_set_split_id,
|
||||
/* .get_records = */ cuda_prof_get_records,
|
||||
/* .free_context = */ cuda_prof_free,
|
||||
};
|
||||
ggml_backend_set_profiler(cuda_backend, profiler);
|
||||
|
||||
return cuda_backend;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,74 @@
|
|||
#include "ggml-profiler.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <time.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
//
|
||||
// Time utilities
|
||||
//
|
||||
|
||||
uint64_t ggml_profiler_time_ns(void) {
|
||||
#ifdef _WIN32
|
||||
LARGE_INTEGER freq, count;
|
||||
QueryPerformanceFrequency(&freq);
|
||||
QueryPerformanceCounter(&count);
|
||||
return (uint64_t) (count.QuadPart * 1000000000ULL / freq.QuadPart);
|
||||
#elif defined(__APPLE__)
|
||||
clock_serv_t cclock;
|
||||
mach_timespec_t mts;
|
||||
host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &cclock);
|
||||
clock_get_time(cclock, &mts);
|
||||
mach_port_deallocate(mach_task_self(), cclock);
|
||||
return (uint64_t) mts.tv_sec * 1000000000ULL + (uint64_t) mts.tv_nsec;
|
||||
#elif defined(CLOCK_MONOTONIC_RAW)
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||
return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
|
||||
#else
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// Backend profiler registration
|
||||
//
|
||||
|
||||
void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler) {
|
||||
if (backend == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Free any existing profiler
|
||||
if (backend->profiler != NULL) {
|
||||
if (backend->profiler->free_context != NULL) {
|
||||
backend->profiler->free_context(backend->profiler->context);
|
||||
}
|
||||
delete backend->profiler;
|
||||
backend->profiler = NULL;
|
||||
}
|
||||
|
||||
backend->profiler = profiler;
|
||||
}
|
||||
|
||||
ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend) {
|
||||
if (backend == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
return backend->profiler;
|
||||
}
|
||||
|
|
@ -550,6 +550,8 @@ extern "C" {
|
|||
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
||||
|
||||
LLAMA_API struct ggml_backend_sched * llama_context_get_sched(const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "llama-context.h"
|
||||
|
||||
#include "ggml-profiler.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
|
|
@ -2189,6 +2190,11 @@ ggml_status llama_context::graph_compute(
|
|||
LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
|
||||
}
|
||||
|
||||
// If profiling is enabled, synchronize to ensure records are complete
|
||||
if (ggml_backend_sched_get_profiling(sched.get())) {
|
||||
ggml_backend_sched_synchronize(sched.get());
|
||||
}
|
||||
|
||||
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
|
||||
|
||||
return status;
|
||||
|
|
@ -3025,6 +3031,10 @@ enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
|
|||
return ctx->pooling_type();
|
||||
}
|
||||
|
||||
ggml_backend_sched_t llama_context_get_sched(const llama_context * ctx) {
|
||||
return ctx->get_sched();
|
||||
}
|
||||
|
||||
void llama_attach_threadpool(
|
||||
llama_context * ctx,
|
||||
ggml_threadpool_t threadpool,
|
||||
|
|
|
|||
|
|
@ -644,6 +644,23 @@ int main(int argc, char ** argv) {
|
|||
ctx_cli.ctx_server.terminate();
|
||||
inference_thread.join();
|
||||
|
||||
// Export profiling data if profiling was enabled
|
||||
if (params.profiling) {
|
||||
ggml_backend_sched_t sched = llama_context_get_sched(ctx_cli.ctx_server.get_llama_context());
|
||||
if (sched != nullptr) {
|
||||
if (params.profiling_output.empty()) {
|
||||
ggml_backend_sched_print_profiling(sched);
|
||||
} else {
|
||||
int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
|
||||
if (ret == 0) {
|
||||
console::log("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
|
||||
} else {
|
||||
console::error("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// bump the log level to display timings
|
||||
common_log_set_verbosity_thold(LOG_LEVEL_INFO);
|
||||
llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
|
||||
|
|
|
|||
|
|
@ -997,6 +997,23 @@ int main(int argc, char ** argv) {
|
|||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
||||
// Export profiling data if profiling was enabled
|
||||
if (params.profiling) {
|
||||
ggml_backend_sched_t sched = llama_context_get_sched(ctx);
|
||||
if (sched != nullptr) {
|
||||
if (params.profiling_output.empty()) {
|
||||
ggml_backend_sched_print_profiling(sched);
|
||||
} else {
|
||||
int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
|
||||
if (ret == 0) {
|
||||
LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
|
||||
} else {
|
||||
LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOG("\n\n");
|
||||
common_perf_print(ctx, smpl);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
# llama.cpp profiler analysis tools
|
||||
|
|
@ -0,0 +1,986 @@
|
|||
#!/usr/bin/env python3
|
||||
"""llama.cpp cross-backend profiler analysis tool.
|
||||
|
||||
Usage:
|
||||
python -m tools.profiler.profiler profile.json
|
||||
python -m tools.profiler.profiler profile.json --chrome-trace trace.json
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
OP_EVENT = 0
|
||||
COPY_EVENT = 1
|
||||
|
||||
TYPE_NAMES = {0: "OP", 1: "COPY"}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProfileRecord:
|
||||
type: int
|
||||
name: str
|
||||
backend_id: int
|
||||
split_id: int
|
||||
start_ns: int
|
||||
duration_ns: int
|
||||
bytes: int
|
||||
extra: Optional[str]
|
||||
ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
|
||||
|
||||
@property
|
||||
def type_name(self) -> str:
|
||||
return TYPE_NAMES.get(self.type, f"UNKNOWN({self.type})")
|
||||
|
||||
@property
|
||||
def duration_us(self) -> float:
|
||||
return self.duration_ns / 1000.0
|
||||
|
||||
@property
|
||||
def duration_ms(self) -> float:
|
||||
return self.duration_ns / 1_000_000.0
|
||||
|
||||
@property
|
||||
def bandwidth_gbps(self) -> float:
|
||||
"""Bandwidth in GB/s (only meaningful for copy events)."""
|
||||
if self.duration_ns == 0 or self.bytes == 0:
|
||||
return 0.0
|
||||
return self.bytes / self.duration_ns
|
||||
|
||||
@property
|
||||
def shape_str(self) -> str:
|
||||
"""Human-readable tensor shape string, e.g. '[4096, 4096]'."""
|
||||
dims = [n for n in self.ne if n > 0]
|
||||
if not dims:
|
||||
return ""
|
||||
return "[" + ", ".join(str(d) for d in dims) + "]"
|
||||
|
||||
@property
|
||||
def ne_elements(self) -> int:
|
||||
"""Total number of elements."""
|
||||
result = 1
|
||||
for n in self.ne:
|
||||
if n > 0:
|
||||
result *= n
|
||||
return result
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"type": self.type,
|
||||
"name": self.name,
|
||||
"backend_id": self.backend_id,
|
||||
"split_id": self.split_id,
|
||||
"start_ns": self.start_ns,
|
||||
"duration_ns": self.duration_ns,
|
||||
"bytes": self.bytes,
|
||||
"extra": self.extra,
|
||||
"ne": self.ne,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpStats:
|
||||
name: str
|
||||
event_type: int
|
||||
backend_id: int
|
||||
count: int = 0
|
||||
total_ns: int = 0
|
||||
min_ns: int = 0
|
||||
max_ns: int = 0
|
||||
total_bytes: int = 0
|
||||
representative_ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
|
||||
|
||||
@property
|
||||
def avg_ns(self) -> float:
|
||||
return self.total_ns / self.count if self.count > 0 else 0
|
||||
|
||||
@property
|
||||
def avg_us(self) -> float:
|
||||
return self.avg_ns / 1000.0
|
||||
|
||||
@property
|
||||
def total_ms(self) -> float:
|
||||
return self.total_ns / 1_000_000.0
|
||||
|
||||
@property
|
||||
def min_us(self) -> float:
|
||||
return self.min_ns / 1000.0
|
||||
|
||||
@property
|
||||
def max_us(self) -> float:
|
||||
return self.max_ns / 1000.0
|
||||
|
||||
@property
|
||||
def bandwidth_gbps(self) -> float:
|
||||
if self.total_ns == 0 or self.total_bytes == 0:
|
||||
return 0.0
|
||||
return self.total_bytes / self.total_ns
|
||||
|
||||
@property
|
||||
def time_per_byte_ns(self) -> float:
|
||||
"""Time per byte (lower = more efficient)."""
|
||||
if self.total_bytes == 0:
|
||||
return float("inf")
|
||||
return self.total_ns / self.total_bytes
|
||||
|
||||
@property
|
||||
def type_name(self) -> str:
|
||||
return TYPE_NAMES.get(self.event_type, f"UNKNOWN({self.event_type})")
|
||||
|
||||
|
||||
class ProfileData:
|
||||
def __init__(self, records: list[ProfileRecord], metadata: dict):
|
||||
self.records = records
|
||||
self.metadata = metadata
|
||||
|
||||
@classmethod
|
||||
def load(cls, filepath: str | Path) -> ProfileData:
|
||||
"""Load a profiler JSON file."""
|
||||
with open(filepath, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
if data.get("profiler") != "ggml":
|
||||
print(f"Warning: file may not be a ggml profiler output (profiler={data.get('profiler')})")
|
||||
|
||||
records = []
|
||||
for r in data.get("records", []):
|
||||
ne = r.get("ne", [0, 0, 0, 0])
|
||||
if isinstance(ne, list) and len(ne) < 4:
|
||||
ne = ne + [0] * (4 - len(ne))
|
||||
elif not isinstance(ne, list):
|
||||
ne = [0, 0, 0, 0]
|
||||
records.append(ProfileRecord(
|
||||
type=r.get("type", 0),
|
||||
name=r.get("name", "unknown"),
|
||||
backend_id=r.get("backend_id", 0),
|
||||
split_id=r.get("split_id", 0),
|
||||
start_ns=r.get("start_ns", 0),
|
||||
duration_ns=r.get("duration_ns", 0),
|
||||
bytes=r.get("bytes", 0),
|
||||
extra=r.get("extra"),
|
||||
ne=ne,
|
||||
))
|
||||
|
||||
backends_raw = data.get("backends", [])
|
||||
backends = []
|
||||
for b in backends_raw:
|
||||
backends.append({
|
||||
"id": b.get("id", 0),
|
||||
"name": b.get("name", "unknown"),
|
||||
"device": b.get("device", "unknown"),
|
||||
"device_type": b.get("device_type", 0),
|
||||
})
|
||||
|
||||
metadata = {
|
||||
"version": data.get("version", 0),
|
||||
"total_records": data.get("total_records", len(records)),
|
||||
"total_ns": data.get("total_ns", sum(r.duration_ns for r in records)),
|
||||
"backends": backends,
|
||||
}
|
||||
|
||||
return cls(records, metadata)
|
||||
|
||||
@property
|
||||
def total_ns(self) -> int:
|
||||
return sum(r.duration_ns for r in self.records)
|
||||
|
||||
@property
|
||||
def total_ms(self) -> float:
|
||||
return self.total_ns / 1_000_000.0
|
||||
|
||||
def stats(self) -> list[OpStats]:
|
||||
"""Aggregate stats grouped by (name, type, backend_id)."""
|
||||
groups: dict[tuple, OpStats] = {}
|
||||
for rec in self.records:
|
||||
key = (rec.name, rec.type, rec.backend_id)
|
||||
if key not in groups:
|
||||
groups[key] = OpStats(
|
||||
name=rec.name,
|
||||
event_type=rec.type,
|
||||
backend_id=rec.backend_id,
|
||||
min_ns=rec.duration_ns,
|
||||
max_ns=rec.duration_ns,
|
||||
representative_ne=list(rec.ne),
|
||||
)
|
||||
s = groups[key]
|
||||
s.count += 1
|
||||
s.total_ns += rec.duration_ns
|
||||
s.min_ns = min(s.min_ns, rec.duration_ns)
|
||||
s.max_ns = max(s.max_ns, rec.duration_ns)
|
||||
s.total_bytes += rec.bytes
|
||||
|
||||
# Track the ne from the longest individual call
|
||||
if rec.duration_ns >= s.max_ns:
|
||||
s.representative_ne = list(rec.ne)
|
||||
|
||||
return sorted(groups.values(), key=lambda s: s.total_ns, reverse=True)
|
||||
|
||||
def top_operations(self, n: int = 10) -> list[OpStats]:
|
||||
"""Return the N most time-consuming operations (aggregated)."""
|
||||
return self.stats()[:n]
|
||||
|
||||
def top_kernels(self, n: int = 10) -> list[ProfileRecord]:
|
||||
"""Return the N longest individual kernel executions."""
|
||||
return sorted(self.records, key=lambda r: r.duration_ns, reverse=True)[:n]
|
||||
|
||||
def by_backend(self) -> dict[int, list[ProfileRecord]]:
|
||||
"""Group records by backend ID."""
|
||||
groups: dict[int, list[ProfileRecord]] = {}
|
||||
for rec in self.records:
|
||||
groups.setdefault(rec.backend_id, []).append(rec)
|
||||
return dict(sorted(groups.items()))
|
||||
|
||||
def timeline(self) -> list[ProfileRecord]:
|
||||
"""Return records sorted by start_ns for timeline visualization."""
|
||||
return sorted(self.records, key=lambda r: r.start_ns)
|
||||
|
||||
def inefficiency_ranking(self, n: int = 10) -> list[OpStats]:
|
||||
"""Rank operations by time per byte (inefficiency). Lower is better."""
|
||||
all_stats = [s for s in self.stats() if s.total_bytes > 0 and s.event_type == OP_EVENT]
|
||||
return sorted(all_stats, key=lambda s: s.time_per_byte_ns, reverse=True)[:n]
|
||||
|
||||
def summary(self) -> None:
|
||||
"""Print a formatted summary table to stdout."""
|
||||
print(f"\n{'='*80}")
|
||||
print(f" ggml Profiler Summary")
|
||||
print(f"{'='*80}")
|
||||
print(f" Total records: {len(self.records)}")
|
||||
print(f" Total time: {self.total_ms:.2f} ms")
|
||||
print(f" Unique ops: {len(set((r.name, r.type, r.backend_id) for r in self.records))}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
stats = self.stats()
|
||||
if not stats:
|
||||
print(" No profiling data.\n")
|
||||
return
|
||||
|
||||
print(f" {'TYPE':<5} {'BKND':>4} {'Operation':<28} {'%Time':>7} {'Count':>6} "
|
||||
f"{'Total':>10} {'Avg':>10} {'Min':>10} {'Max':>10} {'Bytes':>10}")
|
||||
print(f" {'':->5} {'':->4} {'':->28} {'':->7} {'':->6} "
|
||||
f"{'(ms)':>10} {'(us)':>10} {'(us)':>10} {'(us)':>10} {'':->10}")
|
||||
|
||||
for s in stats:
|
||||
pct = 100.0 * s.total_ns / self.total_ns if self.total_ns > 0 else 0
|
||||
|
||||
line = (f" {s.type_name:<5} {s.backend_id:>4} {s.name:<28} {pct:>6.1f}% "
|
||||
f"{s.count:>6} {s.total_ms:>10.2f} {s.avg_us:>10.2f} "
|
||||
f"{s.min_us:>10.2f} {s.max_us:>10.2f}")
|
||||
|
||||
if s.total_bytes > 0:
|
||||
bw = s.bandwidth_gbps
|
||||
bytes_str = f"{s.total_bytes / 1e6:.1f} MB"
|
||||
if s.event_type == COPY_EVENT:
|
||||
line += f" {bw:>8.2f} GB/s"
|
||||
else:
|
||||
line += f" {bytes_str:>10}"
|
||||
else:
|
||||
line += f" {'':>10}"
|
||||
|
||||
# Tensor shape from longest call
|
||||
shape_dims = [n for n in s.representative_ne if n > 0]
|
||||
if shape_dims:
|
||||
line += f" [{', '.join(str(d) for d in shape_dims)}]"
|
||||
|
||||
print(line)
|
||||
|
||||
backend_groups = self.by_backend()
|
||||
if len(backend_groups) > 1:
|
||||
print(f"\n --- By Backend ---")
|
||||
for bid, recs in sorted(backend_groups.items()):
|
||||
bk_total = sum(r.duration_ns for r in recs)
|
||||
bk_pct = 100.0 * bk_total / self.total_ns if self.total_ns > 0 else 0
|
||||
print(f" Backend {bid}: {bk_total / 1e6:.2f} ms ({bk_pct:.1f}%) — {len(recs)} records")
|
||||
|
||||
inef = self.inefficiency_ranking(5)
|
||||
if inef:
|
||||
print(f"\n --- Top 5 Inefficient Operations (time/byte) ---")
|
||||
for s in inef:
|
||||
print(f" {s.name:<28} {s.time_per_byte_ns / 1000:.2f} us/byte "
|
||||
f"({s.count} calls, {s.total_bytes / 1e6:.1f} MB)")
|
||||
|
||||
top_k = self.top_kernels(5)
|
||||
print(f"\n --- Top 5 Longest Kernels ---")
|
||||
for rec in top_k:
|
||||
shape = f" {rec.shape_str}" if rec.shape_str else ""
|
||||
print(f" {rec.type_name:<5} {rec.name:<28} {rec.duration_us:>10.2f} us{shape} "
|
||||
f"(split={rec.split_id}, backend={rec.backend_id})")
|
||||
|
||||
print()
|
||||
|
||||
def export_chrome_trace(self, filepath: str | Path) -> None:
|
||||
"""Export as Chrome Trace Event format for chrome://tracing."""
|
||||
events = []
|
||||
|
||||
# Build backend name mapping and remap to non-negative PIDs
|
||||
# (Chrome cannot handle negative PIDs)
|
||||
backend_ids = sorted(set(rec.backend_id for rec in self.records))
|
||||
backend_names: dict[int, str] = {}
|
||||
pid_map: dict[int, int] = {}
|
||||
|
||||
# Use metadata from JSON if available
|
||||
metadata_backends = self.metadata.get("backends", [])
|
||||
backend_by_id: dict[int, dict] = {b["id"]: b for b in metadata_backends}
|
||||
|
||||
device_type_names = {0: "CPU", 1: "GPU", 2: "ACCEL"}
|
||||
for idx, bid in enumerate(backend_ids):
|
||||
pid_map[bid] = idx
|
||||
if bid in backend_by_id:
|
||||
binfo = backend_by_id[bid]
|
||||
dev_type = binfo.get("device_type", 0)
|
||||
dev_name = binfo.get("device", "")
|
||||
type_name = device_type_names.get(dev_type, "Device")
|
||||
if dev_name and dev_name != "unknown":
|
||||
backend_names[bid] = f"{type_name}: {dev_name}"
|
||||
else:
|
||||
backend_names[bid] = f"{type_name}: {binfo.get('name', f'Backend {bid}')}"
|
||||
else:
|
||||
backend_names[bid] = f"Backend {bid}"
|
||||
|
||||
# Process metadata events
|
||||
for bid in backend_ids:
|
||||
pid = pid_map[bid]
|
||||
events.append({
|
||||
"ph": "M", # metadata
|
||||
"pid": pid,
|
||||
"name": "process_name",
|
||||
"args": {"name": backend_names[bid]},
|
||||
})
|
||||
|
||||
# Group records by (backend_id, split_id) and lay them out sequentially
|
||||
# since we don't have reliable global timestamps across backends.
|
||||
# Within each group, events are cumulative.
|
||||
from collections import OrderedDict
|
||||
groups: OrderedDict[tuple, list[ProfileRecord]] = OrderedDict()
|
||||
for rec in self.records:
|
||||
key = (rec.backend_id, rec.split_id)
|
||||
groups.setdefault(key, []).append(rec)
|
||||
|
||||
# Assign timestamps: each group starts after the previous one,
|
||||
# and events within a group are sequential (cumulative duration).
|
||||
global_ts = 0.0 # microseconds
|
||||
for key, recs in groups.items():
|
||||
backend_id, split_id = key
|
||||
pid = pid_map[backend_id]
|
||||
tid = f"split_{split_id}"
|
||||
|
||||
for rec in recs:
|
||||
cat = "copy" if rec.type == COPY_EVENT else "compute"
|
||||
events.append({
|
||||
"ph": "X", # complete event
|
||||
"pid": pid,
|
||||
"tid": tid,
|
||||
"name": rec.name,
|
||||
"ts": global_ts,
|
||||
"dur": rec.duration_ns / 1000.0, # us
|
||||
"cat": cat,
|
||||
"args": {
|
||||
"bytes": rec.bytes,
|
||||
"duration_us": rec.duration_ns / 1000.0,
|
||||
"shape": rec.shape_str,
|
||||
},
|
||||
})
|
||||
global_ts += rec.duration_ns / 1000.0
|
||||
|
||||
# Add a small gap between groups for visual separation
|
||||
global_ts += 1.0
|
||||
|
||||
trace = {"traceEvents": events}
|
||||
with open(filepath, "w") as f:
|
||||
json.dump(trace, f, indent=2)
|
||||
|
||||
print(f"Chrome trace exported to: {filepath}")
|
||||
print(f"Open chrome://tracing in Chrome/Edge and load this file.")
|
||||
|
||||
def export_html_viewer(self, filepath: str | Path, max_records: int = 0) -> None:
|
||||
"""Export a self-contained interactive HTML timeline viewer using Canvas."""
|
||||
import json as json_mod
|
||||
|
||||
metadata_backends = self.metadata.get("backends", [])
|
||||
backend_by_id: dict[int, dict] = {b["id"]: b for b in metadata_backends}
|
||||
|
||||
backend_names: dict[int, str] = {}
|
||||
for bid in sorted(set(rec.backend_id for rec in self.records)):
|
||||
binfo = backend_by_id.get(bid, {})
|
||||
name = binfo.get("name", f"Backend {bid}")
|
||||
device = binfo.get("device", "")
|
||||
backend_names[bid] = device if device and device != "unknown" else name
|
||||
|
||||
events: list[dict] = []
|
||||
cum_us = 0.0
|
||||
for rec in self.records:
|
||||
dur_us = rec.duration_ns / 1000.0
|
||||
events.append({
|
||||
"n": rec.name,
|
||||
"d": dur_us,
|
||||
"s": rec.shape_str,
|
||||
"b": rec.bytes,
|
||||
"t": rec.type,
|
||||
"bid": rec.backend_id,
|
||||
"start": cum_us,
|
||||
})
|
||||
cum_us += dur_us
|
||||
total_us = cum_us
|
||||
|
||||
if max_records > 0 and len(events) > max_records:
|
||||
stride = len(events) // max_records
|
||||
events = events[::stride][:max_records]
|
||||
|
||||
if total_us == 0:
|
||||
print("No profiling data to export.")
|
||||
return
|
||||
|
||||
header_stats = str(len(events)) + ' events | ' + f'{total_us/1000:.1f}' + ' ms'
|
||||
|
||||
# Build backend name map with string keys for JSON
|
||||
bn_str = {str(k): v for k, v in backend_names.items()}
|
||||
|
||||
# --- HTML ---
|
||||
html = (
|
||||
'<!DOCTYPE html>\n<html><head><meta charset="utf-8">'
|
||||
'<title>ggml Profiler</title>\n<style>\n'
|
||||
'*{margin:0;padding:0;box-sizing:border-box}\n'
|
||||
'body{font-family:system-ui,sans-serif;background:#1a1a2e;color:#eee;'
|
||||
'display:flex;flex-direction:column;height:100vh;overflow:hidden}\n'
|
||||
'#hd{background:#16213e;padding:8px 16px;display:flex;align-items:center;'
|
||||
'gap:16px;border-bottom:1px solid #0f3460;flex-shrink:0}\n'
|
||||
'#hd h1{font-size:15px;color:#e94560}\n'
|
||||
'#hd .st{font-size:11px;color:#888}\n'
|
||||
'#tb{background:#16213e;padding:6px 16px;border-bottom:1px solid #0f3460;'
|
||||
'display:flex;align-items:center;gap:6px;flex-shrink:0}\n'
|
||||
'#tb button{background:#0f3460;color:#eee;border:none;padding:5px 12px;'
|
||||
'cursor:pointer;border-radius:3px;font-size:11px}\n'
|
||||
'#tb button:hover{background:#e94560}\n'
|
||||
'#vi{font-size:10px;color:#888;margin-left:auto}\n'
|
||||
'#main{flex:1;display:flex;flex-direction:column;overflow:hidden}\n'
|
||||
'#cw{flex-shrink:0;overflow:hidden;position:relative}\n'
|
||||
'#c{display:block}\n'
|
||||
'#stats{flex:1;overflow-y:auto;background:#1a1a2e;border-top:1px solid #0f3460}\n'
|
||||
'#stats table{width:100%;border-collapse:collapse;font-size:11px}\n'
|
||||
'#stats thead{position:sticky;top:0;z-index:1}\n'
|
||||
'#stats th{text-align:left;padding:6px 10px;color:#888;background:#16213e;'
|
||||
'border-bottom:1px solid #0f3460;font-weight:normal;font-size:10px;'
|
||||
'text-transform:uppercase;letter-spacing:0.5px}\n'
|
||||
'#stats th.r{text-align:right}\n'
|
||||
'#stats td{padding:4px 10px;border-bottom:1px solid rgba(15,52,96,0.4)}\n'
|
||||
'#stats td.r{text-align:right;font-variant-numeric:tabular-nums;font-family:monospace,system-ui}\n'
|
||||
'#stats .l0 td{background:rgba(30,30,54,0.6)}\n'
|
||||
'#stats .l0:hover td{background:rgba(40,40,70,0.8)}\n'
|
||||
'#stats .l1:hover td,.l2:hover td{background:rgba(35,35,60,0.5)}\n'
|
||||
'#stats .tog{cursor:pointer;user-select:none;color:#666;'
|
||||
'width:16px;display:inline-block;text-align:center;font-size:9px}\n'
|
||||
'#stats .tog:hover{color:#e94560}\n'
|
||||
'#stats .pct-cell{position:relative}\n'
|
||||
'#stats .pct-bg{position:absolute;left:0;top:1px;bottom:1px;border-radius:2px;pointer-events:none}\n'
|
||||
'#stats .pct-tx{position:relative}\n'
|
||||
'#tt{position:fixed;background:#16213e;border:1px solid #e94560;'
|
||||
'padding:10px;border-radius:5px;font-size:11px;display:none;'
|
||||
'z-index:100;pointer-events:none;max-width:280px;line-height:1.6}\n'
|
||||
'#lg{background:#16213e;padding:6px 16px;border-top:1px solid #0f3460;'
|
||||
'font-size:10px;flex-shrink:0}\n'
|
||||
'</style></head><body>\n'
|
||||
'<div id="hd"><h1>ggml Profiler Timeline</h1>'
|
||||
'<span class="st">' + header_stats + '</span></div>\n'
|
||||
'<div id="tb">'
|
||||
'<button onclick="fitAll()">Fit</button>'
|
||||
'<button onclick="zoomTo(1000000)">1s</button>'
|
||||
'<button onclick="zoomTo(100000)">100ms</button>'
|
||||
'<button onclick="zoomTo(10000)">10ms</button>'
|
||||
'<button onclick="zoomTo(1000)">1ms</button>'
|
||||
'<button onclick="zoomTo(100)">100\u03bcs</button>'
|
||||
'<span id="vi"></span></div>\n'
|
||||
'<div id="main">\n'
|
||||
'<div id="cw"><canvas id="c"></canvas></div>\n'
|
||||
'<div id="stats"></div>\n'
|
||||
'</div>\n'
|
||||
'<div id="tt"></div>\n'
|
||||
'<div id="lg"></div>\n'
|
||||
'<script>\n'
|
||||
)
|
||||
|
||||
# --- Inject data ---
|
||||
html += 'var EVENTS=' + json_mod.dumps(events, separators=(',', ':')) + ';\n'
|
||||
html += 'var BACKENDS=' + json_mod.dumps(bn_str, separators=(',', ':')) + ';\n'
|
||||
html += 'var TOTAL_US=' + repr(total_us) + ';\n'
|
||||
|
||||
# --- JavaScript (plain string, no f-strings) ---
|
||||
js = r"""
|
||||
// Pre-process: group events by lane
|
||||
var LANE_IDS=[],seen={};
|
||||
for(var i=0;i<EVENTS.length;i++){var b=EVENTS[i].bid;if(!(b in seen)){LANE_IDS.push(b);seen[b]=true;}}
|
||||
LANE_IDS.sort(function(a,b){return a-b;});
|
||||
var LANE_EVENTS={};
|
||||
for(var i=0;i<LANE_IDS.length;i++)LANE_EVENTS[LANE_IDS[i]]=[];
|
||||
for(var i=0;i<EVENTS.length;i++)LANE_EVENTS[EVENTS[i].bid].push(EVENTS[i]);
|
||||
|
||||
// Constants
|
||||
var LANE_H=32,LABEL_W=150,MINIMAP_H=28,AXIS_H=18;
|
||||
var TOP_PAD=MINIMAP_H+AXIS_H;
|
||||
var BAR_PAD=3,COPY_PAD=8;
|
||||
|
||||
// Colors
|
||||
var OP_COL={'MUL_MAT':'#4285f4','FLASH_ATTN_EXT':'#e879a0','ADD':'#81c784',
|
||||
'ROPE':'#ce93d8','GET_ROWS':'#ffab91','CPY':'#b0bec5','CONCAT':'#90caf9',
|
||||
'SCALE':'#80deea','MUL':'#a5d6a7','SOFT_MAX':'#fff176','RMS_NORM':'#ffcc80',
|
||||
'SILU':'#ef9a9a','CONT':'#80cbc4','RESHAPE':'#9fa8da','VIEW':'#a1887f',
|
||||
'PERMUTE':'#90a4ae','TRANSPOSE':'#c5e1a5','UNARY':'#f48fb1'};
|
||||
function hash(s){var h=0;for(var i=0;i<s.length;i++)h=((h<<5)-h)+s.charCodeAt(i);return Math.abs(h);}
|
||||
function col(n){return OP_COL[n]||('hsl('+hash(n)%360+',60%,55%)');}
|
||||
function fmtT(us){if(us>=1e6)return(us/1e6).toFixed(2)+'s';if(us>=1e3)return(us/1e3).toFixed(2)+'ms';return us.toFixed(1)+'\u03bcs';}
|
||||
function fmtB(b){if(!b)return'';if(b>=1e9)return(b/1e9).toFixed(1)+'GB';if(b>=1e6)return(b/1e6).toFixed(1)+'MB';if(b>=1e3)return(b/1e3).toFixed(1)+'KB';return b+'B';}
|
||||
function fmtSh(s){if(!s)return'';return s.replace(/[\[\],]/g,function(m){return'<span style="color:#e8a040">'+m+'</span>';});}
|
||||
|
||||
// Canvas state
|
||||
var canvas=document.getElementById('c');
|
||||
var ctx,canvasW,canvasH,viewW;
|
||||
var scale=1,offsetUs=0;
|
||||
var hoveredEv=null,isDragging=false,dragStartX,dragStartOff;
|
||||
|
||||
function setup(){
|
||||
var dpr=window.devicePixelRatio||1;
|
||||
canvasW=canvas.parentElement.clientWidth;
|
||||
canvasH=Math.max(200,LANE_IDS.length*LANE_H+TOP_PAD+4);
|
||||
canvas.width=Math.round(canvasW*dpr);
|
||||
canvas.height=Math.round(canvasH*dpr);
|
||||
canvas.style.width=canvasW+'px';
|
||||
canvas.style.height=canvasH+'px';
|
||||
ctx=canvas.getContext('2d');
|
||||
ctx.scale(dpr,dpr);
|
||||
viewW=canvasW-LABEL_W;
|
||||
document.getElementById('cw').style.height=canvasH+'px';
|
||||
}
|
||||
|
||||
// Binary search: first event where start+d >= t
|
||||
function bsFirst(evts,t){
|
||||
var lo=0,hi=evts.length;
|
||||
while(lo<hi){var m=(lo+hi)>>1;if(evts[m].start+evts[m].d<t)lo=m+1;else hi=m;}
|
||||
return lo;
|
||||
}
|
||||
// Binary search: find event containing time t
|
||||
function bsHit(evts,t){
|
||||
var lo=0,hi=evts.length-1;
|
||||
while(lo<=hi){
|
||||
var m=(lo+hi)>>1;var ev=evts[m];
|
||||
if(t<ev.start)hi=m-1;
|
||||
else if(t>ev.start+ev.d)lo=m+1;
|
||||
else return ev;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Pre-render minimap to offscreen canvas
|
||||
var mmCanvas;
|
||||
function buildMinimap(){
|
||||
var dpr=window.devicePixelRatio||1;
|
||||
mmCanvas=document.createElement('canvas');
|
||||
mmCanvas.width=Math.round(canvasW*dpr);
|
||||
mmCanvas.height=Math.round(MINIMAP_H*dpr);
|
||||
var mc=mmCanvas.getContext('2d');
|
||||
mc.scale(dpr,dpr);
|
||||
mc.fillStyle='#0d1117';
|
||||
mc.fillRect(0,0,canvasW,MINIMAP_H);
|
||||
var mmScale=canvasW/TOTAL_US;
|
||||
for(var li=0;li<LANE_IDS.length;li++){
|
||||
var evts=LANE_EVENTS[LANE_IDS[li]];
|
||||
var step=Math.max(1,Math.floor(evts.length/(canvasW*2)));
|
||||
mc.globalAlpha=0.6;
|
||||
for(var i=0;i<evts.length;i+=step){
|
||||
var ev=evts[i];
|
||||
mc.fillStyle=col(ev.n);
|
||||
mc.fillRect(ev.start*mmScale,2,Math.max(0.5,ev.d*mmScale),MINIMAP_H-4);
|
||||
}
|
||||
}
|
||||
mc.globalAlpha=1;
|
||||
}
|
||||
|
||||
function clampOffset(){
|
||||
var maxOff=TOTAL_US-viewW/scale;
|
||||
if(maxOff<0)maxOff=0;
|
||||
if(offsetUs<0)offsetUs=0;
|
||||
if(offsetUs>maxOff)offsetUs=maxOff;
|
||||
}
|
||||
|
||||
function render(){
|
||||
ctx.clearRect(0,0,canvasW,canvasH);
|
||||
var visStart=offsetUs,visEnd=offsetUs+viewW/scale;
|
||||
|
||||
// Minimap
|
||||
ctx.drawImage(mmCanvas,0,0,canvasW,MINIMAP_H);
|
||||
var vpX=offsetUs/TOTAL_US*canvasW,vpW=viewW/scale/TOTAL_US*canvasW;
|
||||
ctx.strokeStyle='#e94560';ctx.lineWidth=2;
|
||||
ctx.strokeRect(vpX,1,Math.max(2,vpW),MINIMAP_H-2);
|
||||
ctx.fillStyle='rgba(233,69,96,0.15)';
|
||||
ctx.fillRect(vpX,1,Math.max(2,vpW),MINIMAP_H-2);
|
||||
|
||||
// Time axis background
|
||||
ctx.fillStyle='#12122a';
|
||||
ctx.fillRect(LABEL_W,MINIMAP_H,viewW,AXIS_H);
|
||||
|
||||
// Time axis ticks
|
||||
var rangeUs=visEnd-visStart;
|
||||
if(rangeUs>0){
|
||||
var raw=rangeUs/8;
|
||||
var mag=Math.pow(10,Math.floor(Math.log10(raw)));
|
||||
var iv;if(raw/mag<2)iv=2*mag;else if(raw/mag<5)iv=5*mag;else iv=10*mag;
|
||||
var firstTick=Math.ceil(visStart/iv)*iv;
|
||||
ctx.fillStyle='#555';ctx.font='9px monospace';
|
||||
ctx.strokeStyle='rgba(255,255,255,0.06)';ctx.lineWidth=1;
|
||||
for(var t=firstTick;t<=visEnd;t+=iv){
|
||||
var tx=LABEL_W+(t-offsetUs)*scale;
|
||||
ctx.beginPath();ctx.moveTo(tx,TOP_PAD);ctx.lineTo(tx,canvasH);ctx.stroke();
|
||||
ctx.fillText(fmtT(t),tx+3,MINIMAP_H+AXIS_H-4);
|
||||
}
|
||||
}
|
||||
|
||||
// Lanes
|
||||
for(var li=0;li<LANE_IDS.length;li++){
|
||||
var bid=LANE_IDS[li];
|
||||
var y=TOP_PAD+li*LANE_H;
|
||||
|
||||
// Background
|
||||
ctx.fillStyle=li%2===0?'#1a1a2e':'#1c1c34';
|
||||
ctx.fillRect(LABEL_W,y,viewW,LANE_H);
|
||||
|
||||
// Events (clipped to event area)
|
||||
ctx.save();
|
||||
ctx.beginPath();ctx.rect(LABEL_W,y,viewW,LANE_H);ctx.clip();
|
||||
var evts=LANE_EVENTS[bid];
|
||||
if(evts&&evts.length>0){
|
||||
var si=bsFirst(evts,visStart);
|
||||
for(var i=si;i<evts.length;i++){
|
||||
var ev=evts[i];
|
||||
if(ev.start>visEnd)break;
|
||||
var x=LABEL_W+(ev.start-offsetUs)*scale;
|
||||
var w=ev.d*scale;
|
||||
ctx.fillStyle=col(ev.n);
|
||||
if(ev.t===1){
|
||||
ctx.globalAlpha=0.7;
|
||||
ctx.fillRect(x,y+COPY_PAD,Math.max(0.5,w),LANE_H-2*COPY_PAD);
|
||||
ctx.globalAlpha=1;
|
||||
}else{
|
||||
ctx.fillRect(x,y+BAR_PAD,Math.max(0.5,w),LANE_H-2*BAR_PAD);
|
||||
}
|
||||
if(w>50){
|
||||
ctx.fillStyle='#fff';ctx.font='10px system-ui';
|
||||
ctx.fillText(ev.n,x+3,y+LANE_H/2+3,w-6);
|
||||
}
|
||||
}
|
||||
}
|
||||
ctx.restore();
|
||||
|
||||
// Hover highlight
|
||||
if(hoveredEv&&hoveredEv.bid===bid){
|
||||
var hx=LABEL_W+(hoveredEv.start-offsetUs)*scale;
|
||||
var hw=hoveredEv.d*scale;
|
||||
ctx.save();
|
||||
ctx.beginPath();ctx.rect(LABEL_W,y,viewW,LANE_H);ctx.clip();
|
||||
ctx.strokeStyle='#fff';ctx.lineWidth=2;
|
||||
ctx.strokeRect(hx-1,y+2,Math.max(3,hw+2),LANE_H-4);
|
||||
ctx.restore();
|
||||
}
|
||||
|
||||
// Lane separator
|
||||
ctx.strokeStyle='#0f3460';ctx.lineWidth=0.5;
|
||||
ctx.beginPath();ctx.moveTo(0,y+LANE_H-0.5);ctx.lineTo(canvasW,y+LANE_H-0.5);ctx.stroke();
|
||||
|
||||
// Label background + text
|
||||
ctx.fillStyle='#16213e';ctx.fillRect(0,y,LABEL_W,LANE_H);
|
||||
ctx.fillStyle='#ccc';ctx.font='11px system-ui';
|
||||
ctx.fillText(BACKENDS[bid]||('B'+bid),8,y+LANE_H/2+4);
|
||||
}
|
||||
|
||||
// Axis label area background (covers labels column in axis row)
|
||||
ctx.fillStyle='#16213e';ctx.fillRect(0,MINIMAP_H,LABEL_W,AXIS_H);
|
||||
ctx.fillStyle='#666';ctx.font='9px monospace';ctx.fillText('Time',8,MINIMAP_H+AXIS_H-4);
|
||||
|
||||
// View info
|
||||
document.getElementById('vi').textContent=fmtT(visStart)+' \u2014 '+fmtT(visEnd)+' ('+fmtT(rangeUs)+' visible)';
|
||||
}
|
||||
|
||||
// --- Zoom / Pan ---
|
||||
function fitAll(){scale=viewW/TOTAL_US;offsetUs=0;render();}
|
||||
function zoomTo(us){scale=viewW/us;render();}
|
||||
|
||||
canvas.addEventListener('wheel',function(e){
|
||||
e.preventDefault();
|
||||
var r=canvas.getBoundingClientRect();
|
||||
var mx=e.clientX-r.left-LABEL_W;
|
||||
if(mx<0)return;
|
||||
var mu=offsetUs+mx/scale;
|
||||
scale*=(e.deltaY>0?0.8:1.25);
|
||||
var minScale=viewW/TOTAL_US*0.5;
|
||||
if(scale<minScale)scale=minScale;
|
||||
offsetUs=mu-mx/scale;
|
||||
clampOffset();render();
|
||||
},{passive:false});
|
||||
|
||||
canvas.addEventListener('mousedown',function(e){
|
||||
var r=canvas.getBoundingClientRect();
|
||||
var my=e.clientY-r.top;
|
||||
if(my<MINIMAP_H){
|
||||
var frac=(e.clientX-r.left)/canvasW;
|
||||
offsetUs=frac*TOTAL_US-viewW/scale/2;
|
||||
clampOffset();render();return;
|
||||
}
|
||||
isDragging=true;dragStartX=e.clientX;dragStartOff=offsetUs;
|
||||
canvas.style.cursor='grabbing';
|
||||
});
|
||||
document.addEventListener('mousemove',function(e){
|
||||
if(!isDragging)return;
|
||||
offsetUs=dragStartOff-(e.clientX-dragStartX)/scale;
|
||||
clampOffset();render();
|
||||
});
|
||||
document.addEventListener('mouseup',function(){
|
||||
if(isDragging){isDragging=false;canvas.style.cursor='default';}
|
||||
});
|
||||
|
||||
// --- Tooltip ---
|
||||
var tip=document.getElementById('tt');
|
||||
canvas.addEventListener('mousemove',function(e){
|
||||
if(isDragging)return;
|
||||
var r=canvas.getBoundingClientRect();
|
||||
var mx=e.clientX-r.left,my=e.clientY-r.top;
|
||||
var li=Math.floor((my-TOP_PAD)/LANE_H);
|
||||
if(li<0||li>=LANE_IDS.length||mx<LABEL_W){
|
||||
if(hoveredEv){hoveredEv=null;render();}
|
||||
tip.style.display='none';return;
|
||||
}
|
||||
var bid=LANE_IDS[li];
|
||||
var mu=offsetUs+(mx-LABEL_W)/scale;
|
||||
var ev=bsHit(LANE_EVENTS[bid],mu);
|
||||
if(ev){
|
||||
if(hoveredEv!==ev){hoveredEv=ev;render();}
|
||||
var h='<b style="color:#e94560">'+ev.n+'</b><br>'+fmtT(ev.d)+' | '+(BACKENDS[ev.bid]||'B'+ev.bid);
|
||||
if(ev.s)h+='<br>Shape: '+fmtSh(ev.s);
|
||||
if(ev.b)h+='<br>Bytes: '+fmtB(ev.b);
|
||||
tip.innerHTML=h;tip.style.display='block';
|
||||
tip.style.left=Math.min(e.clientX+15,window.innerWidth-280)+'px';
|
||||
tip.style.top=Math.min(e.clientY+15,window.innerHeight-100)+'px';
|
||||
}else{
|
||||
if(hoveredEv){hoveredEv=null;render();}
|
||||
tip.style.display='none';
|
||||
}
|
||||
});
|
||||
canvas.addEventListener('mouseleave',function(){
|
||||
if(hoveredEv){hoveredEv=null;render();}
|
||||
tip.style.display='none';
|
||||
});
|
||||
|
||||
// --- Keyboard ---
|
||||
document.addEventListener('keydown',function(e){
|
||||
var step=viewW/scale*0.2;
|
||||
if(e.key==='ArrowLeft'){offsetUs-=step;clampOffset();render();}
|
||||
else if(e.key==='ArrowRight'){offsetUs+=step;clampOffset();render();}
|
||||
else if(e.key==='+'||e.key==='='){scale*=1.5;render();}
|
||||
else if(e.key==='-'){scale/=1.5;var mn=viewW/TOTAL_US*0.5;if(scale<mn)scale=mn;render();}
|
||||
else if(e.key==='Home'){fitAll();}
|
||||
});
|
||||
|
||||
// --- Resize ---
|
||||
window.addEventListener('resize',function(){setup();buildMinimap();render();});
|
||||
|
||||
// --- Legend ---
|
||||
function buildLegend(){
|
||||
var counts={};
|
||||
for(var i=0;i<EVENTS.length;i++){var n=EVENTS[i].n;counts[n]=(counts[n]||0)+1;}
|
||||
var entries=[];for(var n in counts)entries.push([n,counts[n]]);
|
||||
entries.sort(function(a,b){return b[1]-a[1];});
|
||||
var top=entries.slice(0,12);
|
||||
var h='';
|
||||
for(var i=0;i<top.length;i++){
|
||||
h+='<span style="display:inline-block;margin:0 8px"><span style="display:inline-block;width:10px;height:10px;border-radius:2px;background:'+col(top[i][0])+';margin-right:4px;vertical-align:middle"></span>'+top[i][0]+'</span>';
|
||||
}
|
||||
document.getElementById('lg').innerHTML=h;
|
||||
}
|
||||
|
||||
// --- Stats tree-table ---
|
||||
function buildStats(){
|
||||
var ops={};
|
||||
for(var i=0;i<EVENTS.length;i++){
|
||||
var ev=EVENTS[i];
|
||||
if(!ops[ev.n])ops[ev.n]={name:ev.n,d:0,count:0,min:Infinity,max:0,backends:{}};
|
||||
var op=ops[ev.n];
|
||||
op.d+=ev.d;op.count++;
|
||||
if(ev.d<op.min)op.min=ev.d;if(ev.d>op.max)op.max=ev.d;
|
||||
var bk=String(ev.bid);
|
||||
if(!op.backends[bk])op.backends[bk]={bid:ev.bid,d:0,count:0,min:Infinity,max:0,shapes:{}};
|
||||
var b=op.backends[bk];
|
||||
b.d+=ev.d;b.count++;
|
||||
if(ev.d<b.min)b.min=ev.d;if(ev.d>b.max)b.max=ev.d;
|
||||
var sh=ev.s||'\u2014';
|
||||
if(!b.shapes[sh])b.shapes[sh]={d:0,count:0,min:Infinity,max:0};
|
||||
var s=b.shapes[sh];
|
||||
s.d+=ev.d;s.count++;
|
||||
if(ev.d<s.min)s.min=ev.d;if(ev.d>s.max)s.max=ev.d;
|
||||
}
|
||||
var sorted=[];for(var k in ops)sorted.push(ops[k]);
|
||||
sorted.sort(function(a,b){return b.d-a.d;});
|
||||
|
||||
// Build flat row list
|
||||
var rows=[],rid=0;
|
||||
for(var oi=0;oi<sorted.length;oi++){
|
||||
var op=sorted[oi];
|
||||
var opId=rid++;
|
||||
var bkeys=[];for(var bk in op.backends)bkeys.push(bk);
|
||||
bkeys.sort(function(a,b){return op.backends[b].d-op.backends[a].d;});
|
||||
rows.push({id:opId,p:-1,lv:0,name:op.name,d:op.d,count:op.count,
|
||||
min:op.min,max:op.max,pct:op.d/TOTAL_US*100,ch:bkeys.length>0});
|
||||
for(var bi=0;bi<bkeys.length;bi++){
|
||||
var bdata=op.backends[bkeys[bi]];
|
||||
var bId=rid++;
|
||||
var bname=BACKENDS[bdata.bid]||('B'+bdata.bid);
|
||||
var skeys=[];for(var sk in bdata.shapes)skeys.push(sk);
|
||||
skeys.sort(function(a,b){return bdata.shapes[b].d-bdata.shapes[a].d;});
|
||||
rows.push({id:bId,p:opId,lv:1,name:bname,d:bdata.d,count:bdata.count,
|
||||
min:bdata.min,max:bdata.max,pct:bdata.d/TOTAL_US*100,ch:skeys.length>0});
|
||||
for(var si=0;si<skeys.length;si++){
|
||||
var sdata=bdata.shapes[skeys[si]];
|
||||
var sId=rid++;
|
||||
rows.push({id:sId,p:bId,lv:2,name:skeys[si],d:sdata.d,count:sdata.count,
|
||||
min:sdata.min,max:sdata.max,pct:sdata.d/TOTAL_US*100,ch:false});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Render
|
||||
var h='<table><thead><tr><th style="width:30%">Operation</th>'
|
||||
+'<th class="r" style="width:12%">% Time</th>'
|
||||
+'<th class="r" style="width:12%">Total</th>'
|
||||
+'<th class="r" style="width:10%">Count</th>'
|
||||
+'<th class="r" style="width:12%">Avg</th>'
|
||||
+'<th class="r" style="width:12%">Min</th>'
|
||||
+'<th class="r" style="width:12%">Max</th>'
|
||||
+'</tr></thead><tbody>';
|
||||
|
||||
for(var ri=0;ri<rows.length;ri++){
|
||||
var r=rows[ri];
|
||||
var indent=8+r.lv*20;
|
||||
var vis=r.lv===0?'':'display:none';
|
||||
var tog=r.ch?'<span class="tog" onclick="togRow('+r.id+',this)">\u25b6</span>'
|
||||
:'<span style="width:16px;display:inline-block"></span>';
|
||||
var nc;
|
||||
if(r.lv===0)nc='color:'+col(r.name)+';font-weight:bold';
|
||||
else if(r.lv===1)nc='color:#ccc';
|
||||
else nc='color:#888';
|
||||
var barC=r.lv===0?col(r.name):'rgba(100,140,200,0.3)';
|
||||
var barO=r.lv===0?'0.25':'0.2';
|
||||
|
||||
h+='<tr class="l'+r.lv+'" data-id="'+r.id+'" data-p="'+r.p+'" style="'+vis+'">';
|
||||
var dn=r.lv===2?fmtSh(r.name):r.name;
|
||||
h+='<td style="padding-left:'+indent+'px">'+tog+'<span style="'+nc+'">'+dn+'</span></td>';
|
||||
h+='<td class="r pct-cell"><div class="pct-bg" style="width:'+Math.max(0.5,r.pct)+'%;background:'+barC+';opacity:'+barO+'"></div><span class="pct-tx">'+r.pct.toFixed(1)+'%</span></td>';
|
||||
h+='<td class="r">'+fmtT(r.d)+'</td>';
|
||||
h+='<td class="r">'+r.count.toLocaleString()+'</td>';
|
||||
h+='<td class="r">'+fmtT(r.d/r.count)+'</td>';
|
||||
h+='<td class="r">'+fmtT(r.min)+'</td>';
|
||||
h+='<td class="r">'+fmtT(r.max)+'</td>';
|
||||
h+='</tr>';
|
||||
}
|
||||
h+='</tbody></table>';
|
||||
document.getElementById('stats').innerHTML=h;
|
||||
}
|
||||
|
||||
function togRow(pid,el){
|
||||
var exp=el.textContent==='\u25bc';
|
||||
el.textContent=exp?'\u25b6':'\u25bc';
|
||||
var children=document.querySelectorAll('#stats tr[data-p="'+pid+'"]');
|
||||
for(var i=0;i<children.length;i++){
|
||||
children[i].style.display=exp?'none':'';
|
||||
if(exp){
|
||||
// Collapse grandchildren too
|
||||
var cid=children[i].getAttribute('data-id');
|
||||
var ctog=children[i].querySelector('.tog');
|
||||
if(ctog)ctog.textContent='\u25b6';
|
||||
var gc=document.querySelectorAll('#stats tr[data-p="'+cid+'"]');
|
||||
for(var j=0;j<gc.length;j++)gc[j].style.display='none';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Init ---
|
||||
setup();buildMinimap();buildLegend();buildStats();fitAll();
|
||||
"""
|
||||
|
||||
html += js + '\n</script></body></html>'
|
||||
|
||||
with open(filepath, "w") as f:
|
||||
f.write(html)
|
||||
|
||||
print(f"HTML viewer exported to: {filepath}")
|
||||
print(f"Open in browser: file://{Path(filepath).resolve()}")
|
||||
|
||||
|
||||
def load(filepath: str | Path) -> ProfileData:
|
||||
"""Load a profiler JSON file."""
|
||||
return ProfileData.load(filepath)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="llama.cpp profiler analysis tool",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python -m tools.profiler.profiler profile.json
|
||||
python -m tools.profiler.profiler profile.json --chrome-trace trace.json
|
||||
python -m tools.profiler.profiler profile.json --top-ops 20
|
||||
""",
|
||||
)
|
||||
parser.add_argument("profile", help="Path to profiler JSON file")
|
||||
parser.add_argument("--chrome-trace", metavar="FILE",
|
||||
help="Export as Chrome Trace Event format")
|
||||
parser.add_argument("--html-viewer", metavar="FILE",
|
||||
help="Export as interactive HTML timeline viewer")
|
||||
parser.add_argument("--html-max-records", type=int, default=5000,
|
||||
help="Max records per backend in HTML viewer (0=unlimited, downsample to reduce file size)")
|
||||
parser.add_argument("--top-ops", type=int, default=0,
|
||||
help="Show top N operations (0 = show summary)")
|
||||
parser.add_argument("--top-kernels", type=int, default=0,
|
||||
help="Show top N longest kernels")
|
||||
parser.add_argument("--inefficiency", action="store_true",
|
||||
help="Show inefficiency ranking")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
data = load(args.profile)
|
||||
|
||||
if args.chrome_trace:
|
||||
data.export_chrome_trace(args.chrome_trace)
|
||||
|
||||
if args.html_viewer:
|
||||
data.export_html_viewer(args.html_viewer, max_records=args.html_max_records)
|
||||
|
||||
if args.top_ops > 0:
|
||||
print(f"\nTop {args.top_ops} operations by total time:\n")
|
||||
for s in data.top_operations(args.top_ops):
|
||||
pct = 100.0 * s.total_ns / data.total_ns if data.total_ns > 0 else 0
|
||||
print(f" {s.type_name:<5} {s.backend_id:>4} {s.name:<28} {pct:>6.1f}% "
|
||||
f"{s.count:>6}x {s.total_ms:>10.2f} ms avg={s.avg_us:.2f} us")
|
||||
print()
|
||||
|
||||
if args.top_kernels > 0:
|
||||
print(f"\nTop {args.top_kernels} longest kernels:\n")
|
||||
for rec in data.top_kernels(args.top_kernels):
|
||||
print(f" {rec.type_name:<5} {rec.backend_id:>4} {rec.name:<28} "
|
||||
f"{rec.duration_us:>10.2f} us split={rec.split_id}")
|
||||
print()
|
||||
|
||||
if args.inefficiency:
|
||||
print("\nInefficiency ranking (time/byte for operations with data):\n")
|
||||
for s in data.inefficiency_ranking(10):
|
||||
print(f" {s.name:<28} {s.time_per_byte_ns / 1000:>10.2f} us/byte "
|
||||
f"{s.count:>6} calls {s.total_bytes / 1e6:.1f} MB")
|
||||
print()
|
||||
|
||||
if args.top_ops == 0 and args.top_kernels == 0 and not args.inefficiency and not args.chrome_trace and not args.html_viewer:
|
||||
data.summary()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue