add second dimension to reported tensors, fix Mac build, add missing initializer to all backends

This commit is contained in:
Piotr Wilkin 2026-03-29 01:49:52 +01:00
parent 26459c7ede
commit dee7edea92
14 changed files with 79 additions and 58 deletions

View File

@ -28,7 +28,7 @@ extern "C" {
void * profiling_context; void * profiling_context;
// callback for recording a profile record from C code (set by backend when profiling) // callback for recording a profile record from C code (set by backend when profiling)
// params: context, type (0=OP, 1=COPY), name, split_id, start_ns, end_ns, bytes, extra, ne[4] // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4]
void (*profiling_record_fn)(void * context, void (*profiling_record_fn)(void * context,
int type, int type,
const char * name, const char * name,
@ -37,7 +37,8 @@ extern "C" {
uint64_t end_ns, uint64_t end_ns,
uint64_t bytes, uint64_t bytes,
const char * extra, const char * extra,
const int64_t ne[4]); const int64_t ne_src0[4],
const int64_t ne_src1[4]);
}; };
// numa strategies // numa strategies

View File

@ -27,7 +27,8 @@ typedef struct ggml_profile_record {
uint64_t end_ns; // end timestamp in nanoseconds uint64_t end_ns; // end timestamp in nanoseconds
uint64_t bytes; // bytes transferred (for copy) or tensor size (for ops) uint64_t bytes; // bytes transferred (for copy) or tensor size (for ops)
const char * extra; // fusion name for fused ops, or NULL const char * extra; // fusion name for fused ops, or NULL
int64_t ne[4]; // output tensor dimensions [ne0, ne1, ne2, ne3] int64_t ne_src0[4]; // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT)
int64_t ne_src1[4]; // src[1] tensor dimensions (e.g. input matrix for MUL_MAT)
} ggml_profile_record; } ggml_profile_record;
// Backend profiler interface - each backend optionally implements this // Backend profiler interface - each backend optionally implements this

View File

@ -1515,7 +1515,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} }
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id, sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
copy_start, copy_end, ggml_nbytes(input), NULL, {0} }); copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
} else { } else {
ggml_backend_tensor_copy(input, input_cpy); ggml_backend_tensor_copy(input, input_cpy);
} }
@ -1638,7 +1638,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} }
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} }); split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
} else { } else {
ggml_backend_tensor_copy(input, input_cpy); ggml_backend_tensor_copy(input, input_cpy);
} }
@ -1660,7 +1660,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} }
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} }); split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
} }
} }
} }
@ -2468,7 +2468,7 @@ void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) {
s.max_ns = dur; s.max_ns = dur;
s.count = 1; s.count = 1;
s.total_bytes = rec.bytes; s.total_bytes = rec.bytes;
memcpy(s.representative_ne, rec.ne, sizeof(s.representative_ne)); memcpy(s.representative_ne, rec.ne_src0, sizeof(s.representative_ne));
stats.push_back(s); stats.push_back(s);
} }
} }
@ -2571,9 +2571,11 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f
fprintf(fp, "null"); fprintf(fp, "null");
} }
// Tensor dimensions // Tensor dimensions (both source tensors)
fprintf(fp, ", \"ne\": [%lld, %lld, %lld, %lld]", (long long) rec.ne[0], (long long) rec.ne[1], fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1],
(long long) rec.ne[2], (long long) rec.ne[3]); (long long) rec.ne_src0[2], (long long) rec.ne_src0[3]);
fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1],
(long long) rec.ne_src1[2], (long long) rec.ne_src1[3]);
fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : ""); fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : "");
} }

View File

@ -274,7 +274,8 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
rec.end_ns = t_end; rec.end_ns = t_end;
rec.bytes = ggml_nbytes(node); rec.bytes = ggml_nbytes(node);
rec.extra = NULL; rec.extra = NULL;
memcpy(rec.ne, node->ne, sizeof(rec.ne)); if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
ctx->profiling_records.push_back(rec); ctx->profiling_records.push_back(rec);
} }
} }

View File

@ -3011,9 +3011,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (state->ith == 0) { if (state->ith == 0) {
uint64_t t_end = ggml_profiler_time_ns(); uint64_t t_end = ggml_profiler_time_ns();
cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */, {
ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL, static const int64_t zero_ne[4] = {0, 0, 0, 0};
node->ne); const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
src0_ne, src1_ne);
}
} }
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {

View File

@ -182,7 +182,8 @@ static void ggml_cpu_profiler_record_callback(void * context,
uint64_t end_ns, uint64_t end_ns,
uint64_t bytes, uint64_t bytes,
const char * extra, const char * extra,
const int64_t ne[4]) { const int64_t ne_src0[4],
const int64_t ne_src1[4]) {
auto * cpu_ctx = (ggml_backend_cpu_context *) context; auto * cpu_ctx = (ggml_backend_cpu_context *) context;
ggml_profile_record rec; ggml_profile_record rec;
rec.type = (enum ggml_profile_event_type) type; rec.type = (enum ggml_profile_event_type) type;
@ -193,10 +194,15 @@ static void ggml_cpu_profiler_record_callback(void * context,
rec.end_ns = end_ns; rec.end_ns = end_ns;
rec.bytes = bytes; rec.bytes = bytes;
rec.extra = extra; rec.extra = extra;
if (ne) { if (ne_src0) {
memcpy(rec.ne, ne, sizeof(rec.ne)); memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0));
} else { } else {
memset(rec.ne, 0, sizeof(rec.ne)); memset(rec.ne_src0, 0, sizeof(rec.ne_src0));
}
if (ne_src1) {
memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1));
} else {
memset(rec.ne_src1, 0, sizeof(rec.ne_src1));
} }
cpu_ctx->profiling_records.push_back(rec); cpu_ctx->profiling_records.push_back(rec);
} }

View File

@ -132,7 +132,8 @@ struct ggml_cuda_profiler_state {
event_count++; event_count++;
} }
void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra, const int64_t ne[4]) { void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
const int64_t ne_src0[4], const int64_t ne_src1[4]) {
cudaEvent_t ev; cudaEvent_t ev;
cudaEventCreate(&ev); cudaEventCreate(&ev);
cudaEventRecord(ev, stream); cudaEventRecord(ev, stream);
@ -148,11 +149,8 @@ struct ggml_cuda_profiler_state {
rec.end_ns = 0; rec.end_ns = 0;
rec.bytes = bytes; rec.bytes = bytes;
rec.extra = extra; rec.extra = extra;
if (ne) { if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
memcpy(rec.ne, ne, sizeof(rec.ne)); if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
} else {
memset(rec.ne, 0, sizeof(rec.ne));
}
records.push_back(rec); records.push_back(rec);
} }
@ -4041,7 +4039,8 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
cuda_ctx->profiler_state->split_id, cuda_ctx->profiler_state->split_id,
ggml_nbytes(node), ggml_nbytes(node),
nullptr, nullptr,
node->ne node->src[0] ? node->src[0]->ne : nullptr,
node->src[1] ? node->src[1]->ne : nullptr
); );
} }

View File

@ -4082,7 +4082,8 @@ ggml_backend_t ggml_backend_opencl_init(void) {
/* .guid = */ ggml_backend_opencl_guid(), /* .guid = */ ggml_backend_opencl_guid(),
/* .iface = */ ggml_backend_opencl_i, /* .iface = */ ggml_backend_opencl_i,
/* .device = */ dev, /* .device = */ dev,
/* .context = */ backend_ctx /* .context = */ backend_ctx,
/* .profiler = */ nullptr,
}; };
return backend; return backend;
@ -5897,6 +5898,7 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
/* .interface = */ ggml_backend_opencl_i, /* .interface = */ ggml_backend_opencl_i,
/* .device = */ dev, /* .device = */ dev,
/* .context = */ backend_ctx, /* .context = */ backend_ctx,
/* .profiler = */ nullptr,
}; };
return backend; return backend;

View File

@ -27,13 +27,6 @@ uint64_t ggml_profiler_time_ns(void) {
QueryPerformanceFrequency(&freq); QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&count); QueryPerformanceCounter(&count);
return (uint64_t) (count.QuadPart * 1000000000ULL / freq.QuadPart); return (uint64_t) (count.QuadPart * 1000000000ULL / freq.QuadPart);
#elif defined(__APPLE__)
clock_serv_t cclock;
mach_timespec_t mts;
host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &cclock);
clock_get_time(cclock, &mts);
mach_port_deallocate(mach_task_self(), cclock);
return (uint64_t) mts.tv_sec * 1000000000ULL + (uint64_t) mts.tv_nsec;
#elif defined(CLOCK_MONOTONIC_RAW) #elif defined(CLOCK_MONOTONIC_RAW)
struct timespec ts; struct timespec ts;
clock_gettime(CLOCK_MONOTONIC_RAW, &ts); clock_gettime(CLOCK_MONOTONIC_RAW, &ts);

View File

@ -952,7 +952,8 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
/* .guid = */ ggml_backend_rpc_guid(), /* .guid = */ ggml_backend_rpc_guid(),
/* .iface = */ ggml_backend_rpc_interface, /* .iface = */ ggml_backend_rpc_interface,
/* .device = */ ggml_backend_reg_dev_get(reg, device), /* .device = */ ggml_backend_reg_dev_get(reg, device),
/* .context = */ ctx /* .context = */ ctx,
/* .profiler = */ nullptr,
}; };
return backend; return backend;
} }

View File

@ -5147,7 +5147,8 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
/* .guid = */ ggml_backend_sycl_guid(), /* .guid = */ ggml_backend_sycl_guid(),
/* .iface = */ ggml_backend_sycl_interface, /* .iface = */ ggml_backend_sycl_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device), /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
/* .context = */ ctx /* .context = */ ctx,
/* .profiler = */ nullptr,
}; };
return sycl_backend; return sycl_backend;

View File

@ -15006,7 +15006,8 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
/* .guid = */ ggml_backend_vk_guid(), /* .guid = */ ggml_backend_vk_guid(),
/* .iface = */ ggml_backend_vk_interface, /* .iface = */ ggml_backend_vk_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num), /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
/* .context = */ ctx, /* .context = */ ctx,
/* .profiler = */ nullptr,
}; };
if (!ctx->device->support_async) { if (!ctx->device->support_async) {

View File

@ -431,7 +431,8 @@ ggml_backend_t ggml_backend_zendnn_init(void) {
/* .guid = */ ggml_backend_zendnn_guid(), /* .guid = */ ggml_backend_zendnn_guid(),
/* .iface = */ ggml_backend_zendnn_i, /* .iface = */ ggml_backend_zendnn_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_zendnn_reg(), 0), /* .device = */ ggml_backend_reg_dev_get(ggml_backend_zendnn_reg(), 0),
/* .context = */ ctx, /* .context = */ ctx,
/* .profiler = */ nullptr,
}; };
return backend; return backend;

View File

@ -31,7 +31,8 @@ class ProfileRecord:
duration_ns: int duration_ns: int
bytes: int bytes: int
extra: Optional[str] extra: Optional[str]
ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) ne_src0: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
ne_src1: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
@property @property
def type_name(self) -> str: def type_name(self) -> str:
@ -52,22 +53,21 @@ class ProfileRecord:
return 0.0 return 0.0
return self.bytes / self.duration_ns return self.bytes / self.duration_ns
@property @staticmethod
def shape_str(self) -> str: def _fmt_ne(ne: list[int]) -> str:
"""Human-readable tensor shape string, e.g. '[4096, 4096]'.""" dims = [n for n in ne if n > 0]
dims = [n for n in self.ne if n > 0]
if not dims: if not dims:
return "" return ""
return "[" + ", ".join(str(d) for d in dims) + "]" return "[" + ", ".join(str(d) for d in dims) + "]"
@property @property
def ne_elements(self) -> int: def shape_str(self) -> str:
"""Total number of elements.""" """Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1]'."""
result = 1 s0 = self._fmt_ne(self.ne_src0)
for n in self.ne: s1 = self._fmt_ne(self.ne_src1)
if n > 0: if s0 and s1:
result *= n return s0 + " x " + s1
return result return s0 or s1
def to_dict(self) -> dict: def to_dict(self) -> dict:
return { return {
@ -79,7 +79,8 @@ class ProfileRecord:
"duration_ns": self.duration_ns, "duration_ns": self.duration_ns,
"bytes": self.bytes, "bytes": self.bytes,
"extra": self.extra, "extra": self.extra,
"ne": self.ne, "ne_src0": self.ne_src0,
"ne_src1": self.ne_src1,
} }
@ -148,12 +149,17 @@ class ProfileData:
print(f"Warning: file may not be a ggml profiler output (profiler={data.get('profiler')})") print(f"Warning: file may not be a ggml profiler output (profiler={data.get('profiler')})")
records = [] records = []
def _pad_ne(v):
if isinstance(v, list) and len(v) < 4:
return v + [0] * (4 - len(v))
if not isinstance(v, list):
return [0, 0, 0, 0]
return v
for r in data.get("records", []): for r in data.get("records", []):
ne = r.get("ne", [0, 0, 0, 0]) # Support both old "ne" format and new "ne_src0"/"ne_src1" format
if isinstance(ne, list) and len(ne) < 4: ne_src0 = _pad_ne(r.get("ne_src0", r.get("ne", [0, 0, 0, 0])))
ne = ne + [0] * (4 - len(ne)) ne_src1 = _pad_ne(r.get("ne_src1", [0, 0, 0, 0]))
elif not isinstance(ne, list):
ne = [0, 0, 0, 0]
records.append(ProfileRecord( records.append(ProfileRecord(
type=r.get("type", 0), type=r.get("type", 0),
name=r.get("name", "unknown"), name=r.get("name", "unknown"),
@ -163,7 +169,8 @@ class ProfileData:
duration_ns=r.get("duration_ns", 0), duration_ns=r.get("duration_ns", 0),
bytes=r.get("bytes", 0), bytes=r.get("bytes", 0),
extra=r.get("extra"), extra=r.get("extra"),
ne=ne, ne_src0=ne_src0,
ne_src1=ne_src1,
)) ))
backends_raw = data.get("backends", []) backends_raw = data.get("backends", [])
@ -205,7 +212,7 @@ class ProfileData:
backend_id=rec.backend_id, backend_id=rec.backend_id,
min_ns=rec.duration_ns, min_ns=rec.duration_ns,
max_ns=rec.duration_ns, max_ns=rec.duration_ns,
representative_ne=list(rec.ne), representative_ne=list(rec.ne_src0),
) )
s = groups[key] s = groups[key]
s.count += 1 s.count += 1
@ -532,7 +539,7 @@ function hash(s){var h=0;for(var i=0;i<s.length;i++)h=((h<<5)-h)+s.charCodeAt(i)
function col(n){return OP_COL[n]||('hsl('+hash(n)%360+',60%,55%)');} function col(n){return OP_COL[n]||('hsl('+hash(n)%360+',60%,55%)');}
function fmtT(us){if(us>=1e6)return(us/1e6).toFixed(2)+'s';if(us>=1e3)return(us/1e3).toFixed(2)+'ms';return us.toFixed(1)+'\u03bcs';} function fmtT(us){if(us>=1e6)return(us/1e6).toFixed(2)+'s';if(us>=1e3)return(us/1e3).toFixed(2)+'ms';return us.toFixed(1)+'\u03bcs';}
function fmtB(b){if(!b)return'';if(b>=1e9)return(b/1e9).toFixed(1)+'GB';if(b>=1e6)return(b/1e6).toFixed(1)+'MB';if(b>=1e3)return(b/1e3).toFixed(1)+'KB';return b+'B';} function fmtB(b){if(!b)return'';if(b>=1e9)return(b/1e9).toFixed(1)+'GB';if(b>=1e6)return(b/1e6).toFixed(1)+'MB';if(b>=1e3)return(b/1e3).toFixed(1)+'KB';return b+'B';}
function fmtSh(s){if(!s)return'';return s.replace(/[\[\],]/g,function(m){return'<span style="color:#e8a040">'+m+'</span>';});} function fmtSh(s){if(!s)return'';return s.replace(/[\[\],]| x /g,function(m){return'<span style="color:#e8a040">'+m+'</span>';});}
// Canvas state // Canvas state
var canvas=document.getElementById('c'); var canvas=document.getElementById('c');