diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 76305d71de..43885350cf 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -28,7 +28,7 @@ extern "C" { void * profiling_context; // callback for recording a profile record from C code (set by backend when profiling) - // params: context, type (0=OP, 1=COPY), name, split_id, start_ns, end_ns, bytes, extra, ne[4] + // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4] void (*profiling_record_fn)(void * context, int type, const char * name, @@ -37,7 +37,8 @@ extern "C" { uint64_t end_ns, uint64_t bytes, const char * extra, - const int64_t ne[4]); + const int64_t ne_src0[4], + const int64_t ne_src1[4]); }; // numa strategies diff --git a/ggml/include/ggml-profiler.h b/ggml/include/ggml-profiler.h index 773b0635f1..5026d45001 100644 --- a/ggml/include/ggml-profiler.h +++ b/ggml/include/ggml-profiler.h @@ -27,7 +27,8 @@ typedef struct ggml_profile_record { uint64_t end_ns; // end timestamp in nanoseconds uint64_t bytes; // bytes transferred (for copy) or tensor size (for ops) const char * extra; // fusion name for fused ops, or NULL - int64_t ne[4]; // output tensor dimensions [ne0, ne1, ne2, ne3] + int64_t ne_src0[4]; // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT) + int64_t ne_src1[4]; // src[1] tensor dimensions (e.g. input matrix for MUL_MAT) } ggml_profile_record; // Backend profiler interface - each backend optionally implements this diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index d341b245af..b8b204fc60 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1515,7 +1515,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id, - copy_start, copy_end, ggml_nbytes(input), NULL, {0} }); + copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} }); } else { ggml_backend_tensor_copy(input, input_cpy); } @@ -1638,7 +1638,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, - split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} }); + split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} }); } else { ggml_backend_tensor_copy(input, input_cpy); } @@ -1660,7 +1660,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, - split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} }); + split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} }); } } } @@ -2468,7 +2468,7 @@ void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) { s.max_ns = dur; s.count = 1; s.total_bytes = rec.bytes; - memcpy(s.representative_ne, rec.ne, sizeof(s.representative_ne)); + memcpy(s.representative_ne, rec.ne_src0, sizeof(s.representative_ne)); stats.push_back(s); } } @@ -2571,9 +2571,11 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f fprintf(fp, "null"); } - // Tensor dimensions - fprintf(fp, ", \"ne\": [%lld, %lld, %lld, %lld]", (long long) rec.ne[0], (long long) rec.ne[1], - (long long) rec.ne[2], (long long) rec.ne[3]); + // Tensor dimensions (both source tensors) + fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1], + (long long) rec.ne_src0[2], (long long) rec.ne_src0[3]); + fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1], + (long long) rec.ne_src1[2], (long long) rec.ne_src1[3]); fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : ""); } diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index b52f7f90f7..7d2f7b7682 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -274,7 +274,8 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, rec.end_ns = t_end; rec.bytes = ggml_nbytes(node); rec.extra = NULL; - memcpy(rec.ne, node->ne, sizeof(rec.ne)); + if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); } + if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); } ctx->profiling_records.push_back(rec); } } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 20bea11945..c101e5ece7 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3005,9 +3005,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (state->ith == 0) { uint64_t t_end = ggml_profiler_time_ns(); - cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */, - ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL, - node->ne); + { + static const int64_t zero_ne[4] = {0, 0, 0, 0}; + const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne; + const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne; + cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */, + ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL, + src0_ne, src1_ne); + } } if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 2ee638fe02..f5fe903a0f 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -182,7 +182,8 @@ static void ggml_cpu_profiler_record_callback(void * context, uint64_t end_ns, uint64_t bytes, const char * extra, - const int64_t ne[4]) { + const int64_t ne_src0[4], + const int64_t ne_src1[4]) { auto * cpu_ctx = (ggml_backend_cpu_context *) context; ggml_profile_record rec; rec.type = (enum ggml_profile_event_type) type; @@ -193,10 +194,15 @@ static void ggml_cpu_profiler_record_callback(void * context, rec.end_ns = end_ns; rec.bytes = bytes; rec.extra = extra; - if (ne) { - memcpy(rec.ne, ne, sizeof(rec.ne)); + if (ne_src0) { + memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { - memset(rec.ne, 0, sizeof(rec.ne)); + memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); + } + if (ne_src1) { + memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); + } else { + memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); } cpu_ctx->profiling_records.push_back(rec); } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 76a0be4576..1a92a1f9c8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -133,7 +133,8 @@ struct ggml_cuda_profiler_state { event_count++; } - void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra, const int64_t ne[4]) { + void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra, + const int64_t ne_src0[4], const int64_t ne_src1[4]) { cudaEvent_t ev; cudaEventCreate(&ev); cudaEventRecord(ev, stream); @@ -149,11 +150,8 @@ struct ggml_cuda_profiler_state { rec.end_ns = 0; rec.bytes = bytes; rec.extra = extra; - if (ne) { - memcpy(rec.ne, ne, sizeof(rec.ne)); - } else { - memset(rec.ne, 0, sizeof(rec.ne)); - } + if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); } + if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); } records.push_back(rec); } @@ -4133,7 +4131,8 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud cuda_ctx->profiler_state->split_id, ggml_nbytes(node), nullptr, - node->ne + node->src[0] ? node->src[0]->ne : nullptr, + node->src[1] ? node->src[1]->ne : nullptr ); } diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 6f3fc5886d..d05499807d 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -4082,7 +4082,8 @@ ggml_backend_t ggml_backend_opencl_init(void) { /* .guid = */ ggml_backend_opencl_guid(), /* .iface = */ ggml_backend_opencl_i, /* .device = */ dev, - /* .context = */ backend_ctx + /* .context = */ backend_ctx, + /* .profiler = */ nullptr, }; return backend; @@ -5897,6 +5898,7 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co /* .interface = */ ggml_backend_opencl_i, /* .device = */ dev, /* .context = */ backend_ctx, + /* .profiler = */ nullptr, }; return backend; diff --git a/ggml/src/ggml-profiler.cpp b/ggml/src/ggml-profiler.cpp index 7d5d4c2ca1..3dc60595ff 100644 --- a/ggml/src/ggml-profiler.cpp +++ b/ggml/src/ggml-profiler.cpp @@ -27,13 +27,6 @@ uint64_t ggml_profiler_time_ns(void) { QueryPerformanceFrequency(&freq); QueryPerformanceCounter(&count); return (uint64_t) (count.QuadPart * 1000000000ULL / freq.QuadPart); -#elif defined(__APPLE__) - clock_serv_t cclock; - mach_timespec_t mts; - host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &cclock); - clock_get_time(cclock, &mts); - mach_port_deallocate(mach_task_self(), cclock); - return (uint64_t) mts.tv_sec * 1000000000ULL + (uint64_t) mts.tv_nsec; #elif defined(CLOCK_MONOTONIC_RAW) struct timespec ts; clock_gettime(CLOCK_MONOTONIC_RAW, &ts); diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 1378ba9f5b..24dca4503d 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -952,7 +952,8 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) { /* .guid = */ ggml_backend_rpc_guid(), /* .iface = */ ggml_backend_rpc_interface, /* .device = */ ggml_backend_reg_dev_get(reg, device), - /* .context = */ ctx + /* .context = */ ctx, + /* .profiler = */ nullptr, }; return backend; } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 456b1699fa..22060441be 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -5101,7 +5101,8 @@ ggml_backend_t ggml_backend_sycl_init(int device) { /* .guid = */ ggml_backend_sycl_guid(), /* .iface = */ ggml_backend_sycl_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device), - /* .context = */ ctx + /* .context = */ ctx, + /* .profiler = */ nullptr, }; return sycl_backend; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 15ed5b2a79..4ecd3d6708 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -14994,7 +14994,8 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) { /* .guid = */ ggml_backend_vk_guid(), /* .iface = */ ggml_backend_vk_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num), - /* .context = */ ctx, + /* .context = */ ctx, + /* .profiler = */ nullptr, }; if (!ctx->device->support_async) { diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp index c876030400..a78ada8f2f 100644 --- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp +++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp @@ -264,7 +264,8 @@ ggml_backend_t ggml_backend_zendnn_init(void) { /* .guid = */ ggml_backend_zendnn_guid(), /* .iface = */ ggml_backend_zendnn_i, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_zendnn_reg(), 0), - /* .context = */ ctx, + /* .context = */ ctx, + /* .profiler = */ nullptr, }; return backend; diff --git a/tools/profiler/profiler.py b/tools/profiler/profiler.py index a958a278ee..3dd5f6b1da 100644 --- a/tools/profiler/profiler.py +++ b/tools/profiler/profiler.py @@ -31,7 +31,8 @@ class ProfileRecord: duration_ns: int bytes: int extra: Optional[str] - ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + ne_src0: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) + ne_src1: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) @property def type_name(self) -> str: @@ -52,22 +53,21 @@ class ProfileRecord: return 0.0 return self.bytes / self.duration_ns - @property - def shape_str(self) -> str: - """Human-readable tensor shape string, e.g. '[4096, 4096]'.""" - dims = [n for n in self.ne if n > 0] + @staticmethod + def _fmt_ne(ne: list[int]) -> str: + dims = [n for n in ne if n > 0] if not dims: return "" return "[" + ", ".join(str(d) for d in dims) + "]" @property - def ne_elements(self) -> int: - """Total number of elements.""" - result = 1 - for n in self.ne: - if n > 0: - result *= n - return result + def shape_str(self) -> str: + """Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1]'.""" + s0 = self._fmt_ne(self.ne_src0) + s1 = self._fmt_ne(self.ne_src1) + if s0 and s1: + return s0 + " x " + s1 + return s0 or s1 def to_dict(self) -> dict: return { @@ -79,7 +79,8 @@ class ProfileRecord: "duration_ns": self.duration_ns, "bytes": self.bytes, "extra": self.extra, - "ne": self.ne, + "ne_src0": self.ne_src0, + "ne_src1": self.ne_src1, } @@ -148,12 +149,17 @@ class ProfileData: print(f"Warning: file may not be a ggml profiler output (profiler={data.get('profiler')})") records = [] + def _pad_ne(v): + if isinstance(v, list) and len(v) < 4: + return v + [0] * (4 - len(v)) + if not isinstance(v, list): + return [0, 0, 0, 0] + return v + for r in data.get("records", []): - ne = r.get("ne", [0, 0, 0, 0]) - if isinstance(ne, list) and len(ne) < 4: - ne = ne + [0] * (4 - len(ne)) - elif not isinstance(ne, list): - ne = [0, 0, 0, 0] + # Support both old "ne" format and new "ne_src0"/"ne_src1" format + ne_src0 = _pad_ne(r.get("ne_src0", r.get("ne", [0, 0, 0, 0]))) + ne_src1 = _pad_ne(r.get("ne_src1", [0, 0, 0, 0])) records.append(ProfileRecord( type=r.get("type", 0), name=r.get("name", "unknown"), @@ -163,7 +169,8 @@ class ProfileData: duration_ns=r.get("duration_ns", 0), bytes=r.get("bytes", 0), extra=r.get("extra"), - ne=ne, + ne_src0=ne_src0, + ne_src1=ne_src1, )) backends_raw = data.get("backends", []) @@ -205,7 +212,7 @@ class ProfileData: backend_id=rec.backend_id, min_ns=rec.duration_ns, max_ns=rec.duration_ns, - representative_ne=list(rec.ne), + representative_ne=list(rec.ne_src0), ) s = groups[key] s.count += 1 @@ -532,7 +539,7 @@ function hash(s){var h=0;for(var i=0;i=1e6)return(us/1e6).toFixed(2)+'s';if(us>=1e3)return(us/1e3).toFixed(2)+'ms';return us.toFixed(1)+'\u03bcs';} function fmtB(b){if(!b)return'';if(b>=1e9)return(b/1e9).toFixed(1)+'GB';if(b>=1e6)return(b/1e6).toFixed(1)+'MB';if(b>=1e3)return(b/1e3).toFixed(1)+'KB';return b+'B';} -function fmtSh(s){if(!s)return'';return s.replace(/[\[\],]/g,function(m){return''+m+'';});} +function fmtSh(s){if(!s)return'';return s.replace(/[\[\],]| x /g,function(m){return''+m+'';});} // Canvas state var canvas=document.getElementById('c');