diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 76305d71de..43885350cf 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -28,7 +28,7 @@ extern "C" {
         void * profiling_context;
 
         // callback for recording a profile record from C code (set by backend when profiling)
-        // params: context, type (0=OP, 1=COPY), name, split_id, start_ns, end_ns, bytes, extra, ne[4]
+        // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4]
         void (*profiling_record_fn)(void *        context,
                                     int           type,
                                     const char *  name,
@@ -37,7 +37,8 @@ extern "C" {
                                     uint64_t      end_ns,
                                     uint64_t      bytes,
                                     const char *  extra,
-                                    const int64_t ne[4]);
+                                    const int64_t ne_src0[4],
+                                    const int64_t ne_src1[4]);
     };
 
     // numa strategies
diff --git a/ggml/include/ggml-profiler.h b/ggml/include/ggml-profiler.h
index 773b0635f1..5026d45001 100644
--- a/ggml/include/ggml-profiler.h
+++ b/ggml/include/ggml-profiler.h
@@ -27,7 +27,8 @@ typedef struct ggml_profile_record {
     uint64_t                     end_ns;      // end timestamp in nanoseconds
     uint64_t                     bytes;       // bytes transferred (for copy) or tensor size (for ops)
     const char *                 extra;       // fusion name for fused ops, or NULL
-    int64_t                      ne[4];       // output tensor dimensions [ne0, ne1, ne2, ne3]
+    int64_t                      ne_src0[4];  // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT)
+    int64_t                      ne_src1[4];  // src[1] tensor dimensions (e.g. input matrix for MUL_MAT)
 } ggml_profile_record;
 
 // Backend profiler interface - each backend optionally implements this
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index d341b245af..b8b204fc60 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1515,7 +1515,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     }
 
                     sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
-                                                    copy_start, copy_end, ggml_nbytes(input), NULL, {0} });
+                                                    copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
                 } else {
                     ggml_backend_tensor_copy(input, input_cpy);
                 }
@@ -1638,7 +1638,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                             }
 
                             sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
-                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} });
+                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
                         } else {
                             ggml_backend_tensor_copy(input, input_cpy);
                         }
@@ -1660,7 +1660,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                             }
 
                             sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
-                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0} });
+                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
                         }
                     }
                 }
@@ -2468,7 +2468,7 @@ void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) {
             s.max_ns      = dur;
             s.count       = 1;
             s.total_bytes = rec.bytes;
-            memcpy(s.representative_ne, rec.ne, sizeof(s.representative_ne));
+            memcpy(s.representative_ne, rec.ne_src0, sizeof(s.representative_ne));
             stats.push_back(s);
         }
     }
@@ -2571,9 +2571,11 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f
             fprintf(fp, "null");
         }
 
-        // Tensor dimensions
-        fprintf(fp, ", \"ne\": [%lld, %lld, %lld, %lld]", (long long) rec.ne[0], (long long) rec.ne[1],
-                (long long) rec.ne[2], (long long) rec.ne[3]);
+        // Tensor dimensions (both source tensors)
+        fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1],
+                (long long) rec.ne_src0[2], (long long) rec.ne_src0[3]);
+        fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1],
+                (long long) rec.ne_src1[2], (long long) rec.ne_src1[3]);
 
         fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : "");
     }
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index b52f7f90f7..7d2f7b7682 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -274,7 +274,8 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
             rec.end_ns     = t_end;
             rec.bytes      = ggml_nbytes(node);
             rec.extra      = NULL;
-            memcpy(rec.ne, node->ne, sizeof(rec.ne));
+            if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
+            if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
             ctx->profiling_records.push_back(rec);
         }
     }
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 20bea11945..c101e5ece7 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3005,9 +3005,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
             if (state->ith == 0) {
                 uint64_t t_end = ggml_profiler_time_ns();
-                cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
-                                           ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
-                                           node->ne);
+                {
+                    static const int64_t zero_ne[4] = {0, 0, 0, 0};
+                    const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
+                    const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
+                                               ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
+                                               src0_ne, src1_ne);
+                }
             }
 
             if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 2ee638fe02..f5fe903a0f 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -182,7 +182,8 @@ static void ggml_cpu_profiler_record_callback(void *        context,
                                               uint64_t      end_ns,
                                               uint64_t      bytes,
                                               const char *  extra,
-                                              const int64_t ne[4]) {
+                                              const int64_t ne_src0[4],
+                                              const int64_t ne_src1[4]) {
     auto *              cpu_ctx = (ggml_backend_cpu_context *) context;
     ggml_profile_record rec;
     rec.type       = (enum ggml_profile_event_type) type;
@@ -193,10 +194,15 @@ static void ggml_cpu_profiler_record_callback(void *        context,
     rec.end_ns     = end_ns;
     rec.bytes      = bytes;
     rec.extra      = extra;
-    if (ne) {
-        memcpy(rec.ne, ne, sizeof(rec.ne));
+    if (ne_src0) {
+        memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0));
     } else {
-        memset(rec.ne, 0, sizeof(rec.ne));
+        memset(rec.ne_src0, 0, sizeof(rec.ne_src0));
+    }
+    if (ne_src1) {
+        memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1));
+    } else {
+        memset(rec.ne_src1, 0, sizeof(rec.ne_src1));
     }
     cpu_ctx->profiling_records.push_back(rec);
 }
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 76a0be4576..1a92a1f9c8 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -133,7 +133,8 @@ struct ggml_cuda_profiler_state {
         event_count++;
     }
 
-    void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra, const int64_t ne[4]) {
+    void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
+                    const int64_t ne_src0[4], const int64_t ne_src1[4]) {
         cudaEvent_t ev;
         cudaEventCreate(&ev);
         cudaEventRecord(ev, stream);
@@ -149,11 +150,8 @@ struct ggml_cuda_profiler_state {
         rec.end_ns = 0;
         rec.bytes = bytes;
         rec.extra = extra;
-        if (ne) {
-            memcpy(rec.ne, ne, sizeof(rec.ne));
-        } else {
-            memset(rec.ne, 0, sizeof(rec.ne));
-        }
+        if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
+        if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
         records.push_back(rec);
     }
 
@@ -4133,7 +4131,8 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                         cuda_ctx->profiler_state->split_id,
                         ggml_nbytes(node),
                         nullptr,
-                        node->ne
+                        node->src[0] ? node->src[0]->ne : nullptr,
+                        node->src[1] ? node->src[1]->ne : nullptr
                     );
                 }
 
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 6f3fc5886d..d05499807d 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -4082,7 +4082,8 @@ ggml_backend_t ggml_backend_opencl_init(void) {
         /* .guid    = */ ggml_backend_opencl_guid(),
         /* .iface   = */ ggml_backend_opencl_i,
         /* .device  = */ dev,
-        /* .context = */ backend_ctx
+        /* .context  = */ backend_ctx,
+        /* .profiler = */ nullptr,
     };
 
     return backend;
@@ -5897,6 +5898,7 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
         /* .interface = */ ggml_backend_opencl_i,
         /* .device    = */ dev,
         /* .context   = */ backend_ctx,
+        /* .profiler  = */ nullptr,
     };
 
     return backend;
diff --git a/ggml/src/ggml-profiler.cpp b/ggml/src/ggml-profiler.cpp
index 7d5d4c2ca1..3dc60595ff 100644
--- a/ggml/src/ggml-profiler.cpp
+++ b/ggml/src/ggml-profiler.cpp
@@ -27,13 +27,6 @@ uint64_t ggml_profiler_time_ns(void) {
     QueryPerformanceFrequency(&freq);
     QueryPerformanceCounter(&count);
     return (uint64_t) (count.QuadPart * 1000000000ULL / freq.QuadPart);
-#elif defined(__APPLE__)
-    clock_serv_t    cclock;
-    mach_timespec_t mts;
-    host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &cclock);
-    clock_get_time(cclock, &mts);
-    mach_port_deallocate(mach_task_self(), cclock);
-    return (uint64_t) mts.tv_sec * 1000000000ULL + (uint64_t) mts.tv_nsec;
 #elif defined(CLOCK_MONOTONIC_RAW)
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 1378ba9f5b..24dca4503d 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -952,7 +952,8 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
         /* .guid    = */ ggml_backend_rpc_guid(),
         /* .iface   = */ ggml_backend_rpc_interface,
         /* .device  = */ ggml_backend_reg_dev_get(reg, device),
-        /* .context = */ ctx
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
     return backend;
 }
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 456b1699fa..22060441be 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -5101,7 +5101,8 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
         /* .guid    = */ ggml_backend_sycl_guid(),
         /* .iface   = */ ggml_backend_sycl_interface,
         /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
-        /* .context = */ ctx
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
 
     return sycl_backend;
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 15ed5b2a79..4ecd3d6708 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -14994,7 +14994,8 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
         /* .guid    = */ ggml_backend_vk_guid(),
         /* .iface   = */ ggml_backend_vk_interface,
         /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
-        /* .context = */ ctx,
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
 
     if (!ctx->device->support_async) {
diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
index c876030400..a78ada8f2f 100644
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
@@ -264,7 +264,8 @@ ggml_backend_t ggml_backend_zendnn_init(void) {
         /* .guid    = */ ggml_backend_zendnn_guid(),
         /* .iface   = */ ggml_backend_zendnn_i,
         /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_zendnn_reg(), 0),
-        /* .context = */ ctx,
+        /* .context  = */ ctx,
+        /* .profiler = */ nullptr,
     };
 
     return backend;
diff --git a/tools/profiler/profiler.py b/tools/profiler/profiler.py
index a958a278ee..3dd5f6b1da 100644
--- a/tools/profiler/profiler.py
+++ b/tools/profiler/profiler.py
@@ -31,7 +31,8 @@ class ProfileRecord:
     duration_ns: int
     bytes: int
     extra: Optional[str]
-    ne: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
+    ne_src0: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
+    ne_src1: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
 
     @property
     def type_name(self) -> str:
@@ -52,22 +53,21 @@ class ProfileRecord:
             return 0.0
         return self.bytes / self.duration_ns
 
-    @property
-    def shape_str(self) -> str:
-        """Human-readable tensor shape string, e.g. '[4096, 4096]'."""
-        dims = [n for n in self.ne if n > 0]
+    @staticmethod
+    def _fmt_ne(ne: list[int]) -> str:
+        dims = [n for n in ne if n > 0]
         if not dims:
             return ""
         return "[" + ", ".join(str(d) for d in dims) + "]"
 
     @property
-    def ne_elements(self) -> int:
-        """Total number of elements."""
-        result = 1
-        for n in self.ne:
-            if n > 0:
-                result *= n
-        return result
+    def shape_str(self) -> str:
+        """Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1]'."""
+        s0 = self._fmt_ne(self.ne_src0)
+        s1 = self._fmt_ne(self.ne_src1)
+        if s0 and s1:
+            return s0 + " x " + s1
+        return s0 or s1
 
     def to_dict(self) -> dict:
         return {
@@ -79,7 +79,8 @@ class ProfileRecord:
             "duration_ns": self.duration_ns,
             "bytes": self.bytes,
             "extra": self.extra,
-            "ne": self.ne,
+            "ne_src0": self.ne_src0,
+            "ne_src1": self.ne_src1,
         }
 
 
@@ -148,12 +149,17 @@ class ProfileData:
             print(f"Warning: file may not be a ggml profiler output (profiler={data.get('profiler')})")
 
         records = []
+        def _pad_ne(v):
+            if isinstance(v, list) and len(v) < 4:
+                return v + [0] * (4 - len(v))
+            if not isinstance(v, list):
+                return [0, 0, 0, 0]
+            return v
+
         for r in data.get("records", []):
-            ne = r.get("ne", [0, 0, 0, 0])
-            if isinstance(ne, list) and len(ne) < 4:
-                ne = ne + [0] * (4 - len(ne))
-            elif not isinstance(ne, list):
-                ne = [0, 0, 0, 0]
+            # Support both old "ne" format and new "ne_src0"/"ne_src1" format
+            ne_src0 = _pad_ne(r.get("ne_src0", r.get("ne", [0, 0, 0, 0])))
+            ne_src1 = _pad_ne(r.get("ne_src1", [0, 0, 0, 0]))
             records.append(ProfileRecord(
                 type=r.get("type", 0),
                 name=r.get("name", "unknown"),
@@ -163,7 +169,8 @@ class ProfileData:
                 duration_ns=r.get("duration_ns", 0),
                 bytes=r.get("bytes", 0),
                 extra=r.get("extra"),
-                ne=ne,
+                ne_src0=ne_src0,
+                ne_src1=ne_src1,
             ))
 
         backends_raw = data.get("backends", [])
@@ -205,7 +212,7 @@ class ProfileData:
                     backend_id=rec.backend_id,
                     min_ns=rec.duration_ns,
                     max_ns=rec.duration_ns,
-                    representative_ne=list(rec.ne),
+                    representative_ne=list(rec.ne_src0),
                 )
             s = groups[key]
             s.count += 1
@@ -532,7 +539,7 @@ function hash(s){var h=0;for(var i=0;i<s.length;i++)h=((h<<5)-h)+s.charCodeAt(i)
 function col(n){return OP_COL[n]||('hsl('+hash(n)%360+',60%,55%)');}
 function fmtT(us){if(us>=1e6)return(us/1e6).toFixed(2)+'s';if(us>=1e3)return(us/1e3).toFixed(2)+'ms';return us.toFixed(1)+'\u03bcs';}
 function fmtB(b){if(!b)return'';if(b>=1e9)return(b/1e9).toFixed(1)+'GB';if(b>=1e6)return(b/1e6).toFixed(1)+'MB';if(b>=1e3)return(b/1e3).toFixed(1)+'KB';return b+'B';}
-function fmtSh(s){if(!s)return'';return s.replace(/[\[\],]/g,function(m){return'<span style="color:#e8a040">'+m+'</span>';});}
+function fmtSh(s){if(!s)return'';return s.replace(/[\[\],]| x /g,function(m){return'<span style="color:#e8a040">'+m+'</span>';});}
 
 // Canvas state
 var canvas=document.getElementById('c');