fix mul_mat_id stats, add throughput stat, add envvar trigger, add concurrent mode fix

2026-03-29 22:52:33 +02:00 · 2026-03-29 22:52:33 +02:00 · 9da7e4e74f
parent 64e438465f
commit 9da7e4e74f
12 changed files with 459 additions and 93 deletions
--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@ -251,11 +251,17 @@ int main(int argc, char ** argv) {
            if (params.profiling_output.empty()) {
                ggml_backend_sched_print_profiling(sched);
            } else {
-                int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
-                if (ret == 0) {
-                    LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
+                const std::string & path = params.profiling_output;
+                int ret;
+                if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
+                    ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
                } else {
-                    LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
+                    ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
+                }
+                if (ret == 0) {
+                    LOG("\nProfiling data exported to: %s\n", path.c_str());
+                } else {
+                    LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str());
                }
            }
        }
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -28,7 +28,7 @@ extern "C" {
        void * profiling_context;

        // callback for recording a profile record from C code (set by backend when profiling)
-        // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4]
+        // params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4], ne_src2[4]
        void (*profiling_record_fn)(void *        context,
                                    int           type,
                                    const char *  name,
@ -38,7 +38,8 @@ extern "C" {
                                    uint64_t      bytes,
                                    const char *  extra,
                                    const int64_t ne_src0[4],
-                                    const int64_t ne_src1[4]);
+                                    const int64_t ne_src1[4],
+                                    const int64_t ne_src2[4]);
    };

    // numa strategies
--- a/ggml/include/ggml-profiler.h
+++ b/ggml/include/ggml-profiler.h
@ -29,6 +29,7 @@ typedef struct ggml_profile_record {
    const char *                 extra;       // fusion name for fused ops, or NULL
    int64_t                      ne_src0[4];  // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT)
    int64_t                      ne_src1[4];  // src[1] tensor dimensions (e.g. input matrix for MUL_MAT)
+    int64_t                      ne_src2[4];  // src[2] tensor dimensions (e.g. ids for MUL_MAT_ID)
 } ggml_profile_record;

 // Backend profiler interface - each backend optionally implements this
@ -99,6 +100,13 @@ GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched
 // Export profiling data as JSON to a FILE pointer
 GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp);

+// Export profiling data as plain text statistics to a file
+// Returns 0 on success, -1 on error
+GGML_API int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath);
+
+// Export profiling data as plain text statistics to a FILE pointer
+GGML_API int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -21,6 +21,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <algorithm>
+#include <string>
 #include <vector>

 #ifdef __APPLE__
@ -749,8 +750,17 @@ struct ggml_backend_sched {

    // profiling
    bool                             profiling_enabled;
-    std::vector<ggml_profile_record> copy_records;       // copy events recorded by the scheduler
-    std::vector<ggml_profile_record> profiling_records;  // merged records from all sources
+    std::string                      profiling_env_path;  // GGML_PROFILE env var value (for auto-export on free)
+    std::vector<ggml_profile_record> copy_records;        // copy events recorded by the scheduler
+    std::vector<ggml_profile_record> profiling_records;   // merged records from all sources
+
+    // Cached backend metadata for safe access during auto-export (backends may be freed first)
+    struct backend_meta {
+        std::string name;
+        std::string device;
+        int         device_type;
+    };
+    std::vector<backend_meta>        profiling_backend_meta;
 };

 #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@ -1515,7 +1525,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                    }

                    sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
-                                                    copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
+                                                    copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} });
                } else {
                    ggml_backend_tensor_copy(input, input_cpy);
                }
@ -1635,7 +1645,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s

                        sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
                                                        split_id, moe_copy_start, moe_copy_end,
-                                                        (uint64_t) total_copied_bytes, NULL, {0}, {0} });
+                                                        (uint64_t) total_copied_bytes, NULL, {0}, {0}, {0} });
                    }
                } else {
                    // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
@ -1671,7 +1681,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                            }

                            sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
-                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
+                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} });
                        } else {
                            ggml_backend_tensor_copy(input, input_cpy);
                        }
@ -1691,7 +1701,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                            }

                            sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
-                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0} });
+                                                            split_id, copy_start, copy_end, ggml_nbytes(input), NULL, {0}, {0}, {0} });
                        }
                    }
                }
@ -1837,7 +1847,24 @@ ggml_backend_sched_t ggml_backend_sched_new(

    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
    sched->op_offload = op_offload;
-    sched->profiling_enabled = (getenv("GGML_PROFILE") != NULL);
+    const char * profile_env = getenv("GGML_PROFILE");
+    if (profile_env != NULL) {
+        sched->profiling_enabled = true;
+        sched->profiling_env_path = profile_env;
+    }
+
+    // Cache backend metadata for safe access during auto-export
+    for (int b = 0; b < n_backends; b++) {
+        ggml_backend_sched::backend_meta meta;
+        meta.name        = ggml_backend_name(backends[b]);
+        meta.device      = "unknown";
+        meta.device_type  = 0;
+        if (backends[b]->device != NULL) {
+            meta.device      = ggml_backend_dev_name(backends[b]->device);
+            meta.device_type = (int) ggml_backend_dev_type(backends[b]->device);
+        }
+        sched->profiling_backend_meta.push_back(std::move(meta));
+    }

    ggml_backend_sched_reset(sched);

@ -1848,6 +1875,33 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
    if (sched == NULL) {
        return;
    }
+
+    // Auto-export profiling data if enabled via GGML_PROFILE env var
+    // GGML_PROFILE=1 or GGML_PROFILE="" → print to stdout
+    // GGML_PROFILE=file.json → export JSON
+    // GGML_PROFILE=file.txt → export text
+    if (!sched->profiling_records.empty() && getenv("GGML_PROFILE") != NULL) {
+        const std::string & path = sched->profiling_env_path;
+        if (path.empty() || path == "1" || path == "stdout") {
+            ggml_backend_sched_print_profiling(sched);
+        } else if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
+            int ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
+            if (ret == 0) {
+                GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str());
+            } else {
+                GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str());
+            }
+        } else {
+            // Default to JSON for any other path (including .json)
+            int ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
+            if (ret == 0) {
+                GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str());
+            } else {
+                GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str());
+            }
+        }
+    }
+
    for (int b = 0; b < sched->n_backends; b++) {
        for (int c = 0; c < sched->n_copies; c++) {
            ggml_backend_event_free(sched->events[b][c]);
@ -2518,18 +2572,16 @@ void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) {
        double min_us = (double) s.min_ns / 1000.0;
        double max_us = (double) s.max_ns / 1000.0;

-        if (s.type == GGML_PROFILE_EVENT_COPY) {
+        GGML_LOG_INFO(
+            "  [%s] backend %d %-28s %7.1f%%  count=%-6d  total=%8.2f ms  avg=%8.2f us  min=%8.2f us  max=%8.2f us",
+            type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us);
+        if (s.total_bytes > 0 && s.total_ns > 0) {
            double bw_gbps = (double) s.total_bytes / (double) s.total_ns;
-            GGML_LOG_INFO(
-                "  [%s] backend %d %-28s %7.1f%%  count=%-6d  total=%8.2f ms  avg=%8.2f us  min=%8.2f us  max=%8.2f us "
-                " %8.2f GB/s",
-                type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us,
-                bw_gbps);
-        } else {
-            GGML_LOG_INFO(
-                "  [%s] backend %d %-28s %7.1f%%  count=%-6d  total=%8.2f ms  avg=%8.2f us  min=%8.2f us  max=%8.2f us",
-                type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us,
-                max_us);
+            if (bw_gbps >= 1000.0) {
+                GGML_LOG_INFO("  %6.2f TB/s", bw_gbps / 1000.0);
+            } else {
+                GGML_LOG_INFO("  %6.2f GB/s", bw_gbps);
+            }
        }
        // Print representative tensor shape (first record's ne)
        if (s.representative_ne[0] > 0 || s.representative_ne[1] > 0) {
@ -2565,16 +2617,22 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f
    fprintf(fp, "  \"total_records\": %d,\n", (int) sched->profiling_records.size());
    fprintf(fp, "  \"total_ns\": %llu,\n", (unsigned long long) total_ns);

-    // Backend metadata
+    // Backend metadata (use cached data if available, fall back to live pointers)
    fprintf(fp, "  \"backends\": [\n");
    for (int b = 0; b < sched->n_backends; b++) {
-        ggml_backend_t backend  = sched->backends[b];
-        const char *   name     = ggml_backend_name(backend);
-        const char *   dev_name = "unknown";
-        int            dev_type = 0;
-        if (backend->device != NULL) {
-            dev_name = ggml_backend_dev_name(backend->device);
-            dev_type = (int) ggml_backend_dev_type(backend->device);
+        const char * name     = "unknown";
+        const char * dev_name = "unknown";
+        int          dev_type = 0;
+        if (b < (int) sched->profiling_backend_meta.size()) {
+            name     = sched->profiling_backend_meta[b].name.c_str();
+            dev_name = sched->profiling_backend_meta[b].device.c_str();
+            dev_type = sched->profiling_backend_meta[b].device_type;
+        } else if (sched->backends[b] != NULL) {
+            name = ggml_backend_name(sched->backends[b]);
+            if (sched->backends[b]->device != NULL) {
+                dev_name = ggml_backend_dev_name(sched->backends[b]->device);
+                dev_type = (int) ggml_backend_dev_type(sched->backends[b]->device);
+            }
        }
        fprintf(fp, "    {\"id\": %d, \"name\": \"%s\", \"device\": \"%s\", \"device_type\": %d}%s\n", b, name,
                dev_name, dev_type, (b < sched->n_backends - 1) ? "," : "");
@ -2600,11 +2658,13 @@ int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * f
            fprintf(fp, "null");
        }

-        // Tensor dimensions (both source tensors)
+        // Tensor dimensions (all source tensors)
        fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1],
                (long long) rec.ne_src0[2], (long long) rec.ne_src0[3]);
        fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1],
                (long long) rec.ne_src1[2], (long long) rec.ne_src1[3]);
+        fprintf(fp, ", \"ne_src2\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src2[0], (long long) rec.ne_src2[1],
+                (long long) rec.ne_src2[2], (long long) rec.ne_src2[3]);

        fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : "");
    }
@ -2630,3 +2690,254 @@ int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const c

    return ret;
 }
+
+// Helper: format ne dimensions as string, e.g. "[4096, 4096, 1]"
+static void fmt_ne(char * buf, size_t bufsize, const int64_t ne[4]) {
+    if (ne[0] == 0 && ne[1] == 0 && ne[2] == 0 && ne[3] == 0) {
+        buf[0] = '\0';
+        return;
+    }
+    int ndims = 4;
+    while (ndims > 1 && ne[ndims - 1] <= 1) {
+        ndims--;
+    }
+    int pos = snprintf(buf, bufsize, "[");
+    for (int i = 0; i < ndims && pos < (int) bufsize - 1; i++) {
+        pos += snprintf(buf + pos, bufsize - pos, "%s%lld", i > 0 ? ", " : "", (long long) ne[i]);
+    }
+    snprintf(buf + pos, bufsize - pos, "]");
+}
+
+// Helper: format bandwidth as string
+static void fmt_bandwidth(char * buf, size_t bufsize, uint64_t bytes, uint64_t ns) {
+    if (ns == 0 || bytes == 0) {
+        buf[0] = '\0';
+        return;
+    }
+    double bw_gbps = (double) bytes / (double) ns;
+    if (bw_gbps >= 1000.0) {
+        snprintf(buf, bufsize, "%.2f TB/s", bw_gbps / 1000.0);
+    } else {
+        snprintf(buf, bufsize, "%.2f GB/s", bw_gbps);
+    }
+}
+
+int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp) {
+    GGML_ASSERT(sched);
+    GGML_ASSERT(fp != NULL);
+
+    if (sched->profiling_records.empty()) {
+        fprintf(fp, "No profiling data available.\n");
+        return 0;
+    }
+
+    // Aggregate by (name, type, backend_id)
+    struct op_stats {
+        const char *                 name;
+        enum ggml_profile_event_type type;
+        int                          backend_id;
+        uint64_t                     total_ns;
+        uint64_t                     min_ns;
+        uint64_t                     max_ns;
+        int                          count;
+        uint64_t                     total_bytes;
+        int64_t                      representative_ne_src0[4];
+        int64_t                      representative_ne_src1[4];
+        int64_t                      representative_ne_src2[4];
+    };
+
+    std::vector<op_stats> stats;
+    for (const auto & rec : sched->profiling_records) {
+        uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
+        bool found = false;
+        for (auto & s : stats) {
+            if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) {
+                s.total_ns += dur;
+                s.min_ns = std::min(s.min_ns, dur);
+                s.max_ns = std::max(s.max_ns, dur);
+                s.count++;
+                s.total_bytes += rec.bytes;
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            op_stats s = {};
+            s.name        = rec.name;
+            s.type        = rec.type;
+            s.backend_id  = rec.backend_id;
+            s.total_ns    = dur;
+            s.min_ns      = dur;
+            s.max_ns      = dur;
+            s.count       = 1;
+            s.total_bytes = rec.bytes;
+            memcpy(s.representative_ne_src0, rec.ne_src0, sizeof(s.representative_ne_src0));
+            memcpy(s.representative_ne_src1, rec.ne_src1, sizeof(s.representative_ne_src1));
+            memcpy(s.representative_ne_src2, rec.ne_src2, sizeof(s.representative_ne_src2));
+            stats.push_back(s);
+        }
+    }
+
+    std::sort(stats.begin(), stats.end(),
+              [](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; });
+
+    uint64_t grand_total = 0;
+    for (const auto & s : stats) {
+        grand_total += s.total_ns;
+    }
+
+    // --- Section 1: Overall summary ---
+    fprintf(fp, "=== Profiling Summary ===\n");
+    fprintf(fp, "Total time:    %.2f ms\n", (double) grand_total / 1e6);
+    fprintf(fp, "Total records: %d\n", (int) sched->profiling_records.size());
+    fprintf(fp, "Unique ops:    %d\n\n", (int) stats.size());
+
+    // --- Section 2: Per-backend breakdown ---
+    fprintf(fp, "=== Per-Backend Summary ===\n");
+    {
+        struct backend_stats {
+            int      backend_id;
+            int      op_count;
+            int      copy_count;
+            uint64_t op_ns;
+            uint64_t copy_ns;
+            uint64_t op_bytes;
+            uint64_t copy_bytes;
+        };
+        std::vector<backend_stats> bstats;
+        for (const auto & s : stats) {
+            bool found = false;
+            for (auto & bs : bstats) {
+                if (bs.backend_id == s.backend_id) {
+                    if (s.type == GGML_PROFILE_EVENT_OP) {
+                        bs.op_count += s.count;
+                        bs.op_ns    += s.total_ns;
+                        bs.op_bytes += s.total_bytes;
+                    } else {
+                        bs.copy_count += s.count;
+                        bs.copy_ns    += s.total_ns;
+                        bs.copy_bytes += s.total_bytes;
+                    }
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                backend_stats bs = {};
+                bs.backend_id = s.backend_id;
+                if (s.type == GGML_PROFILE_EVENT_OP) {
+                    bs.op_count = s.count;
+                    bs.op_ns    = s.total_ns;
+                    bs.op_bytes = s.total_bytes;
+                } else {
+                    bs.copy_count = s.count;
+                    bs.copy_ns    = s.total_ns;
+                    bs.copy_bytes = s.total_bytes;
+                }
+                bstats.push_back(bs);
+            }
+        }
+        std::sort(bstats.begin(), bstats.end(),
+                  [](const backend_stats & a, const backend_stats & b) {
+                      return (a.op_ns + a.copy_ns) > (b.op_ns + b.copy_ns);
+                  });
+
+        for (const auto & bs : bstats) {
+            uint64_t total = bs.op_ns + bs.copy_ns;
+            double   pct   = grand_total > 0 ? 100.0 * (double) total / (double) grand_total : 0;
+
+            const char * bname = "unknown";
+            if (bs.backend_id >= 0 && bs.backend_id < (int) sched->profiling_backend_meta.size()) {
+                bname = sched->profiling_backend_meta[bs.backend_id].name.c_str();
+            } else if (bs.backend_id >= 0 && bs.backend_id < sched->n_backends && sched->backends[bs.backend_id] != NULL) {
+                bname = ggml_backend_name(sched->backends[bs.backend_id]);
+            }
+
+            fprintf(fp, "  Backend %d (%s): %.2f ms (%.1f%%)\n", bs.backend_id, bname, (double) total / 1e6, pct);
+            if (bs.op_count > 0) {
+                char bw_buf[32];
+                fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.op_bytes, bs.op_ns);
+                fprintf(fp, "    OPs:   %d calls, %.2f ms", bs.op_count, (double) bs.op_ns / 1e6);
+                if (bw_buf[0]) {
+                    fprintf(fp, ", %s", bw_buf);
+                }
+                fprintf(fp, "\n");
+            }
+            if (bs.copy_count > 0) {
+                char bw_buf[32];
+                fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.copy_bytes, bs.copy_ns);
+                fprintf(fp, "    COPYs: %d calls, %.2f ms", bs.copy_count, (double) bs.copy_ns / 1e6);
+                if (bw_buf[0]) {
+                    fprintf(fp, ", %s", bw_buf);
+                }
+                fprintf(fp, "\n");
+            }
+        }
+    }
+    fprintf(fp, "\n");
+
+    // --- Section 3: Detailed operation table ---
+    fprintf(fp, "=== Operations (sorted by total time) ===\n");
+    fprintf(fp, "%-5s %4s  %-28s  %7s  %6s  %10s  %10s  %10s  %10s  %12s  %s\n",
+            "TYPE", "BKND", "Operation", "%Time", "Count", "Total(ms)", "Avg(us)", "Min(us)", "Max(us)", "Bandwidth", "Tensors");
+    fprintf(fp, "%-5s %4s  %-28s  %7s  %6s  %10s  %10s  %10s  %10s  %12s  %s\n",
+            "-----", "----", "----------------------------", "-------", "------",
+            "----------", "----------", "----------", "----------", "------------", "-------");
+
+    const char * type_str[] = { "OP", "COPY" };
+    for (const auto & s : stats) {
+        double pct    = grand_total > 0 ? 100.0 * (double) s.total_ns / (double) grand_total : 0;
+        double avg_us = (double) s.total_ns / (double) s.count / 1000.0;
+        double min_us = (double) s.min_ns / 1000.0;
+        double max_us = (double) s.max_ns / 1000.0;
+
+        char bw_buf[32] = "";
+        fmt_bandwidth(bw_buf, sizeof(bw_buf), s.total_bytes, s.total_ns);
+
+        char ne0_buf[64];
+        char ne1_buf[64];
+        char ne2_buf[64];
+        fmt_ne(ne0_buf, sizeof(ne0_buf), s.representative_ne_src0);
+        fmt_ne(ne1_buf, sizeof(ne1_buf), s.representative_ne_src1);
+        fmt_ne(ne2_buf, sizeof(ne2_buf), s.representative_ne_src2);
+
+        // Build tensor shapes string
+        char tensors_buf[256] = "";
+        int tpos = 0;
+        if (ne0_buf[0]) {
+            tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, "%s", ne0_buf);
+        }
+        if (ne1_buf[0]) {
+            tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne1_buf);
+        }
+        if (ne2_buf[0]) {
+            tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne2_buf);
+        }
+
+        fprintf(fp, "%-5s %4d  %-28s  %6.1f%%  %6d  %10.2f  %10.2f  %10.2f  %10.2f  %12s  %s\n",
+                type_str[s.type], s.backend_id, s.name, pct, s.count,
+                (double) s.total_ns / 1e6, avg_us, min_us, max_us,
+                bw_buf, tensors_buf);
+    }
+
+    fprintf(fp, "\nTotal: %.2f ms  (%d records, %d unique ops)\n", (double) grand_total / 1e6,
+            (int) sched->profiling_records.size(), (int) stats.size());
+
+    return 0;
+}
+
+int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath) {
+    GGML_ASSERT(sched);
+    GGML_ASSERT(filepath != NULL);
+
+    FILE * fp = fopen(filepath, "w");
+    if (fp == NULL) {
+        GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath);
+        return -1;
+    }
+
+    int ret = ggml_backend_sched_write_profiling_text(sched, fp);
+    fclose(fp);
+
+    return ret;
+}
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@ -276,6 +276,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
            rec.extra      = NULL;
            if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
            if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
+            if (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) { memcpy(rec.ne_src2, node->src[2]->ne, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
            ctx->profiling_records.push_back(rec);
        }
    }
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -3009,9 +3009,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                    static const int64_t zero_ne[4] = {0, 0, 0, 0};
                    const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
                    const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
                    cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
                                               ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
-                                               src0_ne, src1_ne);
+                                               src0_ne, src1_ne, src2_ne);
                }
            }

--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -183,7 +183,8 @@ static void ggml_cpu_profiler_record_callback(void *        context,
                                              uint64_t      bytes,
                                              const char *  extra,
                                              const int64_t ne_src0[4],
-                                              const int64_t ne_src1[4]) {
+                                              const int64_t ne_src1[4],
+                                              const int64_t ne_src2[4]) {
    auto *              cpu_ctx = (ggml_backend_cpu_context *) context;
    ggml_profile_record rec;
    rec.type       = (enum ggml_profile_event_type) type;
@ -204,6 +205,11 @@ static void ggml_cpu_profiler_record_callback(void *        context,
    } else {
        memset(rec.ne_src1, 0, sizeof(rec.ne_src1));
    }
+    if (ne_src2) {
+        memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2));
+    } else {
+        memset(rec.ne_src2, 0, sizeof(rec.ne_src2));
+    }
    cpu_ctx->profiling_records.push_back(rec);
 }

--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -138,7 +138,7 @@ struct ggml_cuda_profiler_state {
    }

    void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
-                    const int64_t ne_src0[4], const int64_t ne_src1[4]) {
+                    const int64_t ne_src0[4], const int64_t ne_src1[4], const int64_t ne_src2[4]) {
        cudaEvent_t ev;
        (void) cudaEventCreate(&ev);
        (void) cudaEventRecord(ev, stream);
@ -156,6 +156,7 @@ struct ggml_cuda_profiler_state {
        rec.extra = extra;
        if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
        if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
+        if (ne_src2) { memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
        records.push_back(rec);
    }

@ -4137,7 +4138,8 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                        ggml_nbytes(node),
                        nullptr,
                        node->src[0] ? node->src[0]->ne : nullptr,
-                        node->src[1] ? node->src[1]->ne : nullptr
+                        node->src[1] ? node->src[1]->ne : nullptr,
+                        (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : nullptr
                    );
                }

--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -14682,6 +14682,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                    static const int64_t zero_ne[4] = {0, 0, 0, 0};
                    const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
                    const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
                    uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
                                    ? ctx->profiler_state->cpu_timestamps[i] : 0;

@ -14696,6 +14697,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                    rec.extra      = name;  // fusion name or NULL
                    memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
                    memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
+                    memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
                    ctx->profiler_state->records.push_back(rec);
                }
            }
@ -14724,27 +14726,31 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                if (has_profiler && !nodes.empty()) {
                    uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
                                    ? ctx->profiler_state->cpu_timestamps[i] : 0;
-                    // In concurrent mode, distribute duration evenly across ops in group
-                    uint64_t per_op_ns = duration_ns / nodes.size();
-                    for (size_t j = 0; j < nodes.size(); j++) {
-                        auto * node = nodes[j];
-                        static const int64_t zero_ne[4] = {0, 0, 0, 0};
-                        const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
-                        const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    // In concurrent mode, report the group as a single combined operation
+                    auto * node = nodes[0];
+                    static const int64_t zero_ne[4] = {0, 0, 0, 0};
+                    const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
+                    const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
+                    const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;

-                        ggml_profile_record rec;
-                        rec.type       = GGML_PROFILE_EVENT_OP;
-                        rec.name       = ggml_op_name(node->op);
-                        rec.backend_id = -1;
-                        rec.split_id   = ctx->profiler_state->split_id;
-                        rec.start_ns   = cpu_ts + j * per_op_ns;
-                        rec.end_ns     = cpu_ts + (j + 1) * per_op_ns;
-                        rec.bytes      = ggml_nbytes(node);
-                        rec.extra      = names[j];
-                        memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
-                        memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
-                        ctx->profiler_state->records.push_back(rec);
+                    uint64_t total_bytes = 0;
+                    for (size_t j = 0; j < nodes.size(); j++) {
+                        total_bytes += ggml_nbytes(nodes[j]);
                    }
+
+                    ggml_profile_record rec;
+                    rec.type       = GGML_PROFILE_EVENT_OP;
+                    rec.name       = ggml_op_name(node->op);
+                    rec.backend_id = -1;
+                    rec.split_id   = ctx->profiler_state->split_id;
+                    rec.start_ns   = cpu_ts;
+                    rec.end_ns     = cpu_ts + duration_ns;
+                    rec.bytes      = total_bytes;
+                    rec.extra      = names[0];  // fusion name of first op, or NULL
+                    memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
+                    memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
+                    memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
+                    ctx->profiler_state->records.push_back(rec);
                }
            }
        }
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@ -651,11 +651,17 @@ int main(int argc, char ** argv) {
            if (params.profiling_output.empty()) {
                ggml_backend_sched_print_profiling(sched);
            } else {
-                int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
-                if (ret == 0) {
-                    console::log("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
+                const std::string & path = params.profiling_output;
+                int ret;
+                if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
+                    ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
                } else {
-                    console::error("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
+                    ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
+                }
+                if (ret == 0) {
+                    console::log("\nProfiling data exported to: %s\n", path.c_str());
+                } else {
+                    console::error("\nFailed to export profiling data to: %s\n", path.c_str());
                }
            }
        }
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@ -998,11 +998,17 @@ int main(int argc, char ** argv) {
            if (params.profiling_output.empty()) {
                ggml_backend_sched_print_profiling(sched);
            } else {
-                int ret = ggml_backend_sched_export_profiling_json(sched, params.profiling_output.c_str());
-                if (ret == 0) {
-                    LOG("\nProfiling data exported to: %s\n", params.profiling_output.c_str());
+                const std::string & path = params.profiling_output;
+                int ret;
+                if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
+                    ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
                } else {
-                    LOG_ERR("\nFailed to export profiling data to: %s\n", params.profiling_output.c_str());
+                    ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
+                }
+                if (ret == 0) {
+                    LOG("\nProfiling data exported to: %s\n", path.c_str());
+                } else {
+                    LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str());
                }
            }
        }
--- a/tools/profiler/profiler.py
+++ b/tools/profiler/profiler.py
@ -33,6 +33,7 @@ class ProfileRecord:
    extra: Optional[str]
    ne_src0: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
    ne_src1: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
+    ne_src2: list[int] = field(default_factory=lambda: [0, 0, 0, 0])

    @property
    def type_name(self) -> str:
@ -48,7 +49,7 @@ class ProfileRecord:

    @property
    def bandwidth_gbps(self) -> float:
-        """Bandwidth in GB/s (only meaningful for copy events)."""
+        """Bandwidth in GB/s."""
        if self.duration_ns == 0 or self.bytes == 0:
            return 0.0
        return self.bytes / self.duration_ns
@ -62,12 +63,12 @@ class ProfileRecord:

    @property
    def shape_str(self) -> str:
-        """Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1]'."""
+        """Human-readable tensor shapes, e.g. '[4096, 4096] x [4096, 1] x [8, 1]'."""
        s0 = self._fmt_ne(self.ne_src0)
        s1 = self._fmt_ne(self.ne_src1)
-        if s0 and s1:
-            return s0 + " x " + s1
-        return s0 or s1
+        s2 = self._fmt_ne(self.ne_src2)
+        parts = [s for s in (s0, s1, s2) if s]
+        return " x ".join(parts)

    def to_dict(self) -> dict:
        return {
@ -81,6 +82,7 @@ class ProfileRecord:
            "extra": self.extra,
            "ne_src0": self.ne_src0,
            "ne_src1": self.ne_src1,
+            "ne_src2": self.ne_src2,
        }


@ -160,6 +162,7 @@ class ProfileData:
            # Support both old "ne" format and new "ne_src0"/"ne_src1" format
            ne_src0 = _pad_ne(r.get("ne_src0", r.get("ne", [0, 0, 0, 0])))
            ne_src1 = _pad_ne(r.get("ne_src1", [0, 0, 0, 0]))
+            ne_src2 = _pad_ne(r.get("ne_src2", [0, 0, 0, 0]))
            records.append(ProfileRecord(
                type=r.get("type", 0),
                name=r.get("name", "unknown"),
@ -171,6 +174,7 @@ class ProfileData:
                extra=r.get("extra"),
                ne_src0=ne_src0,
                ne_src1=ne_src1,
+                ne_src2=ne_src2,
            ))

        backends_raw = data.get("backends", [])
@ -267,9 +271,9 @@ class ProfileData:
            return

        print(f"  {'TYPE':<5} {'BKND':>4}  {'Operation':<28} {'%Time':>7}  {'Count':>6}  "
-              f"{'Total':>10}  {'Avg':>10}  {'Min':>10}  {'Max':>10}  {'Bytes':>10}")
+              f"{'Total':>10}  {'Avg':>10}  {'Min':>10}  {'Max':>10}  {'Bandwidth':>12}")
        print(f"  {'':->5} {'':->4}  {'':->28} {'':->7}  {'':->6}  "
-              f"{'(ms)':>10}  {'(us)':>10}  {'(us)':>10}  {'(us)':>10}  {'':->10}")
+              f"{'(ms)':>10}  {'(us)':>10}  {'(us)':>10}  {'(us)':>10}  {'':->12}")

        for s in stats:
            pct = 100.0 * s.total_ns / self.total_ns if self.total_ns > 0 else 0
@ -278,15 +282,14 @@ class ProfileData:
                    f"{s.count:>6}  {s.total_ms:>10.2f}  {s.avg_us:>10.2f}  "
                    f"{s.min_us:>10.2f}  {s.max_us:>10.2f}")

-            if s.total_bytes > 0:
+            if s.total_bytes > 0 and s.total_ns > 0:
                bw = s.bandwidth_gbps
-                bytes_str = f"{s.total_bytes / 1e6:.1f} MB"
-                if s.event_type == COPY_EVENT:
-                    line += f"  {bw:>8.2f} GB/s"
+                if bw >= 1000.0:
+                    line += f"  {bw / 1000.0:>9.2f} TB/s"
                else:
-                    line += f"  {bytes_str:>10}"
+                    line += f"  {bw:>9.2f} GB/s"
            else:
-                line += f"  {'':>10}"
+                line += f"  {'':>12}"

            # Tensor shape from longest call
            shape_dims = [n for n in s.representative_ne if n > 0]
@ -807,19 +810,19 @@ function buildStats(){
  var ops={};
  for(var i=0;i<EVENTS.length;i++){
    var ev=EVENTS[i];
-    if(!ops[ev.n])ops[ev.n]={name:ev.n,d:0,count:0,min:Infinity,max:0,backends:{}};
+    if(!ops[ev.n])ops[ev.n]={name:ev.n,d:0,count:0,min:Infinity,max:0,bytes:0,backends:{}};
    var op=ops[ev.n];
-    op.d+=ev.d;op.count++;
+    op.d+=ev.d;op.count++;op.bytes+=(ev.b||0);
    if(ev.d<op.min)op.min=ev.d;if(ev.d>op.max)op.max=ev.d;
    var bk=String(ev.bid);
-    if(!op.backends[bk])op.backends[bk]={bid:ev.bid,d:0,count:0,min:Infinity,max:0,shapes:{}};
+    if(!op.backends[bk])op.backends[bk]={bid:ev.bid,d:0,count:0,min:Infinity,max:0,bytes:0,shapes:{}};
    var b=op.backends[bk];
-    b.d+=ev.d;b.count++;
+    b.d+=ev.d;b.count++;b.bytes+=(ev.b||0);
    if(ev.d<b.min)b.min=ev.d;if(ev.d>b.max)b.max=ev.d;
    var sh=ev.s||'\u2014';
-    if(!b.shapes[sh])b.shapes[sh]={d:0,count:0,min:Infinity,max:0};
+    if(!b.shapes[sh])b.shapes[sh]={d:0,count:0,min:Infinity,max:0,bytes:0};
    var s=b.shapes[sh];
-    s.d+=ev.d;s.count++;
+    s.d+=ev.d;s.count++;s.bytes+=(ev.b||0);
    if(ev.d<s.min)s.min=ev.d;if(ev.d>s.max)s.max=ev.d;
  }
  var sorted=[];for(var k in ops)sorted.push(ops[k]);
@ -832,7 +835,7 @@ function buildStats(){
    var opId=rid++;
    var bkeys=[];for(var bk in op.backends)bkeys.push(bk);
    bkeys.sort(function(a,b){return op.backends[b].d-op.backends[a].d;});
-    rows.push({id:opId,p:-1,lv:0,name:op.name,d:op.d,count:op.count,
+    rows.push({id:opId,p:-1,lv:0,name:op.name,d:op.d,count:op.count,bytes:op.bytes,
      min:op.min,max:op.max,pct:op.d/TOTAL_US*100,ch:bkeys.length>0});
    for(var bi=0;bi<bkeys.length;bi++){
      var bdata=op.backends[bkeys[bi]];
@ -840,25 +843,33 @@ function buildStats(){
      var bname=BACKENDS[bdata.bid]||('B'+bdata.bid);
      var skeys=[];for(var sk in bdata.shapes)skeys.push(sk);
      skeys.sort(function(a,b){return bdata.shapes[b].d-bdata.shapes[a].d;});
-      rows.push({id:bId,p:opId,lv:1,name:bname,d:bdata.d,count:bdata.count,
+      rows.push({id:bId,p:opId,lv:1,name:bname,d:bdata.d,count:bdata.count,bytes:bdata.bytes,
        min:bdata.min,max:bdata.max,pct:bdata.d/TOTAL_US*100,ch:skeys.length>0});
      for(var si=0;si<skeys.length;si++){
        var sdata=bdata.shapes[skeys[si]];
        var sId=rid++;
-        rows.push({id:sId,p:bId,lv:2,name:skeys[si],d:sdata.d,count:sdata.count,
+        rows.push({id:sId,p:bId,lv:2,name:skeys[si],d:sdata.d,count:sdata.count,bytes:sdata.bytes,
          min:sdata.min,max:sdata.max,pct:sdata.d/TOTAL_US*100,ch:false});
      }
    }
  }

  // Render
-  var h='<table><thead><tr><th style="width:30%">Operation</th>'
-    +'<th class="r" style="width:12%">% Time</th>'
-    +'<th class="r" style="width:12%">Total</th>'
-    +'<th class="r" style="width:10%">Count</th>'
-    +'<th class="r" style="width:12%">Avg</th>'
-    +'<th class="r" style="width:12%">Min</th>'
-    +'<th class="r" style="width:12%">Max</th>'
+  function fmtBW(bytes,d_us){
+    if(!bytes||!d_us)return'\u2014';
+    var gbps=bytes/(d_us*1000);
+    if(gbps>=1000)return(gbps/1000).toFixed(2)+' TB/s';
+    if(gbps>=1)return gbps.toFixed(2)+' GB/s';
+    return(gbps*1000).toFixed(1)+' MB/s';
+  }
+  var h='<table><thead><tr><th style="width:26%">Operation</th>'
+    +'<th class="r" style="width:10%">% Time</th>'
+    +'<th class="r" style="width:10%">Total</th>'
+    +'<th class="r" style="width:8%">Count</th>'
+    +'<th class="r" style="width:10%">Avg</th>'
+    +'<th class="r" style="width:10%">Min</th>'
+    +'<th class="r" style="width:10%">Max</th>'
+    +'<th class="r" style="width:10%">Bandwidth</th>'
    +'</tr></thead><tbody>';

  for(var ri=0;ri<rows.length;ri++){
@ -883,6 +894,7 @@ function buildStats(){
    h+='<td class="r">'+fmtT(r.d/r.count)+'</td>';
    h+='<td class="r">'+fmtT(r.min)+'</td>';
    h+='<td class="r">'+fmtT(r.max)+'</td>';
+    h+='<td class="r">'+fmtBW(r.bytes,r.d)+'</td>';
    h+='</tr>';
  }
  h+='</tbody></table>';