Merge 532a8ebdde into 6422036fcb
This commit is contained in:
commit
b2698fcea6
|
|
@ -1073,6 +1073,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.completion = true;
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--profile"},
|
||||
"enable cross-backend profiling (CPU, BLAS, CUDA)",
|
||||
[](common_params & params) {
|
||||
params.profiling = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
|
||||
add_opt(common_arg(
|
||||
{"--profile-output"}, "FNAME",
|
||||
"write profiling JSON output to FNAME (default: stdout)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.profiling = true;
|
||||
params.profiling_output = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_DEBUG}));
|
||||
add_opt(common_arg(
|
||||
{"--verbose-prompt"},
|
||||
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
#include "gguf.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "sampling.h"
|
||||
|
|
@ -1236,6 +1237,14 @@ common_init_result::common_init_result(common_params & params) :
|
|||
return;
|
||||
}
|
||||
|
||||
if (params.profiling) {
|
||||
ggml_backend_sched_t sched = llama_context_get_sched(lctx);
|
||||
if (sched != nullptr) {
|
||||
ggml_backend_sched_set_profiling(sched, true);
|
||||
LOG_INF("%s: profiling enabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
pimpl->context.reset(lctx);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include "ggml.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
|
|
@ -669,6 +670,10 @@ struct common_params {
|
|||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
// profiling
|
||||
bool profiling = false; // enable cross-backend profiling
|
||||
std::string profiling_output; // path to write profiling JSON output (empty = stdout)
|
||||
|
||||
// batched-bench params
|
||||
bool batched_bench_output_jsonl = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,225 @@
|
|||
# Cross-Backend Profiler
|
||||
|
||||
llama.cpp includes a built-in cross-backend profiler that captures per-operation timing, data transfer costs, and tensor shapes across all compute backends. It works with any application built on the ggml scheduler — no source changes needed.
|
||||
|
||||
## Supported Backends
|
||||
|
||||
| Backend | Status | Timing method |
|
||||
|---------|--------|---------------|
|
||||
| CPU | Supported | Wall-clock (`CLOCK_MONOTONIC_RAW`) |
|
||||
| CUDA | Supported | `cudaEvent` GPU timestamps |
|
||||
| Vulkan | Supported | GPU timestamp queries |
|
||||
| BLAS | Supported | Wall-clock |
|
||||
| Metal | Not yet supported | — |
|
||||
| OpenCL | Not yet supported | — |
|
||||
|
||||
The scheduler also profiles **data copies** (H2D, D2H, D2D) between backends regardless of which backends have native profiler support.
|
||||
|
||||
## Enabling the Profiler
|
||||
|
||||
There are two independent ways to enable profiling. They can be used separately or together.
|
||||
|
||||
### CLI flags (`--profile`, `--profile-output`)
|
||||
|
||||
Available in `llama-cli`, `llama-completion`, `llama-server`, and `debug`:
|
||||
|
||||
```bash
|
||||
# Print summary to stdout
|
||||
llama-completion -m model.gguf --profile -p "Hello world"
|
||||
|
||||
# Export to JSON
|
||||
llama-completion -m model.gguf --profile --profile-output profile.json -p "Hello world"
|
||||
|
||||
# Export to plain text
|
||||
llama-completion -m model.gguf --profile --profile-output profile.txt -p "Hello world"
|
||||
```
|
||||
|
||||
The output format is chosen by file extension: `.json` for JSON, `.txt` for plain text. Any other extension defaults to JSON.
|
||||
|
||||
### Environment variable (`GGML_PROFILE`)
|
||||
|
||||
The `GGML_PROFILE` environment variable enables profiling at the ggml scheduler level. This works with **any** application that uses the scheduler — including third-party tools like `sd.cpp` — without CLI flag support.
|
||||
|
||||
```bash
|
||||
# Print summary to stdout
|
||||
GGML_PROFILE=1 llama-completion -m model.gguf -p "Hello world"
|
||||
|
||||
# Export JSON
|
||||
GGML_PROFILE=profile.json llama-completion -m model.gguf -p "Hello world"
|
||||
|
||||
# Export plain text
|
||||
GGML_PROFILE=profile.txt llama-completion -m model.gguf -p "Hello world"
|
||||
|
||||
# Works with any ggml-based application
|
||||
GGML_PROFILE=1 sd -m model.gguf -p "a cat"
|
||||
```
|
||||
|
||||
| Value | Behavior |
|
||||
|-------|----------|
|
||||
| `1`, `stdout`, or empty | Print summary to stdout |
|
||||
| `path.json` | Export JSON to file |
|
||||
| `path.txt` | Export plain text to file |
|
||||
| Any other path | Export JSON to file |
|
||||
|
||||
The export happens automatically when the scheduler is freed (typically at program exit).
|
||||
|
||||
## Output Formats
|
||||
|
||||
### Console summary (stdout)
|
||||
|
||||
The default when `--profile` is used without `--profile-output`, or `GGML_PROFILE=1`:
|
||||
|
||||
```
|
||||
=== Profiling Summary ===
|
||||
[OP ] backend 0 MUL_MAT 45.2% count=1200 total= 120.50 ms avg= 100.42 us ... 12.30 GB/s [4096 x 4096]
|
||||
[OP ] backend 1 MUL_MAT_ID 30.1% count= 600 total= 80.20 ms avg= 133.67 us ... 0.08 GB/s [2688 x 1856 x 128]
|
||||
[COPY] backend 0 copy_H2D 5.3% count= 200 total= 14.10 ms avg= 70.50 us ... 2.50 GB/s
|
||||
...
|
||||
```
|
||||
|
||||
Each line shows: event type (OP or COPY), backend index, operation name, percentage of total time, call count, timing stats, bandwidth, and representative tensor shape.
|
||||
|
||||
### Plain text (`.txt`)
|
||||
|
||||
A more detailed report with three sections:
|
||||
|
||||
1. **Profiling Summary** — total time, record count, unique ops
|
||||
2. **Per-Backend Summary** — ops and copies per backend with aggregate bandwidth
|
||||
3. **Operations table** — full breakdown with bandwidth and tensor shapes for all source tensors
|
||||
|
||||
### JSON (`.json`)
|
||||
|
||||
Machine-readable format suitable for the Python analysis tool. Contains:
|
||||
|
||||
- `version`: Format version (currently `2`)
|
||||
- `backends[]`: Backend metadata (name, device, device type)
|
||||
- `records[]`: Every profiling event with:
|
||||
- `type`: `0` = OP, `1` = COPY
|
||||
- `name`: Operation name (e.g. `"MUL_MAT"`, `"copy_H2D"`)
|
||||
- `backend_id`, `split_id`: Scheduler indices
|
||||
- `start_ns`, `duration_ns`: Timing in nanoseconds
|
||||
- `bytes`: Output tensor size (OPs) or transfer size (COPYs)
|
||||
- `extra`: Fusion name for fused ops, or `null`
|
||||
- `ne_src0`, `ne_src1`, `ne_src2`: Source tensor dimensions (4-element arrays)
|
||||
|
||||
`ne_src2` is populated only for `MUL_MAT_ID` (expert selection indices); it is `[0,0,0,0]` for all other ops.
|
||||
|
||||
## Python Analysis Tool
|
||||
|
||||
The `tools/profiler/profiler.py` script reads JSON exports and produces analysis reports and visualizations.
|
||||
|
||||
### Basic usage
|
||||
|
||||
```bash
|
||||
# Print summary
|
||||
python -m tools.profiler.profiler profile.json
|
||||
|
||||
# Show top 10 operations by time
|
||||
python -m tools.profiler.profiler profile.json --top-ops 10
|
||||
|
||||
# Show top 10 longest individual kernels
|
||||
python -m tools.profiler.profiler profile.json --top-kernels 10
|
||||
|
||||
# Show inefficiency ranking (highest time-per-byte)
|
||||
python -m tools.profiler.profiler profile.json --inefficiency
|
||||
```
|
||||
|
||||
### Export visualizations
|
||||
|
||||
```bash
|
||||
# Interactive HTML timeline (self-contained, no dependencies)
|
||||
python -m tools.profiler.profiler profile.json --html-viewer timeline.html
|
||||
|
||||
# Chrome Trace format (open in chrome://tracing or Perfetto)
|
||||
python -m tools.profiler.profiler profile.json --chrome-trace trace.json
|
||||
|
||||
# Downsample large traces for the HTML viewer
|
||||
python -m tools.profiler.profiler profile.json --html-viewer timeline.html --html-max-records 50000
|
||||
```
|
||||
|
||||
Multiple exports can be combined in a single invocation:
|
||||
|
||||
```bash
|
||||
python -m tools.profiler.profiler profile.json --html-viewer timeline.html --chrome-trace trace.json --top-ops 20
|
||||
```
|
||||
|
||||
### CLI reference
|
||||
|
||||
| Argument | Description |
|
||||
|----------|-------------|
|
||||
| `profile` (positional) | Path to profiler JSON file |
|
||||
| `--chrome-trace FILE` | Export Chrome Trace Event format |
|
||||
| `--html-viewer FILE` | Export interactive HTML timeline |
|
||||
| `--html-max-records N` | Limit records in HTML output (0 = unlimited) |
|
||||
| `--top-ops N` | Show top N operations by total time |
|
||||
| `--top-kernels N` | Show top N longest individual kernels |
|
||||
| `--inefficiency` | Rank operations by time per byte (higher = worse) |
|
||||
|
||||
### HTML viewer features
|
||||
|
||||
The HTML viewer is a self-contained file with no external dependencies:
|
||||
|
||||
- **Canvas timeline** with per-backend lanes and color-coded operations
|
||||
- **Zoom controls** (1s / 100ms / 1ms / 100us) and mouse drag navigation
|
||||
- **Minimap** showing the full trace with a viewport indicator
|
||||
- **Hover tooltips** with operation name, duration, shape, and bytes
|
||||
- **Stats table** with collapsible tree: Operation → Backend → Tensor shape, showing % time, count, avg/min/max, and bandwidth
|
||||
- **Legend** showing the most frequent operation types
|
||||
|
||||
## What Gets Measured
|
||||
|
||||
### OP events
|
||||
|
||||
Every tensor operation (MUL_MAT, ADD, UNARY, FLASH_ATTN_EXT, etc.) is recorded with:
|
||||
|
||||
- **Timing**: Start/end timestamps (nanosecond precision)
|
||||
- **Bytes**: Output tensor size (`ggml_nbytes(node)`)
|
||||
- **Tensor shapes**: Dimensions of `src[0]`, `src[1]`, and `src[2]` (when applicable)
|
||||
- **Bandwidth**: Computed as `bytes / duration` — useful for identifying memory-bound vs compute-bound operations
|
||||
|
||||
### COPY events
|
||||
|
||||
Data transfers between backends:
|
||||
|
||||
- **Direction**: `copy_H2D` (host→device), `copy_D2H` (device→host), `copy_D2D` (device→device)
|
||||
- **Bytes**: Exact transfer size
|
||||
- **Bandwidth**: Transfer throughput
|
||||
|
||||
### MoE weight copies
|
||||
|
||||
When `--cpu-moe` is used, the scheduler selectively copies only the active experts. These partial copies are recorded as individual COPY events with the actual bytes transferred.
|
||||
|
||||
## Programmatic API
|
||||
|
||||
For custom applications, the profiler can be controlled through the C API defined in `ggml/include/ggml-profiler.h`:
|
||||
|
||||
```c
|
||||
// Enable profiling on a scheduler
|
||||
ggml_backend_sched_set_profiling(sched, true);
|
||||
|
||||
// ... run inference ...
|
||||
|
||||
// Get raw records
|
||||
const ggml_profile_record * records;
|
||||
int n = ggml_backend_sched_get_profiling_records(sched, &records);
|
||||
|
||||
// Or export directly
|
||||
ggml_backend_sched_print_profiling(sched); // stdout
|
||||
ggml_backend_sched_export_profiling_json(sched, "profile.json"); // JSON file
|
||||
ggml_backend_sched_export_profiling_text(sched, "profile.txt"); // text file
|
||||
ggml_backend_sched_write_profiling_json(sched, fp); // JSON to FILE*
|
||||
ggml_backend_sched_write_profiling_text(sched, fp); // text to FILE*
|
||||
|
||||
// Reset for next measurement window
|
||||
ggml_backend_sched_reset_profiling(sched);
|
||||
```
|
||||
|
||||
Records accumulate across multiple `graph_compute` calls until explicitly reset or the scheduler is freed.
|
||||
|
||||
## Tips
|
||||
|
||||
- **Prompt eval vs generation**: The profiler captures all graph computes. During prompt evaluation you'll see larger batch sizes in tensor shapes; during generation, batch size is typically 1-2.
|
||||
- **Vulkan concurrent mode**: When Vulkan dispatches multiple operations concurrently, they are reported as a single combined record spanning the full GPU time interval.
|
||||
- **Bandwidth interpretation**: For compute ops, bandwidth = `output_bytes / duration`. This is not memory bandwidth — it's a proxy for throughput. MUL_MAT with low bandwidth typically indicates compute-bound behavior; high bandwidth indicates memory-bound.
|
||||
- **Large traces**: For long inference runs, the JSON can be large. Use `--html-max-records` to downsample the HTML viewer, or use Chrome Trace format which handles large files well.
|
||||
- **Multiple backends**: Backend IDs in the output correspond to the scheduler's priority order (0 = highest priority, typically GPU; last = CPU).
|
||||
|
|
@ -244,6 +244,29 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
// Export profiling data if profiling was enabled
|
||||
if (params.profiling) {
|
||||
ggml_backend_sched_t sched = llama_context_get_sched(ctx);
|
||||
if (sched != nullptr) {
|
||||
if (params.profiling_output.empty()) {
|
||||
ggml_backend_sched_print_profiling(sched);
|
||||
} else {
|
||||
const std::string & path = params.profiling_output;
|
||||
int ret;
|
||||
if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
|
||||
ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
|
||||
} else {
|
||||
ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
|
||||
}
|
||||
if (ret == 0) {
|
||||
LOG("\nProfiling data exported to: %s\n", path.c_str());
|
||||
} else {
|
||||
LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,24 @@ extern "C" {
|
|||
|
||||
// use only reference implementations
|
||||
bool use_ref;
|
||||
|
||||
// profiler context (set by backend when profiling is enabled, NULL otherwise)
|
||||
// when non-NULL, the compute loop will record per-node timing
|
||||
void * profiling_context;
|
||||
|
||||
// callback for recording a profile record from C code (set by backend when profiling)
|
||||
// params: context, type, name, split_id, start_ns, end_ns, bytes, extra, ne_src0[4], ne_src1[4], ne_src2[4]
|
||||
void (*profiling_record_fn)(void * context,
|
||||
int type,
|
||||
const char * name,
|
||||
int split_id,
|
||||
uint64_t start_ns,
|
||||
uint64_t end_ns,
|
||||
uint64_t bytes,
|
||||
const char * extra,
|
||||
const int64_t ne_src0[4],
|
||||
const int64_t ne_src1[4],
|
||||
const int64_t ne_src2[4]);
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
|
|
|
|||
|
|
@ -0,0 +1,112 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// Profiler
|
||||
//
|
||||
|
||||
// Profile event types
|
||||
enum ggml_profile_event_type {
|
||||
GGML_PROFILE_EVENT_OP, // single operation execution (computation kernel)
|
||||
GGML_PROFILE_EVENT_COPY, // data transfer between devices
|
||||
};
|
||||
|
||||
// A single profiling record representing a timed interval
|
||||
typedef struct ggml_profile_record {
|
||||
enum ggml_profile_event_type type;
|
||||
const char * name; // operation name (e.g., "mul_mat", "copy_H2D")
|
||||
int backend_id; // scheduler's backend index (0 = highest priority)
|
||||
int split_id; // which graph split (0..n_splits-1)
|
||||
uint64_t start_ns; // start timestamp in nanoseconds
|
||||
uint64_t end_ns; // end timestamp in nanoseconds
|
||||
uint64_t bytes; // bytes transferred (for copy) or tensor size (for ops)
|
||||
const char * extra; // fusion name for fused ops, or NULL
|
||||
int64_t ne_src0[4]; // src[0] tensor dimensions (e.g. weight matrix for MUL_MAT)
|
||||
int64_t ne_src1[4]; // src[1] tensor dimensions (e.g. input matrix for MUL_MAT)
|
||||
int64_t ne_src2[4]; // src[2] tensor dimensions (e.g. ids for MUL_MAT_ID)
|
||||
} ggml_profile_record;
|
||||
|
||||
// Backend profiler interface - each backend optionally implements this
|
||||
// to provide fine-grained operation timing
|
||||
struct ggml_backend_profiler {
|
||||
void * context; // backend-specific profiler context
|
||||
|
||||
// Enable or disable profiling on this backend
|
||||
void (*enable)(void * context, bool enable);
|
||||
|
||||
// Clear all recorded data
|
||||
void (*reset)(void * context);
|
||||
|
||||
// Set the current split ID (called by scheduler before graph_compute)
|
||||
void (*set_split_id)(void * context, int split_id);
|
||||
|
||||
// Get recorded profiling data
|
||||
// Returns the number of records; sets *out to point to internal storage
|
||||
// The returned pointer remains valid until the next reset or disable call
|
||||
int (*get_records)(void * context, const ggml_profile_record ** out);
|
||||
|
||||
// Free the profiler context
|
||||
void (*free_context)(void * context);
|
||||
};
|
||||
|
||||
typedef struct ggml_backend_profiler * ggml_backend_profiler_t;
|
||||
|
||||
// Register a profiler on a backend (called by backend during init)
|
||||
// The profiler is owned by the backend and will be freed when the backend is freed
|
||||
GGML_API void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler);
|
||||
|
||||
// Get the profiler associated with a backend (returns NULL if none)
|
||||
GGML_API ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend);
|
||||
|
||||
//
|
||||
// Scheduler profiling API
|
||||
//
|
||||
|
||||
// Enable or disable profiling on a scheduler
|
||||
// When enabled, the scheduler will:
|
||||
// - Time data copy operations between backends
|
||||
// - Enable profiling on all backends that support it
|
||||
// - Collect profiling records from all backends after each graph compute
|
||||
GGML_API void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable);
|
||||
|
||||
// Check if profiling is enabled on a scheduler
|
||||
GGML_API bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched);
|
||||
|
||||
// Get profiling data from the last graph compute
|
||||
// Records are owned by the scheduler; valid until the next compute or reset
|
||||
// Returns the number of records
|
||||
GGML_API int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records);
|
||||
|
||||
// Print a human-readable summary of the last profiling run to stdout
|
||||
// Groups records by operation name and shows total/count/min/max/avg time
|
||||
GGML_API void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched);
|
||||
|
||||
// Reset profiling data (clear all recorded data)
|
||||
GGML_API void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched);
|
||||
|
||||
// Get current time in nanoseconds (for manual profiling if needed)
|
||||
GGML_API uint64_t ggml_profiler_time_ns(void);
|
||||
|
||||
// Export profiling data as JSON to a file
|
||||
// Returns 0 on success, -1 on error
|
||||
GGML_API int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath);
|
||||
|
||||
// Export profiling data as JSON to a FILE pointer
|
||||
GGML_API int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp);
|
||||
|
||||
// Export profiling data as plain text statistics to a file
|
||||
// Returns 0 on success, -1 on error
|
||||
GGML_API int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath);
|
||||
|
||||
// Export profiling data as plain text statistics to a FILE pointer
|
||||
GGML_API int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -195,12 +195,14 @@ add_library(ggml-base
|
|||
../include/ggml-backend.h
|
||||
../include/ggml-cpp.h
|
||||
../include/ggml-opt.h
|
||||
../include/ggml-profiler.h
|
||||
../include/gguf.h
|
||||
ggml.c
|
||||
ggml.cpp
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
ggml-opt.cpp
|
||||
ggml-profiler.cpp
|
||||
ggml-threading.cpp
|
||||
ggml-threading.h
|
||||
ggml-quants.c
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
// ggml-backend internal header
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-profiler.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
@ -124,6 +125,9 @@ extern "C" {
|
|||
struct ggml_backend_i iface;
|
||||
ggml_backend_dev_t device;
|
||||
void * context;
|
||||
|
||||
// Optional profiler (set by backend during init, NULL if not profiling)
|
||||
ggml_backend_profiler_t profiler;
|
||||
};
|
||||
|
||||
struct ggml_backend_event {
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-profiler.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
|
|
@ -20,6 +21,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
|
@ -231,6 +233,15 @@ void ggml_backend_free(ggml_backend_t backend) {
|
|||
return;
|
||||
}
|
||||
|
||||
// Clean up profiler if present (before backend frees its context)
|
||||
if (backend->profiler != NULL) {
|
||||
if (backend->profiler->free_context != NULL) {
|
||||
backend->profiler->free_context(backend->profiler->context);
|
||||
}
|
||||
delete backend->profiler;
|
||||
backend->profiler = NULL;
|
||||
}
|
||||
|
||||
backend->iface.free(backend);
|
||||
}
|
||||
|
||||
|
|
@ -736,6 +747,20 @@ struct ggml_backend_sched {
|
|||
int debug_realloc;
|
||||
int debug_graph_size;
|
||||
int debug_prev_graph_size;
|
||||
|
||||
// profiling
|
||||
bool profiling_enabled;
|
||||
std::string profiling_env_path; // GGML_PROFILE env var value (for auto-export on free)
|
||||
std::vector<ggml_profile_record> copy_records; // copy events recorded by the scheduler
|
||||
std::vector<ggml_profile_record> profiling_records; // merged records from all sources
|
||||
|
||||
// Cached backend metadata for safe access during auto-export (backends may be freed first)
|
||||
struct backend_meta {
|
||||
std::string name;
|
||||
std::string device;
|
||||
int device_type;
|
||||
};
|
||||
std::vector<backend_meta> profiling_backend_meta;
|
||||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
|
|
@ -1450,11 +1475,28 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
std::vector<int32_t> ids;
|
||||
std::vector<ggml_bitset_t> used_ids;
|
||||
|
||||
// Profiling: reset copy records for this compute pass
|
||||
if (sched->profiling_enabled) {
|
||||
sched->copy_records.clear();
|
||||
}
|
||||
|
||||
for (int split_id = 0; split_id < sched->n_splits; split_id++) {
|
||||
struct ggml_backend_sched_split * split = &splits[split_id];
|
||||
int split_backend_id = split->backend_id;
|
||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||
|
||||
// Profiling: set split ID and enable backend profiling
|
||||
if (sched->profiling_enabled) {
|
||||
if (split_backend->profiler != NULL) {
|
||||
if (split_backend->profiler->enable != NULL) {
|
||||
split_backend->profiler->enable(split_backend->profiler->context, true);
|
||||
}
|
||||
if (split_backend->profiler->set_split_id != NULL) {
|
||||
split_backend->profiler->set_split_id(split_backend->profiler->context, split_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copy the input tensors to the split backend
|
||||
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
||||
|
|
@ -1468,7 +1510,26 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
if (sched->profiling_enabled) {
|
||||
uint64_t copy_start = ggml_profiler_time_ns();
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
uint64_t copy_end = ggml_profiler_time_ns();
|
||||
|
||||
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
|
||||
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
|
||||
const char * copy_dir = "copy_D2D";
|
||||
if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_H2D";
|
||||
} else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU && dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_D2H";
|
||||
}
|
||||
|
||||
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
|
||||
copy_start, copy_end, ggml_nbytes(input), input->name,
|
||||
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
|
||||
} else {
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
}
|
||||
} else {
|
||||
// wait for the split backend to finish using the input before overwriting it
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
|
|
@ -1525,12 +1586,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
}
|
||||
|
||||
// group consecutive experts and copy them together
|
||||
size_t total_copied_bytes = 0;
|
||||
auto copy_experts = [&](int32_t first_id, int32_t last_id) {
|
||||
const size_t expert_offset = first_id * expert_size;
|
||||
const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
|
||||
const size_t padding = std::min<size_t>(expert_size, 512);
|
||||
const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
|
||||
|
||||
total_copied_bytes += expert_size_copy + padding_end;
|
||||
ggml_backend_tensor_set_async(split_backend,
|
||||
input_cpy,
|
||||
(const uint8_t *)input->data + expert_offset, expert_offset,
|
||||
|
|
@ -1539,6 +1602,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
expert_size_copy + padding_end);
|
||||
};
|
||||
|
||||
uint64_t moe_copy_start = 0;
|
||||
if (sched->profiling_enabled) {
|
||||
moe_copy_start = ggml_profiler_time_ns();
|
||||
}
|
||||
|
||||
int id = 0;
|
||||
while (!ggml_bitset_get(used_ids.data(), id)) {
|
||||
id++;
|
||||
|
|
@ -1562,9 +1630,35 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
last_id = id;
|
||||
}
|
||||
copy_experts(first_id, last_id);
|
||||
|
||||
if (sched->profiling_enabled) {
|
||||
uint64_t moe_copy_end = ggml_profiler_time_ns();
|
||||
|
||||
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
|
||||
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
|
||||
const char * copy_dir = "copy_D2D";
|
||||
if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_H2D";
|
||||
} else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
|
||||
dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_D2H";
|
||||
}
|
||||
|
||||
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
|
||||
split_id, moe_copy_start, moe_copy_end,
|
||||
(uint64_t) total_copied_bytes, input->name,
|
||||
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
|
||||
}
|
||||
} else {
|
||||
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
||||
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
||||
|
||||
// Capture timestamp before async attempt so we can record launch time
|
||||
uint64_t copy_start = 0;
|
||||
if (sched->profiling_enabled) {
|
||||
copy_start = ggml_profiler_time_ns();
|
||||
}
|
||||
|
||||
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
||||
ggml_backend_synchronize(input_backend);
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
|
|
@ -1572,7 +1666,47 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
if (sched->profiling_enabled) {
|
||||
// Re-take start after sync for accurate sync copy measurement
|
||||
copy_start = ggml_profiler_time_ns();
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
uint64_t copy_end = ggml_profiler_time_ns();
|
||||
|
||||
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
|
||||
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
|
||||
const char * copy_dir = "copy_D2D";
|
||||
if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_H2D";
|
||||
} else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
|
||||
dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_D2H";
|
||||
}
|
||||
|
||||
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
|
||||
split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
|
||||
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
|
||||
} else {
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
}
|
||||
} else {
|
||||
// async copy was launched — record the time spanning the async call
|
||||
if (sched->profiling_enabled) {
|
||||
uint64_t copy_end = ggml_profiler_time_ns();
|
||||
|
||||
enum ggml_backend_dev_type src_type = ggml_backend_dev_type(input_backend->device);
|
||||
enum ggml_backend_dev_type dst_type = ggml_backend_dev_type(split_backend->device);
|
||||
const char * copy_dir = "copy_D2D";
|
||||
if (src_type == GGML_BACKEND_DEVICE_TYPE_CPU && dst_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_H2D";
|
||||
} else if (src_type != GGML_BACKEND_DEVICE_TYPE_CPU &&
|
||||
dst_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
copy_dir = "copy_D2H";
|
||||
}
|
||||
|
||||
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
|
||||
split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
|
||||
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1625,6 +1759,32 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
|||
}
|
||||
}
|
||||
|
||||
// Profiling: collect records from all backends and append to accumulated records
|
||||
if (sched->profiling_enabled) {
|
||||
// Collect backend operation records
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
ggml_backend_t backend = sched->backends[b];
|
||||
if (backend->profiler != NULL && backend->profiler->get_records != NULL) {
|
||||
const ggml_profile_record * backend_recs = NULL;
|
||||
int count = backend->profiler->get_records(backend->profiler->context, &backend_recs);
|
||||
for (int r = 0; r < count; r++) {
|
||||
ggml_profile_record rec = backend_recs[r];
|
||||
rec.backend_id = b; // stamp correct scheduler backend index
|
||||
sched->profiling_records.push_back(rec);
|
||||
}
|
||||
// Reset backend records (but keep profiling enabled for next compute)
|
||||
if (backend->profiler->reset != NULL) {
|
||||
backend->profiler->reset(backend->profiler->context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Append copy records
|
||||
for (const auto & rec : sched->copy_records) {
|
||||
sched->profiling_records.push_back(rec);
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -1691,6 +1851,24 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||
|
||||
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
||||
sched->op_offload = op_offload;
|
||||
const char * profile_env = getenv("GGML_PROFILE");
|
||||
if (profile_env != NULL) {
|
||||
sched->profiling_enabled = true;
|
||||
sched->profiling_env_path = profile_env;
|
||||
}
|
||||
|
||||
// Cache backend metadata for safe access during auto-export
|
||||
for (int b = 0; b < n_backends; b++) {
|
||||
ggml_backend_sched::backend_meta meta;
|
||||
meta.name = ggml_backend_name(backends[b]);
|
||||
meta.device = "unknown";
|
||||
meta.device_type = 0;
|
||||
if (backends[b]->device != NULL) {
|
||||
meta.device = ggml_backend_dev_name(backends[b]->device);
|
||||
meta.device_type = (int) ggml_backend_dev_type(backends[b]->device);
|
||||
}
|
||||
sched->profiling_backend_meta.push_back(std::move(meta));
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
|
|
@ -1701,6 +1879,33 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|||
if (sched == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Auto-export profiling data if enabled via GGML_PROFILE env var
|
||||
// GGML_PROFILE=1 or GGML_PROFILE="" → print to stdout
|
||||
// GGML_PROFILE=file.json → export JSON
|
||||
// GGML_PROFILE=file.txt → export text
|
||||
if (!sched->profiling_records.empty() && getenv("GGML_PROFILE") != NULL) {
|
||||
const std::string & path = sched->profiling_env_path;
|
||||
if (path.empty() || path == "1" || path == "stdout") {
|
||||
ggml_backend_sched_print_profiling(sched);
|
||||
} else if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
|
||||
int ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
|
||||
if (ret == 0) {
|
||||
GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str());
|
||||
}
|
||||
} else {
|
||||
// Default to JSON for any other path (including .json)
|
||||
int ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
|
||||
if (ret == 0) {
|
||||
GGML_LOG_INFO("[profiler] Data exported to: %s\n", path.c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("[profiler] Failed to export data to: %s\n", path.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
ggml_backend_event_free(sched->events[b][c]);
|
||||
|
|
@ -2268,3 +2473,475 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
|
|||
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
||||
}
|
||||
|
||||
//
|
||||
// Scheduler profiling
|
||||
//
|
||||
|
||||
void ggml_backend_sched_set_profiling(ggml_backend_sched_t sched, bool enable) {
|
||||
GGML_ASSERT(sched);
|
||||
sched->profiling_enabled = enable;
|
||||
|
||||
if (!enable) {
|
||||
ggml_backend_sched_reset_profiling(sched);
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_get_profiling(ggml_backend_sched_t sched) {
|
||||
GGML_ASSERT(sched);
|
||||
return sched->profiling_enabled;
|
||||
}
|
||||
|
||||
int ggml_backend_sched_get_profiling_records(ggml_backend_sched_t sched, const ggml_profile_record ** records) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT(records != NULL);
|
||||
|
||||
*records = sched->profiling_records.data();
|
||||
return (int) sched->profiling_records.size();
|
||||
}
|
||||
|
||||
void ggml_backend_sched_reset_profiling(ggml_backend_sched_t sched) {
|
||||
GGML_ASSERT(sched);
|
||||
sched->profiling_records.clear();
|
||||
sched->copy_records.clear();
|
||||
}
|
||||
|
||||
void ggml_backend_sched_print_profiling(ggml_backend_sched_t sched) {
|
||||
GGML_ASSERT(sched);
|
||||
|
||||
if (sched->profiling_records.empty()) {
|
||||
GGML_LOG_INFO("[profiler] No profiling data available\n");
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("\n=== Profiling Summary ===\n");
|
||||
|
||||
// Aggregate by (name, type, backend_id)
|
||||
struct op_stats {
|
||||
const char * name;
|
||||
enum ggml_profile_event_type type;
|
||||
int backend_id;
|
||||
uint64_t total_ns;
|
||||
uint64_t min_ns;
|
||||
uint64_t max_ns;
|
||||
int count;
|
||||
uint64_t total_bytes;
|
||||
int64_t representative_ne[4];
|
||||
};
|
||||
|
||||
std::vector<op_stats> stats;
|
||||
for (const auto & rec : sched->profiling_records) {
|
||||
bool found = false;
|
||||
for (auto & s : stats) {
|
||||
if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) {
|
||||
uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
|
||||
s.total_ns += dur;
|
||||
s.min_ns = std::min(s.min_ns, dur);
|
||||
s.max_ns = std::max(s.max_ns, dur);
|
||||
s.count++;
|
||||
s.total_bytes += rec.bytes;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
|
||||
op_stats s;
|
||||
s.name = rec.name;
|
||||
s.type = rec.type;
|
||||
s.backend_id = rec.backend_id;
|
||||
s.total_ns = dur;
|
||||
s.min_ns = dur;
|
||||
s.max_ns = dur;
|
||||
s.count = 1;
|
||||
s.total_bytes = rec.bytes;
|
||||
memcpy(s.representative_ne, rec.ne_src0, sizeof(s.representative_ne));
|
||||
stats.push_back(s);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by total time descending
|
||||
std::sort(stats.begin(), stats.end(),
|
||||
[](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; });
|
||||
|
||||
uint64_t grand_total = 0;
|
||||
for (const auto & s : stats) {
|
||||
grand_total += s.total_ns;
|
||||
}
|
||||
|
||||
const char * type_str[] = { "OP ", "COPY" };
|
||||
for (const auto & s : stats) {
|
||||
double pct = 100.0 * (double) s.total_ns / (double) grand_total;
|
||||
double avg_us = (double) s.total_ns / (double) s.count / 1000.0;
|
||||
double min_us = (double) s.min_ns / 1000.0;
|
||||
double max_us = (double) s.max_ns / 1000.0;
|
||||
|
||||
GGML_LOG_INFO(
|
||||
" [%s] backend %d %-28s %7.1f%% count=%-6d total=%8.2f ms avg=%8.2f us min=%8.2f us max=%8.2f us",
|
||||
type_str[s.type], s.backend_id, s.name, pct, s.count, (double) s.total_ns / 1e6, avg_us, min_us, max_us);
|
||||
if (s.total_bytes > 0 && s.total_ns > 0) {
|
||||
double bw_gbps = (double) s.total_bytes / (double) s.total_ns;
|
||||
if (bw_gbps >= 1000.0) {
|
||||
GGML_LOG_INFO(" %6.2f TB/s", bw_gbps / 1000.0);
|
||||
} else {
|
||||
GGML_LOG_INFO(" %6.2f GB/s", bw_gbps);
|
||||
}
|
||||
}
|
||||
// Print representative tensor shape (first record's ne)
|
||||
if (s.representative_ne[0] > 0 || s.representative_ne[1] > 0) {
|
||||
GGML_LOG_INFO(" [%lld x %lld", (long long) s.representative_ne[0], (long long) s.representative_ne[1]);
|
||||
if (s.representative_ne[2] > 1) {
|
||||
GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[2]);
|
||||
}
|
||||
if (s.representative_ne[3] > 1) {
|
||||
GGML_LOG_INFO(" x %lld", (long long) s.representative_ne[3]);
|
||||
}
|
||||
GGML_LOG_INFO("]");
|
||||
}
|
||||
GGML_LOG_INFO("\n");
|
||||
}
|
||||
|
||||
GGML_LOG_INFO(" ---\n");
|
||||
GGML_LOG_INFO(" Total: %.2f ms (%d records, %d unique ops)\n\n", (double) grand_total / 1e6,
|
||||
(int) sched->profiling_records.size(), (int) stats.size());
|
||||
}
|
||||
|
||||
int ggml_backend_sched_write_profiling_json(ggml_backend_sched_t sched, FILE * fp) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT(fp != NULL);
|
||||
|
||||
uint64_t total_ns = 0;
|
||||
for (const auto & rec : sched->profiling_records) {
|
||||
total_ns += (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
|
||||
}
|
||||
|
||||
fprintf(fp, "{\n");
|
||||
fprintf(fp, " \"version\": 2,\n");
|
||||
fprintf(fp, " \"profiler\": \"ggml\",\n");
|
||||
fprintf(fp, " \"total_records\": %d,\n", (int) sched->profiling_records.size());
|
||||
fprintf(fp, " \"total_ns\": %llu,\n", (unsigned long long) total_ns);
|
||||
|
||||
// Backend metadata (use cached data if available, fall back to live pointers)
|
||||
fprintf(fp, " \"backends\": [\n");
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
const char * name = "unknown";
|
||||
const char * dev_name = "unknown";
|
||||
int dev_type = 0;
|
||||
if (b < (int) sched->profiling_backend_meta.size()) {
|
||||
name = sched->profiling_backend_meta[b].name.c_str();
|
||||
dev_name = sched->profiling_backend_meta[b].device.c_str();
|
||||
dev_type = sched->profiling_backend_meta[b].device_type;
|
||||
} else if (sched->backends[b] != NULL) {
|
||||
name = ggml_backend_name(sched->backends[b]);
|
||||
if (sched->backends[b]->device != NULL) {
|
||||
dev_name = ggml_backend_dev_name(sched->backends[b]->device);
|
||||
dev_type = (int) ggml_backend_dev_type(sched->backends[b]->device);
|
||||
}
|
||||
}
|
||||
fprintf(fp, " {\"id\": %d, \"name\": \"%s\", \"device\": \"%s\", \"device_type\": %d}%s\n", b, name,
|
||||
dev_name, dev_type, (b < sched->n_backends - 1) ? "," : "");
|
||||
}
|
||||
fprintf(fp, " ],\n");
|
||||
|
||||
// Records
|
||||
fprintf(fp, " \"records\": [\n");
|
||||
|
||||
for (int i = 0; i < (int) sched->profiling_records.size(); i++) {
|
||||
const auto & rec = sched->profiling_records[i];
|
||||
uint64_t duration_ns = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
|
||||
|
||||
fprintf(fp,
|
||||
" {\"type\": %d, \"name\": \"%s\", \"backend_id\": %d, \"split_id\": %d, "
|
||||
"\"start_ns\": %llu, \"duration_ns\": %llu, \"bytes\": %llu, \"extra\": ",
|
||||
(int) rec.type, rec.name ? rec.name : "unknown", rec.backend_id, rec.split_id,
|
||||
(unsigned long long) rec.start_ns, (unsigned long long) duration_ns, (unsigned long long) rec.bytes);
|
||||
|
||||
if (rec.extra != NULL) {
|
||||
fprintf(fp, "\"%s\"", rec.extra);
|
||||
} else {
|
||||
fprintf(fp, "null");
|
||||
}
|
||||
|
||||
// Tensor dimensions (all source tensors)
|
||||
fprintf(fp, ", \"ne_src0\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src0[0], (long long) rec.ne_src0[1],
|
||||
(long long) rec.ne_src0[2], (long long) rec.ne_src0[3]);
|
||||
fprintf(fp, ", \"ne_src1\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src1[0], (long long) rec.ne_src1[1],
|
||||
(long long) rec.ne_src1[2], (long long) rec.ne_src1[3]);
|
||||
fprintf(fp, ", \"ne_src2\": [%lld, %lld, %lld, %lld]", (long long) rec.ne_src2[0], (long long) rec.ne_src2[1],
|
||||
(long long) rec.ne_src2[2], (long long) rec.ne_src2[3]);
|
||||
|
||||
fprintf(fp, "}%s\n", (i < (int) sched->profiling_records.size() - 1) ? "," : "");
|
||||
}
|
||||
|
||||
fprintf(fp, " ]\n");
|
||||
fprintf(fp, "}\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ggml_backend_sched_export_profiling_json(ggml_backend_sched_t sched, const char * filepath) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT(filepath != NULL);
|
||||
|
||||
FILE * fp = fopen(filepath, "w");
|
||||
if (fp == NULL) {
|
||||
GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int ret = ggml_backend_sched_write_profiling_json(sched, fp);
|
||||
fclose(fp);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Helper: format ne dimensions as string, e.g. "[4096, 4096, 1]"
|
||||
static void fmt_ne(char * buf, size_t bufsize, const int64_t ne[4]) {
|
||||
if (ne[0] == 0 && ne[1] == 0 && ne[2] == 0 && ne[3] == 0) {
|
||||
buf[0] = '\0';
|
||||
return;
|
||||
}
|
||||
int ndims = 4;
|
||||
while (ndims > 1 && ne[ndims - 1] <= 1) {
|
||||
ndims--;
|
||||
}
|
||||
int pos = snprintf(buf, bufsize, "[");
|
||||
for (int i = 0; i < ndims && pos < (int) bufsize - 1; i++) {
|
||||
pos += snprintf(buf + pos, bufsize - pos, "%s%lld", i > 0 ? ", " : "", (long long) ne[i]);
|
||||
}
|
||||
snprintf(buf + pos, bufsize - pos, "]");
|
||||
}
|
||||
|
||||
// Helper: format bandwidth as string
|
||||
static void fmt_bandwidth(char * buf, size_t bufsize, uint64_t bytes, uint64_t ns) {
|
||||
if (ns == 0 || bytes == 0) {
|
||||
buf[0] = '\0';
|
||||
return;
|
||||
}
|
||||
double bw_gbps = (double) bytes / (double) ns;
|
||||
if (bw_gbps >= 1000.0) {
|
||||
snprintf(buf, bufsize, "%.2f TB/s", bw_gbps / 1000.0);
|
||||
} else {
|
||||
snprintf(buf, bufsize, "%.2f GB/s", bw_gbps);
|
||||
}
|
||||
}
|
||||
|
||||
int ggml_backend_sched_write_profiling_text(ggml_backend_sched_t sched, FILE * fp) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT(fp != NULL);
|
||||
|
||||
if (sched->profiling_records.empty()) {
|
||||
fprintf(fp, "No profiling data available.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Aggregate by (name, type, backend_id)
|
||||
struct op_stats {
|
||||
const char * name;
|
||||
enum ggml_profile_event_type type;
|
||||
int backend_id;
|
||||
uint64_t total_ns;
|
||||
uint64_t min_ns;
|
||||
uint64_t max_ns;
|
||||
int count;
|
||||
uint64_t total_bytes;
|
||||
int64_t representative_ne_src0[4];
|
||||
int64_t representative_ne_src1[4];
|
||||
int64_t representative_ne_src2[4];
|
||||
};
|
||||
|
||||
std::vector<op_stats> stats;
|
||||
for (const auto & rec : sched->profiling_records) {
|
||||
uint64_t dur = (rec.end_ns > rec.start_ns) ? (rec.end_ns - rec.start_ns) : 0;
|
||||
bool found = false;
|
||||
for (auto & s : stats) {
|
||||
if (s.type == rec.type && s.backend_id == rec.backend_id && strcmp(s.name, rec.name) == 0) {
|
||||
s.total_ns += dur;
|
||||
s.min_ns = std::min(s.min_ns, dur);
|
||||
s.max_ns = std::max(s.max_ns, dur);
|
||||
s.count++;
|
||||
s.total_bytes += rec.bytes;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
op_stats s = {};
|
||||
s.name = rec.name;
|
||||
s.type = rec.type;
|
||||
s.backend_id = rec.backend_id;
|
||||
s.total_ns = dur;
|
||||
s.min_ns = dur;
|
||||
s.max_ns = dur;
|
||||
s.count = 1;
|
||||
s.total_bytes = rec.bytes;
|
||||
memcpy(s.representative_ne_src0, rec.ne_src0, sizeof(s.representative_ne_src0));
|
||||
memcpy(s.representative_ne_src1, rec.ne_src1, sizeof(s.representative_ne_src1));
|
||||
memcpy(s.representative_ne_src2, rec.ne_src2, sizeof(s.representative_ne_src2));
|
||||
stats.push_back(s);
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(stats.begin(), stats.end(),
|
||||
[](const op_stats & a, const op_stats & b) { return a.total_ns > b.total_ns; });
|
||||
|
||||
uint64_t grand_total = 0;
|
||||
for (const auto & s : stats) {
|
||||
grand_total += s.total_ns;
|
||||
}
|
||||
|
||||
// --- Section 1: Overall summary ---
|
||||
fprintf(fp, "=== Profiling Summary ===\n");
|
||||
fprintf(fp, "Total time: %.2f ms\n", (double) grand_total / 1e6);
|
||||
fprintf(fp, "Total records: %d\n", (int) sched->profiling_records.size());
|
||||
fprintf(fp, "Unique ops: %d\n\n", (int) stats.size());
|
||||
|
||||
// --- Section 2: Per-backend breakdown ---
|
||||
fprintf(fp, "=== Per-Backend Summary ===\n");
|
||||
{
|
||||
struct backend_stats {
|
||||
int backend_id;
|
||||
int op_count;
|
||||
int copy_count;
|
||||
uint64_t op_ns;
|
||||
uint64_t copy_ns;
|
||||
uint64_t op_bytes;
|
||||
uint64_t copy_bytes;
|
||||
};
|
||||
std::vector<backend_stats> bstats;
|
||||
for (const auto & s : stats) {
|
||||
bool found = false;
|
||||
for (auto & bs : bstats) {
|
||||
if (bs.backend_id == s.backend_id) {
|
||||
if (s.type == GGML_PROFILE_EVENT_OP) {
|
||||
bs.op_count += s.count;
|
||||
bs.op_ns += s.total_ns;
|
||||
bs.op_bytes += s.total_bytes;
|
||||
} else {
|
||||
bs.copy_count += s.count;
|
||||
bs.copy_ns += s.total_ns;
|
||||
bs.copy_bytes += s.total_bytes;
|
||||
}
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
backend_stats bs = {};
|
||||
bs.backend_id = s.backend_id;
|
||||
if (s.type == GGML_PROFILE_EVENT_OP) {
|
||||
bs.op_count = s.count;
|
||||
bs.op_ns = s.total_ns;
|
||||
bs.op_bytes = s.total_bytes;
|
||||
} else {
|
||||
bs.copy_count = s.count;
|
||||
bs.copy_ns = s.total_ns;
|
||||
bs.copy_bytes = s.total_bytes;
|
||||
}
|
||||
bstats.push_back(bs);
|
||||
}
|
||||
}
|
||||
std::sort(bstats.begin(), bstats.end(),
|
||||
[](const backend_stats & a, const backend_stats & b) {
|
||||
return (a.op_ns + a.copy_ns) > (b.op_ns + b.copy_ns);
|
||||
});
|
||||
|
||||
for (const auto & bs : bstats) {
|
||||
uint64_t total = bs.op_ns + bs.copy_ns;
|
||||
double pct = grand_total > 0 ? 100.0 * (double) total / (double) grand_total : 0;
|
||||
|
||||
const char * bname = "unknown";
|
||||
if (bs.backend_id >= 0 && bs.backend_id < (int) sched->profiling_backend_meta.size()) {
|
||||
bname = sched->profiling_backend_meta[bs.backend_id].name.c_str();
|
||||
} else if (bs.backend_id >= 0 && bs.backend_id < sched->n_backends && sched->backends[bs.backend_id] != NULL) {
|
||||
bname = ggml_backend_name(sched->backends[bs.backend_id]);
|
||||
}
|
||||
|
||||
fprintf(fp, " Backend %d (%s): %.2f ms (%.1f%%)\n", bs.backend_id, bname, (double) total / 1e6, pct);
|
||||
if (bs.op_count > 0) {
|
||||
char bw_buf[32];
|
||||
fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.op_bytes, bs.op_ns);
|
||||
fprintf(fp, " OPs: %d calls, %.2f ms", bs.op_count, (double) bs.op_ns / 1e6);
|
||||
if (bw_buf[0]) {
|
||||
fprintf(fp, ", %s", bw_buf);
|
||||
}
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
if (bs.copy_count > 0) {
|
||||
char bw_buf[32];
|
||||
fmt_bandwidth(bw_buf, sizeof(bw_buf), bs.copy_bytes, bs.copy_ns);
|
||||
fprintf(fp, " COPYs: %d calls, %.2f ms", bs.copy_count, (double) bs.copy_ns / 1e6);
|
||||
if (bw_buf[0]) {
|
||||
fprintf(fp, ", %s", bw_buf);
|
||||
}
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n");
|
||||
|
||||
// --- Section 3: Detailed operation table ---
|
||||
fprintf(fp, "=== Operations (sorted by total time) ===\n");
|
||||
fprintf(fp, "%-5s %4s %-28s %7s %6s %10s %10s %10s %10s %12s %s\n",
|
||||
"TYPE", "BKND", "Operation", "%Time", "Count", "Total(ms)", "Avg(us)", "Min(us)", "Max(us)", "Bandwidth", "Tensors");
|
||||
fprintf(fp, "%-5s %4s %-28s %7s %6s %10s %10s %10s %10s %12s %s\n",
|
||||
"-----", "----", "----------------------------", "-------", "------",
|
||||
"----------", "----------", "----------", "----------", "------------", "-------");
|
||||
|
||||
const char * type_str[] = { "OP", "COPY" };
|
||||
for (const auto & s : stats) {
|
||||
double pct = grand_total > 0 ? 100.0 * (double) s.total_ns / (double) grand_total : 0;
|
||||
double avg_us = (double) s.total_ns / (double) s.count / 1000.0;
|
||||
double min_us = (double) s.min_ns / 1000.0;
|
||||
double max_us = (double) s.max_ns / 1000.0;
|
||||
|
||||
char bw_buf[32] = "";
|
||||
fmt_bandwidth(bw_buf, sizeof(bw_buf), s.total_bytes, s.total_ns);
|
||||
|
||||
char ne0_buf[64];
|
||||
char ne1_buf[64];
|
||||
char ne2_buf[64];
|
||||
fmt_ne(ne0_buf, sizeof(ne0_buf), s.representative_ne_src0);
|
||||
fmt_ne(ne1_buf, sizeof(ne1_buf), s.representative_ne_src1);
|
||||
fmt_ne(ne2_buf, sizeof(ne2_buf), s.representative_ne_src2);
|
||||
|
||||
// Build tensor shapes string
|
||||
char tensors_buf[256] = "";
|
||||
int tpos = 0;
|
||||
if (ne0_buf[0]) {
|
||||
tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, "%s", ne0_buf);
|
||||
}
|
||||
if (ne1_buf[0]) {
|
||||
tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne1_buf);
|
||||
}
|
||||
if (ne2_buf[0]) {
|
||||
tpos += snprintf(tensors_buf + tpos, sizeof(tensors_buf) - tpos, " x %s", ne2_buf);
|
||||
}
|
||||
|
||||
fprintf(fp, "%-5s %4d %-28s %6.1f%% %6d %10.2f %10.2f %10.2f %10.2f %12s %s\n",
|
||||
type_str[s.type], s.backend_id, s.name, pct, s.count,
|
||||
(double) s.total_ns / 1e6, avg_us, min_us, max_us,
|
||||
bw_buf, tensors_buf);
|
||||
}
|
||||
|
||||
fprintf(fp, "\nTotal: %.2f ms (%d records, %d unique ops)\n", (double) grand_total / 1e6,
|
||||
(int) sched->profiling_records.size(), (int) stats.size());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ggml_backend_sched_export_profiling_text(ggml_backend_sched_t sched, const char * filepath) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT(filepath != NULL);
|
||||
|
||||
FILE * fp = fopen(filepath, "w");
|
||||
if (fp == NULL) {
|
||||
GGML_LOG_ERROR("%s: failed to open %s for writing\n", __func__, filepath);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int ret = ggml_backend_sched_write_profiling_text(sched, fp);
|
||||
fclose(fp);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "ggml-impl.h"
|
||||
#include "ggml-blas.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-profiler.h"
|
||||
|
||||
#include <future>
|
||||
#include <vector>
|
||||
|
|
@ -25,6 +26,11 @@ struct ggml_backend_blas_context {
|
|||
#ifndef GGML_USE_OPENMP
|
||||
std::vector<std::future<void>> tasks;
|
||||
#endif
|
||||
|
||||
// Profiling state
|
||||
bool profiling_enabled = false;
|
||||
int profiling_split_id = -1;
|
||||
std::vector<ggml_profile_record> profiling_records;
|
||||
};
|
||||
|
||||
static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
|
||||
|
|
@ -232,6 +238,18 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
|||
continue;
|
||||
}
|
||||
|
||||
// Skip view/identity ops
|
||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_VIEW ||
|
||||
node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Profiling: time this operation
|
||||
uint64_t t_start = 0;
|
||||
if (ctx->profiling_enabled) {
|
||||
t_start = ggml_profiler_time_ns();
|
||||
}
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
ggml_backend_blas_mul_mat(ctx, node);
|
||||
|
|
@ -241,16 +259,26 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
|
|||
ggml_backend_blas_out_prod(ctx, node);
|
||||
break;
|
||||
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
break;
|
||||
|
||||
default:
|
||||
GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
|
||||
}
|
||||
|
||||
if (ctx->profiling_enabled) {
|
||||
uint64_t t_end = ggml_profiler_time_ns();
|
||||
ggml_profile_record rec;
|
||||
rec.type = GGML_PROFILE_EVENT_OP;
|
||||
rec.name = ggml_op_name(node->op);
|
||||
rec.backend_id = 0;
|
||||
rec.split_id = ctx->profiling_split_id;
|
||||
rec.start_ns = t_start;
|
||||
rec.end_ns = t_end;
|
||||
rec.bytes = ggml_nbytes(node);
|
||||
rec.extra = NULL;
|
||||
if (node->src[0]) { memcpy(rec.ne_src0, node->src[0]->ne, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
|
||||
if (node->src[1]) { memcpy(rec.ne_src1, node->src[1]->ne, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
|
||||
if (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) { memcpy(rec.ne_src2, node->src[2]->ne, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
|
||||
ctx->profiling_records.push_back(rec);
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
|
@ -284,10 +312,11 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|||
ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_blas_guid(),
|
||||
/* .iface = */ blas_backend_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_blas_guid(),
|
||||
/* .iface = */ blas_backend_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
|
||||
|
|
@ -300,6 +329,44 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|||
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
||||
#endif
|
||||
|
||||
// Register profiler
|
||||
ggml_backend_blas_context * blas_ctx = ctx; // ctx is already defined above
|
||||
|
||||
static auto blas_prof_enable = [](void * ctx, bool enable) {
|
||||
auto * bctx = (ggml_backend_blas_context *) ctx;
|
||||
bctx->profiling_enabled = enable;
|
||||
if (!enable) {
|
||||
bctx->profiling_records.clear();
|
||||
}
|
||||
};
|
||||
static auto blas_prof_reset = [](void * ctx) {
|
||||
auto * bctx = (ggml_backend_blas_context *) ctx;
|
||||
bctx->profiling_records.clear();
|
||||
bctx->profiling_split_id = -1;
|
||||
};
|
||||
static auto blas_prof_set_split_id = [](void * ctx, int split_id) {
|
||||
auto * bctx = (ggml_backend_blas_context *) ctx;
|
||||
bctx->profiling_split_id = split_id;
|
||||
};
|
||||
static auto blas_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int {
|
||||
auto * bctx = (ggml_backend_blas_context *) ctx;
|
||||
*out = bctx->profiling_records.data();
|
||||
return (int) bctx->profiling_records.size();
|
||||
};
|
||||
static auto blas_prof_free = [](void * ctx) {
|
||||
(void) ctx;
|
||||
};
|
||||
|
||||
auto * profiler = new ggml_backend_profiler{
|
||||
/* .context = */ blas_ctx,
|
||||
/* .enable = */ blas_prof_enable,
|
||||
/* .reset = */ blas_prof_reset,
|
||||
/* .set_split_id = */ blas_prof_set_split_id,
|
||||
/* .get_records = */ blas_prof_get_records,
|
||||
/* .free_context = */ blas_prof_free,
|
||||
};
|
||||
ggml_backend_set_profiler(backend, profiler);
|
||||
|
||||
return backend;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2985,7 +2985,8 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
|||
new ggml_backend{ /* .guid = */ ggml_backend_cann_guid(),
|
||||
/* .interface = */ ggml_backend_cann_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
|
||||
/* .context = */ ctx };
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr };
|
||||
|
||||
return cann_backend;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include "quants.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "unary-ops.h"
|
||||
|
|
@ -1159,8 +1160,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
|||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
const ggml_vec_dot_t vec_dot = type_traits_cpu[type].vec_dot;
|
||||
const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
|
|
@ -1244,9 +1245,9 @@ void ggml_compute_forward_mul_mat(
|
|||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
|
||||
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
|
||||
const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
|
||||
const ggml_from_float_t from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
const int64_t vec_dot_num_rows = type_traits_cpu[src0->type].nrows;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
|
|
@ -1455,8 +1456,8 @@ static void ggml_compute_forward_mul_mat_id_one_chunk(
|
|||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
const ggml_vec_dot_t vec_dot = type_traits_cpu[type].vec_dot;
|
||||
const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
|
||||
const int64_t blck_0 = 16;
|
||||
const int64_t blck_1 = 16;
|
||||
|
|
@ -1523,8 +1524,8 @@ static void ggml_compute_forward_mul_mat_id(
|
|||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
const enum ggml_type vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
const ggml_from_float_t from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||
|
|
@ -2977,28 +2978,73 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph);
|
||||
#endif
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
// Profiling state
|
||||
if (cplan->profiling_context != NULL && cplan->profiling_record_fn != NULL) {
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
if (ggml_op_is_empty(node->op)) {
|
||||
// skip NOPs
|
||||
continue;
|
||||
if (ggml_op_is_empty(node->op)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Only thread 0 records timing (after barrier = total node time)
|
||||
uint64_t t_start = 0;
|
||||
if (state->ith == 0) {
|
||||
t_start = ggml_profiler_time_ns();
|
||||
}
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
if (node_n + 1 < cgraph->n_nodes) {
|
||||
ggml_barrier(state->threadpool);
|
||||
}
|
||||
|
||||
if (state->ith == 0) {
|
||||
uint64_t t_end = ggml_profiler_time_ns();
|
||||
{
|
||||
static const int64_t zero_ne[4] = {0, 0, 0, 0};
|
||||
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
|
||||
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
|
||||
const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
|
||||
cplan->profiling_record_fn(cplan->profiling_context, 0 /* GGML_PROFILE_EVENT_OP */,
|
||||
ggml_op_name(node->op), -1, t_start, t_end, ggml_nbytes(node), NULL,
|
||||
src0_ne, src1_ne, src2_ne);
|
||||
}
|
||||
}
|
||||
|
||||
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
|
||||
tp->ec = GGML_STATUS_ABORTED;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
if (ggml_op_is_empty(node->op)) {
|
||||
// skip NOPs
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (state->ith == 0 && cplan->abort_callback &&
|
||||
cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
|
||||
tp->ec = GGML_STATUS_ABORTED;
|
||||
}
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
if (node_n + 1 < cgraph->n_nodes) {
|
||||
ggml_barrier(state->threadpool);
|
||||
if (state->ith == 0 && cplan->abort_callback &&
|
||||
cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
|
||||
tp->ec = GGML_STATUS_ABORTED;
|
||||
}
|
||||
|
||||
if (node_n + 1 < cgraph->n_nodes) {
|
||||
ggml_barrier(state->threadpool);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include "repack.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-impl.h"
|
||||
|
|
@ -107,6 +108,11 @@ struct ggml_backend_cpu_context {
|
|||
void * abort_callback_data;
|
||||
|
||||
bool use_ref; // use reference implementation
|
||||
|
||||
// Profiling state
|
||||
bool profiling_enabled;
|
||||
int profiling_split_id;
|
||||
std::vector<ggml_profile_record> profiling_records;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
||||
|
|
@ -167,6 +173,46 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe
|
|||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
// Callback function for recording CPU profiling events from C code (ggml-cpu.c)
|
||||
static void ggml_cpu_profiler_record_callback(void * context,
|
||||
int type,
|
||||
const char * name,
|
||||
int split_id,
|
||||
uint64_t start_ns,
|
||||
uint64_t end_ns,
|
||||
uint64_t bytes,
|
||||
const char * extra,
|
||||
const int64_t ne_src0[4],
|
||||
const int64_t ne_src1[4],
|
||||
const int64_t ne_src2[4]) {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) context;
|
||||
ggml_profile_record rec;
|
||||
rec.type = (enum ggml_profile_event_type) type;
|
||||
rec.name = name;
|
||||
rec.backend_id = 0; // will be overwritten by scheduler
|
||||
rec.split_id = split_id != -1 ? split_id : cpu_ctx->profiling_split_id;
|
||||
rec.start_ns = start_ns;
|
||||
rec.end_ns = end_ns;
|
||||
rec.bytes = bytes;
|
||||
rec.extra = extra;
|
||||
if (ne_src0) {
|
||||
memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0));
|
||||
} else {
|
||||
memset(rec.ne_src0, 0, sizeof(rec.ne_src0));
|
||||
}
|
||||
if (ne_src1) {
|
||||
memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1));
|
||||
} else {
|
||||
memset(rec.ne_src1, 0, sizeof(rec.ne_src1));
|
||||
}
|
||||
if (ne_src2) {
|
||||
memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2));
|
||||
} else {
|
||||
memset(rec.ne_src2, 0, sizeof(rec.ne_src2));
|
||||
}
|
||||
cpu_ctx->profiling_records.push_back(rec);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
|
|
@ -187,6 +233,9 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
|
|||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
cplan.use_ref = cpu_ctx->use_ref;
|
||||
|
||||
cplan.profiling_context = cpu_ctx->profiling_enabled ? cpu_ctx : NULL;
|
||||
cplan.profiling_record_fn = cpu_ctx->profiling_enabled ? ggml_cpu_profiler_record_callback : NULL;
|
||||
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
|
|
@ -228,12 +277,15 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|||
ctx->abort_callback = NULL;
|
||||
ctx->abort_callback_data = NULL;
|
||||
ctx->use_ref = false;
|
||||
ctx->profiling_enabled = false;
|
||||
ctx->profiling_split_id = -1;
|
||||
|
||||
ggml_backend_t cpu_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .iface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .iface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
if (cpu_backend == NULL) {
|
||||
|
|
@ -241,6 +293,43 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// Register profiler
|
||||
static auto cpu_prof_enable = [](void * ctx, bool enable) {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
|
||||
cpu_ctx->profiling_enabled = enable;
|
||||
if (!enable) {
|
||||
cpu_ctx->profiling_records.clear();
|
||||
}
|
||||
};
|
||||
static auto cpu_prof_reset = [](void * ctx) {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
|
||||
cpu_ctx->profiling_records.clear();
|
||||
cpu_ctx->profiling_split_id = -1;
|
||||
};
|
||||
static auto cpu_prof_set_split_id = [](void * ctx, int split_id) {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
|
||||
cpu_ctx->profiling_split_id = split_id;
|
||||
};
|
||||
static auto cpu_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int {
|
||||
auto * cpu_ctx = (ggml_backend_cpu_context *) ctx;
|
||||
*out = cpu_ctx->profiling_records.data();
|
||||
return (int) cpu_ctx->profiling_records.size();
|
||||
};
|
||||
static auto cpu_prof_free = [](void * ctx) {
|
||||
// Nothing to free - records are in the CPU context's vector
|
||||
(void) ctx;
|
||||
};
|
||||
|
||||
auto * profiler = new ggml_backend_profiler{
|
||||
/* .context = */ ctx,
|
||||
/* .enable = */ cpu_prof_enable,
|
||||
/* .reset = */ cpu_prof_reset,
|
||||
/* .set_split_id = */ cpu_prof_set_split_id,
|
||||
/* .get_records = */ cpu_prof_get_records,
|
||||
/* .free_context = */ cpu_prof_free,
|
||||
};
|
||||
ggml_backend_set_profiler(cpu_backend, profiler);
|
||||
|
||||
return cpu_backend;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1352,6 +1352,9 @@ struct ggml_cuda_stream_context {
|
|||
}
|
||||
};
|
||||
|
||||
// Forward declaration for profiler state (defined in ggml-cuda.cu)
|
||||
struct ggml_cuda_profiler_state;
|
||||
|
||||
struct ggml_backend_cuda_context {
|
||||
int device;
|
||||
std::string name;
|
||||
|
|
@ -1447,6 +1450,9 @@ struct ggml_backend_cuda_context {
|
|||
ggml_cuda_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
|
||||
// Profiling
|
||||
ggml_cuda_profiler_state * profiler_state = nullptr;
|
||||
};
|
||||
|
||||
struct ggml_cuda_mm_fusion_args_host {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "ggml-cuda.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-profiler.h"
|
||||
|
||||
#include "ggml-cuda/common.cuh"
|
||||
#include "ggml-cuda/acc.cuh"
|
||||
|
|
@ -86,6 +87,94 @@
|
|||
|
||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
// CUDA profiler state
|
||||
struct ggml_cuda_profiler_state {
|
||||
bool enabled = false;
|
||||
int split_id = -1;
|
||||
cudaStream_t stream = nullptr;
|
||||
|
||||
static constexpr int MAX_PENDING_EVENTS = 4096;
|
||||
std::vector<cudaEvent_t> start_events;
|
||||
std::vector<cudaEvent_t> end_events;
|
||||
std::vector<uint64_t> cpu_timestamps; // CPU-side timestamps for global ordering
|
||||
int event_count = 0;
|
||||
|
||||
std::vector<ggml_profile_record> records;
|
||||
std::vector<int> record_event_indices;
|
||||
|
||||
void init(cudaStream_t stream) {
|
||||
this->stream = stream;
|
||||
start_events.reserve(MAX_PENDING_EVENTS);
|
||||
end_events.reserve(MAX_PENDING_EVENTS);
|
||||
cpu_timestamps.reserve(MAX_PENDING_EVENTS);
|
||||
}
|
||||
|
||||
void reset() {
|
||||
for (auto & ev : start_events) {
|
||||
(void) cudaEventDestroy(ev);
|
||||
}
|
||||
for (auto & ev : end_events) {
|
||||
(void) cudaEventDestroy(ev);
|
||||
}
|
||||
start_events.clear();
|
||||
end_events.clear();
|
||||
cpu_timestamps.clear();
|
||||
event_count = 0;
|
||||
records.clear();
|
||||
record_event_indices.clear();
|
||||
}
|
||||
|
||||
~ggml_cuda_profiler_state() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void record_start() {
|
||||
cudaEvent_t ev;
|
||||
(void) cudaEventCreate(&ev);
|
||||
(void) cudaEventRecord(ev, stream);
|
||||
start_events.push_back(ev);
|
||||
cpu_timestamps.push_back(ggml_profiler_time_ns());
|
||||
event_count++;
|
||||
}
|
||||
|
||||
void record_end(const char * name, int backend_id, int split_id, uint64_t bytes, const char * extra,
|
||||
const int64_t ne_src0[4], const int64_t ne_src1[4], const int64_t ne_src2[4]) {
|
||||
cudaEvent_t ev;
|
||||
(void) cudaEventCreate(&ev);
|
||||
(void) cudaEventRecord(ev, stream);
|
||||
end_events.push_back(ev);
|
||||
record_event_indices.push_back(records.size());
|
||||
|
||||
ggml_profile_record rec;
|
||||
rec.type = GGML_PROFILE_EVENT_OP;
|
||||
rec.name = name;
|
||||
rec.backend_id = backend_id;
|
||||
rec.split_id = split_id;
|
||||
rec.start_ns = 0;
|
||||
rec.end_ns = 0;
|
||||
rec.bytes = bytes;
|
||||
rec.extra = extra;
|
||||
if (ne_src0) { memcpy(rec.ne_src0, ne_src0, sizeof(rec.ne_src0)); } else { memset(rec.ne_src0, 0, sizeof(rec.ne_src0)); }
|
||||
if (ne_src1) { memcpy(rec.ne_src1, ne_src1, sizeof(rec.ne_src1)); } else { memset(rec.ne_src1, 0, sizeof(rec.ne_src1)); }
|
||||
if (ne_src2) { memcpy(rec.ne_src2, ne_src2, sizeof(rec.ne_src2)); } else { memset(rec.ne_src2, 0, sizeof(rec.ne_src2)); }
|
||||
records.push_back(rec);
|
||||
}
|
||||
|
||||
void finalize() {
|
||||
(void) cudaStreamSynchronize(stream);
|
||||
|
||||
for (int i = 0; i < (int)record_event_indices.size(); i++) {
|
||||
float ms = 0.0f;
|
||||
(void) cudaEventElapsedTime(&ms, start_events[i], end_events[i]);
|
||||
uint64_t duration_ns = (uint64_t)(ms * 1e6f);
|
||||
int rec_idx = record_event_indices[i];
|
||||
// Use CPU-side timestamp for global ordering, GPU-measured duration for accuracy
|
||||
records[rec_idx].start_ns = cpu_timestamps[i];
|
||||
records[rec_idx].end_ns = cpu_timestamps[i] + duration_ns;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
[[noreturn]]
|
||||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
||||
int id = -1; // in case cudaGetDevice fails
|
||||
|
|
@ -4035,8 +4124,25 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
|||
#else
|
||||
GGML_UNUSED(integrated);
|
||||
#endif // NDEBUG
|
||||
if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
|
||||
cuda_ctx->profiler_state->record_start();
|
||||
}
|
||||
|
||||
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
||||
|
||||
if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
|
||||
cuda_ctx->profiler_state->record_end(
|
||||
ggml_op_name(node->op),
|
||||
-1,
|
||||
cuda_ctx->profiler_state->split_id,
|
||||
ggml_nbytes(node),
|
||||
nullptr,
|
||||
node->src[0] ? node->src[0]->ne : nullptr,
|
||||
node->src[1] ? node->src[1]->ne : nullptr,
|
||||
(node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : nullptr
|
||||
);
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
}
|
||||
|
|
@ -4107,6 +4213,19 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
// Disable CUDA graphs when profiling (we need per-node timing)
|
||||
bool was_graph_enabled = false;
|
||||
if (cuda_ctx->profiler_state != nullptr && cuda_ctx->profiler_state->enabled) {
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
|
||||
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
|
||||
was_graph_enabled = graph->is_enabled();
|
||||
if (was_graph_enabled) {
|
||||
graph->disable_due_to_gpu_arch = true;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
bool use_cuda_graph = false;
|
||||
bool cuda_graph_update_required = false;
|
||||
const void * graph_key = nullptr;
|
||||
|
|
@ -4158,6 +4277,15 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
|
||||
ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required, graph_key);
|
||||
|
||||
// Restore CUDA graph enabled state after profiling
|
||||
if (was_graph_enabled) {
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
const void * graph_key_prof = ggml_cuda_graph_get_key(cgraph);
|
||||
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key_prof);
|
||||
graph->disable_due_to_gpu_arch = false;
|
||||
#endif
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -5304,12 +5432,68 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|||
}
|
||||
|
||||
ggml_backend_t cuda_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cuda_guid(),
|
||||
/* .iface = */ ggml_backend_cuda_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_cuda_guid(),
|
||||
/* .iface = */ ggml_backend_cuda_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
// Register profiler
|
||||
auto * prof_state = new ggml_cuda_profiler_state();
|
||||
prof_state->init(ctx->stream());
|
||||
ctx->profiler_state = prof_state;
|
||||
|
||||
static auto cuda_prof_enable = [](void * ctx, bool enable) {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
cuda_ctx->profiler_state->enabled = enable;
|
||||
if (!enable) {
|
||||
cuda_ctx->profiler_state->reset();
|
||||
}
|
||||
}
|
||||
};
|
||||
static auto cuda_prof_reset = [](void * ctx) {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
cuda_ctx->profiler_state->reset();
|
||||
cuda_ctx->profiler_state->split_id = -1;
|
||||
}
|
||||
};
|
||||
static auto cuda_prof_set_split_id = [](void * ctx, int split_id) {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
cuda_ctx->profiler_state->split_id = split_id;
|
||||
}
|
||||
};
|
||||
static auto cuda_prof_get_records = [](void * ctx, const ggml_profile_record ** out) -> int {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
cuda_ctx->profiler_state->finalize();
|
||||
*out = cuda_ctx->profiler_state->records.data();
|
||||
return (int)cuda_ctx->profiler_state->records.size();
|
||||
}
|
||||
*out = nullptr;
|
||||
return 0;
|
||||
};
|
||||
static auto cuda_prof_free = [](void * ctx) {
|
||||
auto * cuda_ctx = (ggml_backend_cuda_context *)ctx;
|
||||
if (cuda_ctx->profiler_state != nullptr) {
|
||||
delete cuda_ctx->profiler_state;
|
||||
cuda_ctx->profiler_state = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
auto * profiler = new ggml_backend_profiler {
|
||||
/* .context = */ ctx,
|
||||
/* .enable = */ cuda_prof_enable,
|
||||
/* .reset = */ cuda_prof_reset,
|
||||
/* .set_split_id = */ cuda_prof_set_split_id,
|
||||
/* .get_records = */ cuda_prof_get_records,
|
||||
/* .free_context = */ cuda_prof_free,
|
||||
};
|
||||
ggml_backend_set_profiler(cuda_backend, profiler);
|
||||
|
||||
return cuda_backend;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -55,8 +55,10 @@
|
|||
#define cudaError_t hipError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreate hipEventCreate
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
#define cudaEventDisableTiming hipEventDisableTiming
|
||||
#define cudaEventElapsedTime hipEventElapsedTime
|
||||
#define cudaEventRecord hipEventRecord
|
||||
#define cudaEventSynchronize hipEventSynchronize
|
||||
#define cudaEvent_t hipEvent_t
|
||||
|
|
|
|||
|
|
@ -44,8 +44,10 @@
|
|||
#define cudaError_t musaError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreate musaEventCreate
|
||||
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||
#define cudaEventDisableTiming musaEventDisableTiming
|
||||
#define cudaEventElapsedTime musaEventElapsedTime
|
||||
#define cudaEventRecord musaEventRecord
|
||||
#define cudaEventSynchronize musaEventSynchronize
|
||||
#define cudaEvent_t musaEvent_t
|
||||
|
|
|
|||
|
|
@ -3004,6 +3004,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init(ggml_backend_dev_t dev, c
|
|||
/* .interface = */ hexagon_backend_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ sess,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
GGML_UNUSED(params);
|
||||
|
|
|
|||
|
|
@ -597,6 +597,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
|
|||
/* .interface = */ ggml_backend_metal_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_metal_set_n_cb(backend, 1);
|
||||
|
|
@ -691,6 +692,7 @@ static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t
|
|||
/* .interface = */ ggml_backend_metal_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_metal_set_n_cb(backend, 1);
|
||||
|
|
|
|||
|
|
@ -4082,7 +4082,8 @@ ggml_backend_t ggml_backend_opencl_init(void) {
|
|||
/* .guid = */ ggml_backend_opencl_guid(),
|
||||
/* .iface = */ ggml_backend_opencl_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ backend_ctx
|
||||
/* .context = */ backend_ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
return backend;
|
||||
|
|
@ -5897,6 +5898,7 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
|
|||
/* .interface = */ ggml_backend_opencl_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ backend_ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
return backend;
|
||||
|
|
|
|||
|
|
@ -673,6 +673,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {
|
|||
/* .interface = */ ggml_backend_openvino_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), device),
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
return openvino_backend;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,67 @@
|
|||
#include "ggml-profiler.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <time.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
//
|
||||
// Time utilities
|
||||
//
|
||||
|
||||
uint64_t ggml_profiler_time_ns(void) {
|
||||
#ifdef _WIN32
|
||||
LARGE_INTEGER freq, count;
|
||||
QueryPerformanceFrequency(&freq);
|
||||
QueryPerformanceCounter(&count);
|
||||
return (uint64_t) (count.QuadPart * 1000000000ULL / freq.QuadPart);
|
||||
#elif defined(CLOCK_MONOTONIC_RAW)
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
||||
return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
|
||||
#else
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// Backend profiler registration
|
||||
//
|
||||
|
||||
void ggml_backend_set_profiler(ggml_backend_t backend, ggml_backend_profiler_t profiler) {
|
||||
if (backend == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Free any existing profiler
|
||||
if (backend->profiler != NULL) {
|
||||
if (backend->profiler->free_context != NULL) {
|
||||
backend->profiler->free_context(backend->profiler->context);
|
||||
}
|
||||
delete backend->profiler;
|
||||
backend->profiler = NULL;
|
||||
}
|
||||
|
||||
backend->profiler = profiler;
|
||||
}
|
||||
|
||||
ggml_backend_profiler_t ggml_backend_get_profiler(ggml_backend_t backend) {
|
||||
if (backend == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
return backend->profiler;
|
||||
}
|
||||
|
|
@ -952,7 +952,8 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
|
|||
/* .guid = */ ggml_backend_rpc_guid(),
|
||||
/* .iface = */ ggml_backend_rpc_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(reg, device),
|
||||
/* .context = */ ctx
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
return backend;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5101,7 +5101,8 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|||
/* .guid = */ ggml_backend_sycl_guid(),
|
||||
/* .iface = */ ggml_backend_sycl_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
|
||||
/* .context = */ ctx
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
return sycl_backend;
|
||||
|
|
|
|||
|
|
@ -63,6 +63,7 @@ ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const c
|
|||
/* .interface = */ ggml_backend_remoting_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_virtgpu_reg(), ctx->device),
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
return remoting_backend;
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "ggml-vulkan.h"
|
||||
#include "ggml-profiler.h"
|
||||
#include <vulkan/vulkan_core.h>
|
||||
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
|
||||
#include <chrono>
|
||||
|
|
@ -1700,8 +1701,8 @@ private:
|
|||
|
||||
std::mutex vk_memory_logger::log_mutex;
|
||||
|
||||
static bool vk_perf_logger_enabled = false;
|
||||
static bool vk_perf_logger_concurrent = false;
|
||||
static bool vk_perf_logger_enabled = false; // deprecated: use --profile instead
|
||||
static bool vk_perf_logger_concurrent = false; // GGML_VK_PERF_LOGGER_CONCURRENT: use concurrent timestamp mode
|
||||
static bool vk_enable_sync_logger = false;
|
||||
// number of calls between perf logger prints
|
||||
static uint32_t vk_perf_logger_frequency = 1;
|
||||
|
|
@ -1873,6 +1874,21 @@ class vk_perf_logger {
|
|||
uint32_t print_count {};
|
||||
};
|
||||
|
||||
// Profiler state for the new ggml_backend_profiler interface
|
||||
struct ggml_vk_profiler_state {
|
||||
bool enabled = false;
|
||||
int split_id = -1;
|
||||
|
||||
std::vector<ggml_profile_record> records;
|
||||
std::vector<uint64_t> cpu_timestamps; // CPU-side timestamps for global ordering
|
||||
|
||||
void reset() {
|
||||
records.clear();
|
||||
cpu_timestamps.clear();
|
||||
split_id = -1;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_backend_vk_context {
|
||||
std::string name;
|
||||
|
||||
|
|
@ -1930,8 +1946,9 @@ struct ggml_backend_vk_context {
|
|||
topk_moe_mode fused_topk_moe_mode {};
|
||||
bool fused_topk_moe_scale {};
|
||||
|
||||
// for GGML_VK_PERF_LOGGER
|
||||
std::unique_ptr<vk_perf_logger> perf_logger;
|
||||
// Profiling
|
||||
std::unique_ptr<vk_perf_logger> perf_logger; // legacy env-var profiler
|
||||
ggml_vk_profiler_state * profiler_state = nullptr;
|
||||
vk::QueryPool query_pool;
|
||||
std::vector<const char *> query_fusion_names;
|
||||
std::vector<int> query_fusion_node_count;
|
||||
|
|
@ -12859,9 +12876,13 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|||
ctx->unsynced_nodes_read.clear();
|
||||
ggml_vk_sync_buffers(ctx, compute_ctx);
|
||||
|
||||
if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
|
||||
if ((vk_perf_logger_enabled || (ctx->profiler_state != nullptr && ctx->profiler_state->enabled))
|
||||
&& vk_perf_logger_concurrent) {
|
||||
ctx->query_node_idx[ctx->query_idx] = node_idx;
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
|
||||
ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
|
||||
}
|
||||
}
|
||||
}
|
||||
// Add all fused nodes to the unsynchronized lists.
|
||||
|
|
@ -13384,7 +13405,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|||
|
||||
ctx->transfer_cmd_pool.destroy(ctx->device->device);
|
||||
}
|
||||
if (vk_perf_logger_enabled) {
|
||||
if (ctx->perf_logger) {
|
||||
ctx->perf_logger->print_timings(true);
|
||||
}
|
||||
}
|
||||
|
|
@ -14323,7 +14344,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
ggml_vk_submit_transfer_ctx(ctx);
|
||||
|
||||
vk_context compute_ctx;
|
||||
if (vk_perf_logger_enabled) {
|
||||
bool profiling = vk_perf_logger_enabled ||
|
||||
(ctx->profiler_state != nullptr && ctx->profiler_state->enabled);
|
||||
if (profiling) {
|
||||
// allocate/resize the query pool
|
||||
if (ctx->num_queries < cgraph->n_nodes + 1) {
|
||||
if (ctx->query_pool) {
|
||||
|
|
@ -14350,6 +14373,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
ctx->query_idx = 0;
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
|
||||
ctx->profiler_state->cpu_timestamps.clear();
|
||||
ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
|
||||
}
|
||||
}
|
||||
|
||||
ctx->prealloc_y_last_pipeline_used = nullptr;
|
||||
|
|
@ -14579,13 +14606,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit);
|
||||
|
||||
if (vk_perf_logger_enabled && enqueued) {
|
||||
if (profiling && enqueued) {
|
||||
compute_ctx = ggml_vk_get_compute_ctx(ctx);
|
||||
if (!vk_perf_logger_concurrent) {
|
||||
// track a single node/fusion for the current query
|
||||
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
|
||||
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
|
||||
compute_ctx->s->buffer->buf.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||
if (ctx->profiler_state != nullptr && ctx->profiler_state->enabled) {
|
||||
ctx->profiler_state->cpu_timestamps.push_back(ggml_profiler_time_ns());
|
||||
}
|
||||
} else {
|
||||
// track a fusion string and number of fused ops for the current node_idx
|
||||
ctx->query_fusion_names[i] = fusion_string;
|
||||
|
|
@ -14619,7 +14649,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
|
||||
ctx->last_total_mul_mat_bytes = total_mul_mat_bytes;
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
if (profiling) {
|
||||
// End the command buffer and submit/wait
|
||||
GGML_ASSERT(!ctx->compute_ctx.expired());
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
|
|
@ -14633,15 +14663,46 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
// Get the results and pass them to the logger
|
||||
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
||||
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
|
||||
|
||||
const double ts_period = ctx->device->properties.limits.timestampPeriod;
|
||||
const bool has_profiler = ctx->profiler_state != nullptr && ctx->profiler_state->enabled;
|
||||
|
||||
if (!vk_perf_logger_concurrent) {
|
||||
// Log each op separately
|
||||
for (int i = 1; i < ctx->query_idx; i++) {
|
||||
auto node = ctx->query_nodes[i];
|
||||
auto name = ctx->query_fusion_names[i];
|
||||
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
|
||||
uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
|
||||
|
||||
if (ctx->perf_logger) {
|
||||
ctx->perf_logger->log_timing(node, name, duration_ns);
|
||||
}
|
||||
|
||||
if (has_profiler && node != nullptr) {
|
||||
static const int64_t zero_ne[4] = {0, 0, 0, 0};
|
||||
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
|
||||
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
|
||||
const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
|
||||
uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
|
||||
? ctx->profiler_state->cpu_timestamps[i] : 0;
|
||||
|
||||
ggml_profile_record rec;
|
||||
rec.type = GGML_PROFILE_EVENT_OP;
|
||||
rec.name = ggml_op_name(node->op);
|
||||
rec.backend_id = -1;
|
||||
rec.split_id = ctx->profiler_state->split_id;
|
||||
rec.start_ns = cpu_ts;
|
||||
rec.end_ns = cpu_ts + duration_ns;
|
||||
rec.bytes = ggml_nbytes(node);
|
||||
rec.extra = name; // fusion name or NULL
|
||||
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
|
||||
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
|
||||
memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
|
||||
ctx->profiler_state->records.push_back(rec);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Log each group of nodes
|
||||
// Log each group of nodes (concurrent mode)
|
||||
int prev_node_idx = 0;
|
||||
for (int i = 1; i < ctx->query_idx; i++) {
|
||||
auto cur_node_idx = ctx->query_node_idx[i];
|
||||
|
|
@ -14656,10 +14717,46 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
node_idx += ctx->query_fusion_node_count[node_idx];
|
||||
}
|
||||
prev_node_idx = cur_node_idx;
|
||||
ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
|
||||
uint64_t duration_ns = uint64_t((timestamps[i] - timestamps[i-1]) * ts_period);
|
||||
|
||||
if (ctx->perf_logger) {
|
||||
ctx->perf_logger->log_timing(nodes, names, duration_ns);
|
||||
}
|
||||
|
||||
if (has_profiler && !nodes.empty()) {
|
||||
uint64_t cpu_ts = (i < (int)ctx->profiler_state->cpu_timestamps.size())
|
||||
? ctx->profiler_state->cpu_timestamps[i] : 0;
|
||||
// In concurrent mode, report the group as a single combined operation
|
||||
auto * node = nodes[0];
|
||||
static const int64_t zero_ne[4] = {0, 0, 0, 0};
|
||||
const int64_t * src0_ne = node->src[0] ? node->src[0]->ne : zero_ne;
|
||||
const int64_t * src1_ne = node->src[1] ? node->src[1]->ne : zero_ne;
|
||||
const int64_t * src2_ne = (node->op == GGML_OP_MUL_MAT_ID && node->src[2]) ? node->src[2]->ne : zero_ne;
|
||||
|
||||
uint64_t total_bytes = 0;
|
||||
for (size_t j = 0; j < nodes.size(); j++) {
|
||||
total_bytes += ggml_nbytes(nodes[j]);
|
||||
}
|
||||
|
||||
ggml_profile_record rec;
|
||||
rec.type = GGML_PROFILE_EVENT_OP;
|
||||
rec.name = ggml_op_name(node->op);
|
||||
rec.backend_id = -1;
|
||||
rec.split_id = ctx->profiler_state->split_id;
|
||||
rec.start_ns = cpu_ts;
|
||||
rec.end_ns = cpu_ts + duration_ns;
|
||||
rec.bytes = total_bytes;
|
||||
rec.extra = names[0]; // fusion name of first op, or NULL
|
||||
memcpy(rec.ne_src0, src0_ne, sizeof(rec.ne_src0));
|
||||
memcpy(rec.ne_src1, src1_ne, sizeof(rec.ne_src1));
|
||||
memcpy(rec.ne_src2, src2_ne, sizeof(rec.ne_src2));
|
||||
ctx->profiler_state->records.push_back(rec);
|
||||
}
|
||||
}
|
||||
}
|
||||
ctx->perf_logger->print_timings();
|
||||
if (ctx->perf_logger) {
|
||||
ctx->perf_logger->print_timings();
|
||||
}
|
||||
}
|
||||
|
||||
if (!ctx->device->support_async) {
|
||||
|
|
@ -14994,13 +15091,66 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
|||
/* .guid = */ ggml_backend_vk_guid(),
|
||||
/* .iface = */ ggml_backend_vk_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
|
||||
/* .context = */ ctx,
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
if (!ctx->device->support_async) {
|
||||
vk_backend->iface.get_tensor_async = nullptr;
|
||||
}
|
||||
|
||||
// Register profiler
|
||||
auto * prof_state = new ggml_vk_profiler_state();
|
||||
ctx->profiler_state = prof_state;
|
||||
|
||||
static auto vk_prof_enable = [](void * context, bool enable) {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
vk_ctx->profiler_state->enabled = enable;
|
||||
if (!enable) {
|
||||
vk_ctx->profiler_state->reset();
|
||||
}
|
||||
}
|
||||
};
|
||||
static auto vk_prof_reset = [](void * context) {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
vk_ctx->profiler_state->reset();
|
||||
}
|
||||
};
|
||||
static auto vk_prof_set_split_id = [](void * context, int split_id) {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
vk_ctx->profiler_state->split_id = split_id;
|
||||
}
|
||||
};
|
||||
static auto vk_prof_get_records = [](void * context, const ggml_profile_record ** out) -> int {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
*out = vk_ctx->profiler_state->records.data();
|
||||
return (int)vk_ctx->profiler_state->records.size();
|
||||
}
|
||||
*out = nullptr;
|
||||
return 0;
|
||||
};
|
||||
static auto vk_prof_free = [](void * context) {
|
||||
auto * vk_ctx = (ggml_backend_vk_context *)context;
|
||||
if (vk_ctx->profiler_state != nullptr) {
|
||||
delete vk_ctx->profiler_state;
|
||||
vk_ctx->profiler_state = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
auto * profiler = new ggml_backend_profiler {
|
||||
/* .context = */ ctx,
|
||||
/* .enable = */ vk_prof_enable,
|
||||
/* .reset = */ vk_prof_reset,
|
||||
/* .set_split_id = */ vk_prof_set_split_id,
|
||||
/* .get_records = */ vk_prof_get_records,
|
||||
/* .free_context = */ vk_prof_free,
|
||||
};
|
||||
ggml_backend_set_profiler(vk_backend, profiler);
|
||||
|
||||
return vk_backend;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3102,6 +3102,7 @@ static ggml_backend_t ggml_backend_webgpu_backend_init(ggml_backend_dev_t dev, c
|
|||
/* .interface = */ ggml_backend_webgpu_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ backend_ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
return backend;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -499,7 +499,8 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons
|
|||
/* .guid = */ ggml_backend_zdnn_guid(),
|
||||
/* .iface = */ ggml_backend_zdnn_i,
|
||||
/* .device = */ dev,
|
||||
/* .context = */ ctx
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ NULL,
|
||||
};
|
||||
|
||||
return backend;
|
||||
|
|
|
|||
|
|
@ -264,7 +264,8 @@ ggml_backend_t ggml_backend_zendnn_init(void) {
|
|||
/* .guid = */ ggml_backend_zendnn_guid(),
|
||||
/* .iface = */ ggml_backend_zendnn_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_zendnn_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .context = */ ctx,
|
||||
/* .profiler = */ nullptr,
|
||||
};
|
||||
|
||||
return backend;
|
||||
|
|
|
|||
|
|
@ -561,6 +561,8 @@ extern "C" {
|
|||
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
||||
|
||||
LLAMA_API struct ggml_backend_sched * llama_context_get_sched(const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "llama-context.h"
|
||||
|
||||
#include "ggml-profiler.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
|
|
@ -2189,6 +2190,11 @@ ggml_status llama_context::graph_compute(
|
|||
LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
|
||||
}
|
||||
|
||||
// If profiling is enabled, synchronize to ensure records are complete
|
||||
if (ggml_backend_sched_get_profiling(sched.get())) {
|
||||
ggml_backend_sched_synchronize(sched.get());
|
||||
}
|
||||
|
||||
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
|
||||
|
||||
return status;
|
||||
|
|
@ -3025,6 +3031,10 @@ enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
|
|||
return ctx->pooling_type();
|
||||
}
|
||||
|
||||
ggml_backend_sched_t llama_context_get_sched(const llama_context * ctx) {
|
||||
return ctx->get_sched();
|
||||
}
|
||||
|
||||
void llama_attach_threadpool(
|
||||
llama_context * ctx,
|
||||
ggml_threadpool_t threadpool,
|
||||
|
|
|
|||
|
|
@ -644,6 +644,29 @@ int main(int argc, char ** argv) {
|
|||
ctx_cli.ctx_server.terminate();
|
||||
inference_thread.join();
|
||||
|
||||
// Export profiling data if profiling was enabled
|
||||
if (params.profiling) {
|
||||
ggml_backend_sched_t sched = llama_context_get_sched(ctx_cli.ctx_server.get_llama_context());
|
||||
if (sched != nullptr) {
|
||||
if (params.profiling_output.empty()) {
|
||||
ggml_backend_sched_print_profiling(sched);
|
||||
} else {
|
||||
const std::string & path = params.profiling_output;
|
||||
int ret;
|
||||
if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
|
||||
ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
|
||||
} else {
|
||||
ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
|
||||
}
|
||||
if (ret == 0) {
|
||||
console::log("\nProfiling data exported to: %s\n", path.c_str());
|
||||
} else {
|
||||
console::error("\nFailed to export profiling data to: %s\n", path.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// bump the log level to display timings
|
||||
common_log_set_verbosity_thold(LOG_LEVEL_INFO);
|
||||
llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
|
||||
|
|
|
|||
|
|
@ -991,6 +991,29 @@ int main(int argc, char ** argv) {
|
|||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
||||
// Export profiling data if profiling was enabled
|
||||
if (params.profiling) {
|
||||
ggml_backend_sched_t sched = llama_context_get_sched(ctx);
|
||||
if (sched != nullptr) {
|
||||
if (params.profiling_output.empty()) {
|
||||
ggml_backend_sched_print_profiling(sched);
|
||||
} else {
|
||||
const std::string & path = params.profiling_output;
|
||||
int ret;
|
||||
if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".txt") == 0) {
|
||||
ret = ggml_backend_sched_export_profiling_text(sched, path.c_str());
|
||||
} else {
|
||||
ret = ggml_backend_sched_export_profiling_json(sched, path.c_str());
|
||||
}
|
||||
if (ret == 0) {
|
||||
LOG("\nProfiling data exported to: %s\n", path.c_str());
|
||||
} else {
|
||||
LOG_ERR("\nFailed to export profiling data to: %s\n", path.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOG("\n\n");
|
||||
common_perf_print(ctx, smpl);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
# llama.cpp profiler analysis tools
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue