Converge implementation with export-graph-ops

This commit is contained in:
Piotr Wilkin 2026-04-07 22:01:00 +02:00
parent 65ab03a3a4
commit bd6f8008a9
6 changed files with 268 additions and 550 deletions

View File

@ -2693,6 +2693,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
add_opt(common_arg(
{"--with-backends"},
"export graph ops with backend assignments (default: CPU only)",
[](common_params & params) {
params.with_backends = true;
}
).set_examples({LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),

View File

@ -438,6 +438,7 @@ struct common_params {
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
bool with_backends = false; // export graph ops with backend assignments
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
// margin per device in bytes for fitting parameters to free memory:

View File

@ -1526,7 +1526,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id, split_id,
copy_start, copy_end, ggml_nbytes(input), input->name,
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0}, -1, -1, -1, -1 });
} else {
ggml_backend_tensor_copy(input, input_cpy);
}
@ -1647,7 +1647,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, moe_copy_start, moe_copy_end,
(uint64_t) total_copied_bytes, input->name,
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0}, -1, -1, -1, -1 });
}
} else {
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
@ -1684,7 +1684,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0}, -1, -1, -1, -1 });
} else {
ggml_backend_tensor_copy(input, input_cpy);
}
@ -1705,7 +1705,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
sched->copy_records.push_back({ GGML_PROFILE_EVENT_COPY, copy_dir, split_backend_id,
split_id, copy_start, copy_end, ggml_nbytes(input), input->name,
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0} });
{input->ne[0], input->ne[1], input->ne[2], input->ne[3]}, {0}, {0}, -1, -1, -1, -1 });
}
}
}

View File

@ -5,7 +5,6 @@
#include "../src/llama-ext.h"
#include "ggml.h"
#include "gguf-model-data.h"
#include "gguf.h"
#include "ggml-backend.h"
#include "download.h"
@ -14,7 +13,6 @@
#include <set>
#include <fstream>
#include <iostream>
#include <random>
// Noop because weights are not needed
static void set_tensor_data(struct ggml_tensor * tensor, void * userdata) {
@ -55,6 +53,7 @@ struct test_object {
std::vector<int32_t> op_params;
std::vector<input_tensor> sources;
std::string name;
std::string backend_name;
void serialize(std::ostream& out) const {
out << op << ' ' << type << ' ';
@ -78,16 +77,21 @@ struct test_object {
out << '-';
}
if (!backend_name.empty()) {
out << ' ' << backend_name;
}
out << '\n';
}
bool operator<(const test_object &b) const {
return std::tie(op, type, ne, op_params, sources) <
std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
return std::tie(op, type, ne, op_params, sources, backend_name) <
std::tie(b.op, b.type, b.ne, b.op_params, b.sources, b.backend_name);
}
};
static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests,
ggml_backend_sched_t sched = nullptr) {
int n_nodes = ggml_graph_n_nodes(cgraph);
int n_skipped = 0;
int n_before = (int) tests.size();
@ -117,6 +121,14 @@ static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set
}
test.name = node->name;
if (sched) {
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, node);
if (backend) {
test.backend_name = ggml_backend_name(backend);
}
}
tests.insert(test);
}
@ -135,11 +147,12 @@ int main(int argc, char ** argv) {
return 1;
}
// Load CPU-only
ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
params.devices = { cpu_device, nullptr };
params.fit_params = false;
params.n_gpu_layers = 0;
if (!params.with_backends) {
ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
params.devices = { cpu_device, nullptr };
params.fit_params = false;
params.n_gpu_layers = 0;
}
params.warmup = false;
@ -195,19 +208,21 @@ int main(int argc, char ** argv) {
std::set<test_object> tests;
ggml_backend_sched_t sched = params.with_backends ? llama_context_get_sched(ctx) : nullptr;
auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
if (!gf_pp) {
LOG_ERR("failed to reserve prompt processing graph\n");
return 1;
}
extract_graph_ops(gf_pp, "pp", tests);
extract_graph_ops(gf_pp, "pp", tests, sched);
auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
if (!gf_tg) {
LOG_ERR("failed to reserve token generation graph\n");
return 1;
}
extract_graph_ops(gf_tg, "tg", tests);
extract_graph_ops(gf_tg, "tg", tests, sched);
LOG_INF("%d unique ops total\n", (int) tests.size());

View File

@ -20,8 +20,6 @@
#include <ggml-backend.h>
#include <ggml-cpp.h>
#include <nlohmann/json.hpp>
#include <algorithm>
#include <array>
#include <cfloat>
@ -8962,7 +8960,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
return test_cases;
}
static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const char * path) {
static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const char * path, const char * backend_name = nullptr) {
std::ifstream f(path);
if (!f.is_open()) {
@ -9020,20 +9018,25 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const c
name = "";
}
std::string file_backend;
if (iss >> file_backend) {
if (file_backend.length() == 1 && file_backend[0] == '-') {
file_backend = "";
}
}
if (backend_name != nullptr && !file_backend.empty() && file_backend != backend_name) {
continue;
}
test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name)));
}
return test_cases;
}
struct profile_test_plan;
static profile_test_plan make_test_plan_from_profile(
const char * profile_path, int top_n);
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
printer * output_printer, const char * test_file_path,
std::vector<std::unique_ptr<test_case>> profile_test_cases = {}) {
printer * output_printer, const char * test_file_path) {
auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
if (params_filter == nullptr) {
return;
@ -9054,22 +9057,18 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
std::vector<std::unique_ptr<test_case>> test_cases;
if (test_file_path == nullptr) {
if (!profile_test_cases.empty()) {
test_cases = std::move(profile_test_cases);
} else {
switch (mode) {
case MODE_TEST:
case MODE_GRAD:
case MODE_SUPPORT:
test_cases = make_test_cases_eval();
break;
case MODE_PERF:
test_cases = make_test_cases_perf();
break;
}
switch (mode) {
case MODE_TEST:
case MODE_GRAD:
case MODE_SUPPORT:
test_cases = make_test_cases_eval();
break;
case MODE_PERF:
test_cases = make_test_cases_perf();
break;
}
} else {
test_cases = make_test_cases_from_file(test_file_path);
test_cases = make_test_cases_from_file(test_file_path, ggml_backend_name(backend));
}
filter_test_cases(test_cases, params_filter);
@ -9252,7 +9251,7 @@ static void show_test_coverage() {
static void usage(char ** argv) {
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
printf(" [--show-coverage] [--test-file <path>] [--from-profile <path>] [--top-n <N>]\n");
printf(" [--show-coverage] [--test-file <path>]\n");
printf(" valid modes:\n");
printf(" - test (default, compare with CPU backend for correctness)\n");
printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
@ -9263,436 +9262,31 @@ static void usage(char ** argv) {
printf(" --output specifies output format (default: console, options: console, sql, csv)\n");
printf(" --list-ops lists all available GGML operations\n");
printf(" --show-coverage shows test coverage\n");
printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
}
// ##############################
// ## Profiler-based perf ##
// ##############################
static ggml_op profile_name_to_op(const std::string & name) {
static const std::unordered_map<std::string, ggml_op> map = {
{"ADD", GGML_OP_ADD},
{"ADD1", GGML_OP_ADD1},
{"ARGSORT", GGML_OP_ARGSORT},
{"CLAMP", GGML_OP_CLAMP},
{"CONCAT", GGML_OP_CONCAT},
{"CONT", GGML_OP_CONT},
{"CPY", GGML_OP_CPY},
{"DIV", GGML_OP_DIV},
{"FLASH_ATTN_EXT", GGML_OP_FLASH_ATTN_EXT},
{"GET_ROWS", GGML_OP_GET_ROWS},
{"GET_ROWS_BACK", GGML_OP_GET_ROWS_BACK},
{"GLU", GGML_OP_GLU},
{"IM2COL_BACK", GGML_OP_IM2COL_BACK},
{"MUL", GGML_OP_MUL},
{"MUL_MAT", GGML_OP_MUL_MAT},
{"MUL_MAT_ID", GGML_OP_MUL_MAT_ID},
{"OUT_PROD", GGML_OP_OUT_PROD},
{"POOL_2D", GGML_OP_POOL_2D},
{"RMS_NORM", GGML_OP_RMS_NORM},
{"SCALE", GGML_OP_SCALE},
{"SET_ROWS", GGML_OP_SET_ROWS},
{"SQR", GGML_OP_SQR},
{"SSM_CONV", GGML_OP_SSM_CONV},
{"SSM_SCAN", GGML_OP_SSM_SCAN},
{"SUM_ROWS", GGML_OP_SUM_ROWS},
{"UNARY", GGML_OP_UNARY},
{"SOFT_MAX", GGML_OP_SOFT_MAX},
};
auto it = map.find(name);
if (it != map.end()) {
return it->second;
}
return GGML_OP_COUNT;
}
static bool compute_output_ne(ggml_op op,
const int64_t ne0[4], const int64_t ne1[4], const int64_t ne2[4],
int64_t ne_out[4]) {
ne_out[0] = ne_out[1] = ne_out[2] = ne_out[3] = 0;
switch (op) {
case GGML_OP_MUL_MAT:
ne_out[0] = ne0[1]; ne_out[1] = ne1[1];
ne_out[2] = std::max(ne0[2], ne1[2]);
ne_out[3] = std::max(ne0[3], ne1[3]);
return true;
case GGML_OP_MUL_MAT_ID:
ne_out[0] = ne0[1]; ne_out[1] = ne2[0];
ne_out[2] = ne1[2]; ne_out[3] = 1;
return true;
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_SCALE:
for (int i = 0; i < 4; i++) {
ne_out[i] = std::max(ne0[i], ne1[i]);
}
return true;
case GGML_OP_ADD1:
ne_out[0] = ne0[0]; ne_out[1] = ne0[1];
ne_out[2] = ne0[2]; ne_out[3] = ne0[3];
return true;
case GGML_OP_SQR:
case GGML_OP_UNARY:
case GGML_OP_SSM_SCAN:
for (int i = 0; i < 4; i++) {
ne_out[i] = ne0[i];
}
return true;
case GGML_OP_SOFT_MAX:
ne_out[0] = ne0[0]; ne_out[1] = ne0[1];
ne_out[2] = ne0[2]; ne_out[3] = ne0[3];
return true;
case GGML_OP_RMS_NORM:
ne_out[0] = ne0[0]; ne_out[1] = ne0[1];
ne_out[2] = ne0[2]; ne_out[3] = ne0[3];
return true;
case GGML_OP_FLASH_ATTN_EXT:
ne_out[0] = ne1[1]; ne_out[1] = ne1[1];
ne_out[2] = ne0[2]; ne_out[3] = ne0[3];
return true;
case GGML_OP_GET_ROWS:
ne_out[0] = ne0[0]; ne_out[1] = ne1[1];
ne_out[2] = ne1[2]; ne_out[3] = ne1[3];
return true;
case GGML_OP_GET_ROWS_BACK:
ne_out[0] = ne0[0]; ne_out[1] = ne1[1];
ne_out[2] = ne1[2]; ne_out[3] = ne1[3];
return true;
case GGML_OP_SET_ROWS:
for (int i = 0; i < 4; i++) {
ne_out[i] = ne0[i];
}
return true;
case GGML_OP_OUT_PROD:
ne_out[0] = ne0[0]; ne_out[1] = ne1[0];
ne_out[2] = std::max(ne0[2], ne1[2]);
ne_out[3] = std::max(ne0[3], ne1[3]);
return true;
case GGML_OP_CONCAT:
ne_out[0] = ne0[0] + ne1[0];
ne_out[1] = std::max(ne0[1], ne1[1]);
ne_out[2] = std::max(ne0[2], ne1[2]);
ne_out[3] = std::max(ne0[3], ne1[3]);
return true;
case GGML_OP_ARGSORT:
for (int i = 0; i < 4; i++) {
ne_out[i] = ne0[i];
}
return true;
case GGML_OP_CLAMP:
for (int i = 0; i < 4; i++) {
ne_out[i] = ne0[i];
}
return true;
case GGML_OP_CPY:
for (int i = 0; i < 4; i++) {
ne_out[i] = ne0[i];
}
return true;
case GGML_OP_POOL_2D:
for (int i = 0; i < 4; i++) {
ne_out[i] = ne0[i];
}
return true;
case GGML_OP_SSM_CONV:
for (int i = 0; i < 4; i++) {
ne_out[i] = ne0[i];
}
return true;
case GGML_OP_IM2COL_BACK:
for (int i = 0; i < 4; i++) {
ne_out[i] = ne0[i];
}
return true;
default:
return false;
}
}
static std::vector<int64_t> json_get_ne(const nlohmann::json & arr) {
std::vector<int64_t> ne(4, 0);
if (!arr.is_array()) return ne;
for (size_t i = 0; i < std::min(arr.size(), (size_t)4); i++) {
ne[i] = arr[i].get<int64_t>();
}
return ne;
}
static bool ne_is_zero(const std::vector<int64_t> & ne) {
for (auto v : ne) if (v != 0) return false;
return true;
}
struct profile_op_key {
std::string name;
int backend_id;
int type_src0;
int type_src1;
int type_src2;
int sub_op;
std::vector<int64_t> ne_src0;
std::vector<int64_t> ne_src1;
std::vector<int64_t> ne_src2;
bool operator==(const profile_op_key & o) const {
return name == o.name && backend_id == o.backend_id &&
type_src0 == o.type_src0 && type_src1 == o.type_src1 && type_src2 == o.type_src2 &&
sub_op == o.sub_op &&
ne_src0 == o.ne_src0 && ne_src1 == o.ne_src1 && ne_src2 == o.ne_src2;
}
};
struct profile_op_key_hash {
size_t operator()(const profile_op_key & k) const {
size_t h = std::hash<std::string>{}(k.name);
h ^= std::hash<int>{}(k.backend_id) + 0x9e3779b9 + (h << 6) + (h >> 2);
h ^= std::hash<int>{}(k.type_src0) + 0x9e3779b9 + (h << 6) + (h >> 2);
h ^= std::hash<int>{}(k.type_src1) + 0x9e3779b9 + (h << 6) + (h >> 2);
h ^= std::hash<int>{}(k.type_src2) + 0x9e3779b9 + (h << 6) + (h >> 2);
h ^= std::hash<int>{}(k.sub_op) + 0x9e3779b9 + (h << 6) + (h >> 2);
for (auto v : k.ne_src0) { h ^= std::hash<int64_t>{}(v) + 0x9e3779b9 + (h << 6) + (h >> 2); }
for (auto v : k.ne_src1) { h ^= std::hash<int64_t>{}(v) + 0x9e3779b9 + (h << 6) + (h >> 2); }
for (auto v : k.ne_src2) { h ^= std::hash<int64_t>{}(v) + 0x9e3779b9 + (h << 6) + (h >> 2); }
return h;
}
};
struct profile_op_agg {
profile_op_key key;
uint64_t total_ns;
int64_t count;
double max_ns;
};
struct profile_test_plan {
struct backend_plan {
int backend_id;
std::string backend_name;
std::vector<std::unique_ptr<test_case>> test_cases;
std::vector<profile_op_agg> aggs;
};
std::vector<backend_plan> backends;
};
static profile_test_plan make_test_plan_from_profile(
const char * profile_path, int top_n) {
using json = nlohmann::json;
profile_test_plan plan;
std::ifstream f(profile_path);
if (!f.is_open()) {
fprintf(stderr, "Error: cannot open profile file: %s\n", profile_path);
return plan;
}
json root;
try {
root = json::parse(f);
} catch (const json::parse_error & e) {
fprintf(stderr, "Error: failed to parse profile JSON: %s\n", e.what());
return plan;
}
if (!root.contains("records") || !root["records"].is_array()) {
fprintf(stderr, "Error: no 'records' array found in profile\n");
return plan;
}
std::unordered_map<int, std::string> backend_names;
if (root.contains("backends") && root["backends"].is_array()) {
for (const auto & be : root["backends"]) {
int id = be.value("id", -1);
std::string name = be.value("name", "");
if (id >= 0 && !name.empty()) {
backend_names[id] = name;
}
}
}
const auto & records = root["records"];
std::unordered_map<profile_op_key, profile_op_agg, profile_op_key_hash> aggs;
for (const auto & rec : records) {
int rec_type = rec.value("type", -1);
if (rec_type != 0) continue;
std::string name = rec.value("name", "");
ggml_op op = profile_name_to_op(name);
if (op == GGML_OP_COUNT) continue;
profile_op_key key;
key.name = name;
key.backend_id = rec.value("backend_id", 0);
key.type_src0 = rec.value("type_src0", -1);
key.type_src1 = rec.value("type_src1", -1);
key.type_src2 = rec.value("type_src2", -1);
key.sub_op = rec.value("sub_op", -1);
key.ne_src0 = json_get_ne(rec.value("ne_src0", json::array()));
key.ne_src1 = json_get_ne(rec.value("ne_src1", json::array()));
key.ne_src2 = json_get_ne(rec.value("ne_src2", json::array()));
uint64_t dur = rec.value("duration_ns", (uint64_t)0);
auto & agg = aggs[key];
agg.key = key;
agg.total_ns += dur;
agg.count++;
agg.max_ns = std::max(agg.max_ns, (double)dur);
}
std::vector<profile_op_agg> sorted;
sorted.reserve(aggs.size());
for (auto & [_, agg] : aggs) {
sorted.push_back(std::move(agg));
}
std::sort(sorted.begin(), sorted.end(),
[](const profile_op_agg & a, const profile_op_agg & b) {
return a.total_ns > b.total_ns;
});
uint64_t global_max_ns = sorted.empty() ? 1 : sorted[0].total_ns;
int n = std::min(top_n, (int)sorted.size());
if (n == 0) {
fprintf(stderr, "Warning: no matching OP records found in profile\n");
return plan;
}
auto make_src = [](int type_id, const std::vector<int64_t> & ne) -> input_tensor {
input_tensor src;
src.type = (type_id >= 0) ? (ggml_type)type_id : GGML_TYPE_F32;
for (int d = 0; d < 4; d++) {
src.ne[d] = d < (int)ne.size() ? ne[d] : 0;
src.nb[d] = 0;
}
return src;
};
auto make_test_from_agg = [&](const profile_op_agg & agg) -> std::unique_ptr<test_case> {
ggml_op op = profile_name_to_op(agg.key.name);
std::vector<input_tensor> sources;
if (!ne_is_zero(agg.key.ne_src0)) {
sources.push_back(make_src(agg.key.type_src0, agg.key.ne_src0));
}
if (!ne_is_zero(agg.key.ne_src1)) {
sources.push_back(make_src(agg.key.type_src1, agg.key.ne_src1));
}
if (op == GGML_OP_MUL_MAT_ID) {
if (!ne_is_zero(agg.key.ne_src2)) {
sources.push_back(make_src(agg.key.type_src2, agg.key.ne_src2));
} else if (sources.size() >= 2) {
input_tensor src;
src.type = GGML_TYPE_I32;
src.ne[0] = sources[1].ne[1];
src.ne[1] = 1;
src.ne[2] = 1;
src.ne[3] = 1;
src.nb[0] = src.nb[1] = src.nb[2] = src.nb[3] = 0;
sources.push_back(src);
}
}
int64_t ne0[4] = {0}, ne1[4] = {0}, ne2[4] = {0};
if (sources.size() > 0) { for (int d = 0; d < 4; d++) ne0[d] = sources[0].ne[d]; }
if (sources.size() > 1) { for (int d = 0; d < 4; d++) ne1[d] = sources[1].ne[d]; }
if (sources.size() > 2) { for (int d = 0; d < 4; d++) ne2[d] = sources[2].ne[d]; }
int64_t ne_out[4] = {0, 0, 0, 0};
if (!compute_output_ne(op, ne0, ne1, ne2, ne_out)) {
return nullptr;
}
ggml_type out_type = GGML_TYPE_F32;
std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params{};
op_params.fill(0);
if (op == GGML_OP_MUL_MAT_ID && sources.size() >= 2) {
op_params[0] = (int32_t)sources[1].ne[1];
} else if ((op == GGML_OP_UNARY || op == GGML_OP_GLU) && agg.key.sub_op >= 0) {
op_params[0] = (int32_t)agg.key.sub_op;
}
std::array<int64_t, 4> out_ne;
for (int d = 0; d < 4; d++) out_ne[d] = ne_out[d];
return std::unique_ptr<test_case>(new test_generic_op(op, out_type, out_ne, op_params, sources,
agg.key.name + " [from profile]"));
};
printf(" Loaded %d profiler ops, running top %d:\n", (int)sorted.size(), n);
for (int i = 0; i < n; i++) {
const auto & agg = sorted[i];
double pct = 100.0 * agg.total_ns / global_max_ns;
int bid = agg.key.backend_id;
std::string bname = backend_names.count(bid) ? backend_names[bid] : std::to_string(bid);
printf(" #%d: %s @ %s %ldx %.3fms total (%.1f%% of top)\n",
i + 1, agg.key.name.c_str(), bname.c_str(), agg.count,
agg.total_ns / 1e6, pct);
if (!ne_is_zero(agg.key.ne_src0)) {
const char * tn = agg.key.type_src0 >= 0 ? ggml_type_name((ggml_type)agg.key.type_src0) : "?";
printf(" src0: [%lld, %lld, %lld, %lld] (%s)\n",
(long long)agg.key.ne_src0[0], (long long)agg.key.ne_src0[1],
(long long)agg.key.ne_src0[2], (long long)agg.key.ne_src0[3], tn);
}
if (!ne_is_zero(agg.key.ne_src1)) {
const char * tn = agg.key.type_src1 >= 0 ? ggml_type_name((ggml_type)agg.key.type_src1) : "?";
printf(" src1: [%lld, %lld, %lld, %lld] (%s)\n",
(long long)agg.key.ne_src1[0], (long long)agg.key.ne_src1[1],
(long long)agg.key.ne_src1[2], (long long)agg.key.ne_src1[3], tn);
}
if (!ne_is_zero(agg.key.ne_src2)) {
const char * tn = agg.key.type_src2 >= 0 ? ggml_type_name((ggml_type)agg.key.type_src2) : "?";
printf(" src2: [%lld, %lld, %lld, %lld] (%s)\n",
(long long)agg.key.ne_src2[0], (long long)agg.key.ne_src2[1],
(long long)agg.key.ne_src2[2], (long long)agg.key.ne_src2[3], tn);
}
auto tc = make_test_from_agg(agg);
if (!tc) continue;
int bid2 = agg.key.backend_id;
std::string bname2 = backend_names.count(bid2) ? backend_names[bid2] : std::to_string(bid2);
profile_test_plan::backend_plan * bp = nullptr;
for (auto & b : plan.backends) {
if (b.backend_id == bid2 && b.backend_name == bname2) {
bp = &b;
break;
}
}
if (!bp) {
plan.backends.push_back({bid2, bname2, {}, {}});
bp = &plan.backends.back();
}
bp->test_cases.push_back(std::move(tc));
bp->aggs.push_back(agg);
}
return plan;
printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops or the profiler\n");
}
int main(int argc, char ** argv) {
test_mode mode = MODE_TEST;
bool mode_explicit = false;
output_formats output_format = CONSOLE;
const char * op_names_filter = nullptr;
const char * backend_filter = nullptr;
const char * params_filter = nullptr;
const char * test_file_path = nullptr;
const char * profile_path = nullptr;
int profile_top_n = 10;
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "test") == 0) {
mode = MODE_TEST;
mode_explicit = true;
} else if (strcmp(argv[i], "perf") == 0) {
mode = MODE_PERF;
mode_explicit = true;
} else if (strcmp(argv[i], "grad") == 0) {
mode = MODE_GRAD;
mode_explicit = true;
} else if (strcmp(argv[i], "support") == 0) {
mode = MODE_SUPPORT;
mode_explicit = true;
} else if (strcmp(argv[i], "-o") == 0) {
if (i + 1 < argc) {
op_names_filter = argv[++i];
@ -9737,34 +9331,19 @@ int main(int argc, char ** argv) {
usage(argv);
return 1;
}
} else if (strcmp(argv[i], "--from-profile") == 0) {
if (i + 1 < argc) {
profile_path = argv[++i];
} else {
usage(argv);
return 1;
}
} else if (strcmp(argv[i], "--top-n") == 0) {
if (i + 1 < argc) {
profile_top_n = atoi(argv[++i]);
if (profile_top_n <= 0) profile_top_n = 10;
} else {
usage(argv);
return 1;
}
} else {
usage(argv);
return 1;
}
}
// load and enumerate backends
ggml_backend_load_all();
if (profile_path != nullptr) {
if (test_file_path != nullptr && !mode_explicit) {
mode = MODE_PERF;
}
// load and enumerate backends
ggml_backend_load_all();
// Create printer for output format
std::unique_ptr<printer> output_printer = create_printer(output_format);
if (output_printer) {
@ -9773,83 +9352,6 @@ int main(int argc, char ** argv) {
output_printer->print_testing_start(testing_start_info(ggml_backend_dev_count()));
if (profile_path != nullptr) {
profile_test_plan plan = make_test_plan_from_profile(profile_path, profile_top_n);
size_t n_ok = 0;
size_t total = plan.backends.size();
for (size_t bi = 0; bi < plan.backends.size(); bi++) {
auto & bp = plan.backends[bi];
ggml_backend_dev_t dev = nullptr;
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t d = ggml_backend_dev_get(i);
if (strcmp(ggml_backend_dev_name(d), bp.backend_name.c_str()) == 0) {
dev = d;
break;
}
}
if (dev == nullptr) {
fprintf(stderr, "Warning: backend '%s' from profile not found, skipping\n", bp.backend_name.c_str());
n_ok++;
output_printer->print_backend_init(
backend_init_info(bi, total, bp.backend_name.c_str(), true, "Not found"));
continue;
}
if (backend_filter != NULL && strcmp(backend_filter, bp.backend_name.c_str()) != 0) {
output_printer->print_backend_init(
backend_init_info(bi, total, bp.backend_name.c_str(), true, "Skipping"));
n_ok++;
continue;
}
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
GGML_ASSERT(backend != NULL);
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn) {
ggml_backend_set_n_threads_fn(backend, N_THREADS);
}
size_t free, total_mem;
ggml_backend_dev_memory(dev, &free, &total_mem);
output_printer->print_backend_init(backend_init_info(bi, plan.backends.size(), bp.backend_name.c_str(),
false, "", ggml_backend_dev_description(dev),
total_mem / 1024 / 1024, free / 1024 / 1024, true));
std::vector<std::unique_ptr<test_case>> cases;
for (auto & tc : bp.test_cases) {
cases.push_back(std::move(tc));
}
bool ok = test_backend(backend, MODE_PERF, op_names_filter, params_filter,
output_printer.get(), nullptr, std::move(cases));
if (ok) {
n_ok++;
}
output_printer->print_backend_status(
backend_status_info(ggml_backend_name(backend), ok ? test_status_t::OK : test_status_t::FAIL));
ggml_backend_free(backend);
}
ggml_quantize_free();
if (output_printer) {
output_printer->print_footer();
}
output_printer->print_overall_summary(
overall_summary_info(n_ok, total, n_ok == total));
return n_ok != total;
}
size_t n_ok = 0;
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
@ -9862,7 +9364,7 @@ int main(int argc, char ** argv) {
continue;
}
if (backend_filter == NULL &&
if (backend_filter == NULL && test_file_path == NULL &&
ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
output_printer->print_backend_init(backend_init_info(
i, ggml_backend_dev_count(), ggml_backend_dev_name(dev), true, "Skipping CPU backend"));

View File

@ -49,6 +49,107 @@ GGML_GLU_OP_NAMES = {
4: "GEGLU_QUICK", 5: "SWIGLU_OAI",
}
GGML_OP_NAMES = {
0: "NONE", 1: "DUP", 2: "ADD", 3: "ADD_ID", 4: "ADD1",
5: "ACC", 6: "SUB", 7: "MUL", 8: "DIV", 9: "SQR",
10: "SQRT", 11: "LOG", 12: "SIN", 13: "COS", 14: "SUM",
15: "SUM_ROWS", 16: "CUMSUM", 17: "MEAN", 18: "ARGMAX",
19: "COUNT_EQUAL", 20: "REPEAT", 21: "REPEAT_BACK", 22: "CONCAT",
23: "SILU_BACK", 24: "NORM", 25: "RMS_NORM", 26: "RMS_NORM_BACK",
27: "GROUP_NORM", 28: "L2_NORM", 29: "MUL_MAT", 30: "MUL_MAT_ID",
31: "OUT_PROD", 32: "SCALE", 33: "SET", 34: "CPY", 35: "CONT",
36: "RESHAPE", 37: "VIEW", 38: "PERMUTE", 39: "TRANSPOSE",
40: "GET_ROWS", 41: "GET_ROWS_BACK", 42: "SET_ROWS", 43: "DIAG",
44: "DIAG_MASK_INF", 45: "DIAG_MASK_ZERO", 46: "SOFT_MAX",
47: "SOFT_MAX_BACK", 48: "ROPE", 49: "ROPE_BACK", 50: "CLAMP",
51: "CONV_TRANSPOSE_1D", 52: "IM2COL", 53: "IM2COL_BACK", 54: "IM2COL_3D",
55: "CONV_2D", 56: "CONV_3D", 57: "CONV_2D_DW", 58: "CONV_TRANSPOSE_2D",
59: "POOL_1D", 60: "POOL_2D", 61: "POOL_2D_BACK", 62: "UPSCALE",
63: "PAD", 64: "PAD_REFLECT_1D", 65: "ROLL", 66: "ARANGE",
67: "TIMESTEP_EMBEDDING", 68: "ARGSORT", 69: "TOP_K", 70: "LEAKY_RELU",
71: "TRI", 72: "FILL", 73: "FLASH_ATTN_EXT", 74: "FLASH_ATTN_BACK",
75: "SSM_CONV", 76: "SSM_SCAN", 77: "WIN_PART", 78: "WIN_UNPART",
79: "GET_REL_POS", 80: "ADD_REL_POS", 81: "RWKV_WKV6",
82: "GATED_LINEAR_ATTN", 83: "RWKV_WKV7", 84: "SOLVE_TRI",
85: "GATED_DELTA_NET", 86: "UNARY", 87: "MAP_CUSTOM1",
88: "MAP_CUSTOM2", 89: "MAP_CUSTOM3", 90: "CUSTOM",
91: "CROSS_ENTROPY_LOSS", 92: "CROSS_ENTROPY_LOSS_BACK",
93: "OPT_STEP_ADAMW", 94: "OPT_STEP_SGD", 95: "GLU",
96: "COUNT",
}
GGML_TYPE_NAMES_TO_ID = {v: k for k, v in GGML_TYPE_NAMES.items()}
GGML_OP_NAMES_TO_ID = {v: k for k, v in GGML_OP_NAMES.items()}
_EXPORT_SKIP_OPS = frozenset({
33, # SET
34, # CPY
35, # CONT
36, # RESHAPE
37, # VIEW
38, # PERMUTE
39, # TRANSPOSE
41, # GET_ROWS_BACK
42, # SET_ROWS
43, # DIAG
44, # DIAG_MASK_INF
45, # DIAG_MASK_ZERO
47, # SOFT_MAX_BACK
49, # ROPE_BACK
51, # CONV_TRANSPOSE_1D
52, # IM2COL
53, # IM2COL_BACK
54, # IM2COL_3D
58, # CONV_TRANSPOSE_2D
61, # POOL_2D_BACK
63, # PAD
64, # PAD_REFLECT_1D
65, # ROLL
66, # ARANGE
70, # LEAKY_RELU (covered by UNARY)
71, # TRI
72, # FILL
77, # WIN_PART
78, # WIN_UNPART
92, # CROSS_ENTROPY_LOSS_BACK
93, # OPT_STEP_ADAMW
94, # OPT_STEP_SGD
96, # COUNT
})
def _compute_output_ne(op_id: int, ne0: list, ne1: list, ne2: list) -> list | None:
if op_id == 29: # MUL_MAT
return [ne0[1], ne1[1], max(ne0[2], ne1[2]), max(ne0[3], ne1[3])]
if op_id == 30: # MUL_MAT_ID
return [ne0[1], ne2[0], ne1[2], 1]
if op_id in (2, 7, 8, 32): # ADD, MUL, DIV, SCALE
return [max(ne0[i], ne1[i]) for i in range(4)]
if op_id == 4: # ADD1
return list(ne0)
if op_id in (9, 86): # SQR, UNARY
return list(ne0)
if op_id in (46, 25): # SOFT_MAX, RMS_NORM
return list(ne0)
if op_id == 73: # FLASH_ATTN_EXT
return [ne1[1], ne1[1], ne0[2], ne0[3]]
if op_id == 40: # GET_ROWS
return [ne0[0], ne1[1], ne1[2], ne1[3]]
if op_id == 41: # GET_ROWS_BACK
return [ne0[0], ne1[1], ne1[2], ne1[3]]
if op_id == 42: # SET_ROWS
return list(ne0)
if op_id == 31: # OUT_PROD
return [ne0[0], ne1[0], max(ne0[2], ne1[2]), max(ne0[3], ne1[3])]
if op_id == 22: # CONCAT
return [ne0[0] + ne1[0], max(ne0[1], ne1[1]),
max(ne0[2], ne1[2]), max(ne0[3], ne1[3])]
if op_id in (34, 35, 50, 60, 53, 68): # CPY, CONT, CLAMP, POOL_2D, IM2COL_BACK, ARGSORT
return list(ne0)
return None
@dataclass
class ProfileRecord:
@ -464,6 +565,92 @@ class ProfileData:
print(f"Chrome trace exported to: {filepath}")
print(f"Open chrome://tracing in Chrome/Edge and load this file.")
def export_graph_ops(self, filepath: str | Path) -> None:
"""Export operations in export-graph-ops format for test-backend-ops --test-file."""
seen: set[tuple] = set()
lines: list[str] = []
backend_by_id: dict[int, dict] = {}
for b in self.metadata.get("backends", []):
backend_by_id[b["id"]] = b
for rec in self.records:
if rec.type != OP_EVENT:
continue
op_id = GGML_OP_NAMES_TO_ID.get(rec.name, -1)
if op_id < 0:
continue
if op_id in _EXPORT_SKIP_OPS:
continue
ne0 = rec.ne_src0
ne1 = rec.ne_src1
ne2 = rec.ne_src2
type_src0 = rec.type_src0 if rec.type_src0 >= 0 else 0
type_src1 = rec.type_src1 if rec.type_src1 >= 0 else 0
type_src2 = rec.type_src2 if rec.type_src2 >= 0 else 0
sources: list[tuple[int, list, list]] = []
if any(v != 0 for v in ne0):
sources.append((type_src0, ne0, [0, 0, 0, 0]))
if any(v != 0 for v in ne1):
sources.append((type_src1, ne1, [0, 0, 0, 0]))
if op_id == 30: # MUL_MAT_ID: ensure rows tensor (src2) is present
if len(sources) < 3 and any(v != 0 for v in ne2):
sources.append((type_src2, ne2, [0, 0, 0, 0]))
elif len(sources) < 3 and len(sources) >= 2:
sources.append((24, [sources[1][1][1], 1, 1, 1], [0, 0, 0, 0])) # I32
elif any(v != 0 for v in ne2):
sources.append((type_src2, ne2, [0, 0, 0, 0]))
src_ne0 = sources[0][1] if len(sources) > 0 else [0, 0, 0, 0]
src_ne1 = sources[1][1] if len(sources) > 1 else [0, 0, 0, 0]
src_ne2 = sources[2][1] if len(sources) > 2 else [0, 0, 0, 0]
ne_out = _compute_output_ne(op_id, src_ne0, src_ne1, src_ne2)
if ne_out is None:
continue
op_params: list[int] = []
if op_id == 30 and len(sources) >= 2: # MUL_MAT_ID
op_params.append(sources[1][1][1])
elif op_id in (86, 95) and rec.sub_op >= 0: # UNARY, GLU
op_params.append(rec.sub_op)
bname = ""
if rec.backend_id in backend_by_id:
bname = backend_by_id[rec.backend_id].get("device", "")
if not bname or bname == "unknown":
bname = backend_by_id[rec.backend_id].get("name", "")
key = (op_id, tuple(ne_out), tuple(op_params), tuple((s[0], tuple(s[1])) for s in sources), bname)
if key in seen:
continue
seen.add(key)
line = f"{op_id} 0 {ne_out[0]} {ne_out[1]} {ne_out[2]} {ne_out[3]} "
line += f"{len(op_params)}"
for p in op_params:
line += f" {p}"
line += f" {len(sources)}"
for src_type, src_ne, src_nb in sources:
line += f" {src_type} {src_ne[0]} {src_ne[1]} {src_ne[2]} {src_ne[3]} {src_nb[0]} {src_nb[1]} {src_nb[2]} {src_nb[3]}"
name = rec.name if rec.name else "-"
line += f" {name}"
if bname:
line += f" {bname}"
line += "\n"
lines.append(line)
with open(filepath, "w") as f:
f.writelines(lines)
print(f"Exported {len(lines)} unique ops to: {filepath}")
def export_html_viewer(self, filepath: str | Path, max_records: int = 0) -> None:
"""Export a self-contained interactive HTML timeline viewer using Canvas."""
import json as json_mod
@ -1007,6 +1194,7 @@ Examples:
python -m tools.profiler.profiler profile.json
python -m tools.profiler.profiler profile.json --chrome-trace trace.json
python -m tools.profiler.profiler profile.json --top-ops 20
python -m tools.profiler.profiler profile.json --export-ops ops.txt
""",
)
parser.add_argument("profile", help="Path to profiler JSON file")
@ -1014,6 +1202,8 @@ Examples:
help="Export as Chrome Trace Event format")
parser.add_argument("--html-viewer", metavar="FILE",
help="Export as interactive HTML timeline viewer")
parser.add_argument("--export-ops", metavar="FILE",
help="Export ops in export-graph-ops format (for test-backend-ops --test-file)")
parser.add_argument("--html-max-records", type=int, default=0,
help="Max records in HTML viewer (0=unlimited, set to downsample for huge traces)")
parser.add_argument("--top-ops", type=int, default=0,
@ -1033,6 +1223,9 @@ Examples:
if args.html_viewer:
data.export_html_viewer(args.html_viewer, max_records=args.html_max_records)
if args.export_ops:
data.export_graph_ops(args.export_ops)
if args.top_ops > 0:
print(f"\nTop {args.top_ops} operations by total time:\n")
for s in data.top_operations(args.top_ops):
@ -1055,7 +1248,7 @@ Examples:
f"{s.count:>6} calls {s.total_bytes / 1e6:.1f} MB")
print()
if args.top_ops == 0 and args.top_kernels == 0 and not args.inefficiency and not args.chrome_trace and not args.html_viewer:
if args.top_ops == 0 and args.top_kernels == 0 and not args.inefficiency and not args.chrome_trace and not args.html_viewer and not args.export_ops:
data.summary()