Merge 12d11ba583 into ef83fb8601
This commit is contained in:
commit
ac9d29615b
|
|
@ -1527,6 +1527,8 @@ private:
|
||||||
#endif // GGML_VULKAN_MEMORY_DEBUG
|
#endif // GGML_VULKAN_MEMORY_DEBUG
|
||||||
|
|
||||||
static bool vk_perf_logger_enabled = false;
|
static bool vk_perf_logger_enabled = false;
|
||||||
|
static bool vk_perf_logger_concurrent = false;
|
||||||
|
static bool vk_enable_sync_logger = false;
|
||||||
// number of calls between perf logger prints
|
// number of calls between perf logger prints
|
||||||
static uint32_t vk_perf_logger_frequency = 1;
|
static uint32_t vk_perf_logger_frequency = 1;
|
||||||
|
|
||||||
|
|
@ -1577,14 +1579,14 @@ class vk_perf_logger {
|
||||||
flops.clear();
|
flops.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
|
std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) {
|
||||||
|
*n_flops = 0;
|
||||||
std::string fusion_str;
|
std::string fusion_str;
|
||||||
if (fusion_name) {
|
if (fusion_name) {
|
||||||
fusion_str = fusion_name + std::string(" ");
|
fusion_str = fusion_name + std::string(" ");
|
||||||
}
|
}
|
||||||
if (node->op == GGML_OP_UNARY) {
|
if (node->op == GGML_OP_UNARY) {
|
||||||
timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
|
return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node));
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
||||||
const uint64_t m = node->ne[0];
|
const uint64_t m = node->ne[0];
|
||||||
|
|
@ -1606,9 +1608,8 @@ class vk_perf_logger {
|
||||||
name += " batch=" + std::to_string(batch);
|
name += " batch=" + std::to_string(batch);
|
||||||
}
|
}
|
||||||
name = fusion_str + name;
|
name = fusion_str + name;
|
||||||
timings[name].push_back(time);
|
*n_flops = m * n * (k + (k - 1)) * batch;
|
||||||
flops[name].push_back(m * n * (k + (k - 1)) * batch);
|
return name;
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) {
|
if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) {
|
||||||
std::string name = ggml_op_name(node->op);
|
std::string name = ggml_op_name(node->op);
|
||||||
|
|
@ -1624,20 +1625,17 @@ class vk_perf_logger {
|
||||||
uint64_t size_M = Cout;
|
uint64_t size_M = Cout;
|
||||||
uint64_t size_K = Cin * KW * KH;
|
uint64_t size_K = Cin * KW * KH;
|
||||||
uint64_t size_N = N * OW * OH;
|
uint64_t size_N = N * OW * OH;
|
||||||
uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
|
*n_flops = size_M * size_N * (size_K + (size_K - 1));
|
||||||
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
|
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
|
||||||
", N=N*OW*OH=" + std::to_string(size_N);
|
", N=N*OW*OH=" + std::to_string(size_N);
|
||||||
name = fusion_str + name;
|
name = fusion_str + name;
|
||||||
flops[name].push_back(n_flops);
|
return name;
|
||||||
timings[name].push_back(time);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if (node->op == GGML_OP_RMS_NORM) {
|
if (node->op == GGML_OP_RMS_NORM) {
|
||||||
std::string name = ggml_op_name(node->op);
|
std::string name = ggml_op_name(node->op);
|
||||||
name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
|
name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
|
||||||
name = fusion_str + name;
|
name = fusion_str + name;
|
||||||
timings[name].push_back(time);
|
return name;
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||||
const ggml_tensor * dst = node;
|
const ggml_tensor * dst = node;
|
||||||
|
|
@ -1653,8 +1651,7 @@ class vk_perf_logger {
|
||||||
" k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
|
" k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
|
||||||
" v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
|
" v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
|
||||||
" m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
|
" m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
|
||||||
timings[name.str()].push_back(time);
|
return name.str();
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if (node->op == GGML_OP_TOP_K) {
|
if (node->op == GGML_OP_TOP_K) {
|
||||||
std::stringstream name;
|
std::stringstream name;
|
||||||
|
|
@ -1662,11 +1659,38 @@ class vk_perf_logger {
|
||||||
name << ggml_op_name(node->op) <<
|
name << ggml_op_name(node->op) <<
|
||||||
" K=" << node->ne[0] <<
|
" K=" << node->ne[0] <<
|
||||||
" (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
|
" (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
|
||||||
timings[name.str()].push_back(time);
|
return name.str();
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
timings[fusion_str + ggml_op_name(node->op)].push_back(time);
|
return fusion_str + ggml_op_name(node->op);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
|
||||||
|
uint64_t n_flops;
|
||||||
|
std::string name = get_node_fusion_name(node, fusion_name, &n_flops);
|
||||||
|
if (n_flops) {
|
||||||
|
flops[name].push_back(n_flops);
|
||||||
|
}
|
||||||
|
timings[name].push_back(time);
|
||||||
|
}
|
||||||
|
|
||||||
|
void log_timing(const std::vector<ggml_tensor *> &nodes, const std::vector<const char *> &names, uint64_t time) {
|
||||||
|
uint64_t total_flops = 0;
|
||||||
|
std::string name;
|
||||||
|
for (size_t n = 0; n < nodes.size(); ++n) {
|
||||||
|
uint64_t n_flops = 0;
|
||||||
|
name += get_node_fusion_name(nodes[n], names[n], &n_flops);
|
||||||
|
total_flops += n_flops;
|
||||||
|
|
||||||
|
if (n != nodes.size() - 1) {
|
||||||
|
name += ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (total_flops) {
|
||||||
|
flops[name].push_back(total_flops);
|
||||||
|
}
|
||||||
|
timings[name].push_back(time);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::map<std::string, std::vector<uint64_t>> timings;
|
std::map<std::string, std::vector<uint64_t>> timings;
|
||||||
std::map<std::string, std::vector<uint64_t>> flops;
|
std::map<std::string, std::vector<uint64_t>> flops;
|
||||||
|
|
@ -1729,7 +1753,9 @@ struct ggml_backend_vk_context {
|
||||||
std::unique_ptr<vk_perf_logger> perf_logger;
|
std::unique_ptr<vk_perf_logger> perf_logger;
|
||||||
vk::QueryPool query_pool;
|
vk::QueryPool query_pool;
|
||||||
std::vector<const char *> query_fusion_names;
|
std::vector<const char *> query_fusion_names;
|
||||||
|
std::vector<int> query_fusion_node_count;
|
||||||
std::vector<ggml_tensor *> query_nodes;
|
std::vector<ggml_tensor *> query_nodes;
|
||||||
|
std::vector<int> query_node_idx;
|
||||||
int32_t num_queries {};
|
int32_t num_queries {};
|
||||||
int32_t query_idx {};
|
int32_t query_idx {};
|
||||||
};
|
};
|
||||||
|
|
@ -5194,6 +5220,8 @@ static void ggml_vk_instance_init() {
|
||||||
}
|
}
|
||||||
|
|
||||||
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
||||||
|
vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
|
||||||
|
vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
|
||||||
const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
|
const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
|
||||||
|
|
||||||
if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
|
if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
|
||||||
|
|
@ -11820,15 +11848,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ENABLE_SYNC_LOGGING 0
|
|
||||||
|
|
||||||
if (need_sync) {
|
if (need_sync) {
|
||||||
#if ENABLE_SYNC_LOGGING
|
if (vk_enable_sync_logger) {
|
||||||
std::cerr << "sync" << std::endl;
|
std::cerr << "sync" << std::endl;
|
||||||
#endif
|
}
|
||||||
ctx->unsynced_nodes_written.clear();
|
ctx->unsynced_nodes_written.clear();
|
||||||
ctx->unsynced_nodes_read.clear();
|
ctx->unsynced_nodes_read.clear();
|
||||||
ggml_vk_sync_buffers(ctx, compute_ctx);
|
ggml_vk_sync_buffers(ctx, compute_ctx);
|
||||||
|
|
||||||
|
if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
|
||||||
|
ctx->query_node_idx[ctx->query_idx] = node_idx;
|
||||||
|
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Add all fused nodes to the unsynchronized lists.
|
// Add all fused nodes to the unsynchronized lists.
|
||||||
for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
|
for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
|
||||||
|
|
@ -11845,7 +11876,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if ENABLE_SYNC_LOGGING
|
if (vk_enable_sync_logger) {
|
||||||
for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
|
for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
|
||||||
auto *n = cgraph->nodes[node_idx + i];
|
auto *n = cgraph->nodes[node_idx + i];
|
||||||
std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
|
std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
|
||||||
|
|
@ -11858,7 +11889,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||||
}
|
}
|
||||||
std::cerr << std::endl;
|
std::cerr << std::endl;
|
||||||
}
|
}
|
||||||
#endif
|
}
|
||||||
|
|
||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
|
|
@ -13138,12 +13169,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
|
ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
|
||||||
ctx->num_queries = query_create_info.queryCount;
|
ctx->num_queries = query_create_info.queryCount;
|
||||||
ctx->query_fusion_names.resize(ctx->num_queries);
|
ctx->query_fusion_names.resize(ctx->num_queries);
|
||||||
|
ctx->query_fusion_node_count.resize(ctx->num_queries);
|
||||||
ctx->query_nodes.resize(ctx->num_queries);
|
ctx->query_nodes.resize(ctx->num_queries);
|
||||||
|
ctx->query_node_idx.resize(ctx->num_queries);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
|
ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
|
||||||
std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
|
std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
|
||||||
|
std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0);
|
||||||
std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
|
std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
|
||||||
|
std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);
|
||||||
|
|
||||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||||
|
|
@ -13272,9 +13307,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
} else {
|
} else {
|
||||||
compute_ctx = ctx->compute_ctx.lock();
|
compute_ctx = ctx->compute_ctx.lock();
|
||||||
}
|
}
|
||||||
|
if (!vk_perf_logger_concurrent) {
|
||||||
|
// track a single node/fusion for the current query
|
||||||
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
|
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
|
||||||
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
|
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
|
||||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
|
||||||
|
} else {
|
||||||
|
// track a fusion string and number of fused ops for the current node_idx
|
||||||
|
ctx->query_fusion_names[i] = fusion_string;
|
||||||
|
ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (enqueued) {
|
if (enqueued) {
|
||||||
|
|
@ -13316,12 +13358,32 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
// Get the results and pass them to the logger
|
// Get the results and pass them to the logger
|
||||||
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
||||||
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
|
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
|
||||||
|
if (!vk_perf_logger_concurrent) {
|
||||||
|
// Log each op separately
|
||||||
for (int i = 1; i < ctx->query_idx; i++) {
|
for (int i = 1; i < ctx->query_idx; i++) {
|
||||||
auto node = ctx->query_nodes[i];
|
auto node = ctx->query_nodes[i];
|
||||||
auto name = ctx->query_fusion_names[i];
|
auto name = ctx->query_fusion_names[i];
|
||||||
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
|
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Log each group of nodes
|
||||||
|
int prev_node_idx = 0;
|
||||||
|
for (int i = 1; i < ctx->query_idx; i++) {
|
||||||
|
auto cur_node_idx = ctx->query_node_idx[i];
|
||||||
|
std::vector<ggml_tensor *> nodes;
|
||||||
|
std::vector<const char *> names;
|
||||||
|
for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) {
|
||||||
|
if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
nodes.push_back(cgraph->nodes[node_idx]);
|
||||||
|
names.push_back(ctx->query_fusion_names[node_idx]);
|
||||||
|
node_idx += ctx->query_fusion_node_count[node_idx];
|
||||||
|
}
|
||||||
|
prev_node_idx = cur_node_idx;
|
||||||
|
ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
|
||||||
|
}
|
||||||
|
}
|
||||||
ctx->perf_logger->print_timings();
|
ctx->perf_logger->print_timings();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue