diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 34ec09d403..ce9469936b 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1527,6 +1527,8 @@ private: #endif // GGML_VULKAN_MEMORY_DEBUG static bool vk_perf_logger_enabled = false; +static bool vk_perf_logger_concurrent = false; +static bool vk_enable_sync_logger = false; // number of calls between perf logger prints static uint32_t vk_perf_logger_frequency = 1; @@ -1577,14 +1579,14 @@ class vk_perf_logger { flops.clear(); } - void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) { + std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) { + *n_flops = 0; std::string fusion_str; if (fusion_name) { fusion_str = fusion_name + std::string(" "); } if (node->op == GGML_OP_UNARY) { - timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time); - return; + return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node)); } if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { const uint64_t m = node->ne[0]; @@ -1606,9 +1608,8 @@ class vk_perf_logger { name += " batch=" + std::to_string(batch); } name = fusion_str + name; - timings[name].push_back(time); - flops[name].push_back(m * n * (k + (k - 1)) * batch); - return; + *n_flops = m * n * (k + (k - 1)) * batch; + return name; } if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) { std::string name = ggml_op_name(node->op); @@ -1624,20 +1625,17 @@ class vk_perf_logger { uint64_t size_M = Cout; uint64_t size_K = Cin * KW * KH; uint64_t size_N = N * OW * OH; - uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1)); + *n_flops = size_M * size_N * (size_K + (size_K - 1)); name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + ", N=N*OW*OH=" + std::to_string(size_N); name = fusion_str + name; - flops[name].push_back(n_flops); - timings[name].push_back(time); - return; + return name; } if (node->op == GGML_OP_RMS_NORM) { std::string name = ggml_op_name(node->op); name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")"; name = fusion_str + name; - timings[name].push_back(time); - return; + return name; } if (node->op == GGML_OP_FLASH_ATTN_EXT) { const ggml_tensor * dst = node; @@ -1653,8 +1651,7 @@ class vk_perf_logger { " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " << " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " << " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")"; - timings[name.str()].push_back(time); - return; + return name.str(); } if (node->op == GGML_OP_TOP_K) { std::stringstream name; @@ -1662,11 +1659,38 @@ class vk_perf_logger { name << ggml_op_name(node->op) << " K=" << node->ne[0] << " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")"; - timings[name.str()].push_back(time); - return; + return name.str(); } - timings[fusion_str + ggml_op_name(node->op)].push_back(time); + return fusion_str + ggml_op_name(node->op); } + + void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) { + uint64_t n_flops; + std::string name = get_node_fusion_name(node, fusion_name, &n_flops); + if (n_flops) { + flops[name].push_back(n_flops); + } + timings[name].push_back(time); + } + + void log_timing(const std::vector &nodes, const std::vector &names, uint64_t time) { + uint64_t total_flops = 0; + std::string name; + for (size_t n = 0; n < nodes.size(); ++n) { + uint64_t n_flops = 0; + name += get_node_fusion_name(nodes[n], names[n], &n_flops); + total_flops += n_flops; + + if (n != nodes.size() - 1) { + name += ", "; + } + } + if (total_flops) { + flops[name].push_back(total_flops); + } + timings[name].push_back(time); + } + private: std::map> timings; std::map> flops; @@ -1729,7 +1753,9 @@ struct ggml_backend_vk_context { std::unique_ptr perf_logger; vk::QueryPool query_pool; std::vector query_fusion_names; + std::vector query_fusion_node_count; std::vector query_nodes; + std::vector query_node_idx; int32_t num_queries {}; int32_t query_idx {}; }; @@ -5194,6 +5220,8 @@ static void ggml_vk_instance_init() { } vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; + vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr; + vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr; const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY"); if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) { @@ -11820,15 +11848,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr } } -#define ENABLE_SYNC_LOGGING 0 - if (need_sync) { -#if ENABLE_SYNC_LOGGING - std::cerr << "sync" << std::endl; -#endif + if (vk_enable_sync_logger) { + std::cerr << "sync" << std::endl; + } ctx->unsynced_nodes_written.clear(); ctx->unsynced_nodes_read.clear(); ggml_vk_sync_buffers(ctx, compute_ctx); + + if (vk_perf_logger_enabled && vk_perf_logger_concurrent) { + ctx->query_node_idx[ctx->query_idx] = node_idx; + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + } } // Add all fused nodes to the unsynchronized lists. for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { @@ -11845,20 +11876,20 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr } } } -#if ENABLE_SYNC_LOGGING - for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { - auto *n = cgraph->nodes[node_idx + i]; - std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name; - if (n->op == GGML_OP_GLU) { - std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " "; + if (vk_enable_sync_logger) { + for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { + auto *n = cgraph->nodes[node_idx + i]; + std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name; + if (n->op == GGML_OP_GLU) { + std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " "; + } + if (n->op == GGML_OP_ROPE) { + const int mode = ((const int32_t *) n->op_params)[2]; + std::cerr << " rope mode: " << mode; + } + std::cerr << std::endl; } - if (n->op == GGML_OP_ROPE) { - const int mode = ((const int32_t *) n->op_params)[2]; - std::cerr << " rope mode: " << mode; - } - std::cerr << std::endl; } -#endif switch (node->op) { case GGML_OP_REPEAT: @@ -13138,12 +13169,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->query_pool = ctx->device->device.createQueryPool(query_create_info); ctx->num_queries = query_create_info.queryCount; ctx->query_fusion_names.resize(ctx->num_queries); + ctx->query_fusion_node_count.resize(ctx->num_queries); ctx->query_nodes.resize(ctx->num_queries); + ctx->query_node_idx.resize(ctx->num_queries); } ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1); std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr); + std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0); std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr); + std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0); GGML_ASSERT(ctx->compute_ctx.expired()); compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); @@ -13272,9 +13307,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } else { compute_ctx = ctx->compute_ctx.lock(); } - ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; - ctx->query_fusion_names[ctx->query_idx] = fusion_string; - compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + if (!vk_perf_logger_concurrent) { + // track a single node/fusion for the current query + ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; + ctx->query_fusion_names[ctx->query_idx] = fusion_string; + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + } else { + // track a fusion string and number of fused ops for the current node_idx + ctx->query_fusion_names[i] = fusion_string; + ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops; + } } if (enqueued) { @@ -13316,12 +13358,32 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Get the results and pass them to the logger std::vector timestamps(cgraph->n_nodes + 1); VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); - for (int i = 1; i < ctx->query_idx; i++) { - auto node = ctx->query_nodes[i]; - auto name = ctx->query_fusion_names[i]; - ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + if (!vk_perf_logger_concurrent) { + // Log each op separately + for (int i = 1; i < ctx->query_idx; i++) { + auto node = ctx->query_nodes[i]; + auto name = ctx->query_fusion_names[i]; + ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + } + } else { + // Log each group of nodes + int prev_node_idx = 0; + for (int i = 1; i < ctx->query_idx; i++) { + auto cur_node_idx = ctx->query_node_idx[i]; + std::vector nodes; + std::vector names; + for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) { + if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) { + continue; + } + nodes.push_back(cgraph->nodes[node_idx]); + names.push_back(ctx->query_fusion_names[node_idx]); + node_idx += ctx->query_fusion_node_count[node_idx]; + } + prev_node_idx = cur_node_idx; + ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + } } - ctx->perf_logger->print_timings(); }