diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 6b2dbdd359..8f202d83f5 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -39,7 +40,9 @@ #include #include #include +#include #include +#include #define GGML_COMMON_DECL_C @@ -2570,6 +2573,263 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_ev } } +/** + * @brief Sort the computation graph for improved parallelism. + * + * This function reorders the nodes in the computation graph to allow + * more parallel execution. It groups together nodes that don't depend + * on each other, reducing the number of synchronizations needed. + * + * The algorithm: + * 1. Skip "empty" nodes (NONE, RESHAPE, TRANSPOSE, VIEW, PERMUTE) as they don't require computation + * 2. For each unprocessed node, find subsequent nodes that can be executed in parallel + * 3. Nodes can be parallelized if they don't depend on unprocessed nodes + * 4. Preserve fusion patterns (e.g., RMS_NORM + MUL, ADD + RMS_NORM) by keeping them consecutive + * + * @param backend Pointer to the CANN backend structure. + * @param graph Pointer to the computation graph to optimize. + */ +static void ggml_backend_cann_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * graph) { + // Check if graph optimization is disabled via environment variable + static bool disable_graph_optimize = [] { + const char * env = getenv("GGML_CANN_DISABLE_GRAPH_OPTIMIZE"); + return env != nullptr; + }(); + + if (disable_graph_optimize) { + return; + } + + // Helper: check if a node is "empty" (doesn't require actual computation) + auto const & is_empty = [](ggml_tensor * node) -> bool { + return node->op == GGML_OP_NONE || + node->op == GGML_OP_RESHAPE || + node->op == GGML_OP_TRANSPOSE || + node->op == GGML_OP_VIEW || + node->op == GGML_OP_PERMUTE; + }; + + // Helper: check if dst depends on src (src is a source of dst) + auto const & is_src_of = [](const ggml_tensor * dst, const ggml_tensor * src) -> bool { + for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) { + if (dst->src[s] == src) { + return true; + } + } + // Implicit dependency if they view the same tensor + const ggml_tensor * dst2 = dst->view_src ? dst->view_src : dst; + const ggml_tensor * src2 = src->view_src ? src->view_src : src; + if (dst2 == src2) { + return true; + } + return false; + }; + + std::vector new_order; + std::vector used(graph->n_nodes, false); + std::set used_node_set; + + int first_unused = 0; + while (first_unused < graph->n_nodes) { + std::vector current_set; + + // Helper: check if a fusion pattern matches at a given position + auto const & match_pattern = [&](const std::initializer_list & pattern, int start) -> bool { + if (start + (int) pattern.size() <= graph->n_nodes) { + bool is_pattern = true; + for (size_t j = 0; j < pattern.size(); ++j) { + if (graph->nodes[start + j]->op != pattern.begin()[j] || used[start + j]) { + is_pattern = false; + } + } + return is_pattern; + } + return false; + }; + + // Helper: keep a fusion pattern together by adding all its nodes at once + auto const & keep_pattern = [&](const std::initializer_list & pattern) -> bool { + if (match_pattern(pattern, first_unused)) { + for (size_t j = 0; j < pattern.size(); ++j) { + new_order.push_back(graph->nodes[first_unused + j]); + used_node_set.insert(graph->nodes[first_unused + j]); + used[first_unused + j] = true; + } + while (first_unused < graph->n_nodes && used[first_unused]) { + first_unused++; + } + return true; + } + return false; + }; + + // CANN specific fusion patterns that should be kept together + // ADD + RMS_NORM fusion (supported by CANN backend) + if (keep_pattern({ GGML_OP_ADD, GGML_OP_RMS_NORM })) { + continue; + } + + // First, grab the next unused node + current_set.push_back(first_unused); + + // Loop through the next N nodes. Grab any that don't depend on other nodes that + // haven't already been run. Nodes that have already been run have used[i] set + // to true. Allow nodes that depend on the previous node if it's a fusion pattern + // that we support (e.g., RMS_NORM + MUL, MUL_MAT + ADD). + const int NUM_TO_CHECK = 20; + for (int j = first_unused + 1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { + if (used[j]) { + continue; + } + if (is_empty(graph->nodes[j])) { + continue; + } + // Don't pull forward nodes from fusion patterns + if (match_pattern({ GGML_OP_ADD, GGML_OP_RMS_NORM }, j)) { + continue; + } + + bool ok = true; + for (int c = first_unused; c < j; ++c) { + if (!used[c] && + is_src_of(graph->nodes[j], graph->nodes[c]) && + // Allow consecutive RMS_NORM + MUL fusion + !(j == c + 1 && c == current_set.back() && + graph->nodes[c]->op == GGML_OP_RMS_NORM && + graph->nodes[j]->op == GGML_OP_MUL) && + // Allow consecutive MUL_MAT + ADD fusion + !(j == c + 1 && c == current_set.back() && + graph->nodes[c]->op == GGML_OP_MUL_MAT && + graph->nodes[j]->op == GGML_OP_ADD) && + // Allow consecutive MUL_MAT_ID + ADD fusion + !(j == c + 1 && c == current_set.back() && + graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && + graph->nodes[j]->op == GGML_OP_ADD) && + // Allow consecutive ADD + ADD fusion + !(j == c + 1 && c == current_set.back() && + graph->nodes[c]->op == GGML_OP_ADD && + graph->nodes[j]->op == GGML_OP_ADD)) { + ok = false; + break; + } + } + if (ok) { + current_set.push_back(j); + + int rope_idx = j; + + // When we've found RMS_NORM + MUL, try to find a ROPE that uses it + if (j > 0 && + graph->nodes[j]->op == GGML_OP_MUL && + graph->nodes[j - 1]->op == GGML_OP_RMS_NORM) { + for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { + if (graph->nodes[k]->op == GGML_OP_ROPE && + graph->nodes[k]->src[0] == graph->nodes[j] && + // Check that other srcs are already valid + graph->nodes[k]->src[1]->op == GGML_OP_NONE && + (graph->nodes[k]->src[2] == nullptr || + graph->nodes[k]->src[2]->op == GGML_OP_NONE)) { + rope_idx = k; + current_set.push_back(rope_idx); + used[rope_idx] = true; + break; + } + } + } + + // Look for ROPE + VIEW + SET_ROWS and make them consecutive + if (graph->nodes[rope_idx]->op == GGML_OP_ROPE) { + int view_idx = -1; + int set_rows_idx = -1; + for (int k = rope_idx + 1; k < std::min(rope_idx + 10, graph->n_nodes); ++k) { + if (view_idx == -1 && + graph->nodes[k]->op == GGML_OP_VIEW && + graph->nodes[k]->src[0] == graph->nodes[rope_idx]) { + view_idx = k; + continue; + } + if (view_idx != -1 && + set_rows_idx == -1 && + graph->nodes[k]->op == GGML_OP_SET_ROWS && + graph->nodes[k]->src[0] == graph->nodes[view_idx]) { + set_rows_idx = k; + break; + } + } + if (set_rows_idx != -1) { + current_set.push_back(view_idx); + current_set.push_back(set_rows_idx); + used[view_idx] = true; + used[set_rows_idx] = true; + } + } + + // Look for MUL_MAT + ADD + ADD + if (j > 0 && + graph->nodes[j]->op == GGML_OP_ADD && + graph->nodes[j - 1]->op == GGML_OP_MUL_MAT) { + for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) { + if (graph->nodes[k]->op == GGML_OP_ADD && + graph->nodes[k]->src[0] == graph->nodes[j] && + // src1 must either be weights or already processed + (graph->nodes[k]->src[1]->op == GGML_OP_NONE || + used_node_set.find(graph->nodes[k]->src[1]) != used_node_set.end())) { + current_set.push_back(k); + used[k] = true; + break; + } + } + } + } + } + + // Second pass: grab view nodes + // Skip this if it would break a fusion optimization (don't split up add->rms_norm or add->add) + if (graph->nodes[current_set.back()]->op != GGML_OP_ADD) { + for (int j = first_unused + 1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { + if (used[j]) { + continue; + } + if (!is_empty(graph->nodes[j])) { + continue; + } + bool ok = true; + for (int c = first_unused; c < j; ++c) { + bool c_in_current_set = std::find(current_set.begin(), current_set.end(), c) != current_set.end(); + // Skip views whose srcs haven't been processed + if (!used[c] && + is_src_of(graph->nodes[j], graph->nodes[c]) && + !c_in_current_set) { + ok = false; + break; + } + } + if (ok) { + current_set.push_back(j); + } + } + } + + // Push the current set into new_order + for (auto c : current_set) { + new_order.push_back(graph->nodes[c]); + used_node_set.insert(graph->nodes[c]); + used[c] = true; + } + + while (first_unused < graph->n_nodes && used[first_unused]) { + first_unused++; + } + } + + // Replace the graph with the new order + for (int i = 0; i < graph->n_nodes; ++i) { + graph->nodes[i] = new_order[i]; + } + + GGML_UNUSED(backend); +} + /** * @brief Structure defining the interface for the CANN backend. * @@ -2591,7 +2851,7 @@ static const ggml_backend_i ggml_backend_cann_interface = { /* .graph_compute = */ ggml_backend_cann_graph_compute, /* .event_record = */ ggml_backend_cann_event_record, /* .event_wait = */ ggml_backend_cann_event_wait, - /* .graph_optimize = */ NULL, + /* .graph_optimize = */ ggml_backend_cann_graph_optimize, }; /** diff --git a/skills/cann_multi_stream_implementation.md b/skills/cann_multi_stream_implementation.md new file mode 100644 index 0000000000..f481167156 --- /dev/null +++ b/skills/cann_multi_stream_implementation.md @@ -0,0 +1,191 @@ +# CANN Backend Multi-Stream Parallel Implementation + +## 思考过程记录 + +### 1. 分析Vulkan后端的多流并行实现 + +通过分析PR #15489 和 #15850,我了解到Vulkan后端的多流并行实现包含以下两个关键部分: + +#### 1.1 PR #15489: 重写同步机制,允许节点之间的重叠执行 + +**核心思想**: +- 追踪需要同步的节点列表 +- 只有当新节点依赖于未完成的节点时才进行同步 +- 这允许一些重叠执行,从而提高性能 + +**关键实现**: +- 使用内存范围(地址)来判断依赖关系,而不是直接查看图结构 +- 每个预分配的临时缓冲区(如dequantization或split_k)都有一个bool标记,指示它们是否被使用过并需要同步 +- 性能提升:在RTX 5090上,部分模型性能提升约5-8% + +#### 1.2 PR #15850: 图排序优化,允许更多的并行执行 + +**核心思想**: +- 添加backend proc(`graph_optimize`)允许后端修改计算图 +- Vulkan实现会分析哪些节点相互依赖,并贪婪地重排序它们 +- 将不相互依赖的节点分组在一起 + +**关键实现**: +- `ggml_vk_graph_optimize`函数实现图优化 +- 保留特定的fusion pattern不被重排序(如RMS_NORM + MUL) +- 使用两遍扫描:第一遍抓取"real"节点,第二遍抓取view节点 +- 最多检查接下来的20个节点是否可以提前执行 + +### 2. CANN后端当前状态分析 + +**现有基础设施**: +- 已有stream管理(`cann_ctx->stream()`) +- 支持ACL Graph模式 +- 已有同步机制(`aclrtSynchronizeStream`) +- 后端接口中 `graph_optimize` 目前为NULL + +**需要添加的功能**: +1. 实现 `ggml_backend_cann_graph_optimize` 函数 +2. 可能需要添加多流支持 +3. 添加环境变量控制开关 + +### 3. 设计方案 + +#### 3.1 实现 `graph_optimize` 函数 + +参考Vulkan的实现,我们需要: + +```cpp +static void ggml_backend_cann_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * graph); +``` + +**核心逻辑**: +1. 判断是否禁用优化(环境变量控制) +2. 定义辅助函数判断节点是否为"空"(VIEW, RESHAPE等) +3. 定义辅助函数判断节点依赖关系 +4. 重排序算法: + - 遍历所有未使用的节点 + - 找到可以与当前节点并行执行的节点(不相互依赖) + - 保留fusion pattern + - 更新节点顺序 + +#### 3.2 环境变量 + +- `GGML_CANN_DISABLE_GRAPH_OPTIMIZE`: 禁用图优化 + +### 4. 实现计划 + +1. 在 `ggml-cann.cpp` 中实现 `ggml_backend_cann_graph_optimize` +2. 在 `ggml_backend_cann_interface` 中注册该函数 +3. 编译验证 +4. 使用Qwen 0.5B模型验证功能正确性 + +### 5. 预期收益 + +根据Vulkan后端的测试结果,图优化可以带来: +- 小模型(1B参数):约5-8%的性能提升 +- 中等模型(8B参数):约3-4%的性能提升 +- MoE模型:约6-7%的性能提升 + +这些收益来自于减少同步次数,允许更多操作并行执行。 + +## 实现代码 + +### 修改文件 + +`ggml/src/ggml-cann/ggml-cann.cpp` + +### 主要更改 + +1. **添加头文件**: + - `` - 用于 `std::find` + - `` - 用于 `std::set` + - `` - 用于 `std::vector` + +2. **实现 `ggml_backend_cann_graph_optimize` 函数**: + - 位于 `ggml_backend_cann_event_wait` 函数之后 + - 约250行代码 + - 参考Vulkan后端的实现 + +3. **注册到backend interface**: + - 修改 `ggml_backend_cann_interface` 结构体 + - 将 `graph_optimize` 从 `NULL` 改为 `ggml_backend_cann_graph_optimize` + +### 关键算法 + +```cpp +// 核心优化算法伪代码 +while (还有未处理的节点) { + current_set = [下一个未处理的节点] + + // 保留fusion pattern + if (match_pattern(ADD + RMS_NORM)) { + keep_pattern_together() + continue + } + + // 第一遍:抓取可并行执行的"real"节点 + for (接下来的20个节点) { + if (节点不依赖于未处理的节点) { + if (支持fusion pattern) { + add_to_current_set() + } + } + } + + // 第二遍:抓取view节点 + for (接下来的20个节点) { + if (is_empty(节点) && 依赖已满足) { + add_to_current_set() + } + } + + // 更新节点顺序 + new_order.append(current_set) +} +``` + +### 支持的Fusion Pattern + +- RMS_NORM + MUL +- MUL_MAT + ADD +- MUL_MAT_ID + ADD +- ADD + ADD +- ADD + RMS_NORM(CANN特有) +- ROPE + VIEW + SET_ROWS + +## 测试结果 + +### 测试环境 +- 模型:Qwen 2.5 0.5B Instruct FP16 +- 设备:4x Ascend 910B4 +- 测试命令:`llama-cli -m qwen2.5:0.5b-instruct-fp16 -n 50 -ngl 99` + +### 测试输出 +``` +> Hello, how are you? + +Hello! I'm Qwen, an AI developed by Alibaba Cloud. I'm here to answer any questions you may have and help with anything else you need help with. How can I assist you today? + +[ Prompt: 1346.4 t/s | Generation: 142.8 t/s ] +``` + +### 结论 +- ✅ 编译通过 +- ✅ 模型加载成功 +- ✅ 推理输出正确 +- ✅ 正常退出 + +## 使用方法 + +### 启用图优化(默认) +```bash +./llama-cli -m model.gguf -ngl 99 +``` + +### 禁用图优化 +```bash +GGML_CANN_DISABLE_GRAPH_OPTIMIZE=1 ./llama-cli -m model.gguf -ngl 99 +``` + +## 后续优化建议 + +1. **添加性能测试**:使用llama-bench进行before/after性能对比 +2. **多流支持**:进一步实现真正的多流并行,利用CANN的多stream能力 +3. **更多fusion pattern**:根据CANN的特性添加更多融合优化 +4. **环境变量调优**:添加更细粒度的控制参数