cann: optimize multi-stream execution

- Use parse_bool() for GGML_CANN_MULTI_STREAM environment variable parsing, consistent with other env var handling - Only synchronize dependent streams instead of all streams when a node has multiple dependencies, reducing sync overhead - Performance improvement: ~9% faster prompt processing on 0.5B model (1838 t/s vs 1688 t/s with ACL graph disabled)
2026-02-03 07:37:18 +00:00 · 2026-02-03 07:37:18 +00:00 · 4951a4ff7a
parent 906bfed0ca
commit 4951a4ff7a
1 changed files with 9 additions and 15 deletions
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -2229,10 +2229,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
    static bool opt_fusion = parse_bool(get_env_as_lowercase("GGML_CANN_OPERATOR_FUSION").value_or(""));

    // Check if multi-stream execution is enabled
-    static bool multi_stream_enabled = [] {
-        const char * env = getenv("GGML_CANN_MULTI_STREAM");
-        return env != nullptr && (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
-    }();
+    static bool multi_stream_enabled = parse_bool(get_env_as_lowercase("GGML_CANN_MULTI_STREAM").value_or(""));

    if (!use_cann_graph || cann_graph_capture_required) {
        if (multi_stream_enabled) {
@ -2336,15 +2333,15 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
                    // Single dependency - execute on the same stream to avoid sync overhead
                    exec_stream = *dependent_streams.begin();
                } else {
-                    // Multiple dependencies - sync all to stream 0 and execute there
-                    sync_all_to_stream(0);
-                    exec_stream = 0;
-                    current_stream = 1;
+                    // Multiple dependencies - pick the first dependent stream and wait for others
+                    exec_stream = *dependent_streams.begin();
                }

-                // If we depend on a different stream, wait for it
-                if (dependent_streams.size() == 1 && *dependent_streams.begin() != exec_stream) {
-                    wait_for_stream(*dependent_streams.begin(), exec_stream);
+                // Wait for all dependent streams (except the exec_stream itself)
+                for (int dep_stream : dependent_streams) {
+                    if (dep_stream != exec_stream) {
+                        wait_for_stream(dep_stream, exec_stream);
+                    }
                }

                // Execute the node on the chosen stream
@ -2442,10 +2439,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
    bool use_cann_graph = true;

    // Check if multi-stream execution is enabled (must check before using use_cann_graph)
-    static bool multi_stream_enabled = [] {
-        const char * env = getenv("GGML_CANN_MULTI_STREAM");
-        return env != nullptr && (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
-    }();
+    static bool multi_stream_enabled = parse_bool(get_env_as_lowercase("GGML_CANN_MULTI_STREAM").value_or(""));

    // Multi-stream mode is incompatible with ACL graph capture/execution
    if (multi_stream_enabled) {