From 8fa720606ad3a8939dcf1fd271d8c7e84df7bb5f Mon Sep 17 00:00:00 2001
From: Herman Semenoff <GermanAizek@yandex.ru>
Date: Tue, 16 Dec 2025 12:01:39 +0300
Subject: [PATCH] ggml: migrate work_data to stack allocation

---
 ggml/include/ggml-cpu.h        |  4 ++++
 ggml/src/ggml-cpu/ggml-cpu.c   | 25 +++++++++++++++++++++++--
 ggml/src/ggml-cpu/ggml-cpu.cpp | 34 +++++++++++++++++++++-------------
 ggml/src/ggml-impl.h           |  2 ++
 tests/test-barrier.cpp         | 28 ++++++++++++++++++++--------
 tests/test-rope.cpp            |  2 +-
 6 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 4f3b99c8d0..7a68ca706c 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -7,6 +7,8 @@
 extern "C" {
 #endif
 
+#define GGML_CPLAN_INLINE_SIZE 256 * 1024
+
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggml-org/ggml/issues/287
     struct ggml_cplan {
@@ -19,6 +21,8 @@ extern "C" {
         // abort ggml_graph_compute when true
         ggml_abort_callback abort_callback;
         void *              abort_callback_data;
+
+        uint8_t work_data_inline[GGML_CPLAN_INLINE_SIZE];
     };
 
     // numa strategies
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a59b518938..8ed855b70a 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2911,7 +2911,16 @@ struct ggml_cplan ggml_graph_plan(
     cplan.threadpool = threadpool;
     cplan.n_threads  = MIN(max_tasks, n_threads);
     cplan.work_size  = work_size;
-    cplan.work_data  = NULL;
+
+    if (work_size > 0) {
+        if (work_size <= GGML_CPLAN_INLINE_SIZE) {
+            cplan.work_data = cplan.work_data_inline;
+        } else {
+            cplan.work_data = NULL;
+        }
+    } else {
+        cplan.work_data = NULL;
+    }
 
     return cplan;
 }
@@ -3258,7 +3267,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
 enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
 
-    cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
+    if (cplan.work_size > 0) {
+        if (cplan.work_data == NULL) {
+            // work_data is not inlined, so we need to allocate it
+            if (cplan.work_size > GGML_MAX_STACK_SIZE) {
+                cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
+                if (cplan.work_data == NULL) {
+                    return GGML_STATUS_ALLOC_FAILED;
+                }
+            } else {
+                cplan.work_data = (uint8_t *)alloca(cplan.work_size);
+            }
+        }
+    }
 
     return ggml_graph_compute(cgraph, &cplan);
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index f4713a4218..edbf015488 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -133,7 +133,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
     cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
 
-    if (cpu_plan->cplan.work_size > 0) {
+    if (cpu_plan->cplan.work_size > 0 && cpu_plan->cplan.work_data == NULL) {
         cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
         if (cpu_plan->cplan.work_data == NULL) {
             delete cpu_plan;
@@ -150,7 +150,9 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
 static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
-    delete[] cpu_plan->cplan.work_data;
+    if (cpu_plan->cplan.work_data != cpu_plan->cplan.work_data_inline) {
+        delete[] cpu_plan->cplan.work_data;
+    }
     delete cpu_plan;
 
     GGML_UNUSED(backend);
@@ -169,20 +171,26 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
 
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
 
-    if (cpu_ctx->work_size < cplan.work_size) {
-        delete[] cpu_ctx->work_data;
-        cpu_ctx->work_data = new uint8_t[cplan.work_size];
-        if (cpu_ctx->work_data == NULL) {
-            cpu_ctx->work_size = 0;
-            return GGML_STATUS_ALLOC_FAILED;
-        }
-        cpu_ctx->work_size = cplan.work_size;
-    }
-    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
-
     cplan.abort_callback      = cpu_ctx->abort_callback;
     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
 
+    if (cplan.work_size > 0 && cplan.work_data == NULL) {
+        if (cplan.work_size > GGML_MAX_STACK_SIZE) {
+            if (cpu_ctx->work_size < cplan.work_size) {
+                delete[] cpu_ctx->work_data;
+                cpu_ctx->work_data = new uint8_t[cplan.work_size];
+                if (cpu_ctx->work_data == NULL) {
+                    cpu_ctx->work_size = 0;
+                    return GGML_STATUS_ALLOC_FAILED;
+                }
+                cpu_ctx->work_size = cplan.work_size;
+            }
+            cplan.work_data = (uint8_t *)cpu_ctx->work_data;
+        } else {
+            cplan.work_data = (uint8_t *)alloca(cplan.work_size);
+        }
+    }
+
     return ggml_graph_compute(cgraph, &cplan);
 }
 
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index fe57d4c582..cf694025a7 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -42,6 +42,8 @@ void ggml_print_backtrace(void);
 #    define MAX(a, b) ((a) > (b) ? (a) : (b))
 #endif
 
+#define GGML_MAX_STACK_SIZE 2*1024*1024
+
 // required for mmap as gguf only guarantees 32-byte alignment
 #define TENSOR_ALIGNMENT 32
 
diff --git a/tests/test-barrier.cpp b/tests/test-barrier.cpp
index 61f73adfd2..44ae1502ec 100644
--- a/tests/test-barrier.cpp
+++ b/tests/test-barrier.cpp
@@ -47,8 +47,11 @@ static void test_barrier(int n_threads, int n_rounds) {
     // The test runs with constant number of threads
     struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
 
-    std::vector<uint8_t> work_data(cplan.work_size);
-    cplan.work_data = work_data.data();
+    std::vector<uint8_t> work_data;
+    if (cplan.work_size > 0 && cplan.work_data == NULL) {
+        work_data.resize(cplan.work_size);
+        cplan.work_data = work_data.data();
+    }
 
     std::cerr << "graph-compute with"
               << "\n n_threads: " << n_threads
@@ -125,8 +128,11 @@ static void test_active(int n_threads, int n_rounds) {
     for (int i=0; i < n_rounds; i++) {
         struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
 
-        std::vector<uint8_t> work_data(cplan.work_size);
-        cplan.work_data = work_data.data();
+        std::vector<uint8_t> work_data;
+        if (cplan.work_size > 0 && cplan.work_data == NULL) {
+            work_data.resize(cplan.work_size);
+            cplan.work_data = work_data.data();
+        }
 
         ggml_graph_compute(gf, &cplan);
     }
@@ -197,12 +203,18 @@ static void test_multi_graph(int n_threads, int n_rounds) {
 
     for (int i=0; i < n_rounds; i++) {
         struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
-        std::vector<uint8_t> work_data0(cplan0.work_size);
-        cplan0.work_data = work_data0.data();
+        std::vector<uint8_t> work_data0;
+        if (cplan0.work_size > 0 && cplan0.work_data == NULL) {
+            work_data0.resize(cplan0.work_size);
+            cplan0.work_data = work_data0.data();
+        }
 
         struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
-        std::vector<uint8_t> work_data1(cplan1.work_size);
-        cplan1.work_data = work_data1.data();
+        std::vector<uint8_t> work_data1;
+        if (cplan1.work_size > 0 && cplan1.work_data == NULL) {
+            work_data1.resize(cplan1.work_size);
+            cplan1.work_data = work_data1.data();
+        }
 
         ggml_graph_compute(gf0, &cplan0);
         ggml_graph_compute(gf1, &cplan1);
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 801e4cd827..55b0133a5e 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -116,7 +116,7 @@ static struct ggml_tensor * get_random_tensor_f32(
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
-    if (plan.work_size > 0) {
+    if (plan.work_size > 0 && plan.work_data == NULL) {
         buf.resize(plan.work_size);
         plan.work_data = buf.data();
     }