From 8fa720606ad3a8939dcf1fd271d8c7e84df7bb5f Mon Sep 17 00:00:00 2001 From: Herman Semenoff Date: Tue, 16 Dec 2025 12:01:39 +0300 Subject: [PATCH] ggml: migrate work_data to stack allocation --- ggml/include/ggml-cpu.h | 4 ++++ ggml/src/ggml-cpu/ggml-cpu.c | 25 +++++++++++++++++++++++-- ggml/src/ggml-cpu/ggml-cpu.cpp | 34 +++++++++++++++++++++------------- ggml/src/ggml-impl.h | 2 ++ tests/test-barrier.cpp | 28 ++++++++++++++++++++-------- tests/test-rope.cpp | 2 +- 6 files changed, 71 insertions(+), 24 deletions(-) diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 4f3b99c8d0..7a68ca706c 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -7,6 +7,8 @@ extern "C" { #endif +#define GGML_CPLAN_INLINE_SIZE 256 * 1024 + // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggml-org/ggml/issues/287 struct ggml_cplan { @@ -19,6 +21,8 @@ extern "C" { // abort ggml_graph_compute when true ggml_abort_callback abort_callback; void * abort_callback_data; + + uint8_t work_data_inline[GGML_CPLAN_INLINE_SIZE]; }; // numa strategies diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index a59b518938..8ed855b70a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2911,7 +2911,16 @@ struct ggml_cplan ggml_graph_plan( cplan.threadpool = threadpool; cplan.n_threads = MIN(max_tasks, n_threads); cplan.work_size = work_size; - cplan.work_data = NULL; + + if (work_size > 0) { + if (work_size <= GGML_CPLAN_INLINE_SIZE) { + cplan.work_data = cplan.work_data_inline; + } else { + cplan.work_data = NULL; + } + } else { + cplan.work_data = NULL; + } return cplan; } @@ -3258,7 +3267,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL); - cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size); + if (cplan.work_size > 0) { + if (cplan.work_data == NULL) { + // work_data is not inlined, so we need to allocate it + if (cplan.work_size > GGML_MAX_STACK_SIZE) { + cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size); + if (cplan.work_data == NULL) { + return GGML_STATUS_ALLOC_FAILED; + } + } else { + cplan.work_data = (uint8_t *)alloca(cplan.work_size); + } + } + } return ggml_graph_compute(cgraph, &cplan); } diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index f4713a4218..edbf015488 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -133,7 +133,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy - if (cpu_plan->cplan.work_size > 0) { + if (cpu_plan->cplan.work_size > 0 && cpu_plan->cplan.work_data == NULL) { cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; if (cpu_plan->cplan.work_data == NULL) { delete cpu_plan; @@ -150,7 +150,9 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; - delete[] cpu_plan->cplan.work_data; + if (cpu_plan->cplan.work_data != cpu_plan->cplan.work_data_inline) { + delete[] cpu_plan->cplan.work_data; + } delete cpu_plan; GGML_UNUSED(backend); @@ -169,20 +171,26 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); - if (cpu_ctx->work_size < cplan.work_size) { - delete[] cpu_ctx->work_data; - cpu_ctx->work_data = new uint8_t[cplan.work_size]; - if (cpu_ctx->work_data == NULL) { - cpu_ctx->work_size = 0; - return GGML_STATUS_ALLOC_FAILED; - } - cpu_ctx->work_size = cplan.work_size; - } - cplan.work_data = (uint8_t *)cpu_ctx->work_data; - cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; + if (cplan.work_size > 0 && cplan.work_data == NULL) { + if (cplan.work_size > GGML_MAX_STACK_SIZE) { + if (cpu_ctx->work_size < cplan.work_size) { + delete[] cpu_ctx->work_data; + cpu_ctx->work_data = new uint8_t[cplan.work_size]; + if (cpu_ctx->work_data == NULL) { + cpu_ctx->work_size = 0; + return GGML_STATUS_ALLOC_FAILED; + } + cpu_ctx->work_size = cplan.work_size; + } + cplan.work_data = (uint8_t *)cpu_ctx->work_data; + } else { + cplan.work_data = (uint8_t *)alloca(cplan.work_size); + } + } + return ggml_graph_compute(cgraph, &cplan); } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index fe57d4c582..cf694025a7 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -42,6 +42,8 @@ void ggml_print_backtrace(void); # define MAX(a, b) ((a) > (b) ? (a) : (b)) #endif +#define GGML_MAX_STACK_SIZE 2*1024*1024 + // required for mmap as gguf only guarantees 32-byte alignment #define TENSOR_ALIGNMENT 32 diff --git a/tests/test-barrier.cpp b/tests/test-barrier.cpp index 61f73adfd2..44ae1502ec 100644 --- a/tests/test-barrier.cpp +++ b/tests/test-barrier.cpp @@ -47,8 +47,11 @@ static void test_barrier(int n_threads, int n_rounds) { // The test runs with constant number of threads struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool); - std::vector work_data(cplan.work_size); - cplan.work_data = work_data.data(); + std::vector work_data; + if (cplan.work_size > 0 && cplan.work_data == NULL) { + work_data.resize(cplan.work_size); + cplan.work_data = work_data.data(); + } std::cerr << "graph-compute with" << "\n n_threads: " << n_threads @@ -125,8 +128,11 @@ static void test_active(int n_threads, int n_rounds) { for (int i=0; i < n_rounds; i++) { struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool); - std::vector work_data(cplan.work_size); - cplan.work_data = work_data.data(); + std::vector work_data; + if (cplan.work_size > 0 && cplan.work_data == NULL) { + work_data.resize(cplan.work_size); + cplan.work_data = work_data.data(); + } ggml_graph_compute(gf, &cplan); } @@ -197,12 +203,18 @@ static void test_multi_graph(int n_threads, int n_rounds) { for (int i=0; i < n_rounds; i++) { struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool); - std::vector work_data0(cplan0.work_size); - cplan0.work_data = work_data0.data(); + std::vector work_data0; + if (cplan0.work_size > 0 && cplan0.work_data == NULL) { + work_data0.resize(cplan0.work_size); + cplan0.work_data = work_data0.data(); + } struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool); - std::vector work_data1(cplan1.work_size); - cplan1.work_data = work_data1.data(); + std::vector work_data1; + if (cplan1.work_size > 0 && cplan1.work_data == NULL) { + work_data1.resize(cplan1.work_size); + cplan1.work_data = work_data1.data(); + } ggml_graph_compute(gf0, &cplan0); ggml_graph_compute(gf1, &cplan1); diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index 801e4cd827..55b0133a5e 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -116,7 +116,7 @@ static struct ggml_tensor * get_random_tensor_f32( static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); - if (plan.work_size > 0) { + if (plan.work_size > 0 && plan.work_data == NULL) { buf.resize(plan.work_size); plan.work_data = buf.data(); }