ggml: migrate work_data to stack allocation

2025-12-16 12:01:39 +03:00 · 2025-12-16 12:01:39 +03:00 · 8fa720606a
parent d6742125c3
commit 8fa720606a
6 changed files with 71 additions and 24 deletions
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -7,6 +7,8 @@
 extern "C" {
 #endif
 #define GGML_CPLAN_INLINE_SIZE 256 * 1024
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggml-org/ggml/issues/287
    struct ggml_cplan {
@ -19,6 +21,8 @@ extern "C" {
        // abort ggml_graph_compute when true
        ggml_abort_callback abort_callback;
        void *              abort_callback_data;
        uint8_t work_data_inline[GGML_CPLAN_INLINE_SIZE];
    };
    // numa strategies
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -2911,7 +2911,16 @@ struct ggml_cplan ggml_graph_plan(
    cplan.threadpool = threadpool;
    cplan.n_threads  = MIN(max_tasks, n_threads);
    cplan.work_size  = work_size;
-    cplan.work_data  = NULL;
+
    if (work_size > 0) {
        if (work_size <= GGML_CPLAN_INLINE_SIZE) {
            cplan.work_data = cplan.work_data_inline;
        } else {
            cplan.work_data = NULL;
        }
    } else {
        cplan.work_data = NULL;
    }
    return cplan;
 }
@ -3258,7 +3267,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
 enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
-    cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
+    if (cplan.work_size > 0) {
        if (cplan.work_data == NULL) {
            // work_data is not inlined, so we need to allocate it
            if (cplan.work_size > GGML_MAX_STACK_SIZE) {
                cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
                if (cplan.work_data == NULL) {
                    return GGML_STATUS_ALLOC_FAILED;
                }
            } else {
                cplan.work_data = (uint8_t *)alloca(cplan.work_size);
            }
        }
    }
    return ggml_graph_compute(cgraph, &cplan);
 }
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -133,7 +133,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
-    if (cpu_plan->cplan.work_size > 0) {
+    if (cpu_plan->cplan.work_size > 0 && cpu_plan->cplan.work_data == NULL) {
        cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
        if (cpu_plan->cplan.work_data == NULL) {
            delete cpu_plan;
@ -150,7 +150,9 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
 static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-    delete[] cpu_plan->cplan.work_data;
+    if (cpu_plan->cplan.work_data != cpu_plan->cplan.work_data_inline) {
        delete[] cpu_plan->cplan.work_data;
    }
    delete cpu_plan;
    GGML_UNUSED(backend);
@ -169,20 +171,26 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
    if (cpu_ctx->work_size < cplan.work_size) {
        delete[] cpu_ctx->work_data;
        cpu_ctx->work_data = new uint8_t[cplan.work_size];
        if (cpu_ctx->work_data == NULL) {
            cpu_ctx->work_size = 0;
            return GGML_STATUS_ALLOC_FAILED;
        }
        cpu_ctx->work_size = cplan.work_size;
    }
    cplan.work_data = (uint8_t *)cpu_ctx->work_data;
    cplan.abort_callback      = cpu_ctx->abort_callback;
    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
    if (cplan.work_size > 0 && cplan.work_data == NULL) {
        if (cplan.work_size > GGML_MAX_STACK_SIZE) {
            if (cpu_ctx->work_size < cplan.work_size) {
                delete[] cpu_ctx->work_data;
                cpu_ctx->work_data = new uint8_t[cplan.work_size];
                if (cpu_ctx->work_data == NULL) {
                    cpu_ctx->work_size = 0;
                    return GGML_STATUS_ALLOC_FAILED;
                }
                cpu_ctx->work_size = cplan.work_size;
            }
            cplan.work_data = (uint8_t *)cpu_ctx->work_data;
        } else {
            cplan.work_data = (uint8_t *)alloca(cplan.work_size);
        }
    }
    return ggml_graph_compute(cgraph, &cplan);
 }
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -42,6 +42,8 @@ void ggml_print_backtrace(void);
 #    define MAX(a, b) ((a) > (b) ? (a) : (b))
 #endif
 #define GGML_MAX_STACK_SIZE 2*1024*1024
 // required for mmap as gguf only guarantees 32-byte alignment
 #define TENSOR_ALIGNMENT 32
--- a/tests/test-barrier.cpp
+++ b/tests/test-barrier.cpp
@ -47,8 +47,11 @@ static void test_barrier(int n_threads, int n_rounds) {
    // The test runs with constant number of threads
    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
-    std::vector<uint8_t> work_data(cplan.work_size);
+    std::vector<uint8_t> work_data;
-    cplan.work_data = work_data.data();
+    if (cplan.work_size > 0 && cplan.work_data == NULL) {
        work_data.resize(cplan.work_size);
        cplan.work_data = work_data.data();
    }
    std::cerr << "graph-compute with"
              << "\n n_threads: " << n_threads
@ -125,8 +128,11 @@ static void test_active(int n_threads, int n_rounds) {
    for (int i=0; i < n_rounds; i++) {
        struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
-        std::vector<uint8_t> work_data(cplan.work_size);
+        std::vector<uint8_t> work_data;
-        cplan.work_data = work_data.data();
+        if (cplan.work_size > 0 && cplan.work_data == NULL) {
            work_data.resize(cplan.work_size);
            cplan.work_data = work_data.data();
        }
        ggml_graph_compute(gf, &cplan);
    }
@ -197,12 +203,18 @@ static void test_multi_graph(int n_threads, int n_rounds) {
    for (int i=0; i < n_rounds; i++) {
        struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
-        std::vector<uint8_t> work_data0(cplan0.work_size);
+        std::vector<uint8_t> work_data0;
-        cplan0.work_data = work_data0.data();
+        if (cplan0.work_size > 0 && cplan0.work_data == NULL) {
            work_data0.resize(cplan0.work_size);
            cplan0.work_data = work_data0.data();
        }
        struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
-        std::vector<uint8_t> work_data1(cplan1.work_size);
+        std::vector<uint8_t> work_data1;
-        cplan1.work_data = work_data1.data();
+        if (cplan1.work_size > 0 && cplan1.work_data == NULL) {
            work_data1.resize(cplan1.work_size);
            cplan1.work_data = work_data1.data();
        }
        ggml_graph_compute(gf0, &cplan0);
        ggml_graph_compute(gf1, &cplan1);
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@ -116,7 +116,7 @@ static struct ggml_tensor * get_random_tensor_f32(
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-    if (plan.work_size > 0) {
+    if (plan.work_size > 0 && plan.work_data == NULL) {
        buf.resize(plan.work_size);
        plan.work_data = buf.data();
    }