ggml: migrate work_data to stack allocation

This commit is contained in:
Herman Semenoff 2025-12-16 12:01:39 +03:00
parent d6742125c3
commit 8fa720606a
No known key found for this signature in database
GPG Key ID: 1D2DC7BDC7225EF7
6 changed files with 71 additions and 24 deletions

View File

@ -7,6 +7,8 @@
extern "C" { extern "C" {
#endif #endif
#define GGML_CPLAN_INLINE_SIZE 256 * 1024
// the compute plan that needs to be prepared for ggml_graph_compute() // the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggml-org/ggml/issues/287 // since https://github.com/ggml-org/ggml/issues/287
struct ggml_cplan { struct ggml_cplan {
@ -19,6 +21,8 @@ extern "C" {
// abort ggml_graph_compute when true // abort ggml_graph_compute when true
ggml_abort_callback abort_callback; ggml_abort_callback abort_callback;
void * abort_callback_data; void * abort_callback_data;
uint8_t work_data_inline[GGML_CPLAN_INLINE_SIZE];
}; };
// numa strategies // numa strategies

View File

@ -2911,7 +2911,16 @@ struct ggml_cplan ggml_graph_plan(
cplan.threadpool = threadpool; cplan.threadpool = threadpool;
cplan.n_threads = MIN(max_tasks, n_threads); cplan.n_threads = MIN(max_tasks, n_threads);
cplan.work_size = work_size; cplan.work_size = work_size;
cplan.work_data = NULL;
if (work_size > 0) {
if (work_size <= GGML_CPLAN_INLINE_SIZE) {
cplan.work_data = cplan.work_data_inline;
} else {
cplan.work_data = NULL;
}
} else {
cplan.work_data = NULL;
}
return cplan; return cplan;
} }
@ -3258,7 +3267,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL); struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size); if (cplan.work_size > 0) {
if (cplan.work_data == NULL) {
// work_data is not inlined, so we need to allocate it
if (cplan.work_size > GGML_MAX_STACK_SIZE) {
cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
if (cplan.work_data == NULL) {
return GGML_STATUS_ALLOC_FAILED;
}
} else {
cplan.work_data = (uint8_t *)alloca(cplan.work_size);
}
}
}
return ggml_graph_compute(cgraph, &cplan); return ggml_graph_compute(cgraph, &cplan);
} }

View File

@ -133,7 +133,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) { if (cpu_plan->cplan.work_size > 0 && cpu_plan->cplan.work_data == NULL) {
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size]; cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
if (cpu_plan->cplan.work_data == NULL) { if (cpu_plan->cplan.work_data == NULL) {
delete cpu_plan; delete cpu_plan;
@ -150,7 +150,9 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
delete[] cpu_plan->cplan.work_data; if (cpu_plan->cplan.work_data != cpu_plan->cplan.work_data_inline) {
delete[] cpu_plan->cplan.work_data;
}
delete cpu_plan; delete cpu_plan;
GGML_UNUSED(backend); GGML_UNUSED(backend);
@ -169,20 +171,26 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_ctx->work_size < cplan.work_size) {
delete[] cpu_ctx->work_data;
cpu_ctx->work_data = new uint8_t[cplan.work_size];
if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0;
return GGML_STATUS_ALLOC_FAILED;
}
cpu_ctx->work_size = cplan.work_size;
}
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data; cplan.abort_callback_data = cpu_ctx->abort_callback_data;
if (cplan.work_size > 0 && cplan.work_data == NULL) {
if (cplan.work_size > GGML_MAX_STACK_SIZE) {
if (cpu_ctx->work_size < cplan.work_size) {
delete[] cpu_ctx->work_data;
cpu_ctx->work_data = new uint8_t[cplan.work_size];
if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0;
return GGML_STATUS_ALLOC_FAILED;
}
cpu_ctx->work_size = cplan.work_size;
}
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
} else {
cplan.work_data = (uint8_t *)alloca(cplan.work_size);
}
}
return ggml_graph_compute(cgraph, &cplan); return ggml_graph_compute(cgraph, &cplan);
} }

View File

@ -42,6 +42,8 @@ void ggml_print_backtrace(void);
# define MAX(a, b) ((a) > (b) ? (a) : (b)) # define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif #endif
#define GGML_MAX_STACK_SIZE 2*1024*1024
// required for mmap as gguf only guarantees 32-byte alignment // required for mmap as gguf only guarantees 32-byte alignment
#define TENSOR_ALIGNMENT 32 #define TENSOR_ALIGNMENT 32

View File

@ -47,8 +47,11 @@ static void test_barrier(int n_threads, int n_rounds) {
// The test runs with constant number of threads // The test runs with constant number of threads
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool); struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
std::vector<uint8_t> work_data(cplan.work_size); std::vector<uint8_t> work_data;
cplan.work_data = work_data.data(); if (cplan.work_size > 0 && cplan.work_data == NULL) {
work_data.resize(cplan.work_size);
cplan.work_data = work_data.data();
}
std::cerr << "graph-compute with" std::cerr << "graph-compute with"
<< "\n n_threads: " << n_threads << "\n n_threads: " << n_threads
@ -125,8 +128,11 @@ static void test_active(int n_threads, int n_rounds) {
for (int i=0; i < n_rounds; i++) { for (int i=0; i < n_rounds; i++) {
struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool); struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
std::vector<uint8_t> work_data(cplan.work_size); std::vector<uint8_t> work_data;
cplan.work_data = work_data.data(); if (cplan.work_size > 0 && cplan.work_data == NULL) {
work_data.resize(cplan.work_size);
cplan.work_data = work_data.data();
}
ggml_graph_compute(gf, &cplan); ggml_graph_compute(gf, &cplan);
} }
@ -197,12 +203,18 @@ static void test_multi_graph(int n_threads, int n_rounds) {
for (int i=0; i < n_rounds; i++) { for (int i=0; i < n_rounds; i++) {
struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool); struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
std::vector<uint8_t> work_data0(cplan0.work_size); std::vector<uint8_t> work_data0;
cplan0.work_data = work_data0.data(); if (cplan0.work_size > 0 && cplan0.work_data == NULL) {
work_data0.resize(cplan0.work_size);
cplan0.work_data = work_data0.data();
}
struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool); struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
std::vector<uint8_t> work_data1(cplan1.work_size); std::vector<uint8_t> work_data1;
cplan1.work_data = work_data1.data(); if (cplan1.work_size > 0 && cplan1.work_data == NULL) {
work_data1.resize(cplan1.work_size);
cplan1.work_data = work_data1.data();
}
ggml_graph_compute(gf0, &cplan0); ggml_graph_compute(gf0, &cplan0);
ggml_graph_compute(gf1, &cplan1); ggml_graph_compute(gf1, &cplan1);

View File

@ -116,7 +116,7 @@ static struct ggml_tensor * get_random_tensor_f32(
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) { if (plan.work_size > 0 && plan.work_data == NULL) {
buf.resize(plan.work_size); buf.resize(plan.work_size);
plan.work_data = buf.data(); plan.work_data = buf.data();
} }