ggml: migrate work_data to stack allocation
This commit is contained in:
parent
d6742125c3
commit
8fa720606a
|
|
@ -7,6 +7,8 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define GGML_CPLAN_INLINE_SIZE 256 * 1024
|
||||||
|
|
||||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
// since https://github.com/ggml-org/ggml/issues/287
|
// since https://github.com/ggml-org/ggml/issues/287
|
||||||
struct ggml_cplan {
|
struct ggml_cplan {
|
||||||
|
|
@ -19,6 +21,8 @@ extern "C" {
|
||||||
// abort ggml_graph_compute when true
|
// abort ggml_graph_compute when true
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
|
|
||||||
|
uint8_t work_data_inline[GGML_CPLAN_INLINE_SIZE];
|
||||||
};
|
};
|
||||||
|
|
||||||
// numa strategies
|
// numa strategies
|
||||||
|
|
|
||||||
|
|
@ -2911,7 +2911,16 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
cplan.threadpool = threadpool;
|
cplan.threadpool = threadpool;
|
||||||
cplan.n_threads = MIN(max_tasks, n_threads);
|
cplan.n_threads = MIN(max_tasks, n_threads);
|
||||||
cplan.work_size = work_size;
|
cplan.work_size = work_size;
|
||||||
cplan.work_data = NULL;
|
|
||||||
|
if (work_size > 0) {
|
||||||
|
if (work_size <= GGML_CPLAN_INLINE_SIZE) {
|
||||||
|
cplan.work_data = cplan.work_data_inline;
|
||||||
|
} else {
|
||||||
|
cplan.work_data = NULL;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cplan.work_data = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return cplan;
|
return cplan;
|
||||||
}
|
}
|
||||||
|
|
@ -3258,7 +3267,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
|
||||||
|
|
||||||
cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
|
if (cplan.work_size > 0) {
|
||||||
|
if (cplan.work_data == NULL) {
|
||||||
|
// work_data is not inlined, so we need to allocate it
|
||||||
|
if (cplan.work_size > GGML_MAX_STACK_SIZE) {
|
||||||
|
cplan.work_data = (uint8_t *)ggml_new_buffer(ctx, cplan.work_size);
|
||||||
|
if (cplan.work_data == NULL) {
|
||||||
|
return GGML_STATUS_ALLOC_FAILED;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cplan.work_data = (uint8_t *)alloca(cplan.work_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ggml_graph_compute(cgraph, &cplan);
|
return ggml_graph_compute(cgraph, &cplan);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -133,7 +133,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0 && cpu_plan->cplan.work_data == NULL) {
|
||||||
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
||||||
if (cpu_plan->cplan.work_data == NULL) {
|
if (cpu_plan->cplan.work_data == NULL) {
|
||||||
delete cpu_plan;
|
delete cpu_plan;
|
||||||
|
|
@ -150,7 +150,9 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
||||||
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||||
|
|
||||||
delete[] cpu_plan->cplan.work_data;
|
if (cpu_plan->cplan.work_data != cpu_plan->cplan.work_data_inline) {
|
||||||
|
delete[] cpu_plan->cplan.work_data;
|
||||||
|
}
|
||||||
delete cpu_plan;
|
delete cpu_plan;
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
|
|
@ -169,20 +171,26 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
|
|
||||||
if (cpu_ctx->work_size < cplan.work_size) {
|
|
||||||
delete[] cpu_ctx->work_data;
|
|
||||||
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
|
||||||
if (cpu_ctx->work_data == NULL) {
|
|
||||||
cpu_ctx->work_size = 0;
|
|
||||||
return GGML_STATUS_ALLOC_FAILED;
|
|
||||||
}
|
|
||||||
cpu_ctx->work_size = cplan.work_size;
|
|
||||||
}
|
|
||||||
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
|
||||||
|
|
||||||
cplan.abort_callback = cpu_ctx->abort_callback;
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
||||||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||||
|
|
||||||
|
if (cplan.work_size > 0 && cplan.work_data == NULL) {
|
||||||
|
if (cplan.work_size > GGML_MAX_STACK_SIZE) {
|
||||||
|
if (cpu_ctx->work_size < cplan.work_size) {
|
||||||
|
delete[] cpu_ctx->work_data;
|
||||||
|
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
||||||
|
if (cpu_ctx->work_data == NULL) {
|
||||||
|
cpu_ctx->work_size = 0;
|
||||||
|
return GGML_STATUS_ALLOC_FAILED;
|
||||||
|
}
|
||||||
|
cpu_ctx->work_size = cplan.work_size;
|
||||||
|
}
|
||||||
|
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
||||||
|
} else {
|
||||||
|
cplan.work_data = (uint8_t *)alloca(cplan.work_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ggml_graph_compute(cgraph, &cplan);
|
return ggml_graph_compute(cgraph, &cplan);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,8 @@ void ggml_print_backtrace(void);
|
||||||
# define MAX(a, b) ((a) > (b) ? (a) : (b))
|
# define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define GGML_MAX_STACK_SIZE 2*1024*1024
|
||||||
|
|
||||||
// required for mmap as gguf only guarantees 32-byte alignment
|
// required for mmap as gguf only guarantees 32-byte alignment
|
||||||
#define TENSOR_ALIGNMENT 32
|
#define TENSOR_ALIGNMENT 32
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -47,8 +47,11 @@ static void test_barrier(int n_threads, int n_rounds) {
|
||||||
// The test runs with constant number of threads
|
// The test runs with constant number of threads
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
|
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
|
||||||
|
|
||||||
std::vector<uint8_t> work_data(cplan.work_size);
|
std::vector<uint8_t> work_data;
|
||||||
cplan.work_data = work_data.data();
|
if (cplan.work_size > 0 && cplan.work_data == NULL) {
|
||||||
|
work_data.resize(cplan.work_size);
|
||||||
|
cplan.work_data = work_data.data();
|
||||||
|
}
|
||||||
|
|
||||||
std::cerr << "graph-compute with"
|
std::cerr << "graph-compute with"
|
||||||
<< "\n n_threads: " << n_threads
|
<< "\n n_threads: " << n_threads
|
||||||
|
|
@ -125,8 +128,11 @@ static void test_active(int n_threads, int n_rounds) {
|
||||||
for (int i=0; i < n_rounds; i++) {
|
for (int i=0; i < n_rounds; i++) {
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
||||||
|
|
||||||
std::vector<uint8_t> work_data(cplan.work_size);
|
std::vector<uint8_t> work_data;
|
||||||
cplan.work_data = work_data.data();
|
if (cplan.work_size > 0 && cplan.work_data == NULL) {
|
||||||
|
work_data.resize(cplan.work_size);
|
||||||
|
cplan.work_data = work_data.data();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_graph_compute(gf, &cplan);
|
ggml_graph_compute(gf, &cplan);
|
||||||
}
|
}
|
||||||
|
|
@ -197,12 +203,18 @@ static void test_multi_graph(int n_threads, int n_rounds) {
|
||||||
|
|
||||||
for (int i=0; i < n_rounds; i++) {
|
for (int i=0; i < n_rounds; i++) {
|
||||||
struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
||||||
std::vector<uint8_t> work_data0(cplan0.work_size);
|
std::vector<uint8_t> work_data0;
|
||||||
cplan0.work_data = work_data0.data();
|
if (cplan0.work_size > 0 && cplan0.work_data == NULL) {
|
||||||
|
work_data0.resize(cplan0.work_size);
|
||||||
|
cplan0.work_data = work_data0.data();
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
|
||||||
std::vector<uint8_t> work_data1(cplan1.work_size);
|
std::vector<uint8_t> work_data1;
|
||||||
cplan1.work_data = work_data1.data();
|
if (cplan1.work_size > 0 && cplan1.work_data == NULL) {
|
||||||
|
work_data1.resize(cplan1.work_size);
|
||||||
|
cplan1.work_data = work_data1.data();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_graph_compute(gf0, &cplan0);
|
ggml_graph_compute(gf0, &cplan0);
|
||||||
ggml_graph_compute(gf1, &cplan1);
|
ggml_graph_compute(gf1, &cplan1);
|
||||||
|
|
|
||||||
|
|
@ -116,7 +116,7 @@ static struct ggml_tensor * get_random_tensor_f32(
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0 && plan.work_data == NULL) {
|
||||||
buf.resize(plan.work_size);
|
buf.resize(plan.work_size);
|
||||||
plan.work_data = buf.data();
|
plan.work_data = buf.data();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue