From f6823746134ae4f89aecff968150a2986693cd0a Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 11 Dec 2025 20:51:02 +0800 Subject: [PATCH 01/15] ggml-blas: initial mmid impl Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 154 +++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 5b888cdd8c..1685e91c3c 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -149,6 +149,143 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg } } +static void ggml_backend_blas_mul_mat_id(ggml_backend_blas_context * ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; // weights + const ggml_tensor * src1 = dst->src[1]; // inputs + const ggml_tensor * src2 = dst->src[2]; // ids + + GGML_TENSOR_TERNARY_OP_LOCALS + + const ggml_type type = src0->type; + + GGML_ASSERT(ne10 == ne00); + GGML_ASSERT(ne21 == ne12); + GGML_ASSERT(ne22 == 1 || ne22 == ne13); + GGML_ASSERT(src2->type == GGML_TYPE_I32); + + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1 && nb1 <= nb2 && nb2 <= nb3); + + const int64_t n_used = (int64_t)ne20; + GGML_ASSERT(n_used <= ne02); + + const int64_t ne_plane = ne01 * ne00; + const size_t desired_wsize = (type == GGML_TYPE_F32) ? 0 : ne03 * ne02 * ne_plane * sizeof(float); + if (ctx->work_size < desired_wsize) { + ctx->work_data.reset(new char[desired_wsize]); + ctx->work_size = desired_wsize; + } + void * wdata = ctx->work_data.get(); + + if (type != GGML_TYPE_F32) { + const auto * type_traits = ggml_get_type_traits(type); + ggml_to_float_t to_float = type_traits->to_float; + + for (int64_t i03 = 0; i03 < ne03; ++i03) { + for (int64_t i02 = 0; i02 < ne02; ++i02) { + const void * x = (char *)src0->data + i02*nb02 + i03*nb03; + float * wplane = (float *)wdata + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1); + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; ++i01) { + to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); + } +#else + for (int i = 1; i < n_threads; i++) { + const int64_t start = i * ne01/n_threads; + const int64_t end = (i + 1) * ne01/n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; ++i01) { + to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } + } + { + const int64_t start = 0; + const int64_t end = ne01/n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); + } + } +#endif + } + } + +#ifndef GGML_USE_OPENMP + for (auto & task: ctx->tasks) { + task.get(); + } + ctx->tasks.clear(); +#endif + } + +#ifdef OPENBLAS_VERSION + openblas_set_num_threads(ctx->n_threads); +#endif + +#ifdef GGML_BLAS_USE_BLIS + bli_thread_set_num_threads(ctx->n_threads); +#endif + +#ifdef GGML_BLAS_USE_NVPL + nvpl_blas_set_num_threads(ctx->n_threads); +#endif + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t j = 0; j < ne12; ++j) { + const int64_t ids_batch_index = (ne22 > 1 ? i13 : 0); + const int32_t * ids_row = (const int32_t *)((char *)src2->data + ids_batch_index*nb22 + j*nb21); + float * out_ptr = (float *)((char *)dst->data + i13*nb3 + j*nb2); + + for (int iE = 0; iE < n_used; ++iE) { + const int expert_id = ids_row[iE]; + GGML_ASSERT(expert_id < ne02); + + const float * wmat; + if (type == GGML_TYPE_F32) { + wmat = (const float *)((char *)src0->data + expert_id*nb02); + } else { + wmat = (const float *)((char *)wdata + expert_id * ne_plane * sizeof(float)); + } + + if (ne03 > 1) { + int64_t w_batch_index = (ne03 == ne13 ? i13 : 0); + wmat = (const float *)((char *)wdata + (w_batch_index * ne02 + expert_id) * ne_plane * sizeof(float)); + } + + const float * inp = (const float *)((char *)src1->data + + ((ne11 == 1 ? 0 : iE) * nb11) + + j * nb12 + i13 * nb13); + + if (iE == 0) { + cblas_sgemv(CblasRowMajor, CblasNoTrans, (int)ne01, (int)ne00, + 1.0f, wmat, (int)ne00, + inp, 1, + 0.0f, + out_ptr, 1); + } else { + cblas_sgemv(CblasRowMajor, CblasNoTrans, (int)ne01, (int)ne00, + 1.0f, wmat, (int)ne00, + inp, 1, + 1.0f, + out_ptr, 1); + } + } + } + } +} + static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -235,6 +372,10 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, ggml_backend_blas_mul_mat(ctx, node); break; + case GGML_OP_MUL_MAT_ID: + ggml_backend_blas_mul_mat_id(ctx, node); + break; + case GGML_OP_OUT_PROD: ggml_backend_blas_out_prod(ctx, node); break; @@ -418,6 +559,19 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); } + case GGML_OP_MUL_MAT_ID: + { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + const struct ggml_tensor * src2 = op->src[2]; + + // GGML_LOG_INFO("%s: op=GGML_OP_MUL_MAT_ID src0_type=%s src1_type=%s src2_type=%s ne0=%lld ne1=%lld ne2=%lld ne3=%lld\n", + // __func__, ggml_type_name(src0->type), ggml_type_name(src1->type), ggml_type_name(src2->type), + // op->ne[0], op->ne[1], op->ne[2], op->ne[3]); + + return src2->type == GGML_TYPE_I32; + } + case GGML_OP_OUT_PROD: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && From 19c8ec99642792de67e2e59d09bf7dd72d0d6348 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 11 Dec 2025 21:10:25 +0800 Subject: [PATCH 02/15] ggml-blas: fully working mmid Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 134 +++++++++++++++---------------- 1 file changed, 64 insertions(+), 70 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 1685e91c3c..3565facf73 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -150,72 +150,82 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg } static void ggml_backend_blas_mul_mat_id(ggml_backend_blas_context * ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; // weights - const ggml_tensor * src1 = dst->src[1]; // inputs - const ggml_tensor * src2 = dst->src[2]; // ids + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * ids = dst->src[2]; - GGML_TENSOR_TERNARY_OP_LOCALS + GGML_TENSOR_BINARY_OP_LOCALS - const ggml_type type = src0->type; - - GGML_ASSERT(ne10 == ne00); - GGML_ASSERT(ne21 == ne12); - GGML_ASSERT(ne22 == 1 || ne22 == ne13); - GGML_ASSERT(src2->type == GGML_TYPE_I32); + const enum ggml_type type = src0->type; GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1 && nb1 <= nb2 && nb2 <= nb3); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); - const int64_t n_used = (int64_t)ne20; - GGML_ASSERT(n_used <= ne02); + GGML_ASSERT(ne03 == 1); + GGML_ASSERT(ne13 == 1); + GGML_ASSERT(ne3 == 1); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(ids->type == GGML_TYPE_I32); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + GGML_UNUSED(r2); + GGML_UNUSED(r3); + + const int64_t ne_plane = ne01*ne00; + const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); - const int64_t ne_plane = ne01 * ne00; - const size_t desired_wsize = (type == GGML_TYPE_F32) ? 0 : ne03 * ne02 * ne_plane * sizeof(float); if (ctx->work_size < desired_wsize) { ctx->work_data.reset(new char[desired_wsize]); ctx->work_size = desired_wsize; } void * wdata = ctx->work_data.get(); + // convert src0 to float if (type != GGML_TYPE_F32) { const auto * type_traits = ggml_get_type_traits(type); - ggml_to_float_t to_float = type_traits->to_float; + ggml_to_float_t const to_float = type_traits->to_float; - for (int64_t i03 = 0; i03 < ne03; ++i03) { - for (int64_t i02 = 0; i02 < ne02; ++i02) { - const void * x = (char *)src0->data + i02*nb02 + i03*nb03; - float * wplane = (float *)wdata + i02*ne_plane + i03*ne02*ne_plane; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; const int min_cols_per_thread = 4096; - const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1); - const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1); + const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); #ifdef GGML_USE_OPENMP #pragma omp parallel for num_threads(n_threads) - for (int64_t i01 = 0; i01 < ne01; ++i01) { - to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } #else for (int i = 1; i < n_threads; i++) { - const int64_t start = i * ne01/n_threads; - const int64_t end = (i + 1) * ne01/n_threads; + const int64_t start = i*ne01/n_threads; + const int64_t end = (i + 1)*ne01/n_threads; if (start < end) { ctx->tasks.push_back(std::async(std::launch::async, [=]() { - for (int64_t i01 = start; i01 < end; ++i01) { - to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } })); } } { + // reuse the current thread for the first task const int64_t start = 0; - const int64_t end = ne01/n_threads; + const int64_t end = ne01/n_threads; for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); } } #endif @@ -223,65 +233,49 @@ static void ggml_backend_blas_mul_mat_id(ggml_backend_blas_context * ctx, ggml_t } #ifndef GGML_USE_OPENMP - for (auto & task: ctx->tasks) { + // wait for all tasks to finish + for (auto & task : ctx->tasks) { task.get(); } ctx->tasks.clear(); #endif } -#ifdef OPENBLAS_VERSION +#if defined(OPENBLAS_VERSION) openblas_set_num_threads(ctx->n_threads); #endif -#ifdef GGML_BLAS_USE_BLIS +#if defined(GGML_BLAS_USE_BLIS) bli_thread_set_num_threads(ctx->n_threads); #endif -#ifdef GGML_BLAS_USE_NVPL +#if defined(GGML_BLAS_USE_NVPL) nvpl_blas_set_num_threads(ctx->n_threads); #endif - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t j = 0; j < ne12; ++j) { - const int64_t ids_batch_index = (ne22 > 1 ? i13 : 0); - const int32_t * ids_row = (const int32_t *)((char *)src2->data + ids_batch_index*nb22 + j*nb21); - float * out_ptr = (float *)((char *)dst->data + i13*nb3 + j*nb2); + const int n_ids = ids->ne[0]; + const int n_tokens = ids->ne[1]; - for (int iE = 0; iE < n_used; ++iE) { - const int expert_id = ids_row[iE]; - GGML_ASSERT(expert_id < ne02); + for (int t = 0; t < n_tokens; ++t) { + for (int e = 0; e < n_ids; ++e) { + const int32_t expert = *(const int32_t *) ((const char *) ids->data + e*ids->nb[0] + t*ids->nb[1]); + GGML_ASSERT(expert >= 0 && expert < ne02); - const float * wmat; - if (type == GGML_TYPE_F32) { - wmat = (const float *)((char *)src0->data + expert_id*nb02); - } else { - wmat = (const float *)((char *)wdata + expert_id * ne_plane * sizeof(float)); - } + const int e_src1 = e % ne11; - if (ne03 > 1) { - int64_t w_batch_index = (ne03 == ne13 ? i13 : 0); - wmat = (const float *)((char *)wdata + (w_batch_index * ne02 + expert_id) * ne_plane * sizeof(float)); - } + const float * a = (float *) ((char *) src0->data + expert*nb02); + const float * b = (float *) ((char *) src1->data + e_src1*nb11 + t*nb12); + float * d = (float *) ((char *) dst->data + e*nb1 + t*nb2); - const float * inp = (const float *)((char *)src1->data - + ((ne11 == 1 ? 0 : iE) * nb11) - + j * nb12 + i13 * nb13); - - if (iE == 0) { - cblas_sgemv(CblasRowMajor, CblasNoTrans, (int)ne01, (int)ne00, - 1.0f, wmat, (int)ne00, - inp, 1, - 0.0f, - out_ptr, 1); - } else { - cblas_sgemv(CblasRowMajor, CblasNoTrans, (int)ne01, (int)ne00, - 1.0f, wmat, (int)ne00, - inp, 1, - 1.0f, - out_ptr, 1); - } + if (type != GGML_TYPE_F32) { + a = (float *) wdata + expert*ne_plane; } + + cblas_sgemv(CblasRowMajor, CblasNoTrans, + ne01, ne00, + 1.0f, a, ne00, + b, 1, + 0.0f, d, 1); } } } From 1926e07e1a02f718f2100d05c7be9bca0d333cfe Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 11 Dec 2025 21:27:13 +0800 Subject: [PATCH 03/15] ggml-blas: code clean up Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 63 ++++++++++++++++---------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 3565facf73..fecf5fc702 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -27,13 +27,16 @@ struct ggml_backend_blas_context { #endif }; -static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; +static void ggml_backend_blas_mul_mat( + ggml_backend_blas_context * ctx, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; GGML_TENSOR_BINARY_OP_LOCALS - const enum ggml_type type = src0->type; + const ggml_type type = src0->type; GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); @@ -70,8 +73,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; const int min_cols_per_thread = 4096; const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); @@ -84,8 +87,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg } #else for (int i = 1; i < n_threads; i++) { - const int64_t start = i*ne01/n_threads; - const int64_t end = (i + 1)*ne01/n_threads; + const int64_t start = (i + 0) * ne01/n_threads; + const int64_t end = (i + 1) * ne01/n_threads; if (start < end) { ctx->tasks.push_back(std::async(std::launch::async, [=]() { for (int64_t i01 = start; i01 < end; i01++) { @@ -149,14 +152,17 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg } } -static void ggml_backend_blas_mul_mat_id(ggml_backend_blas_context * ctx, ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - const struct ggml_tensor * ids = dst->src[2]; +static void ggml_backend_blas_mul_mat_id( + ggml_backend_blas_context * ctx, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const ggml_tensor * ids = dst->src[2]; GGML_TENSOR_BINARY_OP_LOCALS - const enum ggml_type type = src0->type; + const ggml_type type = src0->type; GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb10 == ggml_type_size(src1->type)); @@ -173,15 +179,10 @@ static void ggml_backend_blas_mul_mat_id(ggml_backend_blas_context * ctx, ggml_t GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(ids->type == GGML_TYPE_I32); - // broadcast factors - const int64_t r2 = ne12/ne02; - const int64_t r3 = ne13/ne03; - - GGML_UNUSED(r2); - GGML_UNUSED(r3); - const int64_t ne_plane = ne01*ne00; - const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); + const size_t desired_wsize = type == GGML_TYPE_F32 + ? 0 + : ne03*ne02*ne_plane*sizeof(float); if (ctx->work_size < desired_wsize) { ctx->work_data.reset(new char[desired_wsize]); @@ -196,8 +197,8 @@ static void ggml_backend_blas_mul_mat_id(ggml_backend_blas_context * ctx, ggml_t for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; const int min_cols_per_thread = 4096; const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); @@ -210,7 +211,7 @@ static void ggml_backend_blas_mul_mat_id(ggml_backend_blas_context * ctx, ggml_t } #else for (int i = 1; i < n_threads; i++) { - const int64_t start = i*ne01/n_threads; + const int64_t start = (i + 0)*ne01/n_threads; const int64_t end = (i + 1)*ne01/n_threads; if (start < end) { ctx->tasks.push_back(std::async(std::launch::async, [=]() { @@ -555,15 +556,13 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s case GGML_OP_MUL_MAT_ID: { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - const struct ggml_tensor * src2 = op->src[2]; + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; - // GGML_LOG_INFO("%s: op=GGML_OP_MUL_MAT_ID src0_type=%s src1_type=%s src2_type=%s ne0=%lld ne1=%lld ne2=%lld ne3=%lld\n", - // __func__, ggml_type_name(src0->type), ggml_type_name(src1->type), ggml_type_name(src2->type), - // op->ne[0], op->ne[1], op->ne[2], op->ne[3]); - - return src2->type == GGML_TYPE_I32; + return ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); } case GGML_OP_OUT_PROD: From 61ee32dec38e58066d84957ed2cb79532876115d Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 18:05:51 +0800 Subject: [PATCH 04/15] tests: set tensor usage as weight for weight tensors only for mul_mat and mul_mat_id ops Signed-off-by: Aaron Teo --- tests/test-backend-ops.cpp | 125 ++++++++++++++++++++++++++++++++++--- 1 file changed, 116 insertions(+), 9 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7be1f66038..f876a54b70 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1160,6 +1160,9 @@ struct test_case { std::vector sentinels; + // Track weight tensors for separate buffer allocation with GGML_BACKEND_BUFFER_USAGE_WEIGHTS + std::vector weight_tensors; + std::string current_op_name; void add_sentinel(ggml_context * ctx) { @@ -1238,6 +1241,8 @@ struct test_case { const char * op_names_filter, printer * output_printer) { mode = MODE_TEST; + weight_tensors.clear(); + sentinels.clear(); ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), @@ -1288,10 +1293,35 @@ struct test_case { // post-graph sentinel add_sentinel(ctx); - // allocate + // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS + ggml_backend_buffer_t weights_buf = nullptr; + if (!weight_tensors.empty()) { + // Calculate total size needed for weight tensors + size_t weight_size = 0; + for (ggml_tensor * wt : weight_tensors) { + weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend1), wt); + } + weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend1))); + + weights_buf = ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend1), weight_size); + if (weights_buf == NULL) { + printf("failed to allocate weight tensors [%s] ", ggml_backend_name(backend1)); + ggml_free(ctx); + return test_status_t::FAIL; + } + ggml_backend_buffer_set_usage(weights_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + // Allocate each weight tensor in the weights buffer + ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf); + for (ggml_tensor * wt : weight_tensors) { + ggml_tallocr_alloc(&weights_talloc, wt); + } + } + + // allocate remaining tensors ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1); - if (buf == NULL) { + if (buf == NULL && weights_buf == NULL) { printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1)); ggml_free(ctx); return test_status_t::FAIL; @@ -1385,6 +1415,9 @@ struct test_case { const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr); + if (weights_buf) { + ggml_backend_buffer_free(weights_buf); + } ggml_backend_buffer_free(buf); ggml_free(ctx); @@ -1404,6 +1437,7 @@ struct test_case { bool eval_perf(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) { mode = MODE_PERF; + weight_tensors.clear(); static const size_t graph_nodes = 8192; @@ -1432,10 +1466,34 @@ struct test_case { return true; } - // allocate + // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS + ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr + if (!weight_tensors.empty()) { + // Calculate total size needed for weight tensors + size_t weight_size = 0; + for (ggml_tensor * wt : weight_tensors) { + weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt); + } + weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend))); + + weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size)); + if (weights_buf == NULL) { + printf("failed to allocate weight tensors\n"); + return false; + } + ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + // Allocate each weight tensor in the weights buffer + ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get()); + for (ggml_tensor * wt : weight_tensors) { + ggml_tallocr_alloc(&weights_talloc, wt); + } + } + + // allocate remaining tensors ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr - if (buf == NULL) { + if (buf == NULL && weights_buf == NULL) { printf("failed to allocate tensors\n"); return false; } @@ -1534,6 +1592,7 @@ struct test_case { bool eval_support(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) { mode = MODE_SUPPORT; + weight_tensors.clear(); static const size_t graph_nodes = 8192; @@ -1569,6 +1628,7 @@ struct test_case { bool eval_grad(ggml_backend_t backend, const char * op_names_filter, printer * output_printer) { mode = MODE_GRAD; + weight_tensors.clear(); const std::vector expect = grad_expect(); ggml_init_params params = { @@ -1679,9 +1739,35 @@ struct test_case { return true; } - // allocate + // allocate weight tensors in a separate buffer with GGML_BACKEND_BUFFER_USAGE_WEIGHTS + ggml_backend_buffer_ptr weights_buf(nullptr); // smart ptr + if (!weight_tensors.empty()) { + // Calculate total size needed for weight tensors + size_t weight_size = 0; + for (ggml_tensor * wt : weight_tensors) { + weight_size += ggml_backend_buft_get_alloc_size(ggml_backend_get_default_buffer_type(backend), wt); + } + weight_size = GGML_PAD(weight_size, ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend))); + + weights_buf.reset(ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), weight_size)); + if (weights_buf == NULL) { + test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend)); + info.set_error("weight allocation", ""); + output_printer->print_operation(info); + return false; + } + ggml_backend_buffer_set_usage(weights_buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + // Allocate each weight tensor in the weights buffer + ggml_tallocr weights_talloc = ggml_tallocr_new(weights_buf.get()); + for (ggml_tensor * wt : weight_tensors) { + ggml_tallocr_alloc(&weights_talloc, wt); + } + } + + // allocate remaining tensors ggml_backend_buffer_ptr buf(ggml_backend_alloc_ctx_tensors(ctx.get(), backend)); // smart ptr - if (buf == NULL) { + if (buf == NULL && weights_buf == NULL) { test_operation_info info(op_desc(out), vars(), ggml_backend_name(backend)); info.set_error("allocation", ""); output_printer->print_operation(info); @@ -3606,6 +3692,7 @@ struct test_mul_mat : public test_case { a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]); b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]); + weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS if (!ggml_is_quantized(type_a)) { if (bs[1] == 1 && nr[1] == 1) { ggml_set_param(a); @@ -3623,6 +3710,7 @@ struct test_mul_mat : public test_case { const int64_t k_physical = k_v == 0 ? k : k_v; a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0], bs[1]); b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]); + weight_tensors.push_back(a); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS if (!ggml_is_quantized(type_a)) { if (bs[1] == 1 && nr[1] == 1) { @@ -3716,6 +3804,7 @@ struct test_mul_mat_id : public test_case { // C^T = A * B^T: (k, m) * (k, n) => (m, n) ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ggml_set_name(as, "as"); + weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ggml_set_name(ids, "ids"); @@ -3776,6 +3865,7 @@ struct test_mul_mat_id_fusion : public test_case { // C^T = A * B^T: (k, m) * (k, n) => (m, n) ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ggml_set_name(as, "as"); + weight_tensors.push_back(as); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); ggml_set_name(ids, "ids"); @@ -3792,6 +3882,7 @@ struct test_mul_mat_id_fusion : public test_case { for (uint32_t i = 1; i < o; ++i) { ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); + weight_tensors.push_back(a2); // Track weight tensor for GGML_BACKEND_BUFFER_USAGE_WEIGHTS ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids); ggml_set_name(out2, "out2"); out = ggml_add(ctx, out, out2); @@ -7861,9 +7952,24 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 })); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 })); for (bool v : {false, true}) { for (bool circular : {false, true}) { @@ -8064,12 +8170,13 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416)); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 })); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 })); // qwen3next with CHUNK_SIZE 64 test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 })); // qwen3next with CHUNK_SIZE 128 test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 })); From 9a14a094ac1cd514045dbd0bc4a35b04bd443775 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 18:06:31 +0800 Subject: [PATCH 05/15] ggml: rewrite ggml-blas Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 677 ++++++++++++++----------------- 1 file changed, 310 insertions(+), 367 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index fecf5fc702..4280156edd 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -2,9 +2,13 @@ #include "ggml-blas.h" #include "ggml-backend-impl.h" +#include "ggml.h" +#include "ggml-backend.h" + #include #include #include +#include #if defined(GGML_BLAS_USE_ACCELERATE) # include @@ -18,15 +22,234 @@ # include #endif -struct ggml_backend_blas_context { - int n_threads = GGML_DEFAULT_N_THREADS; - std::unique_ptr work_data; - size_t work_size = 0; -#ifndef GGML_USE_OPENMP - std::vector> tasks; -#endif +struct ggml_backend_blas_buffer { + void * data; // dequantized data + size_t size; }; +// BLAS backend - buffer + +static void ggml_backend_blas_buffer_free_buffer(ggml_backend_buffer_t buffer) { + GGML_ASSERT(buffer); + + ggml_backend_blas_buffer * buf_ctx = (ggml_backend_blas_buffer *)buffer->context; + ggml_aligned_free(buf_ctx->data, buf_ctx->size); + ggml_aligned_free(buffer->context, buffer->size); +} + +static void * ggml_backend_blas_buffer_get_base(ggml_backend_buffer_t buffer) { + GGML_ASSERT(buffer); + uintptr_t data = (uintptr_t)buffer->context; + + // align the buffer + if (data % TENSOR_ALIGNMENT != 0) { + data = GGML_PAD(data, TENSOR_ALIGNMENT); + } + + return (void *)data; +} + +static enum ggml_status ggml_backend_blas_buffer_init_tensor( + ggml_backend_buffer_t buffer, + ggml_tensor * tensor) { + + if (tensor->view_src != NULL) { + assert(tensor->view_src->buffer->buft == buffer->buft); + return GGML_STATUS_SUCCESS; + } + + void * ctx = buffer->context; + + if (tensor->type != GGML_TYPE_F32) { + ggml_backend_blas_buffer * extra = new ggml_backend_blas_buffer; + extra->data = ggml_aligned_malloc(ggml_nelements(tensor) * sizeof(float)); // sizeof(float) because dequantized + extra->size = ggml_nelements(tensor) * sizeof(float); + + tensor->extra = extra; + } + + return GGML_STATUS_SUCCESS; + GGML_UNUSED(ctx); +} + +static void ggml_backend_blas_buffer_memset_tensor( + ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + uint8_t value, + size_t offset, + size_t size) { + + GGML_ASSERT(tensor); + memset((char *)tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_blas_buffer_set_tensor( + ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + + GGML_ASSERT(tensor); + memcpy((char *)tensor->data + offset, data, size); + + // ggml_backend_blas_buffer_context * buf_ctx = (ggml_backend_blas_buffer_context *)buffer->buft->context; + ggml_backend_blas_buffer * extra = (ggml_backend_blas_buffer *)tensor->extra; + + const int64_t ne00 = tensor->ne[0]; + const int64_t ne01 = tensor->ne[1]; + const int64_t ne02 = tensor->ne[2]; + const int64_t ne03 = tensor->ne[3]; + + const int64_t nb00 = tensor->nb[0]; + const int64_t nb01 = tensor->nb[1]; + const int64_t nb02 = tensor->nb[2]; + const int64_t nb03 = tensor->nb[3]; + + const int64_t ne_plane = ne01*ne00; + + if (tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS + && tensor->type != GGML_TYPE_F32 + && ggml_get_type_traits(tensor->type)->to_float != NULL) { + + const auto * type_traits = ggml_get_type_traits(tensor->type); + ggml_to_float_t const to_float = type_traits->to_float; + GGML_ASSERT(to_float != nullptr); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *)tensor->data + i02*nb02 + i03*nb03; + float * const wplane = (float *)extra->data + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1); + const int n_threads = std::max(std::min(8, (int)(ne01 / min_rows_per_thread)), 1); + + #pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); + } + } + } + } + + GGML_UNUSED(buffer); +} + +static void ggml_backend_blas_buffer_get_tensor( + ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + + GGML_ASSERT(tensor); + memcpy(data, (const char *)tensor->data + offset, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_blas_buffer_clear( + ggml_backend_buffer_t buffer, + uint8_t value) { + + GGML_ASSERT(buffer); + memset(buffer->context, value, buffer->size); +} + +static const ggml_backend_buffer_i ggml_backend_blas_buffer_i = { + /* .free_buffer = */ ggml_backend_blas_buffer_free_buffer, + /* .get_base = */ ggml_backend_blas_buffer_get_base, + /* .init_tensor = */ ggml_backend_blas_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_blas_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_blas_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_blas_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_blas_buffer_clear, + /* .reset = */ NULL, +}; + +// BLAS backend buffer type + +static const char * ggml_backend_blas_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "BLAS"; + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_blas_buffer_type_alloc_buffer( + ggml_backend_buffer_type_t buft, + size_t size) { + + // TODO: contains dequantized data + void * data = ggml_aligned_malloc(size); + if (data == nullptr) { + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; + } + + return ggml_backend_buffer_init(buft, ggml_backend_blas_buffer_i, data, size); +} + +static size_t ggml_backend_blas_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + + return TENSOR_ALIGNMENT; + GGML_UNUSED(buft); +} + +static bool ggml_backend_blas_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + + return true; + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_t ggml_backend_blas_buffer_type(void) { + static ggml_backend_buffer_type ggml_backend_blas_buffer_type = { + /* .iface = */ { + /* .get_name = */ ggml_backend_blas_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_blas_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_blas_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_blas_buffer_type_is_host, + }, + /* .device = */ NULL, + /* .context = */ NULL, + }; + + return &ggml_backend_blas_buffer_type; +} + +struct ggml_backend_blas_context { + int device; + + int n_threads; + ggml_threadpool_t threadpool; + + uint8_t * work_data; + size_t work_size; + + ggml_abort_callback abort_callback; + void * abort_callback_data; + + // std::unique_ptr work_data; + // size_t work_size = 0; +// #ifndef GGML_USE_OPENMP +// std::vector> tasks; +// #endif +// ggml_cgraph * gf; +}; + +// struct ggml_backend_blas_context { +// int n_threads = GGML_DEFAULT_N_THREADS; +// std::unique_ptr work_data; +// size_t work_size = 0; +// #ifndef GGML_USE_OPENMP +// std::vector> tasks; +// #endif +// }; + static void ggml_backend_blas_mul_mat( ggml_backend_blas_context * ctx, ggml_tensor * dst) { @@ -60,63 +283,7 @@ static void ggml_backend_blas_mul_mat( const int64_t ne_plane = ne01*ne00; const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); - if (ctx->work_size < desired_wsize) { - ctx->work_data.reset(new char[desired_wsize]); - ctx->work_size = desired_wsize; - } - void * wdata = ctx->work_data.get(); - - // convert src0 to float - if (type != GGML_TYPE_F32) { - const auto * type_traits = ggml_get_type_traits(type); - ggml_to_float_t const to_float = type_traits->to_float; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; - - const int min_cols_per_thread = 4096; - const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); - const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); - -#ifdef GGML_USE_OPENMP - #pragma omp parallel for num_threads(n_threads) - for (int64_t i01 = 0; i01 < ne01; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } -#else - for (int i = 1; i < n_threads; i++) { - const int64_t start = (i + 0) * ne01/n_threads; - const int64_t end = (i + 1) * ne01/n_threads; - if (start < end) { - ctx->tasks.push_back(std::async(std::launch::async, [=]() { - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - })); - } - } - { - // reuse the current thread for the first task - const int64_t start = 0; - const int64_t end = ne01/n_threads; - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - } -#endif - } - } - -#ifndef GGML_USE_OPENMP - // wait for all tasks to finish - for (auto & task : ctx->tasks) { - task.get(); - } - ctx->tasks.clear(); -#endif - } + const ggml_backend_blas_buffer * extra = (ggml_backend_blas_buffer *)src0->extra; #if defined(OPENBLAS_VERSION) openblas_set_num_threads(ctx->n_threads); @@ -139,210 +306,20 @@ static void ggml_backend_blas_mul_mat( const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + // switch to dequantized F32 data if (type != GGML_TYPE_F32) { - x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + x = (float *)extra->data + i02*ne_plane + i03*ne02*ne_plane; } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne1, ne01, ne10, 1.0f, y, ne10, x, ne00, - 0.0f, d, ne01); + 0.0f, d, nb1/nb0); } } } -static void ggml_backend_blas_mul_mat_id( - ggml_backend_blas_context * ctx, - ggml_tensor * dst) { - - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - const ggml_tensor * ids = dst->src[2]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const ggml_type type = src0->type; - - GGML_ASSERT(nb00 == ggml_type_size(type)); - GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - GGML_ASSERT(ne03 == 1); - GGML_ASSERT(ne13 == 1); - GGML_ASSERT(ne3 == 1); - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(ids->type == GGML_TYPE_I32); - - const int64_t ne_plane = ne01*ne00; - const size_t desired_wsize = type == GGML_TYPE_F32 - ? 0 - : ne03*ne02*ne_plane*sizeof(float); - - if (ctx->work_size < desired_wsize) { - ctx->work_data.reset(new char[desired_wsize]); - ctx->work_size = desired_wsize; - } - void * wdata = ctx->work_data.get(); - - // convert src0 to float - if (type != GGML_TYPE_F32) { - const auto * type_traits = ggml_get_type_traits(type); - ggml_to_float_t const to_float = type_traits->to_float; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; - - const int min_cols_per_thread = 4096; - const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); - const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); - -#ifdef GGML_USE_OPENMP - #pragma omp parallel for num_threads(n_threads) - for (int64_t i01 = 0; i01 < ne01; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } -#else - for (int i = 1; i < n_threads; i++) { - const int64_t start = (i + 0)*ne01/n_threads; - const int64_t end = (i + 1)*ne01/n_threads; - if (start < end) { - ctx->tasks.push_back(std::async(std::launch::async, [=]() { - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - })); - } - } - { - // reuse the current thread for the first task - const int64_t start = 0; - const int64_t end = ne01/n_threads; - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - } -#endif - } - } - -#ifndef GGML_USE_OPENMP - // wait for all tasks to finish - for (auto & task : ctx->tasks) { - task.get(); - } - ctx->tasks.clear(); -#endif - } - -#if defined(OPENBLAS_VERSION) - openblas_set_num_threads(ctx->n_threads); -#endif - -#if defined(GGML_BLAS_USE_BLIS) - bli_thread_set_num_threads(ctx->n_threads); -#endif - -#if defined(GGML_BLAS_USE_NVPL) - nvpl_blas_set_num_threads(ctx->n_threads); -#endif - - const int n_ids = ids->ne[0]; - const int n_tokens = ids->ne[1]; - - for (int t = 0; t < n_tokens; ++t) { - for (int e = 0; e < n_ids; ++e) { - const int32_t expert = *(const int32_t *) ((const char *) ids->data + e*ids->nb[0] + t*ids->nb[1]); - GGML_ASSERT(expert >= 0 && expert < ne02); - - const int e_src1 = e % ne11; - - const float * a = (float *) ((char *) src0->data + expert*nb02); - const float * b = (float *) ((char *) src1->data + e_src1*nb11 + t*nb12); - float * d = (float *) ((char *) dst->data + e*nb1 + t*nb2); - - if (type != GGML_TYPE_F32) { - a = (float *) wdata + expert*ne_plane; - } - - cblas_sgemv(CblasRowMajor, CblasNoTrans, - ne01, ne00, - 1.0f, a, ne00, - b, 1, - 0.0f, d, 1); - } - } -} - -static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(ne0 == ne00); - GGML_ASSERT(ne1 == ne10); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne3 == ne13); - GGML_ASSERT(ne03 == ne13); - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == sizeof(float)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - // GGML_ASSERT(nb0 <= nb1); - // GGML_ASSERT(nb1 <= nb2); - // GGML_ASSERT(nb2 <= nb3); - - // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) - // src0: (k,n) - // src1: (k,m) - // dst: (m,n) - // - // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) - // Also expressed as (major,minor) - // a: (m,k): so src1 transposed - // b: (k,n): so src0 - // c: (m,n) - // - // However, if ggml_is_transposed(src1) is true, then - // src1->data already contains a transposed version, so sgemm mustn't - // transpose it further. - - int n = src0->ne[0]; - int k = src0->ne[1]; - int m = src1->ne[0]; - - CBLAS_TRANSPOSE transposeA; - int lda; - - if (!ggml_is_transposed(src1)) { - transposeA = CblasTrans; - lda = m; - } else { - transposeA = CblasNoTrans; - lda = k; - } - - float * a = (float *) ((char *) src1->data); - float * b = (float *) ((char *) src0->data); - float * c = (float *) ((char *) dst->data); - - cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); - - GGML_UNUSED(ctx); -} - -// backend interface static const char * ggml_backend_blas_get_name(ggml_backend_t backend) { return "BLAS"; @@ -352,35 +329,30 @@ static const char * ggml_backend_blas_get_name(ggml_backend_t backend) { static void ggml_backend_blas_free(ggml_backend_t backend) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + + delete[] ctx->work_data; delete ctx; delete backend; } -static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static ggml_status ggml_backend_blas_graph_compute( + ggml_backend_t backend, + ggml_cgraph * cgraph) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + ggml_tensor * node = cgraph->nodes[i]; + + if (ggml_op_is_empty(node->op)) { + continue; + } switch (node->op) { case GGML_OP_MUL_MAT: - ggml_backend_blas_mul_mat(ctx, node); - break; - - case GGML_OP_MUL_MAT_ID: - ggml_backend_blas_mul_mat_id(ctx, node); - break; - - case GGML_OP_OUT_PROD: - ggml_backend_blas_out_prod(ctx, node); - break; - - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - break; + { + ggml_backend_blas_mul_mat(ctx, node); + } break; default: GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); @@ -392,21 +364,21 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, GGML_UNUSED(backend); } -static struct ggml_backend_i blas_backend_i = { - /* .get_name = */ ggml_backend_blas_get_name, - /* .free = */ ggml_backend_blas_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_blas_graph_compute, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, - /* .graph_optimize = */ NULL, +static const ggml_backend_i ggml_backend_blas_i = { + /* .get_name = */ ggml_backend_blas_get_name, + /* .free = */ ggml_backend_blas_free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_blas_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .graph_optimize = */ NULL, }; static ggml_guid_t ggml_backend_blas_guid(void) { @@ -416,39 +388,49 @@ static ggml_guid_t ggml_backend_blas_guid(void) { ggml_backend_t ggml_backend_blas_init(void) { ggml_backend_blas_context * ctx = new ggml_backend_blas_context; + if (ctx == NULL) { + return NULL; + } - ggml_backend_t backend = new ggml_backend { + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; + ctx->work_data = nullptr; + ctx->work_size = 0; + ctx->abort_callback = NULL; + ctx->abort_callback_data = nullptr; + + ggml_backend_t blas_backend = new ggml_backend { /* .guid = */ ggml_backend_blas_guid(), - /* .iface = */ blas_backend_i, + /* .iface = */ ggml_backend_blas_i, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), /* .context = */ ctx, }; -#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) - if (openblas_get_parallel() != OPENBLAS_OPENMP) { - GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + if (blas_backend == NULL) { + delete ctx; + return NULL; } -#endif -#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) - GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); -#endif - - return backend; + return blas_backend; } bool ggml_backend_is_blas(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); } -void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { - GGML_ASSERT(ggml_backend_is_blas(backend_blas)); +void ggml_backend_blas_set_n_threads(ggml_backend_t backend, int n_threads) { + // TODO: IMPL + GGML_ASSERT(ggml_backend_is_blas(backend)); - ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; ctx->n_threads = n_threads; } -// device interface +// TODO: maybe implement description? +struct ggml_backend_blas_device_context { + int blas_device; + int blas_device_ref_count; +}; static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) { return "BLAS"; @@ -475,7 +457,6 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t } static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - // TODO *free = 0; *total = 0; @@ -488,7 +469,7 @@ static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend GGML_UNUSED(dev); } -static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { +static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { props->name = ggml_backend_blas_device_get_name(dev); props->description = ggml_backend_blas_device_get_description(dev); props->type = ggml_backend_blas_device_get_type(dev); @@ -496,7 +477,7 @@ static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct gg props->caps = { /* .async = */ false, /* .host_buffer = */ false, - /* .buffer_from_host_ptr = */ true, + /* .buffer_from_host_ptr = */ false, /* .events = */ false, }; } @@ -509,40 +490,25 @@ static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t d } static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) { - return ggml_backend_cpu_buffer_type(); + return ggml_backend_blas_buffer_type(); GGML_UNUSED(dev); } -static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - return ggml_backend_cpu_buffer_from_ptr(ptr, size); +static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; - GGML_UNUSED(dev); - GGML_UNUSED(max_tensor_size); -} - -static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - - switch (op->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - return true; + if (ggml_op_is_empty(dst->op)) { + return true; + } + switch (dst->op) { case GGML_OP_MUL_MAT: { - // BLAS usually is only faster for large matrices - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = op->ne[0]; - const int64_t ne1 = op->ne[1]; + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; // TODO: find the optimal value const int64_t min_batch = 32; @@ -554,29 +520,8 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); } - case GGML_OP_MUL_MAT_ID: - { - const ggml_tensor * src0 = op->src[0]; - const ggml_tensor * src1 = op->src[1]; - - return ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src1->type == GGML_TYPE_F32 && - (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); - } - - case GGML_OP_OUT_PROD: - return op->src[0]->type == GGML_TYPE_F32 && - op->src[1]->type == GGML_TYPE_F32 && - ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) && - (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); - default: return false; - } GGML_UNUSED(dev); @@ -588,7 +533,7 @@ static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_ GGML_UNUSED(dev); } -static const struct ggml_backend_device_i ggml_backend_blas_device_i = { +static const ggml_backend_device_i ggml_backend_blas_device_i = { /* .get_name = */ ggml_backend_blas_device_get_name, /* .get_description = */ ggml_backend_blas_device_get_description, /* .get_memory = */ ggml_backend_blas_device_get_memory, @@ -597,7 +542,7 @@ static const struct ggml_backend_device_i ggml_backend_blas_device_i = { /* .init_backend = */ ggml_backend_blas_device_init_backend, /* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr, + /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_blas_device_supports_op, /* .supports_buft = */ ggml_backend_blas_device_supports_buft, /* .offload_op = */ NULL, @@ -606,7 +551,7 @@ static const struct ggml_backend_device_i ggml_backend_blas_device_i = { /* .event_synchronize = */ NULL, }; -// backend reg interface +// BLAS backend - backend (reg) static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) { return "BLAS"; @@ -623,29 +568,27 @@ static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) { static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) { GGML_ASSERT(index == 0); + static ggml_backend_blas_device_context ctx; static ggml_backend_device ggml_backend_blas_device = { /* .iface = */ ggml_backend_blas_device_i, /* .reg = */ reg, - /* .context = */ nullptr, + /* .context = */ &ctx, }; return &ggml_backend_blas_device; - - GGML_UNUSED(reg); - GGML_UNUSED(index); } static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) { - if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { + if (strcmp(name, "ggml_backend_set_n_threads") == 0) { return (void *)ggml_backend_blas_set_n_threads; } - return NULL; + + return nullptr; GGML_UNUSED(reg); - GGML_UNUSED(name); } -static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { +static const ggml_backend_reg_i ggml_backend_blas_reg_i = { /* .get_name = */ ggml_backend_blas_reg_get_name, /* .get_device_count = */ ggml_backend_blas_reg_get_device_count, /* .get_device = */ ggml_backend_blas_reg_get_device, @@ -653,7 +596,7 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { }; ggml_backend_reg_t ggml_backend_blas_reg(void) { - static struct ggml_backend_reg ggml_backend_blas_reg = { + static ggml_backend_reg ggml_backend_blas_reg = { /* .api_version = */ GGML_BACKEND_API_VERSION, /* .iface = */ ggml_backend_blas_reg_i, /* .context = */ NULL, From aae6d1e9b0adfd73db507fc3e29bd5acf542e9d7 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 19:15:34 +0800 Subject: [PATCH 06/15] ggml-blas: fix invalid data access Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas-rework.cpp | 685 ++++++++++++++++++++++++ ggml/src/ggml-blas/ggml-blas.cpp | 2 + 2 files changed, 687 insertions(+) create mode 100644 ggml/src/ggml-blas/ggml-blas-rework.cpp diff --git a/ggml/src/ggml-blas/ggml-blas-rework.cpp b/ggml/src/ggml-blas/ggml-blas-rework.cpp new file mode 100644 index 0000000000..9c8be1470c --- /dev/null +++ b/ggml/src/ggml-blas/ggml-blas-rework.cpp @@ -0,0 +1,685 @@ +#include "ggml-impl.h" +#include "ggml-blas.h" +#include "ggml-backend-impl.h" + +#include +#include +#include + +#if defined(GGML_BLAS_USE_ACCELERATE) +# include +#elif defined(GGML_BLAS_USE_MKL) +# include +#elif defined(GGML_BLAS_USE_BLIS) +# include +#elif defined(GGML_BLAS_USE_NVPL) +# include +#else +# include +#endif + +struct ggml_backend_blas_context { + int n_threads = GGML_DEFAULT_N_THREADS; + std::unique_ptr work_data; + size_t work_size = 0; +#ifndef GGML_USE_OPENMP + std::vector> tasks; +#endif +}; + +static void ggml_backend_blas_mul_mat( + ggml_backend_blas_context * ctx, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const ggml_type type = src0->type; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + const int64_t ne_plane = ne01*ne00; + const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); + + if (ctx->work_size < desired_wsize) { + ctx->work_data.reset(new char[desired_wsize]); + ctx->work_size = desired_wsize; + } + void * wdata = ctx->work_data.get(); + + // convert src0 to float + if (type != GGML_TYPE_F32) { + const auto * type_traits = ggml_get_type_traits(type); + ggml_to_float_t const to_float = type_traits->to_float; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } +#else + for (int i = 1; i < n_threads; i++) { + const int64_t start = (i + 0) * ne01/n_threads; + const int64_t end = (i + 1) * ne01/n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01/n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + } +#endif + } + } + +#ifndef GGML_USE_OPENMP + // wait for all tasks to finish + for (auto & task : ctx->tasks) { + task.get(); + } + ctx->tasks.clear(); +#endif + } + +#if defined(OPENBLAS_VERSION) + openblas_set_num_threads(ctx->n_threads); +#endif + +#if defined(GGML_BLAS_USE_BLIS) + bli_thread_set_num_threads(ctx->n_threads); +#endif + +#if defined(GGML_BLAS_USE_NVPL) + nvpl_blas_set_num_threads(ctx->n_threads); +#endif + + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + + if (type != GGML_TYPE_F32) { + x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + } + + GGML_LOG_INFO("%s: name='%s' type=%s dequantized first 5 elements: [%.6f, %.6f, %.6f, %.6f, %.6f]\n", + __func__, + src0->name, + ggml_type_name(src0->type), + ((float *)x)[0], + ((float *)x)[1], + ((float *)x)[2], + ((float *)x)[3], + ((float *)x)[4]); + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne1, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } +} + +static void ggml_backend_blas_mul_mat_id( + ggml_backend_blas_context * ctx, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const ggml_tensor * ids = dst->src[2]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const ggml_type type = src0->type; + + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne03 == 1); + GGML_ASSERT(ne13 == 1); + GGML_ASSERT(ne3 == 1); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(ids->type == GGML_TYPE_I32); + + const int64_t ne_plane = ne01*ne00; + const size_t desired_wsize = type == GGML_TYPE_F32 + ? 0 + : ne03*ne02*ne_plane*sizeof(float); + + if (ctx->work_size < desired_wsize) { + ctx->work_data.reset(new char[desired_wsize]); + ctx->work_size = desired_wsize; + } + void * wdata = ctx->work_data.get(); + + // convert src0 to float + if (type != GGML_TYPE_F32) { + const auto * type_traits = ggml_get_type_traits(type); + ggml_to_float_t const to_float = type_traits->to_float; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } +#else + for (int i = 1; i < n_threads; i++) { + const int64_t start = (i + 0)*ne01/n_threads; + const int64_t end = (i + 1)*ne01/n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01/n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + } +#endif + } + } + + GGML_LOG_INFO("%s: name='%s' type=%s dequantized first 5 elements: [%.6f, %.6f, %.6f, %.6f, %.6f]\n", + __func__, + src0->name, + ggml_type_name(src0->type), + ((float *)wdata)[0], + ((float *)wdata)[1], + ((float *)wdata)[2], + ((float *)wdata)[3], + ((float *)wdata)[4]); + +#ifndef GGML_USE_OPENMP + // wait for all tasks to finish + for (auto & task : ctx->tasks) { + task.get(); + } + ctx->tasks.clear(); +#endif + } + +#if defined(OPENBLAS_VERSION) + openblas_set_num_threads(ctx->n_threads); +#endif + +#if defined(GGML_BLAS_USE_BLIS) + bli_thread_set_num_threads(ctx->n_threads); +#endif + +#if defined(GGML_BLAS_USE_NVPL) + nvpl_blas_set_num_threads(ctx->n_threads); +#endif + + const int n_ids = ids->ne[0]; + const int n_tokens = ids->ne[1]; + + for (int t = 0; t < n_tokens; ++t) { + for (int e = 0; e < n_ids; ++e) { + const int32_t expert = *(const int32_t *) ((const char *) ids->data + e*ids->nb[0] + t*ids->nb[1]); + GGML_ASSERT(expert >= 0 && expert < ne02); + + const int e_src1 = e % ne11; + + const float * a = (float *) ((char *) src0->data + expert*nb02); + const float * b = (float *) ((char *) src1->data + e_src1*nb11 + t*nb12); + float * d = (float *) ((char *) dst->data + e*nb1 + t*nb2); + + if (type != GGML_TYPE_F32) { + a = (float *) wdata + expert*ne_plane; + } + + cblas_sgemv(CblasRowMajor, CblasNoTrans, + ne01, ne00, + 1.0f, a, ne00, + b, 1, + 0.0f, d, 1); + } + } +} + +static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne03 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) + // src0: (k,n) + // src1: (k,m) + // dst: (m,n) + // + // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) + // Also expressed as (major,minor) + // a: (m,k): so src1 transposed + // b: (k,n): so src0 + // c: (m,n) + // + // However, if ggml_is_transposed(src1) is true, then + // src1->data already contains a transposed version, so sgemm mustn't + // transpose it further. + + int n = src0->ne[0]; + int k = src0->ne[1]; + int m = src1->ne[0]; + + CBLAS_TRANSPOSE transposeA; + int lda; + + if (!ggml_is_transposed(src1)) { + transposeA = CblasTrans; + lda = m; + } else { + transposeA = CblasNoTrans; + lda = k; + } + + float * a = (float *) ((char *) src1->data); + float * b = (float *) ((char *) src0->data); + float * c = (float *) ((char *) dst->data); + + cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); + + GGML_UNUSED(ctx); +} + +// backend interface + +static const char * ggml_backend_blas_get_name(ggml_backend_t backend) { + return "BLAS"; + + GGML_UNUSED(backend); +} + +static void ggml_backend_blas_free(ggml_backend_t backend) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + delete ctx; + delete backend; +} + +static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_backend_blas_mul_mat(ctx, node); + break; + + case GGML_OP_MUL_MAT_ID: + ggml_backend_blas_mul_mat_id(ctx, node); + break; + + case GGML_OP_OUT_PROD: + ggml_backend_blas_out_prod(ctx, node); + break; + + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + + default: + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + } + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +static struct ggml_backend_i blas_backend_i = { + /* .get_name = */ ggml_backend_blas_get_name, + /* .free = */ ggml_backend_blas_free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_blas_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .graph_optimize = */ NULL, +}; + +static ggml_guid_t ggml_backend_blas_guid(void) { + static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + return &guid; +} + +ggml_backend_t ggml_backend_blas_init(void) { + ggml_backend_blas_context * ctx = new ggml_backend_blas_context; + + ggml_backend_t backend = new ggml_backend { + /* .guid = */ ggml_backend_blas_guid(), + /* .iface = */ blas_backend_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0), + /* .context = */ ctx, + }; + +#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) + if (openblas_get_parallel() != OPENBLAS_OPENMP) { + GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + } +#endif + +#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) + GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); +#endif + + return backend; +} + +bool ggml_backend_is_blas(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); +} + +void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { + GGML_ASSERT(ggml_backend_is_blas(backend_blas)); + + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; + ctx->n_threads = n_threads; +} + +// device interface + +static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) { + return "BLAS"; + + GGML_UNUSED(dev); +} + +static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) { + #if defined(GGML_BLAS_USE_ACCELERATE) + return "Accelerate"; + #elif defined(GGML_BLAS_USE_MKL) + return "MKL"; + #elif defined(GGML_BLAS_USE_BLIS) + return "BLIS"; + #elif defined(GGML_BLAS_USE_NVPL) + return "NVPL"; + #elif defined(OPENBLAS_VERSION) + return "OpenBLAS"; + #else + return "BLAS"; + #endif + + GGML_UNUSED(dev); +} + +static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + // TODO + *free = 0; + *total = 0; + + GGML_UNUSED(dev); +} + +static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) { + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + + GGML_UNUSED(dev); +} + +static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_blas_device_get_name(dev); + props->description = ggml_backend_blas_device_get_description(dev); + props->type = ggml_backend_blas_device_get_type(dev); + ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) { + return ggml_backend_blas_init(); + + GGML_UNUSED(dev); + GGML_UNUSED(params); +} + +static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(dev); +} + +static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + + GGML_UNUSED(dev); + GGML_UNUSED(max_tensor_size); +} + +static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + return true; + + case GGML_OP_MUL_MAT: + { + // BLAS usually is only faster for large matrices + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = op->ne[0]; + const int64_t ne1 = op->ne[1]; + + // TODO: find the optimal value + const int64_t min_batch = 32; + + return ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); + } + + case GGML_OP_MUL_MAT_ID: + { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + + return ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); + } + + case GGML_OP_OUT_PROD: + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); + + default: + return false; + + } + + GGML_UNUSED(dev); +} + +static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(dev); +} + +static const struct ggml_backend_device_i ggml_backend_blas_device_i = { + /* .get_name = */ ggml_backend_blas_device_get_name, + /* .get_description = */ ggml_backend_blas_device_get_description, + /* .get_memory = */ ggml_backend_blas_device_get_memory, + /* .get_type = */ ggml_backend_blas_device_get_type, + /* .get_props = */ ggml_backend_blas_device_get_props, + /* .init_backend = */ ggml_backend_blas_device_init_backend, + /* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr, + /* .supports_op = */ ggml_backend_blas_device_supports_op, + /* .supports_buft = */ ggml_backend_blas_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// backend reg interface + +static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) { + return "BLAS"; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) { + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + static ggml_backend_device ggml_backend_blas_device = { + /* .iface = */ ggml_backend_blas_device_i, + /* .reg = */ reg, + /* .context = */ nullptr, + }; + + return &ggml_backend_blas_device; + + GGML_UNUSED(reg); + GGML_UNUSED(index); +} + +static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { + return (void *)ggml_backend_blas_set_n_threads; + } + return NULL; + + GGML_UNUSED(reg); + GGML_UNUSED(name); +} + +static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { + /* .get_name = */ ggml_backend_blas_reg_get_name, + /* .get_device_count = */ ggml_backend_blas_reg_get_device_count, + /* .get_device = */ ggml_backend_blas_reg_get_device, + /* .get_proc_address = */ ggml_backend_blas_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_blas_reg(void) { + static struct ggml_backend_reg ggml_backend_blas_reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_blas_reg_i, + /* .context = */ NULL, + }; + + return &ggml_backend_blas_reg; +} + +GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 4280156edd..bb33717453 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -515,6 +515,8 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const g return ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && + src0->view_src == nullptr && + src1->view_src == nullptr && src1->type == GGML_TYPE_F32 && (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) && (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); From 717531b1a7e616400de3cb130a489f67b60ae649 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 19:22:14 +0800 Subject: [PATCH 07/15] ggml-blas: add note Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index bb33717453..65cd2c1c40 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -513,13 +513,14 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const g // TODO: find the optimal value const int64_t min_batch = 32; - return ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src0->view_src == nullptr && - src1->view_src == nullptr && - src1->type == GGML_TYPE_F32 && - (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) && - (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); + return ggml_is_contiguous(src0) + && ggml_is_contiguous(src1) + && src1->type == GGML_TYPE_F32 + // NOTE: llama-bench creates views that somehow does not go through init_tensor + // this prevents the uninitialized views from being used in BLAS + && src0->view_src == nullptr && src1->view_src == nullptr + && (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) + && (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); } default: From 447057973c009b1d09a194228dee3a6535b04fe5 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 19:27:42 +0800 Subject: [PATCH 08/15] ggml-blas: fix ne Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 65cd2c1c40..ebd7ce8bb2 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -315,7 +315,7 @@ static void ggml_backend_blas_mul_mat( ne1, ne01, ne10, 1.0f, y, ne10, x, ne00, - 0.0f, d, nb1/nb0); + 0.0f, d, ne01); } } } From 6dff031caa10dd0afbefdb4bbaf54779ca15fd05 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 21:57:09 +0800 Subject: [PATCH 09/15] ggml-blas: force dequant routine to use max logical cores Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index ebd7ce8bb2..62887d0190 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -6,6 +6,7 @@ #include "ggml-backend.h" #include +#include #include #include #include @@ -27,6 +28,10 @@ struct ggml_backend_blas_buffer { size_t size; }; +struct ggml_backend_blas_buffer_type_context { + int n_threads; +}; + // BLAS backend - buffer static void ggml_backend_blas_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -95,7 +100,7 @@ static void ggml_backend_blas_buffer_set_tensor( GGML_ASSERT(tensor); memcpy((char *)tensor->data + offset, data, size); - // ggml_backend_blas_buffer_context * buf_ctx = (ggml_backend_blas_buffer_context *)buffer->buft->context; + ggml_backend_blas_buffer_type_context * buft_ctx = (ggml_backend_blas_buffer_type_context *)buffer->buft->context; ggml_backend_blas_buffer * extra = (ggml_backend_blas_buffer *)tensor->extra; const int64_t ne00 = tensor->ne[0]; @@ -125,7 +130,7 @@ static void ggml_backend_blas_buffer_set_tensor( const int min_cols_per_thread = 4096; const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1); - const int n_threads = std::max(std::min(8, (int)(ne01 / min_rows_per_thread)), 1); + const int n_threads = std::max(std::min(buft_ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1); #pragma omp parallel for num_threads(n_threads) for (int64_t i01 = 0; i01 < ne01; i01++) { @@ -134,8 +139,6 @@ static void ggml_backend_blas_buffer_set_tensor( } } } - - GGML_UNUSED(buffer); } static void ggml_backend_blas_buffer_get_tensor( @@ -205,6 +208,10 @@ static bool ggml_backend_blas_buffer_type_is_host(ggml_backend_buffer_type_t buf } static ggml_backend_buffer_type_t ggml_backend_blas_buffer_type(void) { + static ggml_backend_blas_buffer_type_context buft_ctx = { + /* .n_threads = */ (int)std::thread::hardware_concurrency(), + }; + static ggml_backend_buffer_type ggml_backend_blas_buffer_type = { /* .iface = */ { /* .get_name = */ ggml_backend_blas_buffer_type_get_name, @@ -215,7 +222,7 @@ static ggml_backend_buffer_type_t ggml_backend_blas_buffer_type(void) { /* .is_host = */ ggml_backend_blas_buffer_type_is_host, }, /* .device = */ NULL, - /* .context = */ NULL, + /* .context = */ &buft_ctx, }; return &ggml_backend_blas_buffer_type; @@ -419,7 +426,6 @@ bool ggml_backend_is_blas(ggml_backend_t backend) { } void ggml_backend_blas_set_n_threads(ggml_backend_t backend, int n_threads) { - // TODO: IMPL GGML_ASSERT(ggml_backend_is_blas(backend)); ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; From e481be6da6a2f28910f85fb58f077eb8c530ba18 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 22:19:12 +0800 Subject: [PATCH 10/15] ggml-blas: move global blas n threads to set_n_threads Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 42 ++++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 62887d0190..ca896c2541 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -91,11 +91,11 @@ static void ggml_backend_blas_buffer_memset_tensor( } static void ggml_backend_blas_buffer_set_tensor( - ggml_backend_buffer_t buffer, - ggml_tensor * tensor, - const void * data, - size_t offset, - size_t size) { + ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { GGML_ASSERT(tensor); memcpy((char *)tensor->data + offset, data, size); @@ -143,10 +143,10 @@ static void ggml_backend_blas_buffer_set_tensor( static void ggml_backend_blas_buffer_get_tensor( ggml_backend_buffer_t buffer, - const ggml_tensor * tensor, - void * data, - size_t offset, - size_t size) { + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { GGML_ASSERT(tensor); memcpy(data, (const char *)tensor->data + offset, size); @@ -292,18 +292,6 @@ static void ggml_backend_blas_mul_mat( const ggml_backend_blas_buffer * extra = (ggml_backend_blas_buffer *)src0->extra; -#if defined(OPENBLAS_VERSION) - openblas_set_num_threads(ctx->n_threads); -#endif - -#if defined(GGML_BLAS_USE_BLIS) - bli_thread_set_num_threads(ctx->n_threads); -#endif - -#if defined(GGML_BLAS_USE_NVPL) - nvpl_blas_set_num_threads(ctx->n_threads); -#endif - for (int64_t i13 = 0; i13 < ne13; i13++) { for (int64_t i12 = 0; i12 < ne12; i12++) { const int64_t i03 = i13/r3; @@ -430,6 +418,18 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend, int n_threads) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; ctx->n_threads = n_threads; + +#if defined(OPENBLAS_VERSION) + openblas_set_num_threads(ctx->n_threads); +#endif + +#if defined(GGML_BLAS_USE_BLIS) + bli_thread_set_num_threads(ctx->n_threads); +#endif + +#if defined(GGML_BLAS_USE_NVPL) + nvpl_blas_set_num_threads(ctx->n_threads); +#endif } // TODO: maybe implement description? From 7998d08b29771ceb704decd37dcbdc15d55555d2 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 23:07:54 +0800 Subject: [PATCH 11/15] ggml-blas: bring back openmp Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 41 +++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index ca896c2541..8b416719db 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -25,11 +25,15 @@ struct ggml_backend_blas_buffer { void * data; // dequantized data - size_t size; + size_t size; // ggml_nelements * sizeof(float) }; struct ggml_backend_blas_buffer_type_context { int n_threads; + +#ifndef GGML_USE_OPENMP + std::vector> tasks; +#endif }; // BLAS backend - buffer @@ -132,12 +136,42 @@ static void ggml_backend_blas_buffer_set_tensor( const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1); const int n_threads = std::max(std::min(buft_ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1); +#ifdef GGML_USE_OPENMP #pragma omp parallel for num_threads(n_threads) for (int64_t i01 = 0; i01 < ne01; i01++) { to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); } +#else + for (int i = 1; i < n_threads; i++) { + const int64_t start = (i + 0) * ne01/n_threads; + const int64_t end = (i + 1) * ne01/n_threads; + if (start < end) { + buft_ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01/n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00); + } + } +#endif } } + +#ifndef GGML_USE_OPENMP + // wait for all tasks to finish + for (auto & task : buft_ctx->tasks) { + task.get(); + } + buft_ctx->tasks.clear(); +#endif } } @@ -185,7 +219,6 @@ static ggml_backend_buffer_t ggml_backend_blas_buffer_type_alloc_buffer( ggml_backend_buffer_type_t buft, size_t size) { - // TODO: contains dequantized data void * data = ggml_aligned_malloc(size); if (data == nullptr) { GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); @@ -210,6 +243,9 @@ static bool ggml_backend_blas_buffer_type_is_host(ggml_backend_buffer_type_t buf static ggml_backend_buffer_type_t ggml_backend_blas_buffer_type(void) { static ggml_backend_blas_buffer_type_context buft_ctx = { /* .n_threads = */ (int)std::thread::hardware_concurrency(), +#ifndef GGML_USE_OPENMP + /* .tasks = */ std::vector>(), +#endif }; static ggml_backend_buffer_type ggml_backend_blas_buffer_type = { @@ -432,7 +468,6 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend, int n_threads) { #endif } -// TODO: maybe implement description? struct ggml_backend_blas_device_context { int blas_device; int blas_device_ref_count; From 75e506ff22a258bb16c732471ac8d84b2d0d960f Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 23:19:22 +0800 Subject: [PATCH 12/15] ggml-blas: clean up code Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 49 ++++---------------------------- 1 file changed, 6 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 8b416719db..aadaa93546 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -67,8 +67,6 @@ static enum ggml_status ggml_backend_blas_buffer_init_tensor( return GGML_STATUS_SUCCESS; } - void * ctx = buffer->context; - if (tensor->type != GGML_TYPE_F32) { ggml_backend_blas_buffer * extra = new ggml_backend_blas_buffer; extra->data = ggml_aligned_malloc(ggml_nelements(tensor) * sizeof(float)); // sizeof(float) because dequantized @@ -78,7 +76,6 @@ static enum ggml_status ggml_backend_blas_buffer_init_tensor( } return GGML_STATUS_SUCCESS; - GGML_UNUSED(ctx); } static void ggml_backend_blas_buffer_memset_tensor( @@ -265,34 +262,9 @@ static ggml_backend_buffer_type_t ggml_backend_blas_buffer_type(void) { } struct ggml_backend_blas_context { - int device; - - int n_threads; - ggml_threadpool_t threadpool; - - uint8_t * work_data; - size_t work_size; - - ggml_abort_callback abort_callback; - void * abort_callback_data; - - // std::unique_ptr work_data; - // size_t work_size = 0; -// #ifndef GGML_USE_OPENMP -// std::vector> tasks; -// #endif -// ggml_cgraph * gf; + int n_threads; }; -// struct ggml_backend_blas_context { -// int n_threads = GGML_DEFAULT_N_THREADS; -// std::unique_ptr work_data; -// size_t work_size = 0; -// #ifndef GGML_USE_OPENMP -// std::vector> tasks; -// #endif -// }; - static void ggml_backend_blas_mul_mat( ggml_backend_blas_context * ctx, ggml_tensor * dst) { @@ -322,9 +294,7 @@ static void ggml_backend_blas_mul_mat( // broadcast factors const int64_t r2 = ne12/ne02; const int64_t r3 = ne13/ne03; - - const int64_t ne_plane = ne01*ne00; - const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); + const int64_t ne_plane = ne01*ne00; const ggml_backend_blas_buffer * extra = (ggml_backend_blas_buffer *)src0->extra; @@ -349,6 +319,8 @@ static void ggml_backend_blas_mul_mat( 0.0f, d, ne01); } } + + GGML_UNUSED(ctx); } @@ -361,7 +333,6 @@ static const char * ggml_backend_blas_get_name(ggml_backend_t backend) { static void ggml_backend_blas_free(ggml_backend_t backend) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; - delete[] ctx->work_data; delete ctx; delete backend; } @@ -423,12 +394,7 @@ ggml_backend_t ggml_backend_blas_init(void) { return NULL; } - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->threadpool = NULL; - ctx->work_data = nullptr; - ctx->work_size = 0; - ctx->abort_callback = NULL; - ctx->abort_callback_data = nullptr; + ctx->n_threads = GGML_DEFAULT_N_THREADS; ggml_backend_t blas_backend = new ggml_backend { /* .guid = */ ggml_backend_blas_guid(), @@ -468,10 +434,7 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend, int n_threads) { #endif } -struct ggml_backend_blas_device_context { - int blas_device; - int blas_device_ref_count; -}; +struct ggml_backend_blas_device_context {}; static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) { return "BLAS"; From 10ce5e056dceffd5b7ccb70accf7fe99a1f08a3a Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 23:20:28 +0800 Subject: [PATCH 13/15] ggml-blas: more code formatting Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index aadaa93546..a46391b93b 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -80,10 +80,10 @@ static enum ggml_status ggml_backend_blas_buffer_init_tensor( static void ggml_backend_blas_buffer_memset_tensor( ggml_backend_buffer_t buffer, - ggml_tensor * tensor, - uint8_t value, - size_t offset, - size_t size) { + ggml_tensor * tensor, + uint8_t value, + size_t offset, + size_t size) { GGML_ASSERT(tensor); memset((char *)tensor->data + offset, value, size); From 46dea5da74a295409cf3c768a7e34bb134135799 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 23:23:08 +0800 Subject: [PATCH 14/15] CODEOWNERS: add @taronaeo to blas backend [no ci] Signed-off-by: Aaron Teo --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/CODEOWNERS b/CODEOWNERS index 8e62a36e81..a15f023c34 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -47,6 +47,7 @@ /ggml/cmake/ @ggerganov /ggml/include/ @ggerganov /ggml/src/ggml-common.h @ggerganov +/ggml/src/ggml-blas/ @taronaeo /ggml/src/ggml-cpu/ @ggerganov /ggml/src/ggml-cpu/spacemit/ @alex-spacemit /ggml/src/ggml-cuda/fattn* @JohannesGaessler From 04ed19bbc00f04d5acc7473f9e2e5399540edba7 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 14 Dec 2025 23:37:56 +0800 Subject: [PATCH 15/15] ggml-blas: further cleanup Signed-off-by: Aaron Teo --- ggml/src/ggml-blas/ggml-blas.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index a46391b93b..a4e3898f13 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -76,6 +76,8 @@ static enum ggml_status ggml_backend_blas_buffer_init_tensor( } return GGML_STATUS_SUCCESS; + + GGML_UNUSED(buffer); } static void ggml_backend_blas_buffer_memset_tensor( @@ -170,6 +172,8 @@ static void ggml_backend_blas_buffer_set_tensor( buft_ctx->tasks.clear(); #endif } + + GGML_UNUSED(nb00); } static void ggml_backend_blas_buffer_get_tensor(