From 7998d08b29771ceb704decd37dcbdc15d55555d2 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Sun, 14 Dec 2025 23:07:54 +0800
Subject: [PATCH] ggml-blas: bring back openmp

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 ggml/src/ggml-blas/ggml-blas.cpp | 41 +++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index ca896c2541..8b416719db 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -25,11 +25,15 @@
 
 struct ggml_backend_blas_buffer {
     void * data;  // dequantized data
-    size_t size;
+    size_t size;  // ggml_nelements * sizeof(float)
 };
 
 struct ggml_backend_blas_buffer_type_context {
     int n_threads;
+
+#ifndef GGML_USE_OPENMP
+    std::vector<std::future<void>> tasks;
+#endif
 };
 
 // BLAS backend - buffer
@@ -132,12 +136,42 @@ static void ggml_backend_blas_buffer_set_tensor(
                 const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
                 const int n_threads = std::max(std::min(buft_ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
 
+#ifdef GGML_USE_OPENMP
                 #pragma omp parallel for num_threads(n_threads)
                 for (int64_t i01 = 0; i01 < ne01; i01++) {
                     to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00);
                 }
+#else
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start = (i + 0) * ne01/n_threads;
+                    const int64_t end   = (i + 1) * ne01/n_threads;
+                    if (start < end) {
+                        buft_ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00);
+                            }
+                        }));
+                    }
+                }
+                {
+                    // reuse the current thread for the first task
+                    const int64_t start = 0;
+                    const int64_t end   = ne01/n_threads;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *)x + i01*nb01, wplane + i01*ne00, ne00);
+                    }
+                }
+#endif
             }
         }
+
+#ifndef GGML_USE_OPENMP
+        // wait for all tasks to finish
+        for (auto & task : buft_ctx->tasks) {
+            task.get();
+        }
+        buft_ctx->tasks.clear();
+#endif
     }
 }
 
@@ -185,7 +219,6 @@ static ggml_backend_buffer_t ggml_backend_blas_buffer_type_alloc_buffer(
         ggml_backend_buffer_type_t buft,
         size_t size) {
 
-    // TODO: contains dequantized data
     void * data = ggml_aligned_malloc(size);
     if (data == nullptr) {
         GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
@@ -210,6 +243,9 @@ static bool ggml_backend_blas_buffer_type_is_host(ggml_backend_buffer_type_t buf
 static ggml_backend_buffer_type_t ggml_backend_blas_buffer_type(void) {
     static ggml_backend_blas_buffer_type_context buft_ctx = {
         /* .n_threads = */ (int)std::thread::hardware_concurrency(),
+#ifndef GGML_USE_OPENMP
+        /* .tasks     = */ std::vector<std::future<void>>(),
+#endif
     };
 
     static ggml_backend_buffer_type ggml_backend_blas_buffer_type = {
@@ -432,7 +468,6 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend, int n_threads) {
 #endif
 }
 
-// TODO: maybe implement description?
 struct ggml_backend_blas_device_context {
     int blas_device;
     int blas_device_ref_count;