diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 7597377cc2..548dcf979d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -517,6 +517,7 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
 
 // TODO: move to ggml-threading
 void ggml_barrier(struct ggml_threadpool * tp);
+void ggml_cpu_set_numa_thread_affinity(int thread_n);
 
 void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
 int  ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a59b518938..a1ebf4955d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2087,7 +2087,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
-static void set_numa_thread_affinity(int thread_n) {
+void ggml_cpu_set_numa_thread_affinity(int thread_n) {
     if (!ggml_is_numa()) {
         return;
     }
@@ -2155,7 +2155,7 @@ static void clear_numa_thread_affinity(void) {
 #else
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
+void ggml_cpu_set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
 static void clear_numa_thread_affinity(void) {}
 #endif
 
@@ -2923,7 +2923,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     const struct ggml_cgraph * cgraph = tp->cgraph;
     const struct ggml_cplan  * cplan  = tp->cplan;
 
-    set_numa_thread_affinity(state->ith);
+    ggml_cpu_set_numa_thread_affinity(state->ith);
 
     struct ggml_compute_params params = {
         /*.ith       =*/ state->ith,
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index b9727e6c9c..c2de7b9adf 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -16,6 +16,10 @@
 #include <cassert>
 #include <cstdio>  // for GGML_ASSERT
 
+#if defined(GGML_USE_OPENMP)
+#include <omp.h>
+#endif
+
 #include "repack.h"
 
 #if defined(__GNUC__)
@@ -1429,7 +1433,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1468,7 +1472,7 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1507,7 +1511,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1546,7 +1550,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1621,7 +1625,7 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1685,7 +1689,7 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -2188,9 +2192,21 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
     }
 
     int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
+        int ret = 0;
         GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                        (int) NB_COLS, (int) INTER_SIZE);
-        return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+#ifdef GGML_USE_OPENMP
+        #pragma omp parallel
+        {
+            ggml_cpu_set_numa_thread_affinity(omp_get_thread_num());
+            int r = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+            #pragma omp master
+            ret = r;
+        }
+#else
+        ret = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+#endif
+        return ret;
     }
 };