diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 7597377cc2..548dcf979d 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -517,6 +517,7 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) { // TODO: move to ggml-threading void ggml_barrier(struct ggml_threadpool * tp); +void ggml_cpu_set_numa_thread_affinity(int thread_n); void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value); int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index a59b518938..a1ebf4955d 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2087,7 +2087,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) -static void set_numa_thread_affinity(int thread_n) { +void ggml_cpu_set_numa_thread_affinity(int thread_n) { if (!ggml_is_numa()) { return; } @@ -2155,7 +2155,7 @@ static void clear_numa_thread_affinity(void) { #else // TODO: Windows etc. // (the linux implementation may also work on BSD, someone should test) -static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } +void ggml_cpu_set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } static void clear_numa_thread_affinity(void) {} #endif @@ -2923,7 +2923,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const struct ggml_cgraph * cgraph = tp->cgraph; const struct ggml_cplan * cplan = tp->cplan; - set_numa_thread_affinity(state->ith); + ggml_cpu_set_numa_thread_affinity(state->ith); struct ggml_compute_params params = { /*.ith =*/ state->ith, diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index b9727e6c9c..c2de7b9adf 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -16,6 +16,10 @@ #include #include // for GGML_ASSERT +#if defined(GGML_USE_OPENMP) +#include +#endif + #include "repack.h" #if defined(__GNUC__) @@ -1429,7 +1433,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1468,7 +1472,7 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1507,7 +1511,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1546,7 +1550,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1621,7 +1625,7 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1685,7 +1689,7 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -2188,9 +2192,21 @@ template name, ggml_type_name(t->type), (int) NB_COLS, (int) INTER_SIZE); - return ggml::cpu::repack::repack(t, data, data_size); +#ifdef GGML_USE_OPENMP + #pragma omp parallel + { + ggml_cpu_set_numa_thread_affinity(omp_get_thread_num()); + int r = ggml::cpu::repack::repack(t, data, data_size); + #pragma omp master + ret = r; + } +#else + ret = ggml::cpu::repack::repack(t, data, data_size); +#endif + return ret; } };