From 21b530eab33b9b6ac8937503b920abd9f5361fff Mon Sep 17 00:00:00 2001
From: Ragesh Hajela <ragesh.hajela@fujitsu.com>
Date: Fri, 31 Oct 2025 09:36:42 +0530
Subject: [PATCH 1/2] Disable NUMA-specific chunking for high-core-count HPC
 systems

---
 ggml/src/ggml-cpu/ggml-cpu.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b5466dd703..6e7accb7e2 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1360,7 +1360,14 @@ UseGgmlGemm2:;
     // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
     //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
     //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
-    if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
+    // If the current chunking plan is inefficient for the available threads, re-chunk it by thread.
+    //   - Original observation: For low-core NUMA machines, re-chunking improves performance 
+    //     when there are too few chunks per thread (see https://github.com/ggml-org/llama.cpp/pull/6915).
+    //   - Our observation on AWS Graviton4 (high-core, high-memory bandwidth) shows that
+    //     disabling this re-chunking for nth >= 128 can actually improve performance.
+    //   - Therefore, we only apply re-chunking when nth <= 128 and the chunking is poor
+    //     or on NUMA machines.
+    if (nth <= 128 && (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa())) {
         // distribute the thread work across the inner or outer loop based on which one is larger
         nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
         nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows

From 8972883d17fb3782355a0c0c8a236c08120ad1ae Mon Sep 17 00:00:00 2001
From: Ragesh Hajela <ragesh.hajela@fujitsu.com>
Date: Mon, 10 Nov 2025 12:48:48 +0530
Subject: [PATCH 2/2] remove whitespace in ggml-cpu.c

---
 ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 6e7accb7e2..807d7a3a1d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1361,7 +1361,7 @@ UseGgmlGemm2:;
     //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
     //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
     // If the current chunking plan is inefficient for the available threads, re-chunk it by thread.
-    //   - Original observation: For low-core NUMA machines, re-chunking improves performance 
+    //   - Original observation: For low-core NUMA machines, re-chunking improves performance
     //     when there are too few chunks per thread (see https://github.com/ggml-org/llama.cpp/pull/6915).
     //   - Our observation on AWS Graviton4 (high-core, high-memory bandwidth) shows that
     //     disabling this re-chunking for nth >= 128 can actually improve performance.