From c6980ff29ddc8e59c9c002dcaeec14182d893ed7 Mon Sep 17 00:00:00 2001 From: shalinib-ibm Date: Fri, 6 Mar 2026 20:52:39 +0530 Subject: [PATCH] ggml-cpu: Fix gcc 15 ICE on ppc64le (#20083) (#20130) This patch addresses an Internal Compiler Error (Segmentation fault) observed with gcc 15 by replacing the intrinsic + cast by doing a cat on the data first and then calling the intrinsic. This bypasses the buggy compiler path while maintaining identical instruction selection. Performance Verification: Assembly analysis on RHEL 9 (GCC 15.1.1) confirms that both the original code and this fix generate the identical Power10 prefixed load instruction: `plxv 40, 2(14)` This ensures zero performance regression while unblocking builds on newer toolchains. Reproduced on: - Alpine Linux + GCC 15.2.0-r2 - RHEL 9 + GCC 15.1.1 (gcc-toolset-15) Signed-off-by: Shalini Salomi Bodapati --- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 32 +++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 5fd452a03d..c89e5076f2 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -2497,7 +2497,7 @@ class tinyBLAS_Q0_PPC { for (int r = 0; r < 8; r++) { const block_q4_0 * current_blk = rows_base[r] + blk; vector float v_scale = vec_extract_fp32_from_shorth(vec_splats(current_blk->d)); - vector signed char v_qs = reinterpret_cast(vec_xl(0, current_blk->qs)); + vector signed char v_qs = vec_xl(0, (const vector signed char *)current_blk->qs); vector signed char c1, c2; unpack_q4_to_q8(v_qs, c1, c2); convert_and_scale_q8(c1, v_scale, hp_res[r][0], hp_res[r][1]); @@ -2611,14 +2611,14 @@ class tinyBLAS_Q0_PPC { i = (cols >> 2); if (i > 0) { do { - c1[1] = reinterpret_cast(vec_xl(0, aoffset1->qs)); - c2[1] = reinterpret_cast(vec_xl(0, aoffset2->qs)); - c3[1] = reinterpret_cast(vec_xl(0, aoffset3->qs)); - c4[1] = reinterpret_cast(vec_xl(0, aoffset4->qs)); - c5[1] = reinterpret_cast(vec_xl(0, aoffset5->qs)); - c6[1] = reinterpret_cast(vec_xl(0, aoffset6->qs)); - c7[1] = reinterpret_cast(vec_xl(0, aoffset7->qs)); - c8[1] = reinterpret_cast(vec_xl(0, aoffset8->qs)); + c1[1] = vec_xl(0, (const vector signed char *)aoffset1->qs); + c2[1] = vec_xl(0, (const vector signed char *)aoffset2->qs); + c3[1] = vec_xl(0, (const vector signed char *)aoffset3->qs); + c4[1] = vec_xl(0, (const vector signed char *)aoffset4->qs); + c5[1] = vec_xl(0, (const vector signed char *)aoffset5->qs); + c6[1] = vec_xl(0, (const vector signed char *)aoffset6->qs); + c7[1] = vec_xl(0, (const vector signed char *)aoffset7->qs); + c8[1] = vec_xl(0, (const vector signed char *)aoffset8->qs); process_q4_elements(c1, & comparray[0]); process_q4_elements(c2, & comparray[1]); @@ -2657,10 +2657,10 @@ class tinyBLAS_Q0_PPC { i = (cols >> 2); if (i > 0) { do { - c1[1] = reinterpret_cast(vec_xl(0, aoffset1->qs)); - c2[1] = reinterpret_cast(vec_xl(0, aoffset2->qs)); - c3[1] = reinterpret_cast(vec_xl(0, aoffset3->qs)); - c4[1] = reinterpret_cast(vec_xl(0, aoffset4->qs)); + c1[1] = vec_xl(0, (const vector signed char *)aoffset1->qs); + c2[1] = vec_xl(0, (const vector signed char *)aoffset2->qs); + c3[1] = vec_xl(0, (const vector signed char *)aoffset3->qs); + c4[1] = vec_xl(0, (const vector signed char *)aoffset4->qs); process_q4_elements(c1, & comparray[0]); process_q4_elements(c2, & comparray[1]); @@ -2686,9 +2686,9 @@ class tinyBLAS_Q0_PPC { if (i > 0) { do { switch(rows) { - case 3: c3[1] = reinterpret_cast(vec_xl(0, aoffset3->qs)); - case 2: c2[1] = reinterpret_cast(vec_xl(0, aoffset2->qs)); - case 1: c1[1] = reinterpret_cast(vec_xl(0, aoffset1->qs)); + case 3: c3[1] = vec_xl(0, (const vector signed char *)aoffset3->qs); + case 2: c2[1] = vec_xl(0, (const vector signed char *)aoffset2->qs); + case 1: c1[1] = vec_xl(0, (const vector signed char *)aoffset1->qs); break; } process_q4_elements(c1, & comparray[0]);