ggml-cpu: fix x86 SIMD detection — validate OS YMM/ZMM/AMX state via XCR0

The ggml_backend_cpu_x86_score() function contained a FIXME comment
admitting it did not check for OS support. All AVX/AVX2/AVX-512/AMX
feature checks were raw CPUID bit reads with no OSXSAVE or XGETBV
validation.

On CPUs where CPUID reports AVX2 support but the OS has not enabled
YMM register save/restore (e.g. certain hypervisors, containers with
restricted XSAVE, or Windows builds with disabled AVX context), ggml
would select an AVX2/AVX-512 backend that immediately faults with
SIGILL on the first vector instruction.

Fix: add xgetbv() (MSVC _xgetbv / GCC-Clang inline asm), then three
predicates that are now called by the affected feature methods:

  os_saves_ymm() — CPUID.1:ECX[27] (OSXSAVE) + XCR0[2:1] == 0b11
    gates: AVX, AVX2, FMA, F16C, AVX_VNNI

  os_saves_zmm() — os_saves_ymm() + XCR0[7:5] == 0b111
    gates: AVX512F/DQ/PF/ER/CD/BW/VL, AVX512_VBMI/VNNI/FP16/BF16

  os_saves_amx() — os_saves_zmm() + XCR0[18:17] == 0b11
    gates: AMX_TILE, AMX_INT8, AMX_FP16, AMX_BF16

No interface changes — all call sites continue to use is.AVX2() etc.
The fix is self-contained within cpuid_x86.

Resolves the FIXME in ggml_backend_cpu_x86_score().

Designed and implemented by Matthew Busel.
This commit is contained in:
Mattbusel 2026-03-11 04:34:42 -04:00
parent 5f91b1d5d5
commit 8197d9756e
1 changed files with 51 additions and 22 deletions

View File

@ -18,7 +18,8 @@ struct cpuid_x86 {
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
bool MONITOR(void) { return f_1_ecx[3]; }
bool SSSE3(void) { return f_1_ecx[9]; }
bool FMA(void) { return f_1_ecx[12]; }
// FMA, F16C, AVX, AVX2, AVX_VNNI use YMM registers — require OS YMM save.
bool FMA(void) { return f_1_ecx[12] && os_saves_ymm(); }
bool CMPXCHG16B(void) { return f_1_ecx[13]; }
bool SSE41(void) { return f_1_ecx[19]; }
bool SSE42(void) { return f_1_ecx[20]; }
@ -27,8 +28,8 @@ struct cpuid_x86 {
bool AES(void) { return f_1_ecx[25]; }
bool XSAVE(void) { return f_1_ecx[26]; }
bool OSXSAVE(void) { return f_1_ecx[27]; }
bool AVX(void) { return f_1_ecx[28]; }
bool F16C(void) { return f_1_ecx[29]; }
bool AVX(void) { return f_1_ecx[28] && os_saves_ymm(); }
bool F16C(void) { return f_1_ecx[29] && os_saves_ymm(); }
bool RDRAND(void) { return f_1_ecx[30]; }
bool MSR(void) { return f_1_edx[5]; }
@ -44,20 +45,21 @@ struct cpuid_x86 {
bool FSGSBASE(void) { return f_7_ebx[0]; }
bool BMI1(void) { return f_7_ebx[3]; }
bool HLE(void) { return is_intel && f_7_ebx[4]; }
bool AVX2(void) { return f_7_ebx[5]; }
bool AVX2(void) { return f_7_ebx[5] && os_saves_ymm(); }
bool BMI2(void) { return f_7_ebx[8]; }
bool ERMS(void) { return f_7_ebx[9]; }
bool INVPCID(void) { return f_7_ebx[10]; }
bool RTM(void) { return is_intel && f_7_ebx[11]; }
bool AVX512F(void) { return f_7_ebx[16]; }
bool AVX512DQ(void) { return f_7_ebx[17]; }
// All AVX-512 variants use ZMM registers — require OS ZMM save.
bool AVX512F(void) { return f_7_ebx[16] && os_saves_zmm(); }
bool AVX512DQ(void) { return f_7_ebx[17] && os_saves_zmm(); }
bool RDSEED(void) { return f_7_ebx[18]; }
bool ADX(void) { return f_7_ebx[19]; }
bool AVX512PF(void) { return f_7_ebx[26]; }
bool AVX512ER(void) { return f_7_ebx[27]; }
bool AVX512CD(void) { return f_7_ebx[28]; }
bool AVX512BW(void) { return f_7_ebx[30]; }
bool AVX512VL(void) { return f_7_ebx[31]; }
bool AVX512PF(void) { return f_7_ebx[26] && os_saves_zmm(); }
bool AVX512ER(void) { return f_7_ebx[27] && os_saves_zmm(); }
bool AVX512CD(void) { return f_7_ebx[28] && os_saves_zmm(); }
bool AVX512BW(void) { return f_7_ebx[30] && os_saves_zmm(); }
bool AVX512VL(void) { return f_7_ebx[31] && os_saves_zmm(); }
bool SHA(void) { return f_7_ebx[29]; }
@ -76,16 +78,18 @@ struct cpuid_x86 {
bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
bool AVX512_VBMI(void) { return f_7_ecx[1]; }
bool AVX512_VNNI(void) { return f_7_ecx[11]; }
bool AVX512_FP16(void) { return f_7_edx[23]; }
bool AVX512_BF16(void) { return f_7_1_eax[5]; }
bool AVX_VNNI(void) { return f_7_1_eax[4]; }
bool AVX512_VBMI(void) { return f_7_ecx[1] && os_saves_zmm(); }
bool AVX512_VNNI(void) { return f_7_ecx[11] && os_saves_zmm(); }
bool AVX512_FP16(void) { return f_7_edx[23] && os_saves_zmm(); }
bool AVX512_BF16(void) { return f_7_1_eax[5] && os_saves_zmm(); }
// AVX_VNNI uses VEX-encoded YMM instructions — require OS YMM save.
bool AVX_VNNI(void) { return f_7_1_eax[4] && os_saves_ymm(); }
bool AMX_TILE(void) { return f_7_edx[24]; }
bool AMX_INT8(void) { return f_7_edx[25]; }
bool AMX_FP16(void) { return f_7_1_eax[21]; }
bool AMX_BF16(void) { return f_7_edx[22]; }
// AMX requires OS AMX tile state save (XCR0 bits 17 and 18).
bool AMX_TILE(void) { return f_7_edx[24] && os_saves_amx(); }
bool AMX_INT8(void) { return f_7_edx[25] && os_saves_amx(); }
bool AMX_FP16(void) { return f_7_1_eax[21] && os_saves_amx(); }
bool AMX_BF16(void) { return f_7_edx[22] && os_saves_amx(); }
#ifdef _MSC_VER
static void cpuid(int cpu_info[4], int eax) {
@ -94,6 +98,7 @@ struct cpuid_x86 {
static void cpuidex(int cpu_info[4], int eax, int ecx) {
__cpuidex(cpu_info, eax, ecx);
}
static uint64_t xgetbv(uint32_t xcr) { return _xgetbv(xcr); }
#else
static void cpuid(int cpu_info[4], int eax) {
__asm__ __volatile__(
@ -107,8 +112,34 @@ struct cpuid_x86 {
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(eax), "c"(ecx));
}
static uint64_t xgetbv(uint32_t xcr) {
uint32_t lo, hi;
__asm__ __volatile__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(xcr));
return (static_cast<uint64_t>(hi) << 32u) | lo;
}
#endif
// Returns true when the OS saves YMM registers (required for AVX/AVX2/FMA/F16C/AVX_VNNI).
// Checks CPUID.1:ECX[27] (OSXSAVE) then XCR0[2:1] == 0b11 (SSE + YMM state).
bool os_saves_ymm(void) {
if (!f_1_ecx[27]) { return false; } // OSXSAVE bit not set
return (xgetbv(0u) & 0x6u) == 0x6u; // XCR0 bits 1 (SSE) and 2 (YMM) both set
}
// Returns true when the OS saves ZMM registers (required for all AVX-512 variants).
// Checks os_saves_ymm() then XCR0[7:5] == 0b111 (opmask + ZMM hi256 + ZMM hi16).
bool os_saves_zmm(void) {
if (!os_saves_ymm()) { return false; }
return (xgetbv(0u) & 0xE0u) == 0xE0u; // XCR0 bits 5, 6, 7
}
// Returns true when the OS saves AMX tile state (required for AMX-* instructions).
// Checks os_saves_zmm() then XCR0[18:17] == 0b11 (XTILECFG + XTILEDATA).
bool os_saves_amx(void) {
if (!os_saves_zmm()) { return false; }
return (xgetbv(0u) & 0x60000u) == 0x60000u; // XCR0 bits 17 and 18
}
cpuid_x86() {
std::array<int, 4> cpui;
std::vector<std::array<int, 4>> data;
@ -261,8 +292,6 @@ void test_x86_is() {
#endif
static int ggml_backend_cpu_x86_score() {
// FIXME: this does not check for OS support
int score = 1;
cpuid_x86 is;