ggml-cpu: fix x86 SIMD detection — validate OS YMM/ZMM/AMX state via XCR0
The ggml_backend_cpu_x86_score() function contained a FIXME comment
admitting it did not check for OS support. All AVX/AVX2/AVX-512/AMX
feature checks were raw CPUID bit reads with no OSXSAVE or XGETBV
validation.
On CPUs where CPUID reports AVX2 support but the OS has not enabled
YMM register save/restore (e.g. certain hypervisors, containers with
restricted XSAVE, or Windows builds with disabled AVX context), ggml
would select an AVX2/AVX-512 backend that immediately faults with
SIGILL on the first vector instruction.
Fix: add xgetbv() (MSVC _xgetbv / GCC-Clang inline asm), then three
predicates that are now called by the affected feature methods:
os_saves_ymm() — CPUID.1:ECX[27] (OSXSAVE) + XCR0[2:1] == 0b11
gates: AVX, AVX2, FMA, F16C, AVX_VNNI
os_saves_zmm() — os_saves_ymm() + XCR0[7:5] == 0b111
gates: AVX512F/DQ/PF/ER/CD/BW/VL, AVX512_VBMI/VNNI/FP16/BF16
os_saves_amx() — os_saves_zmm() + XCR0[18:17] == 0b11
gates: AMX_TILE, AMX_INT8, AMX_FP16, AMX_BF16
No interface changes — all call sites continue to use is.AVX2() etc.
The fix is self-contained within cpuid_x86.
Resolves the FIXME in ggml_backend_cpu_x86_score().
Designed and implemented by Matthew Busel.
This commit is contained in:
parent
5f91b1d5d5
commit
8197d9756e
|
|
@ -18,7 +18,8 @@ struct cpuid_x86 {
|
|||
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
|
||||
bool MONITOR(void) { return f_1_ecx[3]; }
|
||||
bool SSSE3(void) { return f_1_ecx[9]; }
|
||||
bool FMA(void) { return f_1_ecx[12]; }
|
||||
// FMA, F16C, AVX, AVX2, AVX_VNNI use YMM registers — require OS YMM save.
|
||||
bool FMA(void) { return f_1_ecx[12] && os_saves_ymm(); }
|
||||
bool CMPXCHG16B(void) { return f_1_ecx[13]; }
|
||||
bool SSE41(void) { return f_1_ecx[19]; }
|
||||
bool SSE42(void) { return f_1_ecx[20]; }
|
||||
|
|
@ -27,8 +28,8 @@ struct cpuid_x86 {
|
|||
bool AES(void) { return f_1_ecx[25]; }
|
||||
bool XSAVE(void) { return f_1_ecx[26]; }
|
||||
bool OSXSAVE(void) { return f_1_ecx[27]; }
|
||||
bool AVX(void) { return f_1_ecx[28]; }
|
||||
bool F16C(void) { return f_1_ecx[29]; }
|
||||
bool AVX(void) { return f_1_ecx[28] && os_saves_ymm(); }
|
||||
bool F16C(void) { return f_1_ecx[29] && os_saves_ymm(); }
|
||||
bool RDRAND(void) { return f_1_ecx[30]; }
|
||||
|
||||
bool MSR(void) { return f_1_edx[5]; }
|
||||
|
|
@ -44,20 +45,21 @@ struct cpuid_x86 {
|
|||
bool FSGSBASE(void) { return f_7_ebx[0]; }
|
||||
bool BMI1(void) { return f_7_ebx[3]; }
|
||||
bool HLE(void) { return is_intel && f_7_ebx[4]; }
|
||||
bool AVX2(void) { return f_7_ebx[5]; }
|
||||
bool AVX2(void) { return f_7_ebx[5] && os_saves_ymm(); }
|
||||
bool BMI2(void) { return f_7_ebx[8]; }
|
||||
bool ERMS(void) { return f_7_ebx[9]; }
|
||||
bool INVPCID(void) { return f_7_ebx[10]; }
|
||||
bool RTM(void) { return is_intel && f_7_ebx[11]; }
|
||||
bool AVX512F(void) { return f_7_ebx[16]; }
|
||||
bool AVX512DQ(void) { return f_7_ebx[17]; }
|
||||
// All AVX-512 variants use ZMM registers — require OS ZMM save.
|
||||
bool AVX512F(void) { return f_7_ebx[16] && os_saves_zmm(); }
|
||||
bool AVX512DQ(void) { return f_7_ebx[17] && os_saves_zmm(); }
|
||||
bool RDSEED(void) { return f_7_ebx[18]; }
|
||||
bool ADX(void) { return f_7_ebx[19]; }
|
||||
bool AVX512PF(void) { return f_7_ebx[26]; }
|
||||
bool AVX512ER(void) { return f_7_ebx[27]; }
|
||||
bool AVX512CD(void) { return f_7_ebx[28]; }
|
||||
bool AVX512BW(void) { return f_7_ebx[30]; }
|
||||
bool AVX512VL(void) { return f_7_ebx[31]; }
|
||||
bool AVX512PF(void) { return f_7_ebx[26] && os_saves_zmm(); }
|
||||
bool AVX512ER(void) { return f_7_ebx[27] && os_saves_zmm(); }
|
||||
bool AVX512CD(void) { return f_7_ebx[28] && os_saves_zmm(); }
|
||||
bool AVX512BW(void) { return f_7_ebx[30] && os_saves_zmm(); }
|
||||
bool AVX512VL(void) { return f_7_ebx[31] && os_saves_zmm(); }
|
||||
|
||||
bool SHA(void) { return f_7_ebx[29]; }
|
||||
|
||||
|
|
@ -76,16 +78,18 @@ struct cpuid_x86 {
|
|||
bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
|
||||
bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
|
||||
|
||||
bool AVX512_VBMI(void) { return f_7_ecx[1]; }
|
||||
bool AVX512_VNNI(void) { return f_7_ecx[11]; }
|
||||
bool AVX512_FP16(void) { return f_7_edx[23]; }
|
||||
bool AVX512_BF16(void) { return f_7_1_eax[5]; }
|
||||
bool AVX_VNNI(void) { return f_7_1_eax[4]; }
|
||||
bool AVX512_VBMI(void) { return f_7_ecx[1] && os_saves_zmm(); }
|
||||
bool AVX512_VNNI(void) { return f_7_ecx[11] && os_saves_zmm(); }
|
||||
bool AVX512_FP16(void) { return f_7_edx[23] && os_saves_zmm(); }
|
||||
bool AVX512_BF16(void) { return f_7_1_eax[5] && os_saves_zmm(); }
|
||||
// AVX_VNNI uses VEX-encoded YMM instructions — require OS YMM save.
|
||||
bool AVX_VNNI(void) { return f_7_1_eax[4] && os_saves_ymm(); }
|
||||
|
||||
bool AMX_TILE(void) { return f_7_edx[24]; }
|
||||
bool AMX_INT8(void) { return f_7_edx[25]; }
|
||||
bool AMX_FP16(void) { return f_7_1_eax[21]; }
|
||||
bool AMX_BF16(void) { return f_7_edx[22]; }
|
||||
// AMX requires OS AMX tile state save (XCR0 bits 17 and 18).
|
||||
bool AMX_TILE(void) { return f_7_edx[24] && os_saves_amx(); }
|
||||
bool AMX_INT8(void) { return f_7_edx[25] && os_saves_amx(); }
|
||||
bool AMX_FP16(void) { return f_7_1_eax[21] && os_saves_amx(); }
|
||||
bool AMX_BF16(void) { return f_7_edx[22] && os_saves_amx(); }
|
||||
|
||||
#ifdef _MSC_VER
|
||||
static void cpuid(int cpu_info[4], int eax) {
|
||||
|
|
@ -94,6 +98,7 @@ struct cpuid_x86 {
|
|||
static void cpuidex(int cpu_info[4], int eax, int ecx) {
|
||||
__cpuidex(cpu_info, eax, ecx);
|
||||
}
|
||||
static uint64_t xgetbv(uint32_t xcr) { return _xgetbv(xcr); }
|
||||
#else
|
||||
static void cpuid(int cpu_info[4], int eax) {
|
||||
__asm__ __volatile__(
|
||||
|
|
@ -107,8 +112,34 @@ struct cpuid_x86 {
|
|||
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||
: "a"(eax), "c"(ecx));
|
||||
}
|
||||
static uint64_t xgetbv(uint32_t xcr) {
|
||||
uint32_t lo, hi;
|
||||
__asm__ __volatile__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(xcr));
|
||||
return (static_cast<uint64_t>(hi) << 32u) | lo;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Returns true when the OS saves YMM registers (required for AVX/AVX2/FMA/F16C/AVX_VNNI).
|
||||
// Checks CPUID.1:ECX[27] (OSXSAVE) then XCR0[2:1] == 0b11 (SSE + YMM state).
|
||||
bool os_saves_ymm(void) {
|
||||
if (!f_1_ecx[27]) { return false; } // OSXSAVE bit not set
|
||||
return (xgetbv(0u) & 0x6u) == 0x6u; // XCR0 bits 1 (SSE) and 2 (YMM) both set
|
||||
}
|
||||
|
||||
// Returns true when the OS saves ZMM registers (required for all AVX-512 variants).
|
||||
// Checks os_saves_ymm() then XCR0[7:5] == 0b111 (opmask + ZMM hi256 + ZMM hi16).
|
||||
bool os_saves_zmm(void) {
|
||||
if (!os_saves_ymm()) { return false; }
|
||||
return (xgetbv(0u) & 0xE0u) == 0xE0u; // XCR0 bits 5, 6, 7
|
||||
}
|
||||
|
||||
// Returns true when the OS saves AMX tile state (required for AMX-* instructions).
|
||||
// Checks os_saves_zmm() then XCR0[18:17] == 0b11 (XTILECFG + XTILEDATA).
|
||||
bool os_saves_amx(void) {
|
||||
if (!os_saves_zmm()) { return false; }
|
||||
return (xgetbv(0u) & 0x60000u) == 0x60000u; // XCR0 bits 17 and 18
|
||||
}
|
||||
|
||||
cpuid_x86() {
|
||||
std::array<int, 4> cpui;
|
||||
std::vector<std::array<int, 4>> data;
|
||||
|
|
@ -261,8 +292,6 @@ void test_x86_is() {
|
|||
#endif
|
||||
|
||||
static int ggml_backend_cpu_x86_score() {
|
||||
// FIXME: this does not check for OS support
|
||||
|
||||
int score = 1;
|
||||
cpuid_x86 is;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue