diff --git a/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp b/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp index d775a03638..5ffeddf608 100644 --- a/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +++ b/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp @@ -1,327 +1,377 @@ -#include "ggml-backend-impl.h" - -#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) - -#ifdef _MSC_VER -#include -#endif - -#include -#include -#include -#include -#include - -// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf -struct cpuid_x86 { - bool SSE3(void) { return f_1_ecx[0]; } - bool PCLMULQDQ(void) { return f_1_ecx[1]; } - bool MONITOR(void) { return f_1_ecx[3]; } - bool SSSE3(void) { return f_1_ecx[9]; } - bool FMA(void) { return f_1_ecx[12]; } - bool CMPXCHG16B(void) { return f_1_ecx[13]; } - bool SSE41(void) { return f_1_ecx[19]; } - bool SSE42(void) { return f_1_ecx[20]; } - bool MOVBE(void) { return f_1_ecx[22]; } - bool POPCNT(void) { return f_1_ecx[23]; } - bool AES(void) { return f_1_ecx[25]; } - bool XSAVE(void) { return f_1_ecx[26]; } - bool OSXSAVE(void) { return f_1_ecx[27]; } - bool AVX(void) { return f_1_ecx[28]; } - bool F16C(void) { return f_1_ecx[29]; } - bool RDRAND(void) { return f_1_ecx[30]; } - - bool MSR(void) { return f_1_edx[5]; } - bool CX8(void) { return f_1_edx[8]; } - bool SEP(void) { return f_1_edx[11]; } - bool CMOV(void) { return f_1_edx[15]; } - bool CLFSH(void) { return f_1_edx[19]; } - bool MMX(void) { return f_1_edx[23]; } - bool FXSR(void) { return f_1_edx[24]; } - bool SSE(void) { return f_1_edx[25]; } - bool SSE2(void) { return f_1_edx[26]; } - - bool FSGSBASE(void) { return f_7_ebx[0]; } - bool BMI1(void) { return f_7_ebx[3]; } - bool HLE(void) { return is_intel && f_7_ebx[4]; } - bool AVX2(void) { return f_7_ebx[5]; } - bool BMI2(void) { return f_7_ebx[8]; } - bool ERMS(void) { return f_7_ebx[9]; } - bool INVPCID(void) { return f_7_ebx[10]; } - bool RTM(void) { return is_intel && f_7_ebx[11]; } - bool AVX512F(void) { return f_7_ebx[16]; } - bool AVX512DQ(void) { return f_7_ebx[17]; } - bool RDSEED(void) { return f_7_ebx[18]; } - bool ADX(void) { return f_7_ebx[19]; } - bool AVX512PF(void) { return f_7_ebx[26]; } - bool AVX512ER(void) { return f_7_ebx[27]; } - bool AVX512CD(void) { return f_7_ebx[28]; } - bool AVX512BW(void) { return f_7_ebx[30]; } - bool AVX512VL(void) { return f_7_ebx[31]; } - - bool SHA(void) { return f_7_ebx[29]; } - - bool PREFETCHWT1(void) { return f_7_ecx[0]; } - - bool LAHF(void) { return f_81_ecx[0]; } - bool LZCNT(void) { return is_intel && f_81_ecx[5]; } - bool ABM(void) { return is_amd && f_81_ecx[5]; } - bool SSE4a(void) { return is_amd && f_81_ecx[6]; } - bool XOP(void) { return is_amd && f_81_ecx[11]; } - bool TBM(void) { return is_amd && f_81_ecx[21]; } - - bool SYSCALL(void) { return is_intel && f_81_edx[11]; } - bool MMXEXT(void) { return is_amd && f_81_edx[22]; } - bool RDTSCP(void) { return is_intel && f_81_edx[27]; } - bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; } - bool _3DNOW(void) { return is_amd && f_81_edx[31]; } - - bool AVX512_VBMI(void) { return f_7_ecx[1]; } - bool AVX512_VNNI(void) { return f_7_ecx[11]; } - bool AVX512_FP16(void) { return f_7_edx[23]; } - bool AVX512_BF16(void) { return f_7_1_eax[5]; } - bool AVX_VNNI(void) { return f_7_1_eax[4]; } - - bool AMX_TILE(void) { return f_7_edx[24]; } - bool AMX_INT8(void) { return f_7_edx[25]; } - bool AMX_FP16(void) { return f_7_1_eax[21]; } - bool AMX_BF16(void) { return f_7_edx[22]; } - -#ifdef _MSC_VER - static void cpuid(int cpu_info[4], int eax) { - __cpuid(cpu_info, eax); - } - static void cpuidex(int cpu_info[4], int eax, int ecx) { - __cpuidex(cpu_info, eax, ecx); - } -#else - static void cpuid(int cpu_info[4], int eax) { - __asm__ __volatile__( - "cpuid" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(eax), "c"(0)); - } - static void cpuidex(int cpu_info[4], int eax, int ecx) { - __asm__ __volatile__( - "cpuid" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(eax), "c"(ecx)); - } -#endif - - cpuid_x86() { - std::array cpui; - std::vector> data; - - // calling __cpuid with 0x0 as the function_id argument - // gets the number of the highest valid function ID. - cpuid(cpui.data(), 0); - int n_ids = cpui[0]; - - for (int i = 0; i <= n_ids; ++i) { - cpuidex(cpui.data(), i, 0); - data.push_back(cpui); - } - - // capture vendor string - char vendor[0x20] = {}; - *reinterpret_cast(vendor) = data[0][1]; - *reinterpret_cast(vendor + 4) = data[0][3]; - *reinterpret_cast(vendor + 8) = data[0][2]; - this->vendor = vendor; - if (this->vendor == "GenuineIntel") { - is_intel = true; - } else if (this->vendor == "AuthenticAMD") { - is_amd = true; - } - - // load bitset with flags for function 0x00000001 - if (n_ids >= 1) { - f_1_ecx = data[1][2]; - f_1_edx = data[1][3]; - } - - // load bitset with flags for function 0x00000007 - if (n_ids >= 7) { - f_7_ebx = data[7][1]; - f_7_ecx = data[7][2]; - f_7_edx = data[7][3]; - cpuidex(cpui.data(), 7, 1); - f_7_1_eax = cpui[0]; - } - - // calling __cpuid with 0x80000000 as the function_id argument - // gets the number of the highest valid extended ID. - cpuid(cpui.data(), 0x80000000); - unsigned int n_ex_ids = cpui[0]; - - std::vector> ext_data; - for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) { - cpuidex(cpui.data(), i, 0); - ext_data.push_back(cpui); - } - - // load bitset with flags for function 0x80000001 - if (n_ex_ids >= 0x80000001) { - f_81_ecx = ext_data[1][2]; - f_81_edx = ext_data[1][3]; - } - - // interpret CPU brand string if reported - char brand[0x40] = {}; - if (n_ex_ids >= 0x80000004) { - std::memcpy(brand, ext_data[2].data(), sizeof(cpui)); - std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui)); - std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui)); - this->brand = brand; - } - } - - bool is_intel = false; - bool is_amd = false; - std::string vendor; - std::string brand; - std::bitset<32> f_1_ecx; - std::bitset<32> f_1_edx; - std::bitset<32> f_7_ebx; - std::bitset<32> f_7_ecx; - std::bitset<32> f_7_edx; - std::bitset<32> f_7_1_eax; - std::bitset<32> f_81_ecx; - std::bitset<32> f_81_edx; -}; - -#if 0 -void test_x86_is() { - cpuid_x86 is; - printf("CPU Vendor: %s\n", is.vendor.c_str()); - printf("Brand: %s\n", is.brand.c_str()); - printf("is_intel: %d\n", is.is_intel); - printf("is_amd: %d\n", is.is_amd); - printf("sse3: %d\n", is.SSE3()); - printf("pclmulqdq: %d\n", is.PCLMULQDQ()); - printf("ssse3: %d\n", is.SSSE3()); - printf("fma: %d\n", is.FMA()); - printf("cmpxchg16b: %d\n", is.CMPXCHG16B()); - printf("sse41: %d\n", is.SSE41()); - printf("sse42: %d\n", is.SSE42()); - printf("movbe: %d\n", is.MOVBE()); - printf("popcnt: %d\n", is.POPCNT()); - printf("aes: %d\n", is.AES()); - printf("xsave: %d\n", is.XSAVE()); - printf("osxsave: %d\n", is.OSXSAVE()); - printf("avx: %d\n", is.AVX()); - printf("f16c: %d\n", is.F16C()); - printf("rdrand: %d\n", is.RDRAND()); - printf("msr: %d\n", is.MSR()); - printf("cx8: %d\n", is.CX8()); - printf("sep: %d\n", is.SEP()); - printf("cmov: %d\n", is.CMOV()); - printf("clflush: %d\n", is.CLFSH()); - printf("mmx: %d\n", is.MMX()); - printf("fxsr: %d\n", is.FXSR()); - printf("sse: %d\n", is.SSE()); - printf("sse2: %d\n", is.SSE2()); - printf("fsgsbase: %d\n", is.FSGSBASE()); - printf("bmi1: %d\n", is.BMI1()); - printf("hle: %d\n", is.HLE()); - printf("avx2: %d\n", is.AVX2()); - printf("bmi2: %d\n", is.BMI2()); - printf("erms: %d\n", is.ERMS()); - printf("invpcid: %d\n", is.INVPCID()); - printf("rtm: %d\n", is.RTM()); - printf("avx512f: %d\n", is.AVX512F()); - printf("rdseed: %d\n", is.RDSEED()); - printf("adx: %d\n", is.ADX()); - printf("avx512pf: %d\n", is.AVX512PF()); - printf("avx512er: %d\n", is.AVX512ER()); - printf("avx512cd: %d\n", is.AVX512CD()); - printf("sha: %d\n", is.SHA()); - printf("prefetchwt1: %d\n", is.PREFETCHWT1()); - printf("lahf: %d\n", is.LAHF()); - printf("lzcnt: %d\n", is.LZCNT()); - printf("abm: %d\n", is.ABM()); - printf("sse4a: %d\n", is.SSE4a()); - printf("xop: %d\n", is.XOP()); - printf("tbm: %d\n", is.TBM()); - printf("syscall: %d\n", is.SYSCALL()); - printf("mmxext: %d\n", is.MMXEXT()); - printf("rdtscp: %d\n", is.RDTSCP()); - printf("3dnowext: %d\n", is._3DNOWEXT()); - printf("3dnow: %d\n", is._3DNOW()); - printf("avx512_vbmi: %d\n", is.AVX512_VBMI()); - printf("avx512_vnni: %d\n", is.AVX512_VNNI()); - printf("avx512_fp16: %d\n", is.AVX512_FP16()); - printf("avx512_bf16: %d\n", is.AVX512_BF16()); - printf("amx_tile: %d\n", is.AMX_TILE()); - printf("amx_int8: %d\n", is.AMX_INT8()); - printf("amx_fp16: %d\n", is.AMX_FP16()); - printf("amx_bf16: %d\n", is.AMX_BF16()); -} -#endif - -static int ggml_backend_cpu_x86_score() { - // FIXME: this does not check for OS support - - int score = 1; - cpuid_x86 is; - -#ifdef GGML_FMA - if (!is.FMA()) { return 0; } - score += 1; -#endif -#ifdef GGML_F16C - if (!is.F16C()) { return 0; } - score += 1<<1; -#endif -#ifdef GGML_SSE42 - if (!is.SSE42()) { return 0; } - score += 1<<2; -#endif -#ifdef GGML_BMI2 - if (!is.BMI2()) { return 0; } - score += 1<<3; -#endif -#ifdef GGML_AVX - if (!is.AVX()) { return 0; } - score += 1<<4; -#endif -#ifdef GGML_AVX2 - if (!is.AVX2()) { return 0; } - score += 1<<5; -#endif -#ifdef GGML_AVX_VNNI - if (!is.AVX_VNNI()) { return 0; } - score += 1<<6; -#endif -#ifdef GGML_AVX512 - if (!is.AVX512F()) { return 0; } - if (!is.AVX512CD()) { return 0; } - if (!is.AVX512VL()) { return 0; } - if (!is.AVX512DQ()) { return 0; } - if (!is.AVX512BW()) { return 0; } - score += 1<<7; -#endif -#ifdef GGML_AVX512_VBMI - if (!is.AVX512_VBMI()) { return 0; } - score += 1<<8; -#endif -#ifdef GGML_AVX512_BF16 - if (!is.AVX512_BF16()) { return 0; } - score += 1<<9; -#endif -#ifdef GGML_AVX512_VNNI - if (!is.AVX512_VNNI()) { return 0; } - score += 1<<10; -#endif -#ifdef GGML_AMX_INT8 - if (!is.AMX_INT8()) { return 0; } - score += 1<<11; -#endif - - return score; -} - -GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score) - -#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#include "ggml-backend-impl.h" + +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) + +#ifdef _MSC_VER +#include +#endif + +// On Darwin/macOS, AVX-512 context save is lazy: XCR0 bits 5-7 are not set +// until the process first executes an AVX-512 instruction, even on capable +// hardware. We query the OS via sysctl instead of reading XCR0 directly. +#if defined(__APPLE__) +#include +#endif + +#include +#include +#include +#include +#include + +// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf +struct cpuid_x86 { + bool SSE3(void) { return f_1_ecx[0]; } + bool PCLMULQDQ(void) { return f_1_ecx[1]; } + bool MONITOR(void) { return f_1_ecx[3]; } + bool SSSE3(void) { return f_1_ecx[9]; } + // FMA, F16C, AVX, AVX2, AVX_VNNI use YMM registers — require OS YMM save. + bool FMA(void) { return f_1_ecx[12] && os_saves_ymm(); } + bool CMPXCHG16B(void) { return f_1_ecx[13]; } + bool SSE41(void) { return f_1_ecx[19]; } + bool SSE42(void) { return f_1_ecx[20]; } + bool MOVBE(void) { return f_1_ecx[22]; } + bool POPCNT(void) { return f_1_ecx[23]; } + bool AES(void) { return f_1_ecx[25]; } + bool XSAVE(void) { return f_1_ecx[26]; } + bool OSXSAVE(void) { return f_1_ecx[27]; } + bool AVX(void) { return f_1_ecx[28] && os_saves_ymm(); } + bool F16C(void) { return f_1_ecx[29] && os_saves_ymm(); } + bool RDRAND(void) { return f_1_ecx[30]; } + + bool MSR(void) { return f_1_edx[5]; } + bool CX8(void) { return f_1_edx[8]; } + bool SEP(void) { return f_1_edx[11]; } + bool CMOV(void) { return f_1_edx[15]; } + bool CLFSH(void) { return f_1_edx[19]; } + bool MMX(void) { return f_1_edx[23]; } + bool FXSR(void) { return f_1_edx[24]; } + bool SSE(void) { return f_1_edx[25]; } + bool SSE2(void) { return f_1_edx[26]; } + + bool FSGSBASE(void) { return f_7_ebx[0]; } + bool BMI1(void) { return f_7_ebx[3]; } + bool HLE(void) { return is_intel && f_7_ebx[4]; } + bool AVX2(void) { return f_7_ebx[5] && os_saves_ymm(); } + bool BMI2(void) { return f_7_ebx[8]; } + bool ERMS(void) { return f_7_ebx[9]; } + bool INVPCID(void) { return f_7_ebx[10]; } + bool RTM(void) { return is_intel && f_7_ebx[11]; } + // All AVX-512 variants use ZMM registers — require OS ZMM save. + bool AVX512F(void) { return f_7_ebx[16] && os_saves_zmm(); } + bool AVX512DQ(void) { return f_7_ebx[17] && os_saves_zmm(); } + bool RDSEED(void) { return f_7_ebx[18]; } + bool ADX(void) { return f_7_ebx[19]; } + bool AVX512PF(void) { return f_7_ebx[26] && os_saves_zmm(); } + bool AVX512ER(void) { return f_7_ebx[27] && os_saves_zmm(); } + bool AVX512CD(void) { return f_7_ebx[28] && os_saves_zmm(); } + bool AVX512BW(void) { return f_7_ebx[30] && os_saves_zmm(); } + bool AVX512VL(void) { return f_7_ebx[31] && os_saves_zmm(); } + + bool SHA(void) { return f_7_ebx[29]; } + + bool PREFETCHWT1(void) { return f_7_ecx[0]; } + + bool LAHF(void) { return f_81_ecx[0]; } + bool LZCNT(void) { return is_intel && f_81_ecx[5]; } + bool ABM(void) { return is_amd && f_81_ecx[5]; } + bool SSE4a(void) { return is_amd && f_81_ecx[6]; } + bool XOP(void) { return is_amd && f_81_ecx[11]; } + bool TBM(void) { return is_amd && f_81_ecx[21]; } + + bool SYSCALL(void) { return is_intel && f_81_edx[11]; } + bool MMXEXT(void) { return is_amd && f_81_edx[22]; } + bool RDTSCP(void) { return is_intel && f_81_edx[27]; } + bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; } + bool _3DNOW(void) { return is_amd && f_81_edx[31]; } + + bool AVX512_VBMI(void) { return f_7_ecx[1] && os_saves_zmm(); } + bool AVX512_VNNI(void) { return f_7_ecx[11] && os_saves_zmm(); } + bool AVX512_FP16(void) { return f_7_edx[23] && os_saves_zmm(); } + bool AVX512_BF16(void) { return f_7_1_eax[5] && os_saves_zmm(); } + // AVX_VNNI uses VEX-encoded YMM instructions — require OS YMM save. + bool AVX_VNNI(void) { return f_7_1_eax[4] && os_saves_ymm(); } + + // AMX requires OS AMX tile state save (XCR0 bits 17 and 18). + bool AMX_TILE(void) { return f_7_edx[24] && os_saves_amx(); } + bool AMX_INT8(void) { return f_7_edx[25] && os_saves_amx(); } + bool AMX_FP16(void) { return f_7_1_eax[21] && os_saves_amx(); } + bool AMX_BF16(void) { return f_7_edx[22] && os_saves_amx(); } + +#ifdef _MSC_VER + static void cpuid(int cpu_info[4], int eax) { + __cpuid(cpu_info, eax); + } + static void cpuidex(int cpu_info[4], int eax, int ecx) { + __cpuidex(cpu_info, eax, ecx); + } + static uint64_t xgetbv(uint32_t xcr) { return _xgetbv(xcr); } +#else + static void cpuid(int cpu_info[4], int eax) { + __asm__ __volatile__( + "cpuid" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(eax), "c"(0)); + } + static void cpuidex(int cpu_info[4], int eax, int ecx) { + __asm__ __volatile__( + "cpuid" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(eax), "c"(ecx)); + } + static uint64_t xgetbv(uint32_t xcr) { + uint32_t lo, hi; + __asm__ __volatile__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(xcr)); + return (static_cast(hi) << 32u) | lo; + } +#endif + + // Returns true when the OS saves YMM registers (required for AVX/AVX2/FMA/F16C/AVX_VNNI). + // Checks CPUID.1:ECX[27] (OSXSAVE) then XCR0[2:1] == 0b11 (SSE + YMM state). + bool os_saves_ymm(void) { + if (!f_1_ecx[27]) { return false; } // OSXSAVE bit not set + return (xgetbv(0u) & 0x6u) == 0x6u; // XCR0 bits 1 (SSE) and 2 (YMM) both set + } + + // Returns true when the OS saves ZMM registers (required for all AVX-512 variants). + // + // On Darwin/macOS, AVX-512 context save is lazy: XCR0 bits 5-7 remain clear + // until the process first executes an AVX-512 instruction, even on fully + // capable hardware. Reading XCR0 therefore gives a false negative before + // the first AVX-512 use. We query hw.optional.avx512f via sysctl instead, + // which reflects true hardware capability regardless of lazy-enable state. + // See: https://github.com/google/cpu_features/blob/main/src/impl_x86_macos.c + bool os_saves_zmm(void) { + if (!os_saves_ymm()) { return false; } +#if defined(__APPLE__) + int val = 0; + size_t len = sizeof(val); + return sysctlbyname("hw.optional.avx512f", &val, &len, nullptr, 0) == 0 && val != 0; +#else + return (xgetbv(0u) & 0xE0u) == 0xE0u; // XCR0 bits 5 (opmask), 6 (ZMM hi256), 7 (ZMM hi16) +#endif + } + + // Returns true when the OS saves AMX tile state (required for AMX-* instructions). + // Checks os_saves_zmm() then XCR0[18:17] == 0b11 (XTILECFG + XTILEDATA). + // Note: Intel AMX is not available on macOS hardware; os_saves_zmm() will + // return false on macOS, making this check safe on all platforms. + bool os_saves_amx(void) { + if (!os_saves_zmm()) { return false; } + return (xgetbv(0u) & 0x60000u) == 0x60000u; // XCR0 bits 17 and 18 + } + + cpuid_x86() { + std::array cpui; + std::vector> data; + + // calling __cpuid with 0x0 as the function_id argument + // gets the number of the highest valid function ID. + cpuid(cpui.data(), 0); + int n_ids = cpui[0]; + + for (int i = 0; i <= n_ids; ++i) { + cpuidex(cpui.data(), i, 0); + data.push_back(cpui); + } + + // capture vendor string + char vendor[0x20] = {}; + *reinterpret_cast(vendor) = data[0][1]; + *reinterpret_cast(vendor + 4) = data[0][3]; + *reinterpret_cast(vendor + 8) = data[0][2]; + this->vendor = vendor; + if (this->vendor == "GenuineIntel") { + is_intel = true; + } else if (this->vendor == "AuthenticAMD") { + is_amd = true; + } + + // load bitset with flags for function 0x00000001 + if (n_ids >= 1) { + f_1_ecx = data[1][2]; + f_1_edx = data[1][3]; + } + + // load bitset with flags for function 0x00000007 + if (n_ids >= 7) { + f_7_ebx = data[7][1]; + f_7_ecx = data[7][2]; + f_7_edx = data[7][3]; + cpuidex(cpui.data(), 7, 1); + f_7_1_eax = cpui[0]; + } + + // calling __cpuid with 0x80000000 as the function_id argument + // gets the number of the highest valid extended ID. + cpuid(cpui.data(), 0x80000000); + unsigned int n_ex_ids = cpui[0]; + + std::vector> ext_data; + for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) { + cpuidex(cpui.data(), i, 0); + ext_data.push_back(cpui); + } + + // load bitset with flags for function 0x80000001 + if (n_ex_ids >= 0x80000001) { + f_81_ecx = ext_data[1][2]; + f_81_edx = ext_data[1][3]; + } + + // interpret CPU brand string if reported + char brand[0x40] = {}; + if (n_ex_ids >= 0x80000004) { + std::memcpy(brand, ext_data[2].data(), sizeof(cpui)); + std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui)); + std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui)); + this->brand = brand; + } + } + + bool is_intel = false; + bool is_amd = false; + std::string vendor; + std::string brand; + std::bitset<32> f_1_ecx; + std::bitset<32> f_1_edx; + std::bitset<32> f_7_ebx; + std::bitset<32> f_7_ecx; + std::bitset<32> f_7_edx; + std::bitset<32> f_7_1_eax; + std::bitset<32> f_81_ecx; + std::bitset<32> f_81_edx; +}; + +#if 0 +void test_x86_is() { + cpuid_x86 is; + printf("CPU Vendor: %s\n", is.vendor.c_str()); + printf("Brand: %s\n", is.brand.c_str()); + printf("is_intel: %d\n", is.is_intel); + printf("is_amd: %d\n", is.is_amd); + printf("sse3: %d\n", is.SSE3()); + printf("pclmulqdq: %d\n", is.PCLMULQDQ()); + printf("ssse3: %d\n", is.SSSE3()); + printf("fma: %d\n", is.FMA()); + printf("cmpxchg16b: %d\n", is.CMPXCHG16B()); + printf("sse41: %d\n", is.SSE41()); + printf("sse42: %d\n", is.SSE42()); + printf("movbe: %d\n", is.MOVBE()); + printf("popcnt: %d\n", is.POPCNT()); + printf("aes: %d\n", is.AES()); + printf("xsave: %d\n", is.XSAVE()); + printf("osxsave: %d\n", is.OSXSAVE()); + printf("avx: %d\n", is.AVX()); + printf("f16c: %d\n", is.F16C()); + printf("rdrand: %d\n", is.RDRAND()); + printf("msr: %d\n", is.MSR()); + printf("cx8: %d\n", is.CX8()); + printf("sep: %d\n", is.SEP()); + printf("cmov: %d\n", is.CMOV()); + printf("clflush: %d\n", is.CLFSH()); + printf("mmx: %d\n", is.MMX()); + printf("fxsr: %d\n", is.FXSR()); + printf("sse: %d\n", is.SSE()); + printf("sse2: %d\n", is.SSE2()); + printf("fsgsbase: %d\n", is.FSGSBASE()); + printf("bmi1: %d\n", is.BMI1()); + printf("hle: %d\n", is.HLE()); + printf("avx2: %d\n", is.AVX2()); + printf("bmi2: %d\n", is.BMI2()); + printf("erms: %d\n", is.ERMS()); + printf("invpcid: %d\n", is.INVPCID()); + printf("rtm: %d\n", is.RTM()); + printf("avx512f: %d\n", is.AVX512F()); + printf("rdseed: %d\n", is.RDSEED()); + printf("adx: %d\n", is.ADX()); + printf("avx512pf: %d\n", is.AVX512PF()); + printf("avx512er: %d\n", is.AVX512ER()); + printf("avx512cd: %d\n", is.AVX512CD()); + printf("sha: %d\n", is.SHA()); + printf("prefetchwt1: %d\n", is.PREFETCHWT1()); + printf("lahf: %d\n", is.LAHF()); + printf("lzcnt: %d\n", is.LZCNT()); + printf("abm: %d\n", is.ABM()); + printf("sse4a: %d\n", is.SSE4a()); + printf("xop: %d\n", is.XOP()); + printf("tbm: %d\n", is.TBM()); + printf("syscall: %d\n", is.SYSCALL()); + printf("mmxext: %d\n", is.MMXEXT()); + printf("rdtscp: %d\n", is.RDTSCP()); + printf("3dnowext: %d\n", is._3DNOWEXT()); + printf("3dnow: %d\n", is._3DNOW()); + printf("avx512_vbmi: %d\n", is.AVX512_VBMI()); + printf("avx512_vnni: %d\n", is.AVX512_VNNI()); + printf("avx512_fp16: %d\n", is.AVX512_FP16()); + printf("avx512_bf16: %d\n", is.AVX512_BF16()); + printf("amx_tile: %d\n", is.AMX_TILE()); + printf("amx_int8: %d\n", is.AMX_INT8()); + printf("amx_fp16: %d\n", is.AMX_FP16()); + printf("amx_bf16: %d\n", is.AMX_BF16()); +} +#endif + +static int ggml_backend_cpu_x86_score() { + int score = 1; + cpuid_x86 is; + +#ifdef GGML_FMA + if (!is.FMA()) { return 0; } + score += 1; +#endif +#ifdef GGML_F16C + if (!is.F16C()) { return 0; } + score += 1<<1; +#endif +#ifdef GGML_SSE42 + if (!is.SSE42()) { return 0; } + score += 1<<2; +#endif +#ifdef GGML_BMI2 + if (!is.BMI2()) { return 0; } + score += 1<<3; +#endif +#ifdef GGML_AVX + if (!is.AVX()) { return 0; } + score += 1<<4; +#endif +#ifdef GGML_AVX2 + if (!is.AVX2()) { return 0; } + score += 1<<5; +#endif +#ifdef GGML_AVX_VNNI + if (!is.AVX_VNNI()) { return 0; } + score += 1<<6; +#endif +#ifdef GGML_AVX512 + if (!is.AVX512F()) { return 0; } + if (!is.AVX512CD()) { return 0; } + if (!is.AVX512VL()) { return 0; } + if (!is.AVX512DQ()) { return 0; } + if (!is.AVX512BW()) { return 0; } + score += 1<<7; +#endif +#ifdef GGML_AVX512_VBMI + if (!is.AVX512_VBMI()) { return 0; } + score += 1<<8; +#endif +#ifdef GGML_AVX512_BF16 + if (!is.AVX512_BF16()) { return 0; } + score += 1<<9; +#endif +#ifdef GGML_AVX512_VNNI + if (!is.AVX512_VNNI()) { return 0; } + score += 1<<10; +#endif +#ifdef GGML_AMX_INT8 + if (!is.AMX_INT8()) { return 0; } + score += 1<<11; +#endif + + return score; +} + +GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score) + +#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))