diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp index e81dd8053..49af1b179 100644 --- a/include/xsimd/config/xsimd_config.hpp +++ b/include/xsimd/config/xsimd_config.hpp @@ -33,6 +33,17 @@ * @defgroup xsimd_config_macro Instruction Set Detection */ +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if the target is the x86 architecture family. + */ +#if defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) +#define XSIMD_TARGET_X86 1 +#else +#define XSIMD_TARGET_X86 0 +#endif + /** * @ingroup xsimd_config_macro * diff --git a/include/xsimd/config/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp new file mode 100644 index 000000000..76564d5f7 --- /dev/null +++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp @@ -0,0 +1,584 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_CPU_FEATURES_X86_HPP +#define XSIMD_CPU_FEATURES_X86_HPP + +#include +#include +#include +#include + +#include "../utils/bits.hpp" +#include "./xsimd_config.hpp" + +#if XSIMD_TARGET_X86 && defined(_MSC_VER) +#include // Contains the definition of __cpuidex +#endif + +namespace xsimd +{ + namespace detail + { + using x86_reg32_t = std::uint32_t; + + using cpuid_reg_t = std::array; + + /** + * CPU Identification (CPUID) instruction results. + * + * The CPUID instruction provides detailed information about the processor, + * including supported instruction set extensions (SSE, AVX, AVX-512, etc.). + * This function is well defined on all architectures but will return all zeros + * on all non-x86 architectures. + * + * @param leaf The value inputted to the EAX register. + * @param subleaf The value inputted to the ECX register. + * + * @see https://en.wikipedia.org/wiki/CPUID + */ + inline cpuid_reg_t x86_cpuid(int leaf, int subleaf = 0) noexcept; + + inline x86_reg32_t x86_xcr0_low() noexcept; + + template + using x86_reg32_bitset = utils::uint_bitset; + + template + class x86_cpuid_regs : private x86_reg32_bitset, x86_reg32_bitset, x86_reg32_bitset, x86_reg32_bitset + { + private: + using eax_bitset = x86_reg32_bitset; + using ebx_bitset = x86_reg32_bitset; + using ecx_bitset = x86_reg32_bitset; + using edx_bitset = x86_reg32_bitset; + + /* Parse CPUINFO register value into individual bit components.*/ + constexpr explicit x86_cpuid_regs(const cpuid_reg_t& regs) noexcept + : eax_bitset(regs[0]) + , ebx_bitset(regs[1]) + , ecx_bitset(regs[2]) + , edx_bitset(regs[3]) + { + } + + public: + using eax = A; + using ebx = B; + using ecx = C; + using edx = D; + + inline static x86_cpuid_regs read() + { + return x86_cpuid_regs(detail::x86_cpuid(leaf, subleaf)); + } + + constexpr x86_cpuid_regs() noexcept = default; + + using eax_bitset::all_bits_set; + using ebx_bitset::all_bits_set; + using ecx_bitset::all_bits_set; + using edx_bitset::all_bits_set; + }; + + template + using make_x86_cpuid_regs = x86_cpuid_regs; + } + + struct x86_cpuid_leaf1_traits + { + static constexpr detail::x86_reg32_t leaf = 1; + static constexpr detail::x86_reg32_t subleaf = 0; + + enum class eax + { + }; + enum class ebx + { + }; + enum class ecx + { + /* Streaming SIMD Extensions 3. */ + sse3 = 0, + /* Supplemental Streaming SIMD Extensions 3. */ + ssse3 = 9, + /* Fused multiply-add with 3 operands (FMA3). */ + fma3 = 12, + /* Streaming SIMD Extensions 4.1. */ + sse4_1 = 19, + /* Streaming SIMD Extensions 4.2. */ + sse4_2 = 20, + /* OS has enabled XSAVE/XRSTOR for extended processor state management. */ + osxsave = 27, + /* Advanced Vector Extensions (256-bit SIMD). */ + avx = 28, + }; + enum class edx + { + /* Streaming SIMD Extensions 2. */ + sse2 = 26, + }; + }; + + /** + * Processor Info and Feature Bits. + * + * Utility class that can read and parse the registers for the first leaf level + * of the CPUID instruction. + * This is well defined on all architectures but will return all false on all + * non-x86 architectures. + * + * @see https://en.wikipedia.org/wiki/CPUID + */ + using x86_cpuid_leaf1 = detail::make_x86_cpuid_regs; + + struct x86_cpuid_leaf7_traits + { + static constexpr detail::x86_reg32_t leaf = 7; + static constexpr detail::x86_reg32_t subleaf = 0; + + enum class eax + { + }; + enum class ebx + { + /* Advanced Vector Extensions 2 (integer 256-bit SIMD). */ + avx2 = 5, + /* AVX-512 Foundation instructions. */ + avx512f = 16, + /* AVX-512 Doubleword and Quadword instructions. */ + avx512dq = 17, + /* AVX-512 Integer Fused Multiply-Add instructions. */ + avx512ifma = 21, + /* AVX-512 Prefetch instructions. */ + avx512pf = 26, + /* AVX-512 Exponential and Reciprocal instructions. */ + avx512er = 27, + /* AVX-512 Conflict Detection instructions. */ + avx512cd = 28, + /* AVX-512 Byte and Word instructions. */ + avx512bw = 30, + }; + enum class ecx + { + /* AVX-512 Vector Bit Manipulation instructions. */ + avx512vbmi = 1, + /* AVX-512 Vector Bit Manipulation instructions 2. */ + avx512vbmi2 = 6, + /* AVX-512 Vector Neural Network instructions. */ + avx512vnni_bw = 11, + }; + enum class edx + { + }; + }; + + /** + * Extended Feature Bits (EAX=7, ECX=0). + * + * Utility class that can read and parse the registers for the extended + * feature bits leaf of the CPUID instruction. + * This is well defined on all architectures but will return all false on all + * non-x86 architectures. + * + * @see https://en.wikipedia.org/wiki/CPUID + */ + using x86_cpuid_leaf7 = detail::make_x86_cpuid_regs; + + struct x86_cpuid_leaf7sub1_traits + { + static constexpr detail::x86_reg32_t leaf = 7; + static constexpr detail::x86_reg32_t subleaf = 1; + + enum class eax + { + /* AVX (VEX-encoded) Vector Neural Network instructions. */ + avxvnni = 4, + }; + enum class ebx + { + }; + enum class ecx + { + }; + enum class edx + { + }; + }; + + /** + * Extended Feature Bits (EAX=7, ECX=1). + * + * Utility class that can read and parse the registers for the extended + * feature bits, subleaf 1, of the CPUID instruction. + * This is well defined on all architectures but will return all false on all + * non-x86 architectures. + * + * @see https://en.wikipedia.org/wiki/CPUID + */ + using x86_cpuid_leaf7sub1 = detail::make_x86_cpuid_regs; + + struct x86_cpuid_leaf80000001_traits + { + static constexpr detail::x86_reg32_t leaf = 0x80000001; + static constexpr detail::x86_reg32_t subleaf = 0; + + enum class eax + { + }; + enum class ebx + { + }; + enum class ecx + { + /* AMD Fused multiply-add with 4 operands (FMA4). */ + fma4 = 16, + }; + enum class edx + { + }; + }; + + /** + * Extended Processor Info and Feature Bits. + * + * Utility class that can read and parse the registers for the extended + * processor info leaf of the CPUID instruction. + * This is well defined on all architectures but will return all false on all + * non-x86 architectures. + * + * @see https://en.wikipedia.org/wiki/CPUID + */ + using x86_cpuid_leaf80000001 = detail::make_x86_cpuid_regs; + + /* + * Extended Control Register 0 (XCR0). + * + * Operating systems can explicitly disable the usage of instruction set (such + * as SSE or AVX extensions) by setting an appropriate flag in XCR0 register. + * This utility parses such bit values. + * + * @see https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html + */ + class x86_xcr0 + { + public: + enum class xcr0 + { + /** x87 FPU/MMX support (must be 1). */ + x87 = 0, + /** XSAVE support for MXCSR and XMM registers. */ + sse = 1, + /** AVX enabled and XSAVE support for upper halves of YMM registers. */ + avx = 2, + /** MPX enabled and XSAVE support for BND0-BND3 registers. */ + bndreg = 3, + /** MPX enabled and XSAVE support for BNDCFGU and BNDSTATUS registers. */ + bndcsr = 4, + /** AVX-512 enabled and XSAVE support for opmask registers k0-k7. */ + opmask = 5, + /** AVX-512 enabled and XSAVE support for upper halves of lower ZMM registers. */ + zmm_hi256 = 6, + /** AVX-512 enabled and XSAVE support for upper ZMM registers. */ + hi16_zmm = 7, + /** Saving/restoring Intel Processor Trace state via XSAVE enabled.*/ + processor_trace = 8, + /** XSAVE support for PKRU register. */ + pkru = 9, + }; + + /** + * Create a default value with only SSE enabled. + * + * AVX and AVX512 strictly require OSXSAVE to be enabled by the OS. + * If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), AVX state won't + * be preserved across context switches, so AVX cannot be used. + * SSE is therefore the only value safe to assume. + */ + constexpr static x86_xcr0 safe_default() noexcept + { + x86_reg32_t low = {}; + low = utils::make_bit_mask(static_cast(xcr0::sse)); + return x86_xcr0(low); + } + + /** + * Read the XCR0 register from the CPU if on the correct architecture. + * + * This is only safe to call if bit 18 of CR4.OSXSAVE has been set. + * + * @see cpu_id::osxsave + */ + inline static x86_xcr0 read() + { + assert(x86_cpuid_leaf1::read().all_bits_set()); + return x86_xcr0(detail::x86_xcr0_low()); + } + + template + constexpr bool all_bits_set() const noexcept + { + return m_low.all_bits_set(); + } + + /** Create a value which return false to everything. */ + constexpr x86_xcr0() noexcept = default; + + private: + using x86_reg32_t = detail::x86_reg32_t; + + using xcr0_reg_t = detail::x86_reg32_bitset; + + /** Parse a XCR0 value into individual components. */ + constexpr explicit x86_xcr0(x86_reg32_t low) noexcept + : m_low(low) + { + } + + xcr0_reg_t m_low {}; + }; + + /** + * An opiniated CPU feature detection utility for x86. + * + * These are high level features that combine multiple registers reads in sequence. + * Instead of looking directly at raw CPUID results, this utility also checks that + * permissions (e.g. OSXSAVE) are enabled, and otherwise return conservative defaults. + * + * This is well defined on all architectures. It will always return false on + * non-x86 architectures. + */ + class x86_cpu_features + { + public: + x86_cpu_features() noexcept = default; + + inline bool sse_enabled() const noexcept + { + return xcr0().all_bits_set(); + } + + inline bool avx_enabled() const noexcept + { + // Check both SSE and AVX bits even though AVX must imply SSE + return xcr0().all_bits_set(); + } + + inline bool avx512_enabled() const noexcept + { + // Check all SSE, AVX, and AVX512 bits even though AVX512 must imply AVX and SSE + return xcr0().all_bits_set(); + } + + /** + * Indicates whether the OS has enabled extended state management. + * + * When true, the OS has set bit 18 (OSXSAVE) in the CR4 control register, + * enabling the XGETBV/XSETBV instructions to access XCR0 and support + * processor extended state management using XSAVE/XRSTOR. + * + * This value is read from CPUID leaf 0x1, ECX bit 27, which reflects + * the state of CR4.OSXSAVE. + */ + inline bool osxsave() const noexcept { return leaf1().all_bits_set(); } + + inline bool sse2() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } + + inline bool sse3() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } + + inline bool ssse3() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } + + inline bool sse4_1() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } + + inline bool sse4_2() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } + + inline bool fma3() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } + + inline bool avx() const noexcept { return avx_enabled() && leaf1().all_bits_set(); } + + inline bool avx2() const noexcept { return avx_enabled() && leaf7().all_bits_set(); } + + inline bool avx512f() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avx512dq() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avx512ifma() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avx512pf() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avx512er() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avx512cd() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avx512bw() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avx512vbmi() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avx512vbmi2() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avx512vnni_bw() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } + + inline bool avxvnni() const noexcept { return avx_enabled() && leaf7sub1().all_bits_set(); } + + inline bool fma4() const noexcept { return avx_enabled() && leaf80000001().all_bits_set(); } + + private: + enum class status + { + leaf1_valid = 0, + leaf7_valid = 1, + leaf7sub1_valid = 2, + leaf80000001_valid = 3, + xcr0_valid = 4, + }; + + using status_bitset = utils::uint_bitset; + + mutable x86_xcr0 m_xcr0 {}; + mutable x86_cpuid_leaf1 m_leaf1 {}; + mutable x86_cpuid_leaf7 m_leaf7 {}; + mutable x86_cpuid_leaf7sub1 m_leaf7sub1 {}; + mutable x86_cpuid_leaf80000001 m_leaf80000001 {}; + mutable status_bitset m_status {}; + + inline x86_xcr0 const& xcr0() const noexcept + { + if (!m_status.bit_is_set()) + { + m_xcr0 = osxsave() ? x86_xcr0::read() : x86_xcr0::safe_default(); + m_status.set_bit(); + } + return m_xcr0; + } + + inline x86_cpuid_leaf1 const& leaf1() const + { + if (!m_status.bit_is_set()) + { + m_leaf1 = x86_cpuid_leaf1::read(); + m_status.set_bit(); + } + return m_leaf1; + } + + inline x86_cpuid_leaf7 const& leaf7() const + { + if (!m_status.bit_is_set()) + { + m_leaf7 = x86_cpuid_leaf7::read(); + m_status.set_bit(); + } + return m_leaf7; + } + + inline x86_cpuid_leaf7sub1 const& leaf7sub1() const + { + if (!m_status.bit_is_set()) + { + m_leaf7sub1 = x86_cpuid_leaf7sub1::read(); + m_status.set_bit(); + } + return m_leaf7sub1; + } + + inline x86_cpuid_leaf80000001 const& leaf80000001() const + { + if (!m_status.bit_is_set()) + { + m_leaf80000001 = x86_cpuid_leaf80000001::read(); + m_status.set_bit(); + } + return m_leaf80000001; + } + }; + + namespace detail + { +#if XSIMD_TARGET_X86 + + inline cpuid_reg_t x86_cpuid(int leaf, int subleaf) noexcept + { + cpuid_reg_t reg = {}; +#if defined(_MSC_VER) + int buf[4]; + __cpuidex(buf, leaf, subleaf); + std::memcpy(reg.data(), buf, sizeof(buf)); + +#elif defined(__INTEL_COMPILER) + int buf[4]; + __cpuid(buf, leaf); + std::memcpy(reg.data(), buf, sizeof(buf)); + +#elif defined(__GNUC__) || defined(__clang__) + +#if defined(__i386__) && defined(__PIC__) + // %ebx may be the PIC register + __asm__("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) + : "0"(leaf), "2"(subleaf)); + +#else + __asm__("cpuid\n\t" + : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) + : "0"(leaf), "2"(subleaf)); +#endif +#endif + return reg; + } + + inline x86_reg32_t x86_xcr0_low() noexcept + { +#if defined(_MSC_VER) +#if _MSC_VER >= 1400 + return static_cast(_xgetbv(0)); +#else +#error "_MSC_VER < 1400 is not supported" +#endif + +#elif defined(__GNUC__) + x86_reg32_t xcr0 = {}; + __asm__( + "xorl %%ecx, %%ecx\n" + "xgetbv\n" + : "=a"(xcr0) + : +#if defined(__i386__) + : "ecx", "edx" +#else + : "rcx", "rdx" +#endif + ); + return xcr0; +#endif + } + +#else // XSIMD_TARGET_X86 + + inline cpuid_reg_t x86_cpuid(int /* leaf */, int /* subleaf */) noexcept + { + return {}; // All bits to zero + } + + inline x86_reg32_t x86_xcr0_low() noexcept + { + return {}; // All bits to zero + } + +#endif // XSIMD_TARGET_X86 + } +} +#endif diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index 1be4f018a..6871637e3 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -12,8 +12,9 @@ #ifndef XSIMD_CPUID_HPP #define XSIMD_CPUID_HPP -#include -#include +#include "../types/xsimd_all_registers.hpp" +#include "./xsimd_cpu_features_x86.hpp" +#include "xsimd_inline.hpp" #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector)) #include @@ -25,13 +26,6 @@ #endif -#if defined(_MSC_VER) -// Contains the definition of __cpuidex -#include -#endif - -#include "../types/xsimd_all_registers.hpp" - namespace xsimd { namespace detail @@ -40,7 +34,7 @@ namespace xsimd { #define ARCH_FIELD_EX(arch, field_name) \ - unsigned field_name; \ + unsigned field_name = 0; \ XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; } #define ARCH_FIELD_EX_REUSE(arch, field_name) \ @@ -90,8 +84,6 @@ namespace xsimd XSIMD_INLINE supported_arch() noexcept { - memset(this, 0, sizeof(supported_arch)); - #if XSIMD_WITH_WASM wasm = 1; #endif @@ -122,138 +114,38 @@ namespace xsimd #endif rvv = bool(getauxval(AT_HWCAP) & HWCAP_V); #endif - -#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) - - auto get_xcr0_low = []() noexcept - { - uint32_t xcr0; - -#if defined(_MSC_VER) && _MSC_VER >= 1400 - - xcr0 = (uint32_t)_xgetbv(0); - -#elif defined(__GNUC__) - - __asm__( - "xorl %%ecx, %%ecx\n" - "xgetbv\n" - : "=a"(xcr0) - : -#if defined(__i386__) - : "ecx", "edx" -#else - : "rcx", "rdx" #endif - ); - -#else /* _MSC_VER < 1400 */ -#error "_MSC_VER < 1400 is not supported" -#endif /* _MSC_VER && _MSC_VER >= 1400 */ - return xcr0; - }; - - auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept - { + // Safe on all platforms, it will be all false if non x86. + const auto x86_cpu = xsimd::x86_cpu_features(); -#if defined(_MSC_VER) - __cpuidex(reg, level, count); + sse2 = x86_cpu.sse2(); + sse3 = x86_cpu.sse3(); + ssse3 = x86_cpu.ssse3(); + sse4_1 = x86_cpu.sse4_1(); + sse4_2 = x86_cpu.sse4_2(); + fma3_sse42 = x86_cpu.fma3(); -#elif defined(__INTEL_COMPILER) - __cpuid(reg, level); - -#elif defined(__GNUC__) || defined(__clang__) - -#if defined(__i386__) && defined(__PIC__) - // %ebx may be the PIC register - __asm__("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) - : "0"(level), "2"(count)); - -#else - __asm__("cpuid\n\t" - : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) - : "0"(level), "2"(count)); -#endif + // sse4a not implemented in cpu_id yet + // xop not implemented in cpu_id yet -#else -#error "Unsupported configuration" -#endif - }; - - int regs1[4]; - - get_cpuid(regs1, 0x1); - - // OS can explicitly disable the usage of SSE/AVX extensions - // by setting an appropriate flag in CR0 register - // - // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html - - unsigned sse_state_os_enabled = 1; - // AVX and AVX512 strictly require OSXSAVE to be enabled by the OS. - // If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), - // AVX state won't be preserved across context switches, so AVX cannot be used. - unsigned avx_state_os_enabled = 0; - unsigned avx512_state_os_enabled = 0; - - // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit - // 18] to enable XSETBV/XGETBV instructions to access XCR0 and - // to support processor extended state management using - // XSAVE/XRSTOR. - bool osxsave = regs1[2] >> 27 & 1; - if (osxsave) - { - - uint32_t xcr0 = get_xcr0_low(); - - sse_state_os_enabled = xcr0 >> 1 & 1; - avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled; - avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled; - } - - sse2 = regs1[3] >> 26 & sse_state_os_enabled; - sse3 = regs1[2] >> 0 & sse_state_os_enabled; - ssse3 = regs1[2] >> 9 & sse_state_os_enabled; - sse4_1 = regs1[2] >> 19 & sse_state_os_enabled; - sse4_2 = regs1[2] >> 20 & sse_state_os_enabled; - fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled; - - avx = regs1[2] >> 28 & avx_state_os_enabled; + avx = x86_cpu.avx(); fma3_avx = avx && fma3_sse42; - - int regs8[4]; - get_cpuid(regs8, 0x80000001); - fma4 = regs8[2] >> 16 & avx_state_os_enabled; - - // sse4a = regs[2] >> 6 & 1; - - // xop = regs[2] >> 11 & 1; - - int regs7[4]; - get_cpuid(regs7, 0x7); - avx2 = regs7[1] >> 5 & avx_state_os_enabled; - - int regs7a[4]; - get_cpuid(regs7a, 0x7, 0x1); - avxvnni = regs7a[0] >> 4 & avx_state_os_enabled; - + fma4 = x86_cpu.fma4(); + avx2 = x86_cpu.avx2(); + avxvnni = x86_cpu.avxvnni(); fma3_avx2 = avx2 && fma3_sse42; - avx512f = regs7[1] >> 16 & avx512_state_os_enabled; - avx512cd = regs7[1] >> 28 & avx512_state_os_enabled; - avx512dq = regs7[1] >> 17 & avx512_state_os_enabled; - avx512bw = regs7[1] >> 30 & avx512_state_os_enabled; - avx512er = regs7[1] >> 27 & avx512_state_os_enabled; - avx512pf = regs7[1] >> 26 & avx512_state_os_enabled; - avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled; - avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled; - avx512vbmi2 = regs7[2] >> 6 & avx512_state_os_enabled; - avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled; + avx512f = x86_cpu.avx512f(); + avx512cd = x86_cpu.avx512cd(); + avx512dq = x86_cpu.avx512dq(); + avx512bw = x86_cpu.avx512bw(); + avx512er = x86_cpu.avx512er(); + avx512pf = x86_cpu.avx512pf(); + avx512ifma = x86_cpu.avx512ifma(); + avx512vbmi = x86_cpu.avx512vbmi(); + avx512vbmi2 = x86_cpu.avx512vbmi2(); + avx512vnni_bw = x86_cpu.avx512vnni_bw(); avx512vnni_vbmi2 = avx512vbmi2 && avx512vnni_bw; -#endif } }; } // namespace detail diff --git a/include/xsimd/types/xsimd_common_arch.hpp b/include/xsimd/types/xsimd_common_arch.hpp index 28491aeda..a33c868ea 100644 --- a/include/xsimd/types/xsimd_common_arch.hpp +++ b/include/xsimd/types/xsimd_common_arch.hpp @@ -12,6 +12,8 @@ #ifndef XSIMD_COMMON_ARCH_HPP #define XSIMD_COMMON_ARCH_HPP +#include + #include "../config/xsimd_config.hpp" /** diff --git a/include/xsimd/types/xsimd_register.hpp b/include/xsimd/types/xsimd_register.hpp index 018418af6..b14962e5b 100644 --- a/include/xsimd/types/xsimd_register.hpp +++ b/include/xsimd/types/xsimd_register.hpp @@ -14,6 +14,8 @@ #include +#include "../config/xsimd_inline.hpp" + namespace xsimd { namespace types diff --git a/include/xsimd/utils/bits.hpp b/include/xsimd/utils/bits.hpp new file mode 100644 index 000000000..ffa09f8e7 --- /dev/null +++ b/include/xsimd/utils/bits.hpp @@ -0,0 +1,91 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ***************************************************************************/ + +#ifndef XSIMD_CPUID_UTILS_HPP +#define XSIMD_CPUID_UTILS_HPP + +#include +namespace xsimd +{ + namespace utils + { + template + constexpr I make_bit_mask(I bit) + { + assert(bit >= 0); + assert(bit < static_cast(8 * sizeof(I))); + return static_cast(I { 1 } << bit); + } + + template + constexpr I make_bit_mask(I bit, Args... bits) + { + // TODO(C++17): Use fold expression + return make_bit_mask(bit) | make_bit_mask(static_cast(bits)...); + } + + template + constexpr bool all_bits_set(I value) + { + constexpr I mask = make_bit_mask(static_cast(Bits)...); + return (value & mask) == mask; + } + + template + constexpr I set_bit(I value) + { + constexpr I mask = make_bit_mask(static_cast(Bit)); + return value | mask; + } + + /* A bitset over an unsigned integer type, indexed by an enum key type. */ + template + struct uint_bitset + { + /* The underlying unsigned integer type storing the bits. */ + using storage_type = U; + /* The enum type whose values name individual bits. */ + using key_type = K; + + /* Construct from a raw bit pattern. */ + constexpr explicit uint_bitset(storage_type bitset = {}) noexcept + : m_bitset(bitset) + { + } + + /* Return true if every bit named by the template arguments is set. */ + template + constexpr bool all_bits_set() const noexcept + { + return utils::all_bits_set(bits)...>(m_bitset); + } + + /* Return true if the bit is set. */ + template + constexpr bool bit_is_set() const noexcept + { + return all_bits_set(); + } + + /* Set the corresponding bit to true in the bitfield. */ + template + constexpr void set_bit() noexcept + { + m_bitset = utils::set_bit(bit)>(m_bitset); + } + + private: + storage_type m_bitset = { 0 }; + }; + } +} + +#endif diff --git a/test/check_inline_specifier.sh b/test/check_inline_specifier.sh index 1ccdda130..2337b3707 100755 --- a/test/check_inline_specifier.sh +++ b/test/check_inline_specifier.sh @@ -3,7 +3,7 @@ # Usage: $0 top_srcdir # # This script walks all headers in $top_srcdir/include and makes sure that all -# functions declared tehre are marked as inline or constexpr (which implies +# functions declared there are marked as inline or constexpr (which implies # inline). This makes sure the xsimd headers does not define symbol with global # linkage, and somehow convey our itnent to have all functions in xsimd being # inlined by the compiler.