Files
UnrealEngine/Engine/Source/Runtime/RadAudioCodec/SDK/Src/RadAudio/cpux86.cpp
2025-05-18 13:04:45 +08:00

206 lines
6.3 KiB
C++

// Copyright Epic Games Tools, LLC. All Rights Reserved.
// This source file is licensed solely to users who have
// accepted a valid Unreal Engine license agreement
// (see e.g., https://www.unrealengine.com/eula), and use
// of this source file is governed by such agreement.
#include "cpux86.h"
#ifdef __RADX86__
#ifdef _MSC_VER
#include <intrin.h>
#if _MSC_VER >= 1500 // VC++2008 or later
#define HAVE_CPUIDEX
static inline U64 xgetbv(U32 xcr)
{
return _xgetbv(xcr);
}
#else
static inline U64 xgetbv(U32 xcr)
{
return 0;
}
#endif
#else
// GCC/Clang
#ifdef __RADX64__
// 64-bit: GCC/Clang won't let us use "=b" constraint on Mac64, and we need to preserve RBX
// (PIC/PIE base)
#define __cpuidex(out, leaf_id, subleaf_id)\
asm("xchgq %%rbx,%q1\n" \
"cpuid\n" \
"xchgq %%rbx,%q1\n" \
: "=a" (out[0]), "=&r" (out[1]), "=c" (out[2]), "=d" (out[3]): "0" (leaf_id), "2"(subleaf_id));
#else
#error "64 bit only supported"
#endif // __RADX64__
#define HAVE_CPUIDEX
#define __cpuid(out, leaf_id) __cpuidex(out, leaf_id, 0)
static inline U64 xgetbv(U32 xcr)
{
U32 lo, hi;
__asm__ __volatile__("xgetbv" : "=a"(lo), "=d"(hi) : "c"(xcr));
return ((U64)hi << 32) | lo;
}
#endif // _MSC_VER or not
#ifdef RRX86_CPU_DYNAMIC_DETECT
// note : g_rrCPUx86_feature_flags is a global atomic shared variable
// we play lazy & loose with the thread safety here (atomics are in ext)
// most likely it's fine
extern "C" U32 g_rrCPUx86_feature_flags = 0;
extern "C" void rrCPUx86_detect()
{
int cpuid_info[4];
U32 features = 0;
U32 max_leaf;
bool is_amd = false;
// if we already detected, we're good!
features = g_rrCPUx86_feature_flags; // atomic or volatile load?
if (features & RRX86_CPU_INITIALIZED)
return;
// Basic CPUID information
__cpuid(cpuid_info, 0);
max_leaf = cpuid_info[0];
// Is it AMD?
if (cpuid_info[1] == 0x68747541 /* "Auth" */ && cpuid_info[3] == 0x69746e65 /* "enti" */ &&
cpuid_info[2] == 0x444d4163 /* "cAMD" */)
{
is_amd = true;
}
// Basic feature flags
__cpuid(cpuid_info, 1);
if (cpuid_info[3] & (1u<<26)) features |= RRX86_CPU_SSE2;
if (cpuid_info[2] & (1u<< 9)) features |= RRX86_CPU_SSSE3;
if (cpuid_info[2] & (1u<<19)) features |= RRX86_CPU_SSE41;
if (cpuid_info[2] & (1u<<20)) features |= RRX86_CPU_SSE42;
// Used to compute other feature flags
bool has_popcnt = (cpuid_info[2] & (1u<<23)) != 0;
bool has_osxsave = (cpuid_info[2] & (1u<<27)) != 0;
bool has_cpu_avx = (cpuid_info[2] & (1u<<28)) != 0;
bool has_cpu_f16c = (cpuid_info[2] & (1u<<29)) != 0;
if (has_popcnt) features |= RRX86_CPU_POPCNT;
if (is_amd)
{
U32 family = (cpuid_info[0] >> 8) & 0xf;
U32 ext_family = (cpuid_info[0] >> 20) & 0xff;
// Zen aka AMD 17h has family=0xf, ext_family=0x08 (Zen and Zen2 both)
// Zen3 aka AMD 19h has family=0xf, ext_family=0x0a
// so just testing for this:
if (family == 0xf && ext_family >= 0x08)
features |= RRX86_CPU_AMD_ZEN;
}
// Get XCR0, if available, and determine context save bits
U64 xcr0 = 0;
if (has_osxsave)
{
xcr0 = xgetbv(0);
}
// YMM register saving and ZMM/opmask register saving support
bool has_os_avx_support = (xcr0 & 6) == 6;
bool has_os_avx512_support = (xcr0 & 0xe6) == 0xe6;
// AVX support requires both CPU and OS support, and gates some other extensions
if (has_os_avx_support)
{
if (has_cpu_avx) features |= RRX86_CPU_AVX;
if (has_cpu_f16c) features |= RRX86_CPU_F16C;
}
#ifdef HAVE_CPUIDEX
if (max_leaf >= 7)
{
// "Structured extended feature flags enumeration"
__cpuidex(cpuid_info, 7, 0);
// Some (Celeron) Skylakes erroneously report BMI1/BMI2 even though they don't have it.
// These Celerons also don't have AVX.
//
// All CPUs that actually have BMI1/BMI2 (as of this writing, 2016-05-11) have AVX.
// (The ones we care about, anyway.) So only report BMI1/BMI2 if AVX is present.
// Also only report AVX or the BMIs if POPCNT is present; all processors I know of
// have either both or neither, and it's convenient for us to be able to assume
// that either BMI1/BMI2 or AVX2 implies POPCNT.
if (has_cpu_avx && has_os_avx_support && has_popcnt)
{
if (cpuid_info[1] & (1u<<3)) features |= RRX86_CPU_BMI1;
if (cpuid_info[1] & (1u<<8)) features |= RRX86_CPU_BMI2;
// In addition to the above, only report AVX2 if BMI1 (and thus LZCNT/TZCNT)
// are also reported present; finally VC++ with /arch:AVX2 will emit BMI2
// instructions for things like variable shifts so we require BMI2 for AVX2
// as well.
//
// In practice this is not a limitation, AVX2 and BMI2 are a package deal on
// all uArchs I'm aware of.
const U32 avx2_bits = (1u<<3) /* BMI1 */ | (1u<<5) /* AVX2 */ | (1u<<8) /* BMI2 */;
if ((cpuid_info[1] & avx2_bits) == avx2_bits)
features |= RRX86_CPU_AVX2;
if (has_os_avx512_support)
{
// For us to report AVX512, we want the Skylake feature set
const U32 avx512_bits = (1u << 31) /* AVX512VL */ | (1u << 30) /* AVX512BW */ | (1u << 17) /* AVX512DQ */ | (1u << 16) /* AVX512F */;
if ((cpuid_info[1] & avx512_bits) == avx512_bits)
features |= RRX86_CPU_AVX512;
// Use the VBMI2 bit (set on ICL+) to set the PREFER512 flag. This is available
// on a generation of cores where AVX-512 has no major clock penalty anymore so
// whether to use AVX-512 or not is a much more straightforward calculation,
// and not so dependent on what else is running at the same time.
if (cpuid_info[2] & (1u << 6))
features |= RRX86_CPU_PREFER512;
}
}
}
#endif
// Super-paranoia: we use the AMD_ZEN flag to indicate we are free to use Zen-optimized
// kernels without further CPUID checks. In case some joker monekys around with with CPUID
// flags in the future, turn it off again if we don't have the CPUID bits we should have
// on a real Zen.
if (features & RRX86_CPU_AMD_ZEN)
{
const U32 zen_features = RRX86_CPU_SSE2 | RRX86_CPU_SSSE3 | RRX86_CPU_SSE41 | RRX86_CPU_SSE42 | RRX86_CPU_F16C |
RRX86_CPU_AVX | RRX86_CPU_AVX2 |
RRX86_CPU_BMI1 | RRX86_CPU_BMI2;
if ((features & zen_features) != zen_features)
features &= ~RRX86_CPU_AMD_ZEN;
}
// write detected features
// only write value once at end of the function!
features |= RRX86_CPU_INITIALIZED;
g_rrCPUx86_feature_flags = features; // atomic or volatile store
}
#endif // RRX86_CPU_DYNAMIC_DETECT
#endif // __RADX86__