// Copyright Epic Games, Inc. All Rights Reserved. /*============================================================================= LaneVectorization.ush: Vectorize arbitrary number of processing per lane. =============================================================================*/ #pragma once #include "Platform.ush" #include "WaveBroadcastIntrinsics.ush" //------------------------------------------------------- DEFINE #ifndef TENSOR_REGISTER_LAYOUT #define TENSOR_REGISTER_LAYOUT FRegisters_AoS_VectorArray #endif #ifndef TENSOR_REGISTER_TIGHTENING #define TENSOR_REGISTER_TIGHTENING 0 #endif //------------------------------------------------------- GLOBAL VARIABLE static uint GGroupThreadIndex = 0; //------------------------------------------------------- STANDARD SCALAR OPERATIONS bool not(bool x) { return !x; } bool2 not(bool2 x) { return !x; } bool3 not(bool3 x) { return !x; } bool4 not(bool4 x) { return !x; } uint bit_not(uint x) { return ~x; } uint2 bit_not(uint2 x) { return ~x; } #if PLATFORM_SUPPORTS_REAL_TYPES uint16_t bit_not(uint16_t x) { return ~x; } uint16_t2 bit_not(uint16_t2 x) { return ~x; } #endif bool2 v_pack_b32_b16(bool a, bool b) { return bool2(a, b); } float2 v_pack_b32_b16(float a, float b) { return float2(a, b); } uint2 v_pack_b32_b16(uint a, uint b) { return uint2(a, b); } int2 v_pack_b32_b16(int a, int b) { return int2(a, b); } float fast_sign(float x) { return clamp(x * asfloat(0x7f7fffff), float(-1.0), float(1.0)); } float2 fast_sign(float2 x) { return clamp(x * asfloat(0x7f7fffff), float(-1.0), float(1.0)); } #if PLATFORM_SUPPORTS_REAL_TYPES half fast_sign(half x) { return clamp(x * asfloat16(uint16_t(0x7bff)), half(-1.0), half(1.0)); } half2 fast_sign(half2 x) { return clamp(x * asfloat16(uint16_t(0x7bff)), half(-1.0), half(1.0)); } #endif uint bit_and(uint a, uint b) { return a & b; } uint2 bit_and(uint2 a, uint2 b) { return a & b; } uint bit_or(uint a, uint b) { return a | b; } uint2 bit_or(uint2 a, uint2 b) { return a | b; } #if PLATFORM_SUPPORTS_REAL_TYPES uint16_t bit_and(uint16_t a, uint16_t b) { return a & b; } uint16_t2 bit_and(uint16_t2 a, uint16_t2 b) { return a & b; } uint16_t bit_or(uint16_t a, uint16_t b) { return a | b; } uint16_t2 bit_or(uint16_t2 a, uint16_t2 b) { return a | b; } #endif uint bit_shift_left(uint a, uint b) { return a << b; } uint2 bit_shift_left(uint2 a, uint2 b) { return a << b; } uint bit_shift_right(uint a, uint b) { return a >> b; } uint2 bit_shift_right(uint2 a, uint2 b) { return a >> b; } #if PLATFORM_SUPPORTS_REAL_TYPES uint16_t bit_shift_left(uint16_t a, uint16_t b) { return a << b; } uint16_t2 bit_shift_left(uint16_t2 a, uint16_t2 b) { return a << b; } uint16_t bit_shift_right(uint16_t a, uint16_t b) { return a >> b; } uint16_t2 bit_shift_right(uint16_t2 a, uint16_t2 b) { return a >> b; } #endif //------------------------------------------------------- TIGHT REGISTERS bool PackRegistersTightly(bool v) { return v; } bool2 PackRegistersTightly(bool2 v) { return v; } bool3 PackRegistersTightly(bool3 v) { return v; } bool4 PackRegistersTightly(bool4 v) { return v; } float PackRegistersTightly(float v) { return v; } float2 PackRegistersTightly(float2 v) { return v; } float3 PackRegistersTightly(float3 v) { return v; } float4 PackRegistersTightly(float4 v) { return v; } uint PackRegistersTightly(uint v) { return v; } uint2 PackRegistersTightly(uint2 v) { return v; } uint3 PackRegistersTightly(uint3 v) { return v; } uint4 PackRegistersTightly(uint4 v) { return v; } int PackRegistersTightly(int v) { return v; } int2 PackRegistersTightly(int2 v) { return v; } int3 PackRegistersTightly(int3 v) { return v; } int4 PackRegistersTightly(int4 v) { return v; } #if PLATFORM_SUPPORTS_REAL_TYPES half PackRegistersTightly(half v) { return v; } half2 PackRegistersTightly(half2 v) { return v_pack_b32_b16(v.x, v.y); } half3 PackRegistersTightly(half3 v) { return half3(v_pack_b32_b16(v.x, v.y), v.z); } half4 PackRegistersTightly(half4 v) { return half4(v_pack_b32_b16(v.x, v.y), v_pack_b32_b16(v.z, v.w)); } uint16_t PackRegistersTightly(uint16_t v) { return v; } uint16_t2 PackRegistersTightly(uint16_t2 v) { return v_pack_b32_b16(v.x, v.y); } uint16_t3 PackRegistersTightly(uint16_t3 v) { return uint16_t3(v_pack_b32_b16(v.x, v.y), v.z); } uint16_t4 PackRegistersTightly(uint16_t4 v) { return uint16_t4(v_pack_b32_b16(v.x, v.y), v_pack_b32_b16(v.z, v.w)); } int16_t PackRegistersTightly(int16_t v) { return v; } int16_t2 PackRegistersTightly(int16_t2 v) { return v_pack_b32_b16(v.x, v.y); } int16_t3 PackRegistersTightly(int16_t3 v) { return int16_t3(v_pack_b32_b16(v.x, v.y), v.z); } int16_t4 PackRegistersTightly(int16_t4 v) { return int16_t4(v_pack_b32_b16(v.x, v.y), v_pack_b32_b16(v.z, v.w)); } #endif // PLATFORM_SUPPORTS_REAL_TYPES //------------------------------------------------------- DWORD PACKING/UNPACKING /** Returns the byte size of vector */ #define GetVectorByteSize(ScalarType, VectorSize) uint(sizeof(ScalarType) * uint(VectorSize)) /** Returns the dowrd size of vector */ #define GetVectorDwordSize(ScalarType, VectorSize) (uint(GetVectorByteSize(ScalarType, VectorSize) + 3u) / 4u) // uint void PackVectorToDwords(uint V, out uint DW[1]) { DW[0] = V; } void PackVectorToDwords(uint2 V, out uint DW[2]) { DW[0] = V.x; DW[1] = V.y; } void PackVectorToDwords(uint3 V, out uint DW[3]) { DW[0] = V.x; DW[1] = V.y; DW[2] = V.z; } void PackVectorToDwords(uint4 V, out uint DW[4]) { DW[0] = V.x; DW[1] = V.y; DW[2] = V.z; DW[3] = V.w; } void UnpackDwordsToVector(uint DW[1], out uint V) { V = DW[0]; } void UnpackDwordsToVector(uint DW[2], out uint2 V) { V.x = DW[0]; V.y = DW[1]; } void UnpackDwordsToVector(uint DW[3], out uint3 V) { V.x = DW[0]; V.y = DW[1]; V.z = DW[2]; } void UnpackDwordsToVector(uint DW[4], out uint4 V) { V.x = DW[0]; V.y = DW[1]; V.z = DW[2]; V.w = DW[3]; } // float void PackVectorToDwords(float V, out uint DW[1]) { DW[0] = asuint(V); } void PackVectorToDwords(float2 V, out uint DW[2]) { DW[0] = asuint(V.x); DW[1] = asuint(V.y); } void PackVectorToDwords(float3 V, out uint DW[3]) { DW[0] = asuint(V.x); DW[1] = asuint(V.y); DW[2] = asuint(V.z); } void PackVectorToDwords(float4 V, out uint DW[4]) { DW[0] = asuint(V.x); DW[1] = asuint(V.y); DW[2] = asuint(V.z); DW[3] = asuint(V.w); } void UnpackDwordsToVector(uint DW[1], out float V) { V = asfloat(DW[0]); } void UnpackDwordsToVector(uint DW[2], out float2 V) { V.x = asfloat(DW[0]); V.y = asfloat(DW[1]); } void UnpackDwordsToVector(uint DW[3], out float3 V) { V.x = asfloat(DW[0]); V.y = asfloat(DW[1]); V.z = asfloat(DW[2]); } void UnpackDwordsToVector(uint DW[4], out float4 V) { V.x = asfloat(DW[0]); V.y = asfloat(DW[1]); V.z = asfloat(DW[2]); V.w = asfloat(DW[3]); } #if PLATFORM_SUPPORTS_REAL_TYPES #if COMPILER_SUPPORT_UINT16_BITCAST // uint16_t void PackVectorToDwords(uint16_t V, out uint DW[1]) { DW[0] = uint(V); } void PackVectorToDwords(uint16_t2 V, out uint DW[1]) { DW[0] = bit_cast_uint(V.xy); } void PackVectorToDwords(uint16_t3 V, out uint DW[2]) { DW[0] = bit_cast_uint(V.xy); DW[1] = uint(V.z); } void PackVectorToDwords(uint16_t4 V, out uint DW[2]) { DW[0] = bit_cast_uint(V.xy); DW[1] = bit_cast_uint(V.zw); } void UnpackDwordsToVector(uint DW[1], out uint16_t V) { V = uint16_t(DW[0]); } void UnpackDwordsToVector(uint DW[1], out uint16_t2 V) { V.xy = bit_cast_uint16_t2(DW[0]); } void UnpackDwordsToVector(uint DW[2], out uint16_t3 V) { V.xy = bit_cast_uint16_t2(DW[0]); V.z = uint16_t(DW[1]); } void UnpackDwordsToVector(uint DW[2], out uint16_t4 V) { V.xy = bit_cast_uint16_t2(DW[0]); V.zw = bit_cast_uint16_t2(DW[1]); } #else // !COMPILER_SUPPORT_UINT16_BITCAST // uint16_t void PackVectorToDwords(uint16_t V, out uint DW[1]) { DW[0] = uint(V); } void PackVectorToDwords(uint16_t2 V, out uint DW[1]) { DW[0] = uint(V.x) | (uint(V.y) << 16u); } void PackVectorToDwords(uint16_t3 V, out uint DW[2]) { DW[0] = uint(V.x) | (uint(V.y) << 16u); DW[1] = uint(V.z); } void PackVectorToDwords(uint16_t4 V, out uint DW[2]) { DW[0] = uint(V.x) | (uint(V.y) << 16u); DW[1] = uint(V.z) | (uint(V.w) << 16u); } void UnpackDwordsToVector(uint DW[1], out uint16_t V) { V = uint16_t(DW[0]); } void UnpackDwordsToVector(uint DW[1], out uint16_t2 V) { V.x = uint16_t((DW[0] >> 0u) & 0xFFFFu); V.y = uint16_t((DW[0] >> 16u) & 0xFFFFu); } void UnpackDwordsToVector(uint DW[2], out uint16_t3 V) { V.x = uint16_t((DW[0] >> 0u) & 0xFFFFu); V.y = uint16_t((DW[0] >> 16u) & 0xFFFFu); V.z = uint16_t((DW[1] >> 0u) & 0xFFFFu); } void UnpackDwordsToVector(uint DW[2], out uint16_t4 V) { V.x = uint16_t((DW[0] >> 0u) & 0xFFFFu); V.y = uint16_t((DW[0] >> 16u) & 0xFFFFu); V.z = uint16_t((DW[1] >> 0u) & 0xFFFFu); V.w = uint16_t((DW[1] >> 16u) & 0xFFFFu); } #endif // !COMPILER_SUPPORT_UINT16_BITCAST // half void PackVectorToDwords(half V, out uint DW[1]) { PackVectorToDwords(asuint16(V), /* out */ DW); } void PackVectorToDwords(half2 V, out uint DW[1]) { PackVectorToDwords(uint16_t2(asuint16(V.x), asuint16(V.y)), /* out */ DW); } void PackVectorToDwords(half3 V, out uint DW[2]) { PackVectorToDwords(uint16_t3(asuint16(V.x), asuint16(V.y), asuint16(V.z)), /* out */ DW); } void PackVectorToDwords(half4 V, out uint DW[2]) { PackVectorToDwords(uint16_t4(asuint16(V.x), asuint16(V.y), asuint16(V.z), asuint16(V.w)), /* out */ DW); } void UnpackDwordsToVector(uint DW[1], out half V) { uint16_t UV; UnpackDwordsToVector(DW, /* out */ UV); V = asfloat16(UV); } void UnpackDwordsToVector(uint DW[1], out half2 V) { uint16_t2 UV; UnpackDwordsToVector(DW, /* out */ UV); V.x = asfloat16(UV.x); V.y = asfloat16(UV.y); } void UnpackDwordsToVector(uint DW[2], out half3 V) { uint16_t3 UV; UnpackDwordsToVector(DW, /* out */ UV); V.x = asfloat16(UV.x); V.y = asfloat16(UV.y); V.z = asfloat16(UV.z); } void UnpackDwordsToVector(uint DW[2], out half4 V) { uint16_t4 UV; UnpackDwordsToVector(DW, /* out */ UV); V.x = asfloat16(UV.x); V.y = asfloat16(UV.y); V.z = asfloat16(UV.z); V.w = asfloat16(UV.w); } #endif // PLATFORM_SUPPORTS_REAL_TYPES //------------------------------------------------------- UTIL FUNCTION FOR MANUAL LOOP UNROLL uint GetDW(uint DW[1], const uint Index) { return DW[Index]; } uint GetDW(uint DW[2], const uint Index) { return DW[Index]; } uint GetDW(uint DW[3], const uint Index) { return DW[Index]; } uint GetDW(uint DW[4], const uint Index) { return DW[Index]; } void SetDW(inout uint DW[1], const uint Index, uint V) { DW[Index] = V; } void SetDW(inout uint DW[2], const uint Index, uint V) { DW[Index] = V; } void SetDW(inout uint DW[3], const uint Index, uint V) { DW[Index] = V; } void SetDW(inout uint DW[4], const uint Index, uint V) { DW[Index] = V; } //------------------------------------------------------- GROUP SHARED COMMUNICATION #if defined(LDS_SIZE) groupshared uint SharedData[LDS_SIZE * LDS_DWORD_COMPONENT_COUNT]; void WriteDwordToLDS(const uint SharedIndex, uint V) { SharedData[SharedIndex] = V; } uint ReadDwordFromLDS(const uint SharedIndex) { return SharedData[SharedIndex]; } void AtomicIncrementLDSDword(const uint SharedIndex, uint V) { InterlockedAdd(/* inout */ SharedData[SharedIndex], V); } template void WriteVectorToLDS(const uint SharedIndex, vector V) { // manual loop unroll the most used cases to reduce compile time // the branch will be compiled away uint DW[GetVectorDwordSize(ScalarType, VectorSize)]; PackVectorToDwords(V, /* out */ DW); if (GetVectorDwordSize(ScalarType, VectorSize) == 1) { SharedData[SharedIndex + 0 * LDS_SIZE] = GetDW(DW, 0); } else if (GetVectorDwordSize(ScalarType, VectorSize) == 2) { SharedData[SharedIndex + 0 * LDS_SIZE] = GetDW(DW, 0); SharedData[SharedIndex + 1 * LDS_SIZE] = GetDW(DW, 1); } else { UNROLL for (uint i = 0; i < GetVectorDwordSize(ScalarType, VectorSize); i++) { SharedData[SharedIndex + i * LDS_SIZE] = GetDW(DW, i); } } } template void ReadVectorFromLDS(const uint SharedIndex, out vector V) { uint DW[GetVectorDwordSize(ScalarType, VectorSize)]; if (GetVectorDwordSize(ScalarType, VectorSize) == 1) { SetDW(DW, 0, SharedData[SharedIndex + 0 * LDS_SIZE]); } else if (GetVectorDwordSize(ScalarType, VectorSize) == 2) { SetDW(DW, 0, SharedData[SharedIndex + 0 * LDS_SIZE]); SetDW(DW, 1, SharedData[SharedIndex + 1 * LDS_SIZE]); } else { UNROLL for (uint i = 0; i < GetVectorDwordSize(ScalarType, VectorSize); i++) { SetDW(DW, i, SharedData[SharedIndex + i * LDS_SIZE]); } } UnpackDwordsToVector(DW, /* out */ V); } #endif // defined(LDS_SIZE) //------------------------------------------------------- MANUAL LDS DESPILL #if defined(LDS_DESPILL_DWORD_COUNT) groupshared uint DespilledData[LDS_DESPILL_DWORD_COUNT]; #endif // defined(LDS_DESPILL_DWORD_COUNT) //------------------------------------------------------- REGISTERS LAYOUTS /** Stores one unique vector<>. InElementCount must be == 1 */ template struct FRegisters_S_OneVector { /** Size of each element. */ static const uint VectorSize = InVectorSize; /** Number of elememnts */ static const uint ElementCount = 1; /** Size and number of register rows */ static const uint RegisterRowSize = VectorSize; static const uint RegisterRowCount = 1; vector E; // ------------- register rows CALL_SITE_DEBUGLOC vector GetRegisterRow(const uint RegisterRowIndex) { return E; } CALL_SITE_DEBUGLOC void SetRegisterRow(const uint RegisterRowIndex, vector RegisterRow) { E = RegisterRow; } // ------------- elements CALL_SITE_DEBUGLOC vector GetElement(const uint ElementIndex) { return E; } CALL_SITE_DEBUGLOC void SetElement(const uint ElementIndex, vector Element) { E = Element; } vector WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex) { return WaveBroadcast(BroadcastSettings, GetElement(ElementIndex)); } // ------------- elements' components CALL_SITE_DEBUGLOC ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex) { return E[ComponentIndex]; } CALL_SITE_DEBUGLOC void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component) { E[ComponentIndex] = Component; } }; // struct FRegisters_S_OneVector /** Stores an array of vector<> as an array of structure in registers. */ template struct FRegisters_AoS_VectorArray { /** Size of each element. */ static const uint VectorSize = InVectorSize; /** Number of elememnts */ static const uint ElementCount = InElementCount; /** Size and number of register rows */ static const uint RegisterRowSize = VectorSize; static const uint RegisterRowCount = ElementCount; vector Array[ElementCount]; // ------------- register rows CALL_SITE_DEBUGLOC vector GetRegisterRow(const uint RegisterRowIndex) { return Array[RegisterRowIndex]; } CALL_SITE_DEBUGLOC void SetRegisterRow(const uint RegisterRowIndex, vector RegisterRow) { Array[RegisterRowIndex] = RegisterRow; } // ------------- elements CALL_SITE_DEBUGLOC vector GetElement(const uint ElementIndex) { return Array[ElementIndex]; } CALL_SITE_DEBUGLOC void SetElement(const uint ElementIndex, vector Element) { Array[ElementIndex] = Element; } vector WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex) { return WaveBroadcast(BroadcastSettings, GetElement(ElementIndex)); } // ------------- elements' components CALL_SITE_DEBUGLOC ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex) { return Array[ElementIndex][ComponentIndex]; } CALL_SITE_DEBUGLOC void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component) { Array[ElementIndex][ComponentIndex] = Component; } }; // struct FRegisters_AoS_VectorArray /** Stores an array of vector<> as an array of structure in registers using a matrix<>. */ template struct FRegisters_AoS_Matrix { /** Size of each element. */ static const uint VectorSize = InVectorSize; /** Number of elememnts */ static const uint ElementCount = InElementCount; /** Size and number of register rows */ static const uint RegisterRowSize = VectorSize; static const uint RegisterRowCount = ElementCount; matrix M; // ------------- register rows CALL_SITE_DEBUGLOC vector GetRegisterRow(const uint RegisterRowIndex) { return M[RegisterRowIndex]; } CALL_SITE_DEBUGLOC void SetRegisterRow(const uint RegisterRowIndex, vector RegisterRow) { M[RegisterRowIndex] = RegisterRow; } // ------------- elements CALL_SITE_DEBUGLOC vector GetElement(const uint ElementIndex) { return M[ElementIndex]; } CALL_SITE_DEBUGLOC void SetElement(const uint ElementIndex, vector Element) { M[ElementIndex] = Element; } vector WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex) { return WaveBroadcast(BroadcastSettings, GetElement(ElementIndex)); } // ------------- elements' components CALL_SITE_DEBUGLOC ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex) { return M[ElementIndex][ComponentIndex]; } CALL_SITE_DEBUGLOC void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component) { M[ElementIndex][ComponentIndex] = Component; } }; // struct FRegisters_AoS_Matrix /** Stores an array of vector<> as a structure of array in registers using a matrix<>. Requires InElementCount a pair >= 2 */ template struct FRegisters_SoA_Matrix { /** Size of each element. */ static const uint VectorSize = InVectorSize; /** Number of elememnts */ static const uint ElementCount = InElementCount; /** Size and number of register rows */ static const uint RegisterRowSize = ElementCount; static const uint RegisterRowCount = VectorSize; matrix M; // ------------- register rows CALL_SITE_DEBUGLOC vector GetRegisterRow(const uint RegisterRowIndex) { return M[RegisterRowIndex]; } CALL_SITE_DEBUGLOC void SetRegisterRow(const uint RegisterRowIndex, vector RegisterRow) { M[RegisterRowIndex] = RegisterRow; } // ------------- elements CALL_SITE_DEBUGLOC vector GetElement(const uint ElementIndex) { vector Element; if (VectorSize == 1) { Element[0] = M[0][ElementIndex]; } else if (VectorSize == 3) { Element[0] = M[0][ElementIndex]; Element[1] = M[1][ElementIndex]; Element[2] = M[2][ElementIndex]; } else { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { Element[ComponentIndex] = M[ComponentIndex][ElementIndex]; } } return Element; } CALL_SITE_DEBUGLOC void SetElement(const uint ElementIndex, vector Element) { if (VectorSize == 1) { M[0][ElementIndex] = Element[0]; } else if (VectorSize == 3) { M[0][ElementIndex] = Element[0]; M[1][ElementIndex] = Element[1]; M[2][ElementIndex] = Element[2]; } else { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { M[ComponentIndex][ElementIndex] = Element[ComponentIndex]; } } } vector WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex) { vector Element; if (VectorSize == 1) { Element[0] = WaveBroadcast(BroadcastSettings, M[0])[ElementIndex]; } else if (VectorSize == 3) { Element[0] = WaveBroadcast(BroadcastSettings, M[0])[ElementIndex]; Element[1] = WaveBroadcast(BroadcastSettings, M[1])[ElementIndex]; Element[2] = WaveBroadcast(BroadcastSettings, M[2])[ElementIndex]; } else { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { Element[ComponentIndex] = WaveBroadcast(BroadcastSettings, M[ComponentIndex])[ElementIndex]; } } return Element; } // ------------- elements' components CALL_SITE_DEBUGLOC ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex) { return M[ComponentIndex][ElementIndex]; } CALL_SITE_DEBUGLOC void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component) { M[ComponentIndex][ElementIndex] = Component; } }; // struct FRegisters_SoA_Matrix /** Stores an array of vector<> as a structure of array in registers using a array of vector. Requires InElementCount a pair >= 2 */ template struct FRegisters_AoS_PairArray { /** Size of each element. */ static const uint VectorSize = InVectorSize; /** Number of elememnts */ static const uint ElementCount = InElementCount; /** Size and number of register rows */ static const uint RegisterRowSize = 2; static const uint RegisterRowCount = (ElementCount * VectorSize) / 2; vector M[RegisterRowCount]; // ------------- register rows CALL_SITE_DEBUGLOC vector GetRegisterRow(const uint RegisterRowIndex) { return M[RegisterRowIndex]; } CALL_SITE_DEBUGLOC void SetRegisterRow(const uint RegisterRowIndex, vector RegisterRow) { M[RegisterRowIndex] = RegisterRow; } // ------------- elements CALL_SITE_DEBUGLOC vector GetElement(const uint ElementIndex) { vector Element; UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { Element[ComponentIndex] = GetElementComponent(ElementIndex, ComponentIndex); } return Element; } CALL_SITE_DEBUGLOC void SetElement(const uint ElementIndex, vector Element) { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { SetElementComponent(ElementIndex, ComponentIndex, Element[ComponentIndex]); } } vector WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex) { vector Element; if (sizeof(ScalarType) == 4) { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { const uint ArrayIndex = ComponentIndex + VectorSize * ElementIndex; Element[ComponentIndex] = WaveBroadcast(BroadcastSettings, M[ArrayIndex / 2u][ArrayIndex % 2u]); } } else { const uint FirstArrayIndex = VectorSize * ElementIndex; vector BroadcastedArray[(VectorSize + 1u) / 2u]; UNROLL for (uint BroadcastIndex = 0; BroadcastIndex < ((VectorSize + 1u) / 2u); BroadcastIndex++) { BroadcastedArray[BroadcastIndex] = WaveBroadcast(BroadcastSettings, M[FirstArrayIndex / 2u + BroadcastIndex]); } UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { const uint BroadcastedArrayIndex = ComponentIndex + VectorSize * ElementIndex - (FirstArrayIndex / 2u) * 2u; Element[ComponentIndex] = BroadcastedArray[BroadcastedArrayIndex / 2u][BroadcastedArrayIndex % 2u]; } } return Element; } // ------------- elements' components CALL_SITE_DEBUGLOC ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex) { const uint ArrayIndex = ComponentIndex + VectorSize * ElementIndex; return M[ArrayIndex / 2u][ArrayIndex % 2u]; } CALL_SITE_DEBUGLOC void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component) { const uint ArrayIndex = ComponentIndex + VectorSize * ElementIndex; M[ArrayIndex / 2u][ArrayIndex % 2u] = Component; } }; // struct FRegisters_AoS_PairArray /** Stores an array of vector<> as a structure of array in registers using a array of vector. Requires InElementCount a pair >= 2 */ template struct FRegisters_SoA_PairArray { /** Size of each element. */ static const uint VectorSize = InVectorSize; /** Number of elememnts */ static const uint ElementCount = InElementCount; /** Size and number of register rows */ static const uint RegisterRowSize = 2; static const uint RegisterRowCount = (ElementCount * VectorSize) / 2; vector M[RegisterRowCount]; // ------------- register rows CALL_SITE_DEBUGLOC vector GetRegisterRow(const uint RegisterRowIndex) { return M[RegisterRowIndex]; } CALL_SITE_DEBUGLOC void SetRegisterRow(const uint RegisterRowIndex, vector RegisterRow) { M[RegisterRowIndex] = RegisterRow; } // ------------- elements CALL_SITE_DEBUGLOC vector GetElement(const uint ElementIndex) { vector Element; UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { Element[ComponentIndex] = GetElementComponent(ElementIndex, ComponentIndex); } return Element; } CALL_SITE_DEBUGLOC void SetElement(const uint ElementIndex, vector Element) { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { SetElementComponent(ElementIndex, ComponentIndex, Element[ComponentIndex]); } } vector WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex) { vector Element; UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++) { const uint ArrayIndex = ElementIndex + VectorSize * ComponentIndex; Element[ComponentIndex] = WaveBroadcast(BroadcastSettings, M[ArrayIndex / 2u])[ArrayIndex % 2u]; } return Element; } // ------------- elements' components CALL_SITE_DEBUGLOC ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex) { const uint ArrayIndex = ElementIndex + VectorSize * ComponentIndex; return M[ArrayIndex / 2u][ArrayIndex % 2u]; } CALL_SITE_DEBUGLOC void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component) { const uint ArrayIndex = ElementIndex + VectorSize * ComponentIndex; M[ArrayIndex / 2u][ArrayIndex % 2u] = Component; } }; // struct FRegisters_SoA_PairArray //------------------------------------------------------- TENSORS /** Packs a vector<> in 2 dimension. */ template struct TLaneVector2D { /** Size of each element. */ static const uint VectorSize = InVectorSize; /** Number of elememnts */ static const uint SimdSize = InSimdSizeX * InSimdSizeY; static const uint SimdSizeX = InSimdSizeX; static const uint SimdSizeY = InSimdSizeY; /** Size and number of register rows */ static const uint RegisterRowSize = TENSOR_REGISTER_LAYOUT::RegisterRowSize; static const uint RegisterRowCount = TENSOR_REGISTER_LAYOUT::RegisterRowCount; TENSOR_REGISTER_LAYOUT Registers; // ------------- getting and setting elements and components // Access and set individual element of the vector. CALL_SITE_DEBUGLOC vector GetElement(const uint ElementIndex) { return Registers.GetElement(ElementIndex); } CALL_SITE_DEBUGLOC void SetElement(const uint ElementIndex, vector Element) { Registers.SetElement(ElementIndex, Element); } CALL_SITE_DEBUGLOC void SetAllElements(vector Element) { if(SimdSize == 2) { Registers.SetElement(0, Element); Registers.SetElement(1, Element); } else if (SimdSize == 4) { Registers.SetElement(0, Element); Registers.SetElement(1, Element); Registers.SetElement(2, Element); Registers.SetElement(3, Element); } else { UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++) { Registers.SetElement(ElementIndex, Element); } } } // Access and set a component of the vector CALL_SITE_DEBUGLOC TLaneVector2D GetComponent(const uint ComponentIndex) { TLaneVector2D R; if(SimdSize == 2) { R.SetElement(0, Registers.GetElementComponent(0, ComponentIndex)); R.SetElement(1, Registers.GetElementComponent(1, ComponentIndex)); } else if (SimdSize == 4) { R.SetElement(0, Registers.GetElementComponent(0, ComponentIndex)); R.SetElement(1, Registers.GetElementComponent(1, ComponentIndex)); R.SetElement(2, Registers.GetElementComponent(2, ComponentIndex)); R.SetElement(3, Registers.GetElementComponent(3, ComponentIndex)); } else { UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++) { R.SetElement(ElementIndex, Registers.GetElementComponent(ElementIndex, ComponentIndex)); } } return R; } CALL_SITE_DEBUGLOC void SetComponent(uint ComponentIndex, TLaneVector2D Comp) { if(SimdSize == 2) { Registers.SetElementComponent(0, ComponentIndex, Comp.GetElement(0)); Registers.SetElementComponent(1, ComponentIndex, Comp.GetElement(1)); } else if (SimdSize == 4) { Registers.SetElementComponent(0, ComponentIndex, Comp.GetElement(0)); Registers.SetElementComponent(1, ComponentIndex, Comp.GetElement(1)); Registers.SetElementComponent(2, ComponentIndex, Comp.GetElement(2)); Registers.SetElementComponent(3, ComponentIndex, Comp.GetElement(3)); } else { UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++) { Registers.SetElementComponent(ElementIndex, ComponentIndex, Comp.GetElement(ElementIndex)); } } } CALL_SITE_DEBUGLOC TLaneVector2D operator [](const uint ComponentIndex) { return GetComponent(ComponentIndex); } // ------------- constructors /** Casts a scalar into a vector<>. */ CALL_SITE_DEBUGLOC static TLaneVector2D Vectorize( TLaneVector2D A) { TLaneVector2D R; if(SimdSize == 2) { R.SetElement(0, A.GetElement(0)); R.SetElement(1, A.GetElement(1)); } else if (SimdSize == 4) { R.SetElement(0, A.GetElement(0)); R.SetElement(1, A.GetElement(1)); R.SetElement(2, A.GetElement(2)); R.SetElement(3, A.GetElement(3)); } else { UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++) { R.SetElement(ElementIndex, A.GetElement(ElementIndex)); } } return R; } /** Initialize all elements with a single same vector<>. */ CALL_SITE_DEBUGLOC static TLaneVector2D Const( vector A) { TLaneVector2D R; if(SimdSize == 2) { R.SetElement(0, A); R.SetElement(1, A); } else if (SimdSize == 4) { R.SetElement(0, A); R.SetElement(1, A); R.SetElement(2, A); R.SetElement(3, A); } else { UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++) { R.SetElement(ElementIndex, A); } } return R; } /** Casts vector to vector. */ CALL_SITE_DEBUGLOC template static TLaneVector2D CastFrom( TLaneVector2D A) { TLaneVector2D R; if(SimdSize == 2) { R.SetElement(0, vector(A.GetElement(0))); R.SetElement(1, vector(A.GetElement(1))); } else if (SimdSize == 4) { R.SetElement(0, vector(A.GetElement(0))); R.SetElement(1, vector(A.GetElement(1))); R.SetElement(2, vector(A.GetElement(2))); R.SetElement(3, vector(A.GetElement(3))); } else { UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++) { R.SetElement(ElementIndex, vector(A.GetElement(ElementIndex))); } } return R; } /** Force tight register packing. */ CALL_SITE_DEBUGLOC void TightenRegisters() #if TENSOR_REGISTER_TIGHTENING { if(RegisterRowCount == 2) { Registers.SetRegisterRow(0, PackRegistersTightly(Registers.GetRegisterRow(0))); Registers.SetRegisterRow(1, PackRegistersTightly(Registers.GetRegisterRow(1))); } else if (RegisterRowCount == 4) { Registers.SetRegisterRow(0, PackRegistersTightly(Registers.GetRegisterRow(0))); Registers.SetRegisterRow(1, PackRegistersTightly(Registers.GetRegisterRow(1))); Registers.SetRegisterRow(2, PackRegistersTightly(Registers.GetRegisterRow(2))); Registers.SetRegisterRow(3, PackRegistersTightly(Registers.GetRegisterRow(3))); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { Registers.SetRegisterRow(RegisterRowIndex, PackRegistersTightly(Registers.GetRegisterRow(RegisterRowIndex))); } } } #else { // NOP } #endif #if defined(LDS_DESPILL_DWORD_COUNT) /** Returns the LDS spill size in dwords */ static uint GetLDSSpillDwordSize() { return RegisterRowCount * GetVectorDwordSize(ScalarType, RegisterRowSize); } /** Store to LDS spill to save VGPR. */ CALL_SITE_DEBUGLOC void StoreLDSSpill(const uint DespillDwordOffset) { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { vector ResgiterRow = Registers.GetRegisterRow(RegisterRowIndex); uint ResgiterRowDW[GetVectorDwordSize(ScalarType, RegisterRowSize)]; PackVectorToDwords(ResgiterRow, /* out */ ResgiterRowDW); UNROLL for (uint i = 0; i < GetVectorDwordSize(ScalarType, RegisterRowSize); i++) { DespilledData[GGroupThreadIndex + DespillDwordOffset + (RegisterRowIndex * GetVectorDwordSize(ScalarType, RegisterRowSize) + i) * LDS_DESPILL_THREAD_COUNT] = ResgiterRowDW[i]; } } } /** Load from LDS spill to save VGPR. */ CALL_SITE_DEBUGLOC static TLaneVector2D LoadLDSSpill(const uint DespillDwordOffset) { TLaneVector2D Vector; UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { uint ResgiterRowDW[GetVectorDwordSize(ScalarType, RegisterRowSize)]; UNROLL for (uint i = 0; i < GetVectorDwordSize(ScalarType, RegisterRowSize); i++) { ResgiterRowDW[i] = DespilledData[GGroupThreadIndex + DespillDwordOffset + (RegisterRowIndex * GetVectorDwordSize(ScalarType, RegisterRowSize) + i) * LDS_DESPILL_THREAD_COUNT]; } vector ResgiterRow; UnpackDwordsToVector(ResgiterRowDW, /* out */ ResgiterRow); Vector.Registers.SetRegisterRow(RegisterRowIndex, ResgiterRow); } return Vector; } #endif // defined(LDS_DESPILL_DWORD_COUNT) // ------------- binary operator + CALL_SITE_DEBUGLOC TLaneVector2D operator + (TLaneVector2D B) { TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) + B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) + B.Registers.GetRegisterRow(1)); } else if (RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) + B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) + B.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) + B.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) + B.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) + B.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } CALL_SITE_DEBUGLOC TLaneVector2D operator + (vector B) { TLaneVector2D BV; BV.SetAllElements(B); TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) + BV.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) + BV.Registers.GetRegisterRow(1)); } else if (RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) + BV.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) + BV.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) + BV.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) + BV.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) + BV.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } // ------------- unary operator - CALL_SITE_DEBUGLOC TLaneVector2D operator - () { TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, -Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, -Registers.GetRegisterRow(1)); } else if (RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, -Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, -Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, -Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, -Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, -Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } // ------------- binary operator - CALL_SITE_DEBUGLOC TLaneVector2D operator - (TLaneVector2D B) { TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) - B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) - B.Registers.GetRegisterRow(1)); } else if (RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) - B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) - B.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) - B.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) - B.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) - B.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } CALL_SITE_DEBUGLOC TLaneVector2D operator - (vector B) { TLaneVector2D BV; BV.SetAllElements(B); TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) - BV.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) - BV.Registers.GetRegisterRow(1)); } else if (RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) - BV.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) - BV.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) - BV.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) - BV.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) - BV.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } // ------------- binary operator * CALL_SITE_DEBUGLOC TLaneVector2D operator * (TLaneVector2D B) { TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) * B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) * B.Registers.GetRegisterRow(1)); } else if (RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) * B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) * B.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) * B.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) * B.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) * B.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } CALL_SITE_DEBUGLOC TLaneVector2D operator * (vector B) { TLaneVector2D BV; BV.SetAllElements(B); TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) * BV.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) * BV.Registers.GetRegisterRow(1)); } else if (RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) * BV.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) * BV.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) * BV.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) * BV.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) * BV.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } // ------------- comparison operators CALL_SITE_DEBUGLOC TLaneVector2D operator > (TLaneVector2D B) { TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) > B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) > B.Registers.GetRegisterRow(1)); } else if(RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) > B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) > B.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) > B.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) > B.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) > B.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } CALL_SITE_DEBUGLOC TLaneVector2D operator < (TLaneVector2D B) { TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) < B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) < B.Registers.GetRegisterRow(1)); } else if(RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) < B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) < B.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) < B.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) < B.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) < B.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } CALL_SITE_DEBUGLOC TLaneVector2D operator != (TLaneVector2D B) { TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) != B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) != B.Registers.GetRegisterRow(1)); } else if(RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) != B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) != B.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) != B.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) != B.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) != B.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } CALL_SITE_DEBUGLOC TLaneVector2D operator == (TLaneVector2D B) { TLaneVector2D R; if(RegisterRowCount == 2) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) == B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) == B.Registers.GetRegisterRow(1)); } else if(RegisterRowCount == 4) { R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) == B.Registers.GetRegisterRow(0)); R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) == B.Registers.GetRegisterRow(1)); R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) == B.Registers.GetRegisterRow(2)); R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) == B.Registers.GetRegisterRow(3)); } else { UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) == B.Registers.GetRegisterRow(RegisterRowIndex)); } } return R; } }; // TLaneVector2D CALL_SITE_DEBUGLOC template TLaneVector2D ResizeChannels( TLaneVector2D A) { TLaneVector2D R; R.SetAllElements(ScalarType(0)); UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++) { UNROLL for (uint ComponentIndex = 0; ComponentIndex < min(VectorSize, DestVectorSize); ComponentIndex++) { R.Registers.SetElementComponent(ElementIndex, ComponentIndex, A.Registers.GetElementComponent(ElementIndex, ComponentIndex)); } } return R; } CALL_SITE_DEBUGLOC template TLaneVector2D Concatenate( TLaneVector2D A, TLaneVector2D B) { TLaneVector2D R; UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++) { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++) { R.Registers.SetElementComponent(ElementIndex, ComponentIndex, A.Registers.GetElementComponent(ElementIndex, ComponentIndex)); } UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++) { R.Registers.SetElementComponent(ElementIndex, VectorSizeA + ComponentIndex, B.Registers.GetElementComponent(ElementIndex, ComponentIndex)); } } return R; } CALL_SITE_DEBUGLOC template TLaneVector2D Concatenate( TLaneVector2D A, TLaneVector2D B, TLaneVector2D C) { TLaneVector2D R; UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++) { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++) { R.Registers.SetElementComponent(ElementIndex, ComponentIndex, A.Registers.GetElementComponent(ElementIndex, ComponentIndex)); } UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++) { R.Registers.SetElementComponent(ElementIndex, VectorSizeA + ComponentIndex, B.Registers.GetElementComponent(ElementIndex, ComponentIndex)); } UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeC; ComponentIndex++) { R.Registers.SetElementComponent(ElementIndex, VectorSizeA + VectorSizeB + ComponentIndex, C.Registers.GetElementComponent(ElementIndex, ComponentIndex)); } } return R; } CALL_SITE_DEBUGLOC template void Deconcatenate( TLaneVector2D M, out TLaneVector2D A, out TLaneVector2D B) { UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++) { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++) { A.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, ComponentIndex)); } UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++) { B.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, VectorSizeA + ComponentIndex)); } } } CALL_SITE_DEBUGLOC template void Deconcatenate( TLaneVector2D M, out TLaneVector2D A, out TLaneVector2D B, out TLaneVector2D C) { UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++) { UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++) { A.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, ComponentIndex)); } UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++) { B.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, VectorSizeA + ComponentIndex)); } UNROLL for (uint ComponentIndex = 0; ComponentIndex < VectorSizeC; ComponentIndex++) { C.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, VectorSizeA + VectorSizeB + ComponentIndex)); } } } // ------------- 1 parameter #define TVECTOR_FUNCTION_1PARAMS_ALIASE_FUNCTION_NAME(ReturnScalarType, FunctionName,AliasFunctionName) \ CALL_SITE_DEBUGLOC \ template \ TLaneVector2D AliasFunctionName( \ TLaneVector2D A) \ { \ TLaneVector2D R; \ if(TLaneVector2D::RegisterRowCount == 2) \ { \ R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0))); \ R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1))); \ } \ else if(TLaneVector2D::RegisterRowCount == 4) \ { \ R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0))); \ R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1))); \ R.Registers.SetRegisterRow(2, FunctionName(A.Registers.GetRegisterRow(2))); \ R.Registers.SetRegisterRow(3, FunctionName(A.Registers.GetRegisterRow(3))); \ } \ else \ { \ UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D::RegisterRowCount; RegisterRowIndex++) { \ R.Registers.SetRegisterRow(RegisterRowIndex, FunctionName(A.Registers.GetRegisterRow(RegisterRowIndex))); \ } \ } \ return R; \ } \ #define TVECTOR_FUNCTION_1PARAMS(ReturnScalarType, FunctionName) TVECTOR_FUNCTION_1PARAMS_ALIASE_FUNCTION_NAME(ReturnScalarType, FunctionName, FunctionName) TVECTOR_FUNCTION_1PARAMS(ScalarType, log); TVECTOR_FUNCTION_1PARAMS(ScalarType, log2); TVECTOR_FUNCTION_1PARAMS(ScalarType, sqrt); TVECTOR_FUNCTION_1PARAMS(ScalarType, rsqrt); TVECTOR_FUNCTION_1PARAMS(ScalarType, exp); TVECTOR_FUNCTION_1PARAMS(ScalarType, rcp); TVECTOR_FUNCTION_1PARAMS(ScalarType, saturate); TVECTOR_FUNCTION_1PARAMS(ScalarType, abs); TVECTOR_FUNCTION_1PARAMS(ScalarType, floor); TVECTOR_FUNCTION_1PARAMS(ScalarType, ceil); TVECTOR_FUNCTION_1PARAMS(ScalarType, round); TVECTOR_FUNCTION_1PARAMS(ScalarType, fast_sign); TVECTOR_FUNCTION_1PARAMS(bool, not); TVECTOR_FUNCTION_1PARAMS(uint, bit_not); #if PLATFORM_SUPPORTS_REAL_TYPES TVECTOR_FUNCTION_1PARAMS(uint16_t, asuint16); #endif //Work around: cannot use keyword asuint and asfloat on struct. TVECTOR_FUNCTION_1PARAMS_ALIASE_FUNCTION_NAME(uint, asuint, azuint); TVECTOR_FUNCTION_1PARAMS_ALIASE_FUNCTION_NAME(float,asfloat, azfloat); // ------------- 2 parameters #define TVECTOR_FUNCTION_2PARAMS(FunctionName) \ CALL_SITE_DEBUGLOC \ template \ TLaneVector2D FunctionName( \ TLaneVector2D A, \ TLaneVector2D B) \ { \ TLaneVector2D R; \ if (TLaneVector2D::RegisterRowCount == 2) \ { \ R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0), B.Registers.GetRegisterRow(0))); \ R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1), B.Registers.GetRegisterRow(1))); \ } \ else if (TLaneVector2D::RegisterRowCount == 4) \ { \ R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0), B.Registers.GetRegisterRow(0))); \ R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1), B.Registers.GetRegisterRow(1))); \ R.Registers.SetRegisterRow(2, FunctionName(A.Registers.GetRegisterRow(2), B.Registers.GetRegisterRow(2))); \ R.Registers.SetRegisterRow(3, FunctionName(A.Registers.GetRegisterRow(3), B.Registers.GetRegisterRow(3))); \ } \ else \ { \ UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D::RegisterRowCount; RegisterRowIndex++) { \ R.Registers.SetRegisterRow(RegisterRowIndex, FunctionName(A.Registers.GetRegisterRow(RegisterRowIndex), B.Registers.GetRegisterRow(RegisterRowIndex))); \ } \ } \ return R; \ } \ TVECTOR_FUNCTION_2PARAMS(pow); TVECTOR_FUNCTION_2PARAMS(min); TVECTOR_FUNCTION_2PARAMS(max); TVECTOR_FUNCTION_2PARAMS(and_internal); TVECTOR_FUNCTION_2PARAMS(or_internal); TVECTOR_FUNCTION_2PARAMS(bit_and); TVECTOR_FUNCTION_2PARAMS(bit_or); TVECTOR_FUNCTION_2PARAMS(bit_shift_left); TVECTOR_FUNCTION_2PARAMS(bit_shift_right); // ------------- 2 parameters but different return dimension #define TVECTOR_FUNCTION_2PARAMS_DIFF_RETURN(ReturnVectorSize, FunctionName) \ CALL_SITE_DEBUGLOC \ template \ TLaneVector2D FunctionName( \ TLaneVector2D A, \ TLaneVector2D B) \ { \ TLaneVector2D R; \ if (SimdSizeX * SimdSizeY == 2) \ { \ R.SetElement(0, FunctionName(A.GetElement(0), B.GetElement(0))); \ R.SetElement(1, FunctionName(A.GetElement(1), B.GetElement(1))); \ } \ else if (SimdSizeX * SimdSizeY == 4) \ { \ R.SetElement(0, FunctionName(A.GetElement(0), B.GetElement(0))); \ R.SetElement(1, FunctionName(A.GetElement(1), B.GetElement(1))); \ R.SetElement(2, FunctionName(A.GetElement(2), B.GetElement(2))); \ R.SetElement(3, FunctionName(A.GetElement(3), B.GetElement(3))); \ } \ else \ { \ UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++) { \ R.SetElement(ElementIndex, FunctionName(A.GetElement(ElementIndex), B.GetElement(ElementIndex))); \ } \ } \ return R; \ } \ TVECTOR_FUNCTION_2PARAMS_DIFF_RETURN(/* VectorSize = */ 1, dot); // ------------- 3 parameters #define TVECTOR_FUNCTION_3PARAMS(FunctionName, ScalarTypeA) \ CALL_SITE_DEBUGLOC \ template \ TLaneVector2D FunctionName( \ TLaneVector2D A, \ TLaneVector2D B, \ TLaneVector2D C) \ { \ TLaneVector2D R; \ if (TLaneVector2D::RegisterRowCount == 2) \ { \ R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0), B.Registers.GetRegisterRow(0), C.Registers.GetRegisterRow(0))); \ R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1), B.Registers.GetRegisterRow(1), C.Registers.GetRegisterRow(1))); \ } \ else if (TLaneVector2D::RegisterRowCount == 4) \ { \ R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0), B.Registers.GetRegisterRow(0), C.Registers.GetRegisterRow(0))); \ R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1), B.Registers.GetRegisterRow(1), C.Registers.GetRegisterRow(1))); \ R.Registers.SetRegisterRow(2, FunctionName(A.Registers.GetRegisterRow(2), B.Registers.GetRegisterRow(2), C.Registers.GetRegisterRow(2))); \ R.Registers.SetRegisterRow(3, FunctionName(A.Registers.GetRegisterRow(3), B.Registers.GetRegisterRow(3), C.Registers.GetRegisterRow(3))); \ } \ else \ { \ UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D::RegisterRowCount; RegisterRowIndex++) { \ R.Registers.SetRegisterRow(RegisterRowIndex, FunctionName(A.Registers.GetRegisterRow(RegisterRowIndex), B.Registers.GetRegisterRow(RegisterRowIndex), C.Registers.GetRegisterRow(RegisterRowIndex))); \ } \ } \ return R; \ } \ TVECTOR_FUNCTION_3PARAMS(select_internal, bool); TVECTOR_FUNCTION_3PARAMS(clamp, ScalarType); TVECTOR_FUNCTION_3PARAMS(lerp, ScalarType); TVECTOR_FUNCTION_3PARAMS(min3, ScalarType); TVECTOR_FUNCTION_3PARAMS(max3, ScalarType); #if COMPILER_SUPPORTS_MED3 TVECTOR_FUNCTION_3PARAMS(med3, ScalarType); #endif // ------------- AnyElement & AllElement CALL_SITE_DEBUGLOC template TLaneVector2D AnyElement(TLaneVector2D A) { TLaneVector2D R; R.SetElement(0, any(A.GetElement(0))); if (SimdSizeX * SimdSizeY == 2) { R.SetElement(1, any(A.GetElement(1))); } else if (SimdSizeX * SimdSizeY == 4) { R.SetElement(1, any(A.GetElement(1))); R.SetElement(2, any(A.GetElement(2))); R.SetElement(3, any(A.GetElement(3))); } else { UNROLL for (uint SimdIndex = 1; SimdIndex < SimdSizeX * SimdSizeY; SimdIndex++) { R.SetElement(SimdIndex, any(A.GetElement(SimdIndex))); } } return R; } CALL_SITE_DEBUGLOC template TLaneVector2D AllElement(TLaneVector2D A) { TLaneVector2D R; R.SetElement(0, all(A.GetElement(0))); if (SimdSizeX * SimdSizeY == 2) { R.SetElement(1, all(A.GetElement(1))); } else if (SimdSizeX * SimdSizeY == 4) { R.SetElement(1, all(A.GetElement(1))); R.SetElement(2, all(A.GetElement(2))); R.SetElement(3, all(A.GetElement(3))); } else { UNROLL for (uint SimdIndex = 1; SimdIndex < SimdSizeX * SimdSizeY; SimdIndex++) { R.SetElement(SimdIndex, all(A.GetElement(SimdIndex))); } } return R; } CALL_SITE_DEBUGLOC template bool AnyComponent(TLaneVector2D A) { bool R = any(A.GetElement(0)); if (SimdSizeX * SimdSizeY == 2) { R = R || any(A.GetElement(1)); } else if (SimdSizeX * SimdSizeY == 4) { R = R || any(A.GetElement(1)); R = R || any(A.GetElement(2)); R = R || any(A.GetElement(3)); } else { UNROLL for (uint SimdIndex = 1; SimdIndex < SimdSizeX * SimdSizeY; SimdIndex++) { R = R || any(A.GetElement(SimdIndex)); } } return R; }