Files
UnrealEngine/Engine/Shaders/Public/LaneVectorization.ush
2025-05-18 13:04:45 +08:00

2269 lines
64 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
LaneVectorization.ush: Vectorize arbitrary number of processing per lane.
=============================================================================*/
#pragma once
#include "Platform.ush"
#include "WaveBroadcastIntrinsics.ush"
//------------------------------------------------------- DEFINE
#ifndef TENSOR_REGISTER_LAYOUT
#define TENSOR_REGISTER_LAYOUT FRegisters_AoS_VectorArray
#endif
#ifndef TENSOR_REGISTER_TIGHTENING
#define TENSOR_REGISTER_TIGHTENING 0
#endif
//------------------------------------------------------- GLOBAL VARIABLE
static uint GGroupThreadIndex = 0;
//------------------------------------------------------- STANDARD SCALAR OPERATIONS
bool not(bool x)
{
return !x;
}
bool2 not(bool2 x)
{
return !x;
}
bool3 not(bool3 x)
{
return !x;
}
bool4 not(bool4 x)
{
return !x;
}
uint bit_not(uint x)
{
return ~x;
}
uint2 bit_not(uint2 x)
{
return ~x;
}
#if PLATFORM_SUPPORTS_REAL_TYPES
uint16_t bit_not(uint16_t x)
{
return ~x;
}
uint16_t2 bit_not(uint16_t2 x)
{
return ~x;
}
#endif
bool2 v_pack_b32_b16(bool a, bool b)
{
return bool2(a, b);
}
float2 v_pack_b32_b16(float a, float b)
{
return float2(a, b);
}
uint2 v_pack_b32_b16(uint a, uint b)
{
return uint2(a, b);
}
int2 v_pack_b32_b16(int a, int b)
{
return int2(a, b);
}
float fast_sign(float x)
{
return clamp(x * asfloat(0x7f7fffff), float(-1.0), float(1.0));
}
float2 fast_sign(float2 x)
{
return clamp(x * asfloat(0x7f7fffff), float(-1.0), float(1.0));
}
#if PLATFORM_SUPPORTS_REAL_TYPES
half fast_sign(half x)
{
return clamp(x * asfloat16(uint16_t(0x7bff)), half(-1.0), half(1.0));
}
half2 fast_sign(half2 x)
{
return clamp(x * asfloat16(uint16_t(0x7bff)), half(-1.0), half(1.0));
}
#endif
uint bit_and(uint a, uint b)
{
return a & b;
}
uint2 bit_and(uint2 a, uint2 b)
{
return a & b;
}
uint bit_or(uint a, uint b)
{
return a | b;
}
uint2 bit_or(uint2 a, uint2 b)
{
return a | b;
}
#if PLATFORM_SUPPORTS_REAL_TYPES
uint16_t bit_and(uint16_t a, uint16_t b)
{
return a & b;
}
uint16_t2 bit_and(uint16_t2 a, uint16_t2 b)
{
return a & b;
}
uint16_t bit_or(uint16_t a, uint16_t b)
{
return a | b;
}
uint16_t2 bit_or(uint16_t2 a, uint16_t2 b)
{
return a | b;
}
#endif
uint bit_shift_left(uint a, uint b)
{
return a << b;
}
uint2 bit_shift_left(uint2 a, uint2 b)
{
return a << b;
}
uint bit_shift_right(uint a, uint b)
{
return a >> b;
}
uint2 bit_shift_right(uint2 a, uint2 b)
{
return a >> b;
}
#if PLATFORM_SUPPORTS_REAL_TYPES
uint16_t bit_shift_left(uint16_t a, uint16_t b)
{
return a << b;
}
uint16_t2 bit_shift_left(uint16_t2 a, uint16_t2 b)
{
return a << b;
}
uint16_t bit_shift_right(uint16_t a, uint16_t b)
{
return a >> b;
}
uint16_t2 bit_shift_right(uint16_t2 a, uint16_t2 b)
{
return a >> b;
}
#endif
//------------------------------------------------------- TIGHT REGISTERS
bool PackRegistersTightly(bool v)
{
return v;
}
bool2 PackRegistersTightly(bool2 v)
{
return v;
}
bool3 PackRegistersTightly(bool3 v)
{
return v;
}
bool4 PackRegistersTightly(bool4 v)
{
return v;
}
float PackRegistersTightly(float v)
{
return v;
}
float2 PackRegistersTightly(float2 v)
{
return v;
}
float3 PackRegistersTightly(float3 v)
{
return v;
}
float4 PackRegistersTightly(float4 v)
{
return v;
}
uint PackRegistersTightly(uint v)
{
return v;
}
uint2 PackRegistersTightly(uint2 v)
{
return v;
}
uint3 PackRegistersTightly(uint3 v)
{
return v;
}
uint4 PackRegistersTightly(uint4 v)
{
return v;
}
int PackRegistersTightly(int v)
{
return v;
}
int2 PackRegistersTightly(int2 v)
{
return v;
}
int3 PackRegistersTightly(int3 v)
{
return v;
}
int4 PackRegistersTightly(int4 v)
{
return v;
}
#if PLATFORM_SUPPORTS_REAL_TYPES
half PackRegistersTightly(half v)
{
return v;
}
half2 PackRegistersTightly(half2 v)
{
return v_pack_b32_b16(v.x, v.y);
}
half3 PackRegistersTightly(half3 v)
{
return half3(v_pack_b32_b16(v.x, v.y), v.z);
}
half4 PackRegistersTightly(half4 v)
{
return half4(v_pack_b32_b16(v.x, v.y), v_pack_b32_b16(v.z, v.w));
}
uint16_t PackRegistersTightly(uint16_t v)
{
return v;
}
uint16_t2 PackRegistersTightly(uint16_t2 v)
{
return v_pack_b32_b16(v.x, v.y);
}
uint16_t3 PackRegistersTightly(uint16_t3 v)
{
return uint16_t3(v_pack_b32_b16(v.x, v.y), v.z);
}
uint16_t4 PackRegistersTightly(uint16_t4 v)
{
return uint16_t4(v_pack_b32_b16(v.x, v.y), v_pack_b32_b16(v.z, v.w));
}
int16_t PackRegistersTightly(int16_t v)
{
return v;
}
int16_t2 PackRegistersTightly(int16_t2 v)
{
return v_pack_b32_b16(v.x, v.y);
}
int16_t3 PackRegistersTightly(int16_t3 v)
{
return int16_t3(v_pack_b32_b16(v.x, v.y), v.z);
}
int16_t4 PackRegistersTightly(int16_t4 v)
{
return int16_t4(v_pack_b32_b16(v.x, v.y), v_pack_b32_b16(v.z, v.w));
}
#endif // PLATFORM_SUPPORTS_REAL_TYPES
//------------------------------------------------------- DWORD PACKING/UNPACKING
/** Returns the byte size of vector<ScalarType, VectorSize> */
#define GetVectorByteSize(ScalarType, VectorSize) uint(sizeof(ScalarType) * uint(VectorSize))
/** Returns the dowrd size of vector<ScalarType, VectorSize> */
#define GetVectorDwordSize(ScalarType, VectorSize) (uint(GetVectorByteSize(ScalarType, VectorSize) + 3u) / 4u)
// uint
void PackVectorToDwords(uint V, out uint DW[1])
{
DW[0] = V;
}
void PackVectorToDwords(uint2 V, out uint DW[2])
{
DW[0] = V.x;
DW[1] = V.y;
}
void PackVectorToDwords(uint3 V, out uint DW[3])
{
DW[0] = V.x;
DW[1] = V.y;
DW[2] = V.z;
}
void PackVectorToDwords(uint4 V, out uint DW[4])
{
DW[0] = V.x;
DW[1] = V.y;
DW[2] = V.z;
DW[3] = V.w;
}
void UnpackDwordsToVector(uint DW[1], out uint V)
{
V = DW[0];
}
void UnpackDwordsToVector(uint DW[2], out uint2 V)
{
V.x = DW[0];
V.y = DW[1];
}
void UnpackDwordsToVector(uint DW[3], out uint3 V)
{
V.x = DW[0];
V.y = DW[1];
V.z = DW[2];
}
void UnpackDwordsToVector(uint DW[4], out uint4 V)
{
V.x = DW[0];
V.y = DW[1];
V.z = DW[2];
V.w = DW[3];
}
// float
void PackVectorToDwords(float V, out uint DW[1])
{
DW[0] = asuint(V);
}
void PackVectorToDwords(float2 V, out uint DW[2])
{
DW[0] = asuint(V.x);
DW[1] = asuint(V.y);
}
void PackVectorToDwords(float3 V, out uint DW[3])
{
DW[0] = asuint(V.x);
DW[1] = asuint(V.y);
DW[2] = asuint(V.z);
}
void PackVectorToDwords(float4 V, out uint DW[4])
{
DW[0] = asuint(V.x);
DW[1] = asuint(V.y);
DW[2] = asuint(V.z);
DW[3] = asuint(V.w);
}
void UnpackDwordsToVector(uint DW[1], out float V)
{
V = asfloat(DW[0]);
}
void UnpackDwordsToVector(uint DW[2], out float2 V)
{
V.x = asfloat(DW[0]);
V.y = asfloat(DW[1]);
}
void UnpackDwordsToVector(uint DW[3], out float3 V)
{
V.x = asfloat(DW[0]);
V.y = asfloat(DW[1]);
V.z = asfloat(DW[2]);
}
void UnpackDwordsToVector(uint DW[4], out float4 V)
{
V.x = asfloat(DW[0]);
V.y = asfloat(DW[1]);
V.z = asfloat(DW[2]);
V.w = asfloat(DW[3]);
}
#if PLATFORM_SUPPORTS_REAL_TYPES
#if COMPILER_SUPPORT_UINT16_BITCAST
// uint16_t
void PackVectorToDwords(uint16_t V, out uint DW[1])
{
DW[0] = uint(V);
}
void PackVectorToDwords(uint16_t2 V, out uint DW[1])
{
DW[0] = bit_cast_uint(V.xy);
}
void PackVectorToDwords(uint16_t3 V, out uint DW[2])
{
DW[0] = bit_cast_uint(V.xy);
DW[1] = uint(V.z);
}
void PackVectorToDwords(uint16_t4 V, out uint DW[2])
{
DW[0] = bit_cast_uint(V.xy);
DW[1] = bit_cast_uint(V.zw);
}
void UnpackDwordsToVector(uint DW[1], out uint16_t V)
{
V = uint16_t(DW[0]);
}
void UnpackDwordsToVector(uint DW[1], out uint16_t2 V)
{
V.xy = bit_cast_uint16_t2(DW[0]);
}
void UnpackDwordsToVector(uint DW[2], out uint16_t3 V)
{
V.xy = bit_cast_uint16_t2(DW[0]);
V.z = uint16_t(DW[1]);
}
void UnpackDwordsToVector(uint DW[2], out uint16_t4 V)
{
V.xy = bit_cast_uint16_t2(DW[0]);
V.zw = bit_cast_uint16_t2(DW[1]);
}
#else // !COMPILER_SUPPORT_UINT16_BITCAST
// uint16_t
void PackVectorToDwords(uint16_t V, out uint DW[1])
{
DW[0] = uint(V);
}
void PackVectorToDwords(uint16_t2 V, out uint DW[1])
{
DW[0] = uint(V.x) | (uint(V.y) << 16u);
}
void PackVectorToDwords(uint16_t3 V, out uint DW[2])
{
DW[0] = uint(V.x) | (uint(V.y) << 16u);
DW[1] = uint(V.z);
}
void PackVectorToDwords(uint16_t4 V, out uint DW[2])
{
DW[0] = uint(V.x) | (uint(V.y) << 16u);
DW[1] = uint(V.z) | (uint(V.w) << 16u);
}
void UnpackDwordsToVector(uint DW[1], out uint16_t V)
{
V = uint16_t(DW[0]);
}
void UnpackDwordsToVector(uint DW[1], out uint16_t2 V)
{
V.x = uint16_t((DW[0] >> 0u) & 0xFFFFu);
V.y = uint16_t((DW[0] >> 16u) & 0xFFFFu);
}
void UnpackDwordsToVector(uint DW[2], out uint16_t3 V)
{
V.x = uint16_t((DW[0] >> 0u) & 0xFFFFu);
V.y = uint16_t((DW[0] >> 16u) & 0xFFFFu);
V.z = uint16_t((DW[1] >> 0u) & 0xFFFFu);
}
void UnpackDwordsToVector(uint DW[2], out uint16_t4 V)
{
V.x = uint16_t((DW[0] >> 0u) & 0xFFFFu);
V.y = uint16_t((DW[0] >> 16u) & 0xFFFFu);
V.z = uint16_t((DW[1] >> 0u) & 0xFFFFu);
V.w = uint16_t((DW[1] >> 16u) & 0xFFFFu);
}
#endif // !COMPILER_SUPPORT_UINT16_BITCAST
// half
void PackVectorToDwords(half V, out uint DW[1])
{
PackVectorToDwords(asuint16(V), /* out */ DW);
}
void PackVectorToDwords(half2 V, out uint DW[1])
{
PackVectorToDwords(uint16_t2(asuint16(V.x), asuint16(V.y)), /* out */ DW);
}
void PackVectorToDwords(half3 V, out uint DW[2])
{
PackVectorToDwords(uint16_t3(asuint16(V.x), asuint16(V.y), asuint16(V.z)), /* out */ DW);
}
void PackVectorToDwords(half4 V, out uint DW[2])
{
PackVectorToDwords(uint16_t4(asuint16(V.x), asuint16(V.y), asuint16(V.z), asuint16(V.w)), /* out */ DW);
}
void UnpackDwordsToVector(uint DW[1], out half V)
{
uint16_t UV;
UnpackDwordsToVector(DW, /* out */ UV);
V = asfloat16(UV);
}
void UnpackDwordsToVector(uint DW[1], out half2 V)
{
uint16_t2 UV;
UnpackDwordsToVector(DW, /* out */ UV);
V.x = asfloat16(UV.x);
V.y = asfloat16(UV.y);
}
void UnpackDwordsToVector(uint DW[2], out half3 V)
{
uint16_t3 UV;
UnpackDwordsToVector(DW, /* out */ UV);
V.x = asfloat16(UV.x);
V.y = asfloat16(UV.y);
V.z = asfloat16(UV.z);
}
void UnpackDwordsToVector(uint DW[2], out half4 V)
{
uint16_t4 UV;
UnpackDwordsToVector(DW, /* out */ UV);
V.x = asfloat16(UV.x);
V.y = asfloat16(UV.y);
V.z = asfloat16(UV.z);
V.w = asfloat16(UV.w);
}
#endif // PLATFORM_SUPPORTS_REAL_TYPES
//------------------------------------------------------- UTIL FUNCTION FOR MANUAL LOOP UNROLL
uint GetDW(uint DW[1], const uint Index)
{
return DW[Index];
}
uint GetDW(uint DW[2], const uint Index)
{
return DW[Index];
}
uint GetDW(uint DW[3], const uint Index)
{
return DW[Index];
}
uint GetDW(uint DW[4], const uint Index)
{
return DW[Index];
}
void SetDW(inout uint DW[1], const uint Index, uint V)
{
DW[Index] = V;
}
void SetDW(inout uint DW[2], const uint Index, uint V)
{
DW[Index] = V;
}
void SetDW(inout uint DW[3], const uint Index, uint V)
{
DW[Index] = V;
}
void SetDW(inout uint DW[4], const uint Index, uint V)
{
DW[Index] = V;
}
//------------------------------------------------------- GROUP SHARED COMMUNICATION
#if defined(LDS_SIZE)
groupshared uint SharedData[LDS_SIZE * LDS_DWORD_COMPONENT_COUNT];
void WriteDwordToLDS(const uint SharedIndex, uint V)
{
SharedData[SharedIndex] = V;
}
uint ReadDwordFromLDS(const uint SharedIndex)
{
return SharedData[SharedIndex];
}
void AtomicIncrementLDSDword(const uint SharedIndex, uint V)
{
InterlockedAdd(/* inout */ SharedData[SharedIndex], V);
}
template<typename ScalarType, uint VectorSize>
void WriteVectorToLDS(const uint SharedIndex, vector<ScalarType, VectorSize> V)
{
// manual loop unroll the most used cases to reduce compile time
// the branch will be compiled away
uint DW[GetVectorDwordSize(ScalarType, VectorSize)];
PackVectorToDwords(V, /* out */ DW);
if (GetVectorDwordSize(ScalarType, VectorSize) == 1)
{
SharedData[SharedIndex + 0 * LDS_SIZE] = GetDW(DW, 0);
}
else if (GetVectorDwordSize(ScalarType, VectorSize) == 2)
{
SharedData[SharedIndex + 0 * LDS_SIZE] = GetDW(DW, 0);
SharedData[SharedIndex + 1 * LDS_SIZE] = GetDW(DW, 1);
}
else
{
UNROLL
for (uint i = 0; i < GetVectorDwordSize(ScalarType, VectorSize); i++)
{
SharedData[SharedIndex + i * LDS_SIZE] = GetDW(DW, i);
}
}
}
template<typename ScalarType, uint VectorSize>
void ReadVectorFromLDS(const uint SharedIndex, out vector<ScalarType, VectorSize> V)
{
uint DW[GetVectorDwordSize(ScalarType, VectorSize)];
if (GetVectorDwordSize(ScalarType, VectorSize) == 1)
{
SetDW(DW, 0, SharedData[SharedIndex + 0 * LDS_SIZE]);
}
else if (GetVectorDwordSize(ScalarType, VectorSize) == 2)
{
SetDW(DW, 0, SharedData[SharedIndex + 0 * LDS_SIZE]);
SetDW(DW, 1, SharedData[SharedIndex + 1 * LDS_SIZE]);
}
else
{
UNROLL
for (uint i = 0; i < GetVectorDwordSize(ScalarType, VectorSize); i++)
{
SetDW(DW, i, SharedData[SharedIndex + i * LDS_SIZE]);
}
}
UnpackDwordsToVector(DW, /* out */ V);
}
#endif // defined(LDS_SIZE)
//------------------------------------------------------- MANUAL LDS DESPILL
#if defined(LDS_DESPILL_DWORD_COUNT)
groupshared uint DespilledData[LDS_DESPILL_DWORD_COUNT];
#endif // defined(LDS_DESPILL_DWORD_COUNT)
//------------------------------------------------------- REGISTERS LAYOUTS
/** Stores one unique vector<>. InElementCount must be == 1 */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_S_OneVector
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = 1;
/** Size and number of register rows */
static const uint RegisterRowSize = VectorSize;
static const uint RegisterRowCount = 1;
vector<ScalarType, VectorSize> E;
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return E;
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
E = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
return E;
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
E = Element;
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
return WaveBroadcast(BroadcastSettings, GetElement(ElementIndex));
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
return E[ComponentIndex];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
E[ComponentIndex] = Component;
}
}; // struct FRegisters_S_OneVector
/** Stores an array of vector<> as an array of structure in registers. */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_AoS_VectorArray
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = VectorSize;
static const uint RegisterRowCount = ElementCount;
vector<ScalarType, VectorSize> Array[ElementCount];
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return Array[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
Array[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
return Array[ElementIndex];
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
Array[ElementIndex] = Element;
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
return WaveBroadcast(BroadcastSettings, GetElement(ElementIndex));
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
return Array[ElementIndex][ComponentIndex];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
Array[ElementIndex][ComponentIndex] = Component;
}
}; // struct FRegisters_AoS_VectorArray
/** Stores an array of vector<> as an array of structure in registers using a matrix<>. */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_AoS_Matrix
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = VectorSize;
static const uint RegisterRowCount = ElementCount;
matrix<ScalarType, ElementCount, VectorSize> M;
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return M[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
M[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
return M[ElementIndex];
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
M[ElementIndex] = Element;
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
return WaveBroadcast(BroadcastSettings, GetElement(ElementIndex));
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
return M[ElementIndex][ComponentIndex];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
M[ElementIndex][ComponentIndex] = Component;
}
}; // struct FRegisters_AoS_Matrix
/** Stores an array of vector<> as a structure of array in registers using a matrix<>. Requires InElementCount a pair >= 2 */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_SoA_Matrix
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = ElementCount;
static const uint RegisterRowCount = VectorSize;
matrix<ScalarType, VectorSize, ElementCount> M;
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return M[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
M[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
if (VectorSize == 1)
{
Element[0] = M[0][ElementIndex];
}
else if (VectorSize == 3)
{
Element[0] = M[0][ElementIndex];
Element[1] = M[1][ElementIndex];
Element[2] = M[2][ElementIndex];
}
else
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
Element[ComponentIndex] = M[ComponentIndex][ElementIndex];
}
}
return Element;
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
if (VectorSize == 1)
{
M[0][ElementIndex] = Element[0];
}
else if (VectorSize == 3)
{
M[0][ElementIndex] = Element[0];
M[1][ElementIndex] = Element[1];
M[2][ElementIndex] = Element[2];
}
else
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
M[ComponentIndex][ElementIndex] = Element[ComponentIndex];
}
}
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
if (VectorSize == 1)
{
Element[0] = WaveBroadcast(BroadcastSettings, M[0])[ElementIndex];
}
else if (VectorSize == 3)
{
Element[0] = WaveBroadcast(BroadcastSettings, M[0])[ElementIndex];
Element[1] = WaveBroadcast(BroadcastSettings, M[1])[ElementIndex];
Element[2] = WaveBroadcast(BroadcastSettings, M[2])[ElementIndex];
}
else
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
Element[ComponentIndex] = WaveBroadcast(BroadcastSettings, M[ComponentIndex])[ElementIndex];
}
}
return Element;
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
return M[ComponentIndex][ElementIndex];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
M[ComponentIndex][ElementIndex] = Component;
}
}; // struct FRegisters_SoA_Matrix
/** Stores an array of vector<> as a structure of array in registers using a array of vector<X, 2>. Requires InElementCount a pair >= 2 */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_AoS_PairArray
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = 2;
static const uint RegisterRowCount = (ElementCount * VectorSize) / 2;
vector<ScalarType, RegisterRowSize> M[RegisterRowCount];
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return M[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
M[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
Element[ComponentIndex] = GetElementComponent(ElementIndex, ComponentIndex);
}
return Element;
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
SetElementComponent(ElementIndex, ComponentIndex, Element[ComponentIndex]);
}
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
if (sizeof(ScalarType) == 4)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
const uint ArrayIndex = ComponentIndex + VectorSize * ElementIndex;
Element[ComponentIndex] = WaveBroadcast(BroadcastSettings, M[ArrayIndex / 2u][ArrayIndex % 2u]);
}
}
else
{
const uint FirstArrayIndex = VectorSize * ElementIndex;
vector<ScalarType, RegisterRowSize> BroadcastedArray[(VectorSize + 1u) / 2u];
UNROLL
for (uint BroadcastIndex = 0; BroadcastIndex < ((VectorSize + 1u) / 2u); BroadcastIndex++)
{
BroadcastedArray[BroadcastIndex] = WaveBroadcast(BroadcastSettings, M[FirstArrayIndex / 2u + BroadcastIndex]);
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
const uint BroadcastedArrayIndex = ComponentIndex + VectorSize * ElementIndex - (FirstArrayIndex / 2u) * 2u;
Element[ComponentIndex] = BroadcastedArray[BroadcastedArrayIndex / 2u][BroadcastedArrayIndex % 2u];
}
}
return Element;
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
const uint ArrayIndex = ComponentIndex + VectorSize * ElementIndex;
return M[ArrayIndex / 2u][ArrayIndex % 2u];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
const uint ArrayIndex = ComponentIndex + VectorSize * ElementIndex;
M[ArrayIndex / 2u][ArrayIndex % 2u] = Component;
}
}; // struct FRegisters_AoS_PairArray
/** Stores an array of vector<> as a structure of array in registers using a array of vector<X, 2>. Requires InElementCount a pair >= 2 */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_SoA_PairArray
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = 2;
static const uint RegisterRowCount = (ElementCount * VectorSize) / 2;
vector<ScalarType, RegisterRowSize> M[RegisterRowCount];
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return M[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
M[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
Element[ComponentIndex] = GetElementComponent(ElementIndex, ComponentIndex);
}
return Element;
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
SetElementComponent(ElementIndex, ComponentIndex, Element[ComponentIndex]);
}
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
const uint ArrayIndex = ElementIndex + VectorSize * ComponentIndex;
Element[ComponentIndex] = WaveBroadcast(BroadcastSettings, M[ArrayIndex / 2u])[ArrayIndex % 2u];
}
return Element;
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
const uint ArrayIndex = ElementIndex + VectorSize * ComponentIndex;
return M[ArrayIndex / 2u][ArrayIndex % 2u];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
const uint ArrayIndex = ElementIndex + VectorSize * ComponentIndex;
M[ArrayIndex / 2u][ArrayIndex % 2u] = Component;
}
}; // struct FRegisters_SoA_PairArray
//------------------------------------------------------- TENSORS
/** Packs a vector<> in 2 dimension. */
template<typename ScalarType, uint InVectorSize, uint InSimdSizeX, uint InSimdSizeY>
struct TLaneVector2D
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint SimdSize = InSimdSizeX * InSimdSizeY;
static const uint SimdSizeX = InSimdSizeX;
static const uint SimdSizeY = InSimdSizeY;
/** Size and number of register rows */
static const uint RegisterRowSize = TENSOR_REGISTER_LAYOUT<ScalarType, InVectorSize, InSimdSizeX * InSimdSizeY>::RegisterRowSize;
static const uint RegisterRowCount = TENSOR_REGISTER_LAYOUT<ScalarType, InVectorSize, InSimdSizeX * InSimdSizeY>::RegisterRowCount;
TENSOR_REGISTER_LAYOUT<ScalarType, InVectorSize, InSimdSizeX * InSimdSizeY> Registers;
// ------------- getting and setting elements and components
// Access and set individual element of the vector.
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
return Registers.GetElement(ElementIndex);
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
Registers.SetElement(ElementIndex, Element);
}
CALL_SITE_DEBUGLOC
void SetAllElements(vector<ScalarType, VectorSize> Element)
{
if(SimdSize == 2)
{
Registers.SetElement(0, Element);
Registers.SetElement(1, Element);
}
else if (SimdSize == 4)
{
Registers.SetElement(0, Element);
Registers.SetElement(1, Element);
Registers.SetElement(2, Element);
Registers.SetElement(3, Element);
}
else
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
Registers.SetElement(ElementIndex, Element);
}
}
}
// Access and set a component of the vector
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> GetComponent(const uint ComponentIndex)
{
TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> R;
if(SimdSize == 2)
{
R.SetElement(0, Registers.GetElementComponent(0, ComponentIndex));
R.SetElement(1, Registers.GetElementComponent(1, ComponentIndex));
}
else if (SimdSize == 4)
{
R.SetElement(0, Registers.GetElementComponent(0, ComponentIndex));
R.SetElement(1, Registers.GetElementComponent(1, ComponentIndex));
R.SetElement(2, Registers.GetElementComponent(2, ComponentIndex));
R.SetElement(3, Registers.GetElementComponent(3, ComponentIndex));
}
else
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
R.SetElement(ElementIndex, Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
void SetComponent(uint ComponentIndex, TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> Comp)
{
if(SimdSize == 2)
{
Registers.SetElementComponent(0, ComponentIndex, Comp.GetElement(0));
Registers.SetElementComponent(1, ComponentIndex, Comp.GetElement(1));
}
else if (SimdSize == 4)
{
Registers.SetElementComponent(0, ComponentIndex, Comp.GetElement(0));
Registers.SetElementComponent(1, ComponentIndex, Comp.GetElement(1));
Registers.SetElementComponent(2, ComponentIndex, Comp.GetElement(2));
Registers.SetElementComponent(3, ComponentIndex, Comp.GetElement(3));
}
else
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
Registers.SetElementComponent(ElementIndex, ComponentIndex, Comp.GetElement(ElementIndex));
}
}
}
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> operator [](const uint ComponentIndex)
{
return GetComponent(ComponentIndex);
}
// ------------- constructors
/** Casts a scalar into a vector<>. */
CALL_SITE_DEBUGLOC
static TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> Vectorize(
TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> A)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(SimdSize == 2)
{
R.SetElement(0, A.GetElement(0));
R.SetElement(1, A.GetElement(1));
}
else if (SimdSize == 4)
{
R.SetElement(0, A.GetElement(0));
R.SetElement(1, A.GetElement(1));
R.SetElement(2, A.GetElement(2));
R.SetElement(3, A.GetElement(3));
}
else
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
R.SetElement(ElementIndex, A.GetElement(ElementIndex));
}
}
return R;
}
/** Initialize all elements with a single same vector<>. */
CALL_SITE_DEBUGLOC
static TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> Const(
vector<ScalarType, VectorSize> A)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(SimdSize == 2)
{
R.SetElement(0, A);
R.SetElement(1, A);
}
else if (SimdSize == 4)
{
R.SetElement(0, A);
R.SetElement(1, A);
R.SetElement(2, A);
R.SetElement(3, A);
}
else
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
R.SetElement(ElementIndex, A);
}
}
return R;
}
/** Casts vector<A> to vector<B>. */
CALL_SITE_DEBUGLOC
template<typename SourceScalarType>
static TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> CastFrom(
TLaneVector2D<SourceScalarType, VectorSize, SimdSizeX, SimdSizeY> A)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(SimdSize == 2)
{
R.SetElement(0, vector<ScalarType, VectorSize>(A.GetElement(0)));
R.SetElement(1, vector<ScalarType, VectorSize>(A.GetElement(1)));
}
else if (SimdSize == 4)
{
R.SetElement(0, vector<ScalarType, VectorSize>(A.GetElement(0)));
R.SetElement(1, vector<ScalarType, VectorSize>(A.GetElement(1)));
R.SetElement(2, vector<ScalarType, VectorSize>(A.GetElement(2)));
R.SetElement(3, vector<ScalarType, VectorSize>(A.GetElement(3)));
}
else
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
R.SetElement(ElementIndex, vector<ScalarType, VectorSize>(A.GetElement(ElementIndex)));
}
}
return R;
}
/** Force tight register packing. */
CALL_SITE_DEBUGLOC
void TightenRegisters()
#if TENSOR_REGISTER_TIGHTENING
{
if(RegisterRowCount == 2)
{
Registers.SetRegisterRow(0, PackRegistersTightly(Registers.GetRegisterRow(0)));
Registers.SetRegisterRow(1, PackRegistersTightly(Registers.GetRegisterRow(1)));
}
else if (RegisterRowCount == 4)
{
Registers.SetRegisterRow(0, PackRegistersTightly(Registers.GetRegisterRow(0)));
Registers.SetRegisterRow(1, PackRegistersTightly(Registers.GetRegisterRow(1)));
Registers.SetRegisterRow(2, PackRegistersTightly(Registers.GetRegisterRow(2)));
Registers.SetRegisterRow(3, PackRegistersTightly(Registers.GetRegisterRow(3)));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
Registers.SetRegisterRow(RegisterRowIndex, PackRegistersTightly(Registers.GetRegisterRow(RegisterRowIndex)));
}
}
}
#else
{
// NOP
}
#endif
#if defined(LDS_DESPILL_DWORD_COUNT)
/** Returns the LDS spill size in dwords */
static uint GetLDSSpillDwordSize()
{
return RegisterRowCount * GetVectorDwordSize(ScalarType, RegisterRowSize);
}
/** Store to LDS spill to save VGPR. */
CALL_SITE_DEBUGLOC
void StoreLDSSpill(const uint DespillDwordOffset)
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
vector<ScalarType, RegisterRowSize> ResgiterRow = Registers.GetRegisterRow(RegisterRowIndex);
uint ResgiterRowDW[GetVectorDwordSize(ScalarType, RegisterRowSize)];
PackVectorToDwords(ResgiterRow, /* out */ ResgiterRowDW);
UNROLL
for (uint i = 0; i < GetVectorDwordSize(ScalarType, RegisterRowSize); i++)
{
DespilledData[GGroupThreadIndex + DespillDwordOffset + (RegisterRowIndex * GetVectorDwordSize(ScalarType, RegisterRowSize) + i) * LDS_DESPILL_THREAD_COUNT] = ResgiterRowDW[i];
}
}
}
/** Load from LDS spill to save VGPR. */
CALL_SITE_DEBUGLOC
static TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> LoadLDSSpill(const uint DespillDwordOffset)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> Vector;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
uint ResgiterRowDW[GetVectorDwordSize(ScalarType, RegisterRowSize)];
UNROLL
for (uint i = 0; i < GetVectorDwordSize(ScalarType, RegisterRowSize); i++)
{
ResgiterRowDW[i] = DespilledData[GGroupThreadIndex + DespillDwordOffset + (RegisterRowIndex * GetVectorDwordSize(ScalarType, RegisterRowSize) + i) * LDS_DESPILL_THREAD_COUNT];
}
vector<ScalarType, RegisterRowSize> ResgiterRow;
UnpackDwordsToVector(ResgiterRowDW, /* out */ ResgiterRow);
Vector.Registers.SetRegisterRow(RegisterRowIndex, ResgiterRow);
}
return Vector;
}
#endif // defined(LDS_DESPILL_DWORD_COUNT)
// ------------- binary operator +
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator + (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) + B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) + B.Registers.GetRegisterRow(1));
}
else if (RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) + B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) + B.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) + B.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) + B.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) + B.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator + (vector<ScalarType, VectorSize> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> BV;
BV.SetAllElements(B);
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) + BV.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) + BV.Registers.GetRegisterRow(1));
}
else if (RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) + BV.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) + BV.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) + BV.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) + BV.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) + BV.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
// ------------- unary operator -
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator - ()
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, -Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, -Registers.GetRegisterRow(1));
}
else if (RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, -Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, -Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, -Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, -Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, -Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
// ------------- binary operator -
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator - (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) - B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) - B.Registers.GetRegisterRow(1));
}
else if (RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) - B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) - B.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) - B.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) - B.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) - B.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator - (vector<ScalarType, VectorSize> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> BV;
BV.SetAllElements(B);
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) - BV.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) - BV.Registers.GetRegisterRow(1));
}
else if (RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) - BV.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) - BV.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) - BV.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) - BV.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) - BV.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
// ------------- binary operator *
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator * (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) * B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) * B.Registers.GetRegisterRow(1));
}
else if (RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) * B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) * B.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) * B.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) * B.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) * B.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator * (vector<ScalarType, VectorSize> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> BV;
BV.SetAllElements(B);
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) * BV.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) * BV.Registers.GetRegisterRow(1));
}
else if (RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) * BV.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) * BV.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) * BV.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) * BV.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) * BV.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
// ------------- comparison operators
CALL_SITE_DEBUGLOC
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> operator > (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) > B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) > B.Registers.GetRegisterRow(1));
}
else if(RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) > B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) > B.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) > B.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) > B.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) > B.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> operator < (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) < B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) < B.Registers.GetRegisterRow(1));
}
else if(RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) < B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) < B.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) < B.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) < B.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) < B.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> operator != (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) != B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) != B.Registers.GetRegisterRow(1));
}
else if(RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) != B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) != B.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) != B.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) != B.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) != B.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> operator == (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> R;
if(RegisterRowCount == 2)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) == B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) == B.Registers.GetRegisterRow(1));
}
else if(RegisterRowCount == 4)
{
R.Registers.SetRegisterRow(0, Registers.GetRegisterRow(0) == B.Registers.GetRegisterRow(0));
R.Registers.SetRegisterRow(1, Registers.GetRegisterRow(1) == B.Registers.GetRegisterRow(1));
R.Registers.SetRegisterRow(2, Registers.GetRegisterRow(2) == B.Registers.GetRegisterRow(2));
R.Registers.SetRegisterRow(3, Registers.GetRegisterRow(3) == B.Registers.GetRegisterRow(3));
}
else
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) == B.Registers.GetRegisterRow(RegisterRowIndex));
}
}
return R;
}
}; // TLaneVector2D
CALL_SITE_DEBUGLOC
template<uint DestVectorSize, typename ScalarType, uint VectorSize, uint SimdSizeX, uint SimdSizeY>
TLaneVector2D<ScalarType, DestVectorSize, SimdSizeX, SimdSizeY> ResizeChannels(
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> A)
{
TLaneVector2D<ScalarType, DestVectorSize, SimdSizeX, SimdSizeY> R;
R.SetAllElements(ScalarType(0));
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < min(VectorSize, DestVectorSize); ComponentIndex++)
{
R.Registers.SetElementComponent(ElementIndex, ComponentIndex, A.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
template<typename ScalarType, uint VectorSizeA, uint VectorSizeB, uint SimdSizeX, uint SimdSizeY>
TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB, SimdSizeX, SimdSizeY> Concatenate(
TLaneVector2D<ScalarType, VectorSizeA, SimdSizeX, SimdSizeY> A,
TLaneVector2D<ScalarType, VectorSizeB, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++)
{
R.Registers.SetElementComponent(ElementIndex, ComponentIndex, A.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++)
{
R.Registers.SetElementComponent(ElementIndex, VectorSizeA + ComponentIndex, B.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
template<typename ScalarType, uint VectorSizeA, uint VectorSizeB, uint VectorSizeC, uint SimdSizeX, uint SimdSizeY>
TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB + VectorSizeC, SimdSizeX, SimdSizeY> Concatenate(
TLaneVector2D<ScalarType, VectorSizeA, SimdSizeX, SimdSizeY> A,
TLaneVector2D<ScalarType, VectorSizeB, SimdSizeX, SimdSizeY> B,
TLaneVector2D<ScalarType, VectorSizeC, SimdSizeX, SimdSizeY> C)
{
TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB + VectorSizeC, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++)
{
R.Registers.SetElementComponent(ElementIndex, ComponentIndex, A.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++)
{
R.Registers.SetElementComponent(ElementIndex, VectorSizeA + ComponentIndex, B.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeC; ComponentIndex++)
{
R.Registers.SetElementComponent(ElementIndex, VectorSizeA + VectorSizeB + ComponentIndex, C.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
}
return R;
}
CALL_SITE_DEBUGLOC
template<typename ScalarType, uint VectorSizeA, uint VectorSizeB, uint SimdSizeX, uint SimdSizeY>
void Deconcatenate(
TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB, SimdSizeX, SimdSizeY> M,
out TLaneVector2D<ScalarType, VectorSizeA, SimdSizeX, SimdSizeY> A,
out TLaneVector2D<ScalarType, VectorSizeB, SimdSizeX, SimdSizeY> B)
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++)
{
A.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++)
{
B.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, VectorSizeA + ComponentIndex));
}
}
}
CALL_SITE_DEBUGLOC
template<typename ScalarType, uint VectorSizeA, uint VectorSizeB, uint VectorSizeC, uint SimdSizeX, uint SimdSizeY>
void Deconcatenate(
TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB + VectorSizeC, SimdSizeX, SimdSizeY> M,
out TLaneVector2D<ScalarType, VectorSizeA, SimdSizeX, SimdSizeY> A,
out TLaneVector2D<ScalarType, VectorSizeB, SimdSizeX, SimdSizeY> B,
out TLaneVector2D<ScalarType, VectorSizeC, SimdSizeX, SimdSizeY> C)
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++)
{
A.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++)
{
B.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, VectorSizeA + ComponentIndex));
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeC; ComponentIndex++)
{
C.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, VectorSizeA + VectorSizeB + ComponentIndex));
}
}
}
// ------------- 1 parameter
#define TVECTOR_FUNCTION_1PARAMS_ALIASE_FUNCTION_NAME(ReturnScalarType, FunctionName,AliasFunctionName) \
CALL_SITE_DEBUGLOC \
template<typename ScalarType, uint VectorSize, uint SimdSizeX, uint SimdSizeY> \
TLaneVector2D<ReturnScalarType, VectorSize, SimdSizeX, SimdSizeY> AliasFunctionName( \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> A) \
{ \
TLaneVector2D<ReturnScalarType, VectorSize, SimdSizeX, SimdSizeY> R; \
if(TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount == 2) \
{ \
R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0))); \
R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1))); \
} \
else if(TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount == 4) \
{ \
R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0))); \
R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1))); \
R.Registers.SetRegisterRow(2, FunctionName(A.Registers.GetRegisterRow(2))); \
R.Registers.SetRegisterRow(3, FunctionName(A.Registers.GetRegisterRow(3))); \
} \
else \
{ \
UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount; RegisterRowIndex++) { \
R.Registers.SetRegisterRow(RegisterRowIndex, FunctionName(A.Registers.GetRegisterRow(RegisterRowIndex))); \
} \
} \
return R; \
} \
#define TVECTOR_FUNCTION_1PARAMS(ReturnScalarType, FunctionName) TVECTOR_FUNCTION_1PARAMS_ALIASE_FUNCTION_NAME(ReturnScalarType, FunctionName, FunctionName)
TVECTOR_FUNCTION_1PARAMS(ScalarType, log);
TVECTOR_FUNCTION_1PARAMS(ScalarType, log2);
TVECTOR_FUNCTION_1PARAMS(ScalarType, sqrt);
TVECTOR_FUNCTION_1PARAMS(ScalarType, rsqrt);
TVECTOR_FUNCTION_1PARAMS(ScalarType, exp);
TVECTOR_FUNCTION_1PARAMS(ScalarType, rcp);
TVECTOR_FUNCTION_1PARAMS(ScalarType, saturate);
TVECTOR_FUNCTION_1PARAMS(ScalarType, abs);
TVECTOR_FUNCTION_1PARAMS(ScalarType, floor);
TVECTOR_FUNCTION_1PARAMS(ScalarType, ceil);
TVECTOR_FUNCTION_1PARAMS(ScalarType, round);
TVECTOR_FUNCTION_1PARAMS(ScalarType, fast_sign);
TVECTOR_FUNCTION_1PARAMS(bool, not);
TVECTOR_FUNCTION_1PARAMS(uint, bit_not);
#if PLATFORM_SUPPORTS_REAL_TYPES
TVECTOR_FUNCTION_1PARAMS(uint16_t, asuint16);
#endif
//Work around: cannot use keyword asuint and asfloat on struct.
TVECTOR_FUNCTION_1PARAMS_ALIASE_FUNCTION_NAME(uint, asuint, azuint);
TVECTOR_FUNCTION_1PARAMS_ALIASE_FUNCTION_NAME(float,asfloat, azfloat);
// ------------- 2 parameters
#define TVECTOR_FUNCTION_2PARAMS(FunctionName) \
CALL_SITE_DEBUGLOC \
template<typename ScalarType, uint VectorSize, uint SimdSizeX, uint SimdSizeY> \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> FunctionName( \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> A, \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B) \
{ \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R; \
if (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount == 2) \
{ \
R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0), B.Registers.GetRegisterRow(0))); \
R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1), B.Registers.GetRegisterRow(1))); \
} \
else if (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount == 4) \
{ \
R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0), B.Registers.GetRegisterRow(0))); \
R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1), B.Registers.GetRegisterRow(1))); \
R.Registers.SetRegisterRow(2, FunctionName(A.Registers.GetRegisterRow(2), B.Registers.GetRegisterRow(2))); \
R.Registers.SetRegisterRow(3, FunctionName(A.Registers.GetRegisterRow(3), B.Registers.GetRegisterRow(3))); \
} \
else \
{ \
UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount; RegisterRowIndex++) { \
R.Registers.SetRegisterRow(RegisterRowIndex, FunctionName(A.Registers.GetRegisterRow(RegisterRowIndex), B.Registers.GetRegisterRow(RegisterRowIndex))); \
} \
} \
return R; \
} \
TVECTOR_FUNCTION_2PARAMS(pow);
TVECTOR_FUNCTION_2PARAMS(min);
TVECTOR_FUNCTION_2PARAMS(max);
TVECTOR_FUNCTION_2PARAMS(and_internal);
TVECTOR_FUNCTION_2PARAMS(or_internal);
TVECTOR_FUNCTION_2PARAMS(bit_and);
TVECTOR_FUNCTION_2PARAMS(bit_or);
TVECTOR_FUNCTION_2PARAMS(bit_shift_left);
TVECTOR_FUNCTION_2PARAMS(bit_shift_right);
// ------------- 2 parameters but different return dimension
#define TVECTOR_FUNCTION_2PARAMS_DIFF_RETURN(ReturnVectorSize, FunctionName) \
CALL_SITE_DEBUGLOC \
template<typename ScalarType, uint VectorSize, uint SimdSizeX, uint SimdSizeY> \
TLaneVector2D<ScalarType, ReturnVectorSize, SimdSizeX, SimdSizeY> FunctionName( \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> A, \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B) \
{ \
TLaneVector2D<ScalarType, ReturnVectorSize, SimdSizeX, SimdSizeY> R; \
if (SimdSizeX * SimdSizeY == 2) \
{ \
R.SetElement(0, FunctionName(A.GetElement(0), B.GetElement(0))); \
R.SetElement(1, FunctionName(A.GetElement(1), B.GetElement(1))); \
} \
else if (SimdSizeX * SimdSizeY == 4) \
{ \
R.SetElement(0, FunctionName(A.GetElement(0), B.GetElement(0))); \
R.SetElement(1, FunctionName(A.GetElement(1), B.GetElement(1))); \
R.SetElement(2, FunctionName(A.GetElement(2), B.GetElement(2))); \
R.SetElement(3, FunctionName(A.GetElement(3), B.GetElement(3))); \
} \
else \
{ \
UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++) { \
R.SetElement(ElementIndex, FunctionName(A.GetElement(ElementIndex), B.GetElement(ElementIndex))); \
} \
} \
return R; \
} \
TVECTOR_FUNCTION_2PARAMS_DIFF_RETURN(/* VectorSize = */ 1, dot);
// ------------- 3 parameters
#define TVECTOR_FUNCTION_3PARAMS(FunctionName, ScalarTypeA) \
CALL_SITE_DEBUGLOC \
template<typename ScalarType, uint VectorSize, uint SimdSizeX, uint SimdSizeY> \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> FunctionName( \
TLaneVector2D<ScalarTypeA, VectorSize, SimdSizeX, SimdSizeY> A, \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B, \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> C) \
{ \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R; \
if (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount == 2) \
{ \
R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0), B.Registers.GetRegisterRow(0), C.Registers.GetRegisterRow(0))); \
R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1), B.Registers.GetRegisterRow(1), C.Registers.GetRegisterRow(1))); \
} \
else if (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount == 4) \
{ \
R.Registers.SetRegisterRow(0, FunctionName(A.Registers.GetRegisterRow(0), B.Registers.GetRegisterRow(0), C.Registers.GetRegisterRow(0))); \
R.Registers.SetRegisterRow(1, FunctionName(A.Registers.GetRegisterRow(1), B.Registers.GetRegisterRow(1), C.Registers.GetRegisterRow(1))); \
R.Registers.SetRegisterRow(2, FunctionName(A.Registers.GetRegisterRow(2), B.Registers.GetRegisterRow(2), C.Registers.GetRegisterRow(2))); \
R.Registers.SetRegisterRow(3, FunctionName(A.Registers.GetRegisterRow(3), B.Registers.GetRegisterRow(3), C.Registers.GetRegisterRow(3))); \
} \
else \
{ \
UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount; RegisterRowIndex++) { \
R.Registers.SetRegisterRow(RegisterRowIndex, FunctionName(A.Registers.GetRegisterRow(RegisterRowIndex), B.Registers.GetRegisterRow(RegisterRowIndex), C.Registers.GetRegisterRow(RegisterRowIndex))); \
} \
} \
return R; \
} \
TVECTOR_FUNCTION_3PARAMS(select_internal, bool);
TVECTOR_FUNCTION_3PARAMS(clamp, ScalarType);
TVECTOR_FUNCTION_3PARAMS(lerp, ScalarType);
TVECTOR_FUNCTION_3PARAMS(min3, ScalarType);
TVECTOR_FUNCTION_3PARAMS(max3, ScalarType);
#if COMPILER_SUPPORTS_MED3
TVECTOR_FUNCTION_3PARAMS(med3, ScalarType);
#endif
// ------------- AnyElement & AllElement
CALL_SITE_DEBUGLOC
template<uint VectorSize, uint SimdSizeX, uint SimdSizeY>
TLaneVector2D<bool, 1, SimdSizeX, SimdSizeY> AnyElement(TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> A)
{
TLaneVector2D<bool, 1, SimdSizeX, SimdSizeY> R;
R.SetElement(0, any(A.GetElement(0)));
if (SimdSizeX * SimdSizeY == 2)
{
R.SetElement(1, any(A.GetElement(1)));
}
else if (SimdSizeX * SimdSizeY == 4)
{
R.SetElement(1, any(A.GetElement(1)));
R.SetElement(2, any(A.GetElement(2)));
R.SetElement(3, any(A.GetElement(3)));
}
else
{
UNROLL for (uint SimdIndex = 1; SimdIndex < SimdSizeX * SimdSizeY; SimdIndex++) { R.SetElement(SimdIndex, any(A.GetElement(SimdIndex))); }
}
return R;
}
CALL_SITE_DEBUGLOC
template<uint VectorSize, uint SimdSizeX, uint SimdSizeY>
TLaneVector2D<bool, 1, SimdSizeX, SimdSizeY> AllElement(TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> A)
{
TLaneVector2D<bool, 1, SimdSizeX, SimdSizeY> R;
R.SetElement(0, all(A.GetElement(0)));
if (SimdSizeX * SimdSizeY == 2)
{
R.SetElement(1, all(A.GetElement(1)));
}
else if (SimdSizeX * SimdSizeY == 4)
{
R.SetElement(1, all(A.GetElement(1)));
R.SetElement(2, all(A.GetElement(2)));
R.SetElement(3, all(A.GetElement(3)));
}
else
{
UNROLL for (uint SimdIndex = 1; SimdIndex < SimdSizeX * SimdSizeY; SimdIndex++) { R.SetElement(SimdIndex, all(A.GetElement(SimdIndex))); }
}
return R;
}
CALL_SITE_DEBUGLOC
template<uint VectorSize, uint SimdSizeX, uint SimdSizeY>
bool AnyComponent(TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> A)
{
bool R = any(A.GetElement(0));
if (SimdSizeX * SimdSizeY == 2)
{
R = R || any(A.GetElement(1));
}
else if (SimdSizeX * SimdSizeY == 4)
{
R = R || any(A.GetElement(1));
R = R || any(A.GetElement(2));
R = R || any(A.GetElement(3));
}
else
{
UNROLL for (uint SimdIndex = 1; SimdIndex < SimdSizeX * SimdSizeY; SimdIndex++) { R = R || any(A.GetElement(SimdIndex)); }
}
return R;
}