// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "TSRKernels.ush" #include "/Engine/Public/LaneVectorization.ush" #ifndef WAVE_COUNT_X #error WAVE_COUNT_X is undefined. #endif #ifndef WAVE_COUNT_Y #error WAVE_COUNT_Y is undefined. #endif #ifndef LANE_COUNT_X #error LANE_COUNT_X is undefined. #endif #ifndef LANE_COUNT_Y #error LANE_COUNT_Y is undefined. #endif /** Total number of waves. */ #define WAVE_COUNT (WAVE_COUNT_X * WAVE_COUNT_Y) /** Total number of lane per wave. */ #define LANE_COUNT (LANE_COUNT_X * LANE_COUNT_Y) /** * LANE_COUNT_X=2 * LANE_COUNT_Y=2 * LaneStride=(4, 2) * * o o o o | o o o o * | * o o o o | o o o o * --------------+-------------- * o o o o | o o o o * | * o o o o | o o o o */ //------------------------------------------------------- VECTOR MEMORY ORDER /** Returns coordinate of a pixel within a lane. */ CALL_SITE_DEBUGLOC template tsr_short2 GetSimdIndexPixelCoordinateInLane(const uint SimdIndex) { return tsr_short2(SimdIndex % LaneStrideX, SimdIndex / LaneStrideX); } CALL_SITE_DEBUGLOC template uint GetPixelCoordinateInLaneSimdIndex(const tsr_short2 PixelCoordinateInLane) { return uint(dot(PixelCoordinateInLane, tsr_short2(1, LaneStrideX))); } /** Returns coordinate of a pixel within a wave. */ CALL_SITE_DEBUGLOC template tsr_short2 GetLaneSimdPixelOffset(const uint LaneIndex, const uint SimdIndex) { return ( tsr_short2(LaneIndex % uint(LANE_COUNT_X * WAVE_COUNT_X), LaneIndex / uint(LANE_COUNT_X * WAVE_COUNT_X)) * tsr_short2(LaneStrideX, LaneStrideY) + GetSimdIndexPixelCoordinateInLane(SimdIndex) ); } /** Returns coordinate of a element within a lane. */ CALL_SITE_DEBUGLOC tsr_short2 GetElementPixelPosInLane(const uint LaneStrideX, const uint LaneStrideY, const uint ElementIndex) { return tsr_short2(ElementIndex % LaneStrideX, ElementIndex / LaneStrideX); } /** Returns coordinate of a element within a tile. */ CALL_SITE_DEBUGLOC tsr_short2 GetElementPixelOffsetInTile(const uint LaneStrideX, const uint LaneStrideY, const uint LaneIndex, const uint SimdIndex) { return ( tsr_short2(LaneIndex % uint(LANE_COUNT_X * WAVE_COUNT_X), LaneIndex / uint(LANE_COUNT_X * WAVE_COUNT_X)) * tsr_short2(LaneStrideX, LaneStrideY) + GetElementPixelPosInLane(LaneStrideX, LaneStrideY, SimdIndex) ); } CALL_SITE_DEBUGLOC tsr_short2 ComputeElementInputPixelPos( const uint2 LaneStride, const uint2 TileOverscan, const uint2 PixelViewportMin, const uint2 PixelViewportMax, const uint2 GroupId, const uint GroupThreadIndex, const uint InputElementIndex, const int2 InputDataOffset = int2(0, 0), const uint InputResDivisor = 1) { const uint2 TileSize = uint2(LaneStride.x * WAVE_COUNT_X * LANE_COUNT_X, LaneStride.y * WAVE_COUNT_Y * LANE_COUNT_Y); const uint2 TileStride = uint2(TileSize - 2 * TileOverscan); tsr_short2 InputPixelOffsetMin = tsr_short2(PixelViewportMin / InputResDivisor); tsr_short2 InputPixelOffsetMax = tsr_short2((PixelViewportMax + (InputResDivisor - 1)) / InputResDivisor - 1); tsr_short2 InputTileOffset = tsr_short2(TileStride / InputResDivisor) * tsr_short2(GroupId) - tsr_short2(TileOverscan / InputResDivisor); tsr_short2 InputPixelOffsetInTile = GetElementPixelOffsetInTile( LaneStride.x / InputResDivisor, LaneStride.y / InputResDivisor, GroupThreadIndex, InputElementIndex); return fastClamp(InputPixelOffsetMin + InputTileOffset + InputPixelOffsetInTile, InputPixelOffsetMin, InputPixelOffsetMax); } CALL_SITE_DEBUGLOC tsr_short2 ComputeElementOutputPixelPos( const uint2 LaneStride, const uint2 TileOverscan, const uint2 PixelViewportMin, const uint2 PixelViewportMax, const uint2 GroupId, const uint GroupThreadIndex, const uint OutputElementIndex, const int2 OutputDataOffset = int2(0, 0), const uint OutputResDivisor = 1, const bool bDebugDisablePadding = false) { const uint2 TileSize = uint2(LaneStride.x * WAVE_COUNT_X * LANE_COUNT_X, LaneStride.y * WAVE_COUNT_Y * LANE_COUNT_Y); const uint2 TileStride = bDebugDisablePadding ? TileSize : uint2(TileSize - 2 * TileOverscan); tsr_short2 OutputPixelOffsetMin = tsr_short2(PixelViewportMin / OutputResDivisor); tsr_short2 OutputPixelOffsetMax = tsr_short2((PixelViewportMax + (OutputResDivisor - 1)) / OutputResDivisor); const uint2 DataMinInTile = TileOverscan - uint2(OutputDataOffset); const uint2 DataMaxInTile = TileSize - uint2(OutputDataOffset) - TileOverscan; tsr_short2 OutputTileOffset = tsr_short2(TileStride / OutputResDivisor) * tsr_short2(GroupId) - tsr_short2((bDebugDisablePadding ? 0u : TileOverscan) / OutputResDivisor); tsr_short2 OutputPixelOffsetInTile = GetElementPixelOffsetInTile( LaneStride.x / OutputResDivisor, LaneStride.y / OutputResDivisor, GroupThreadIndex, OutputElementIndex); tsr_short2 OutputPixelOffsetInOutputTexture = OutputPixelOffsetMin + OutputTileOffset + OutputPixelOffsetInTile; bool2 bIsWithinOutputViewport = OutputPixelOffsetInOutputTexture < OutputPixelOffsetMax; bool2 bIsNotPadding = or(bDebugDisablePadding.xx, and( OutputPixelOffsetInTile * tsr_short(OutputResDivisor) >= tsr_short2(DataMinInTile), OutputPixelOffsetInTile * tsr_short(OutputResDivisor) < tsr_short2(DataMaxInTile))); bool2 bIsValid = and(bIsNotPadding, bIsWithinOutputViewport); OutputPixelOffsetInOutputTexture = select(bIsValid, OutputPixelOffsetInOutputTexture, ~tsr_short(0).xx); return OutputPixelOffsetInOutputTexture; } //------------------------------------------------------- PADDING CALL_SITE_DEBUGLOC template TLaneVector2D IsValidTilePixel(const uint2 TileOverscan) { const uint2 TileSize = uint2(LaneStrideX * WAVE_COUNT_X * LANE_COUNT_X, LaneStrideY * WAVE_COUNT_Y * LANE_COUNT_Y); TLaneVector2D bIsValidTilePixel; UNROLL for (uint ElementIndex = 0; ElementIndex < LaneStrideX * LaneStrideY; ElementIndex++) { tsr_short2 PixelOffsetInTile = GetElementPixelOffsetInTile( LaneStrideX, LaneStrideY, GGroupThreadIndex, ElementIndex); bool2 bIsNotPadding = and( PixelOffsetInTile >= tsr_short2(TileOverscan), PixelOffsetInTile < tsr_short2(TileSize - TileOverscan)); bIsValidTilePixel.SetElement(ElementIndex, all(bIsNotPadding)); } return bIsValidTilePixel; } //------------------------------------------------------- VOTE groupshared uint SharedAtomic; CALL_SITE_DEBUGLOC bool AnyThread(bool X) #if WAVE_COUNT_X == 1 && WAVE_COUNT_Y == 1 && COMPILER_SUPPORTS_WAVE_VOTE { return WaveActiveAnyTrue(X); } #elif COMPILER_SUPPORTS_WAVE_VOTE && PLATFORM_GPU_ARCH == PLATFORM_GPU_ARCH_AMD_RDNA_2 { const uint kLaneCount = WaveGetLaneCount(); const uint kThreadCount = LANE_COUNT_X * LANE_COUNT_Y * WAVE_COUNT_X * WAVE_COUNT_Y; const uint kWaveCount = kThreadCount / kLaneCount; bool AnyX = WaveActiveAnyTrue(X); GroupMemoryBarrierWithGroupSync(); if (WaveIsFirstLane()) { WriteDwordToLDS(GGroupThreadIndex / kLaneCount, select(AnyX, 1u, 0u)); } GroupMemoryBarrierWithGroupSync(); uint WaveX = ReadDwordFromLDS(GGroupThreadIndex % kWaveCount); return WaveActiveAnyTrue(WaveX != 0); } #elif defined(LDS_SIZE) { const uint SharedIndex = 0; GroupMemoryBarrierWithGroupSync(); if (GGroupThreadIndex == 0) { WriteDwordToLDS(SharedIndex, 0); } GroupMemoryBarrierWithGroupSync(); AtomicIncrementLDSDword(SharedIndex, select(X, 1u, 0u)); GroupMemoryBarrierWithGroupSync(); return ReadDwordFromLDS(SharedIndex) > 0; } #else { GroupMemoryBarrierWithGroupSync(); if (GGroupThreadIndex == 0) { SharedAtomic = 0; } GroupMemoryBarrierWithGroupSync(); InterlockedAdd(/* inout */ SharedAtomic, select(X, 1u, 0u)); GroupMemoryBarrierWithGroupSync(); return SharedAtomic > 0; } #endif //------------------------------------------------------- ACCESS NEIGHBOR USING WAVE OPS CALL_SITE_DEBUGLOC template uint GetNeighborElementIndex(const uint ElementIndex, const tsr_short2 Offset) { const uint2 ElementPixelCoord = uint2(GetSimdIndexPixelCoordinateInLane(ElementIndex)); const uint2 NeigborElementPixelCoord = (ElementPixelCoord + uint2(Offset)) % uint2(LaneStrideX, LaneStrideY); const uint NeigborElementIndex = dot(NeigborElementPixelCoord, uint2(1, LaneStrideX)); return NeigborElementIndex; } #if WAVE_COUNT_X == 1 || WAVE_COUNT_Y == 1 CALL_SITE_DEBUGLOC template FWaveBroadcastSettings GetWaveBroadcastSettingsForNeighbor(const uint ElementIndex, const tsr_short2 Offset) { const uint2 ElementPixelCoord = uint2(GetSimdIndexPixelCoordinateInLane(ElementIndex)); const bool2 bNeedsLaneRotation = (ElementPixelCoord + uint2(Offset)) >= uint2(LaneStrideX, LaneStrideY); const int LaneRotation = dot(select(bNeedsLaneRotation, int2(Offset), 0), int2(1, LANE_COUNT_X)); return InitWaveRotateLaneGroup(/* LaneGroupSize = */ LANE_COUNT, LaneRotation); } CALL_SITE_DEBUGLOC template vector WaveAccessNeighborElement( TLaneVector2D Center, const uint ElementIndex, const tsr_short2 Offset) { const uint NeigborElementIndex = GetNeighborElementIndex(ElementIndex, Offset); const FWaveBroadcastSettings BroadcastSettings = GetWaveBroadcastSettingsForNeighbor(ElementIndex, Offset); // Access the lement. vector ReturnSimdElement; if (BroadcastSettings.Rotate != 0) { ReturnSimdElement = Center.Registers.WaveBroadcastElement(BroadcastSettings, NeigborElementIndex); } else { ReturnSimdElement = Center.GetElement(NeigborElementIndex); } return ReturnSimdElement; } CALL_SITE_DEBUGLOC template TLaneVector2D WaveAccessNeighborTexel( TLaneVector2D Center, const tsr_short2 Offset) { TLaneVector2D Return; UNROLL for (uint ElementIndex = 0; ElementIndex < LaneStrideX * LaneStrideY; ElementIndex++) { Return.SetElement(ElementIndex, WaveAccessNeighborElement(Center, ElementIndex, Offset)); } Return.TightenRegisters(); return Return; } #endif // WAVE_COUNT_X == 1 || WAVE_COUNT_Y == 1 //------------------------------------------------------- 3x1 NEIGHBOR ACCESS CALL_SITE_DEBUGLOC template void AccessNeighborTexels3x1( TLaneVector2D CO, out TLaneVector2D CP, out TLaneVector2D CN) #if WAVE_COUNT_X > 1 { const uint TotalNumberOfLaneInGroup = LANE_COUNT_Y * WAVE_COUNT; if (LaneStrideX == 1) { GroupMemoryBarrierWithGroupSync(); UNROLL for (uint SimdIndex = 0; SimdIndex < LaneStrideY; SimdIndex++) { WriteVectorToLDS(GGroupThreadIndex + SimdIndex * TotalNumberOfLaneInGroup, CO.GetElement(SimdIndex)); } GroupMemoryBarrierWithGroupSync(); UNROLL for (uint SimdIndex = 0; SimdIndex < LaneStrideY; SimdIndex++) { vector FirstElement; vector LastElement; ReadVectorFromLDS(((GGroupThreadIndex - 1) % TotalNumberOfLaneInGroup) + SimdIndex * TotalNumberOfLaneInGroup, /* out */ FirstElement); ReadVectorFromLDS(((GGroupThreadIndex + 1) % TotalNumberOfLaneInGroup) + SimdIndex * TotalNumberOfLaneInGroup, /* out */ LastElement); CP.SetElement(SimdIndex, LastElement); CN.SetElement(SimdIndex, FirstElement); } } else { // TODO CP = CN = CO; } CP.TightenRegisters(); CN.TightenRegisters(); } #else { CP = WaveAccessNeighborTexel(CO, tsr_short2(1, 0)); CN = WaveAccessNeighborTexel(CO, tsr_short2(-1, 0)); } #endif //------------------------------------------------------- 1x3 NEIGHBOR ACCESS CALL_SITE_DEBUGLOC ISOLATE template void Write1x3CenterToLDS(TLaneVector2D CO) #if WAVE_COUNT_Y > 1 { const uint TotalNumberOfLaneInGroup = LANE_COUNT_X * WAVE_COUNT; GroupMemoryBarrierWithGroupSync(); if (LaneStrideY == 1) { UNROLL for (uint ElementIndex = 0; ElementIndex < LaneStrideX; ElementIndex++) { WriteVectorToLDS(GGroupThreadIndex + ElementIndex * TotalNumberOfLaneInGroup, CO.GetElement(ElementIndex)); } } else { const uint FirstElementIndex = 0; const uint LastElementIndex = LaneStrideX * LaneStrideY - LaneStrideX; UNROLL for (uint ElementIndex = 0; ElementIndex < LaneStrideX; ElementIndex++) { WriteVectorToLDS(GGroupThreadIndex + (0 + ElementIndex) * TotalNumberOfLaneInGroup, CO.GetElement(FirstElementIndex + ElementIndex)); WriteVectorToLDS(GGroupThreadIndex + (LaneStrideX + ElementIndex) * TotalNumberOfLaneInGroup, CO.GetElement(LastElementIndex + ElementIndex)); } } GroupMemoryBarrierWithGroupSync(); } #else { // NOP } #endif CALL_SITE_DEBUGLOC ISOLATE template void Read1x3NeighborElementsFromLDS( TLaneVector2D CO, const uint ElementIndex, out vector CP, out vector CN) #if WAVE_COUNT_Y > 1 { const uint TotalNumberOfLaneInGroup = LANE_COUNT_X * WAVE_COUNT; if (LaneStrideY == 1) { ReadVectorFromLDS(((GGroupThreadIndex - LANE_COUNT_X * WAVE_COUNT_X) % TotalNumberOfLaneInGroup) + ElementIndex * TotalNumberOfLaneInGroup, /* out */ CN); ReadVectorFromLDS(((GGroupThreadIndex + LANE_COUNT_X * WAVE_COUNT_X) % TotalNumberOfLaneInGroup) + ElementIndex * TotalNumberOfLaneInGroup, /* out */ CP); } else // if (LaneStrideY > 1) { if (ElementIndex < LaneStrideX) { ReadVectorFromLDS(((GGroupThreadIndex - LANE_COUNT_X * WAVE_COUNT_X) % TotalNumberOfLaneInGroup) + (LaneStrideX + ElementIndex) * TotalNumberOfLaneInGroup, /* out */ CN); } else { CN = CO.GetElement(ElementIndex - LaneStrideX); } const uint LastElementIndex = LaneStrideX * LaneStrideY - LaneStrideX; if (ElementIndex >= LastElementIndex) { ReadVectorFromLDS(((GGroupThreadIndex + LANE_COUNT_X * WAVE_COUNT_X) % TotalNumberOfLaneInGroup) + (0 + ElementIndex - LastElementIndex) * TotalNumberOfLaneInGroup, /* out */ CP); } else { CP = CO.GetElement(ElementIndex + LaneStrideX); } } } #else { CP = WaveAccessNeighborElement(CO, ElementIndex, tsr_short2(0, 1)); CN = WaveAccessNeighborElement(CO, ElementIndex, tsr_short2(0, -1)); } #endif CALL_SITE_DEBUGLOC ISOLATE template void Read1x3CenterFromLDS( TLaneVector2D CO, out TLaneVector2D CP, out TLaneVector2D CN) { UNROLL for (uint ElementIndex = 0; ElementIndex < LaneStrideX * LaneStrideY; ElementIndex++) { vector CPElement; vector CNElement; Read1x3NeighborElementsFromLDS(CO, ElementIndex, /* out */ CPElement, /* out */ CNElement); CP.SetElement(ElementIndex, CPElement); CN.SetElement(ElementIndex, CNElement); } CP.TightenRegisters(); CN.TightenRegisters(); } CALL_SITE_DEBUGLOC ISOLATE template void AccessNeighborTexels1x3( TLaneVector2D CO, out TLaneVector2D CP, out TLaneVector2D CN) { Write1x3CenterToLDS(CO); Read1x3CenterFromLDS(CO, /* out */ CP, /* out */ CN); } //------------------------------------------------------- CONVOLUTION WITH SINK FOR LOWER REGISTER PRESSURE CALL_SITE_DEBUGLOC template TLaneVector2D Convolve1x3Then3x1FromLDS(TLaneVector2D C, ConvolutionSinkType ConvolutionSink) #if WAVE_COUNT_X == 1 && 0 { // Only if LaneStrideX==1 and WAVE_COUNT_X==1 TLaneVector2D R; UNROLL for (uint ElementIndex = 0; ElementIndex < LaneStrideX * LaneStrideY; ElementIndex++) { vector CPElement; vector CNElement; Read1x3NeighborElementsFromLDS(C, ElementIndex, /* out */ CPElement, /* out */ CNElement); vector HCElement = ConvolutionSink.ConvolveElements1x3(CNElement, C.GetElement(ElementIndex), CPElement); const FWaveBroadcastSettings HPSettings = GetWaveBroadcastSettingsForNeighbor(ElementIndex, tsr_short2(1, 0)); const FWaveBroadcastSettings HNSettings = GetWaveBroadcastSettingsForNeighbor(ElementIndex, tsr_short2(-1, 0)); vector HPElement = WaveBroadcast(HPSettings, HCElement); vector HNElement = WaveBroadcast(HNSettings, HCElement); R.SetElement(ElementIndex, ConvolutionSink.ConvolveElements3x1(HNElement, HCElement, HPElement)); } return R; } #else { TLaneVector2D VO; { TLaneVector2D CO = C; TLaneVector2D CP; TLaneVector2D CN; Read1x3CenterFromLDS(CO, /* out */ CP, /* out */ CN); UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D::RegisterRowCount; RegisterRowIndex++) { VO.Registers.SetRegisterRow(RegisterRowIndex, ConvolutionSink.ConvolveRegisterRows1x3(CN.Registers.GetRegisterRow(RegisterRowIndex), CO.Registers.GetRegisterRow(RegisterRowIndex), CP.Registers.GetRegisterRow(RegisterRowIndex))); } } TLaneVector2D R; { TLaneVector2D CO = VO; TLaneVector2D CP; TLaneVector2D CN; AccessNeighborTexels3x1(CO, /* out */ CP, /* out */ CN); UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D::RegisterRowCount; RegisterRowIndex++) { R.Registers.SetRegisterRow(RegisterRowIndex, ConvolutionSink.ConvolveRegisterRows3x1(CN.Registers.GetRegisterRow(RegisterRowIndex), CO.Registers.GetRegisterRow(RegisterRowIndex), CP.Registers.GetRegisterRow(RegisterRowIndex))); } } return R; } #endif CALL_SITE_DEBUGLOC template TLaneVector2D Convolve1x3Then3x1(TLaneVector2D Center, ConvolutionSinkType ConvolutionSink) { Write1x3CenterToLDS(Center); return Convolve1x3Then3x1FromLDS(Center, ConvolutionSink); } //------------------------------------------------------- BLUR 3x3 CONVOLUTION CALL_SITE_DEBUGLOC template TLaneVector2D WeightedSum3x3(TLaneVector2D Center, float3 HorizontalWeights, float3 VerticalWeights) { TLaneVector2D CO = Center; TLaneVector2D CP; TLaneVector2D CN; AccessNeighborTexels1x3(CO, /* out */ CP, /* out */ CN); TLaneVector2D BO = ( CO * TLaneVector2D::Const(ScalarType(VerticalWeights.y)) + CP * TLaneVector2D::Const(ScalarType(VerticalWeights.z)) + CN * TLaneVector2D::Const(ScalarType(VerticalWeights.x))); TLaneVector2D BP; TLaneVector2D BN; AccessNeighborTexels3x1(BO, /* out */ BP, /* out */ BN); return ( BO * TLaneVector2D::Const(ScalarType(HorizontalWeights.y)) + BP * TLaneVector2D::Const(ScalarType(HorizontalWeights.z)) + BN * TLaneVector2D::Const(ScalarType(HorizontalWeights.x))); } CALL_SITE_DEBUGLOC template TLaneVector2D Sum3x3(TLaneVector2D Center) { return WeightedSum3x3(Center, /* HorizontalWeights = */ float(1.0).xxx, /* VerticalWeights = */ float(1.0).xxx); } CALL_SITE_DEBUGLOC template TLaneVector2D WeightedAvg3x3(TLaneVector2D Center, float3 NormalizedWeights) { return WeightedSum3x3(Center, /* HorizontalWeights = */ NormalizedWeights, /* VerticalWeights = */ NormalizedWeights); } CALL_SITE_DEBUGLOC template TLaneVector2D Blur3x3(TLaneVector2D Center) { return WeightedAvg3x3(Center, /* NormalizedWeights = */ float3(0.25, 0.5, 0.25)); } CALL_SITE_DEBUGLOC template TLaneVector2D BlurEven3x3(TLaneVector2D Center) { return WeightedAvg3x3(Center, /* NormalizedWeights = */ float3(0.3333, 0.3333, 0.3333)); } CALL_SITE_DEBUGLOC template TLaneVector2D Sharpen3x3(TLaneVector2D Center) { return WeightedAvg3x3(Center, /* NormalizedWeights = */ float3(-0.25, 1.5, -0.25)); } //------------------------------------------------------- TOTAL VARIATION 3x3 CONVOLUTION CALL_SITE_DEBUGLOC template TLaneVector2D TotalVariation3x3(TLaneVector2D Center) #if 0 { TLaneVector2D Sum = Blur3x3(Center); return TLaneVector2D::Const(1) * Center - Sum; } #else { TLaneVector2D Sum = WeightedSum3x3(Center, /* HorizontalWeights = */ float(1.0).xxx, /* VerticalWeights = */ float3(0.125, 0.125, 0.125)); return TLaneVector2D::Const(1.125) * Center - Sum; } #endif //------------------------------------------------------- MIN & MAX 3x3 CONVOLUTION template struct TConvolutionSinkMin { static const uint RegisterRowSize = TLaneVector2D::RegisterRowSize; uint Unused; vector ConvolveElements3x1(vector N, vector C, vector P) { return min3(N, C, P); } vector ConvolveElements1x3(vector N, vector C, vector P) { return min3(N, C, P); } vector ConvolveRegisterRows3x1(vector N, vector C, vector P) { return min3(N, C, P); } vector ConvolveRegisterRows1x3(vector N, vector C, vector P) { return min3(N, C, P); } }; template struct TConvolutionSinkMax { static const uint RegisterRowSize = TLaneVector2D::RegisterRowSize; uint Unused; vector ConvolveElements3x1(vector N, vector C, vector P) { return max3(N, C, P); } vector ConvolveElements1x3(vector N, vector C, vector P) { return max3(N, C, P); } vector ConvolveRegisterRows3x1(vector N, vector C, vector P) { return max3(N, C, P); } vector ConvolveRegisterRows1x3(vector N, vector C, vector P) { return max3(N, C, P); } }; CALL_SITE_DEBUGLOC template TLaneVector2D Min3x3(TLaneVector2D Center) #if 1 { TConvolutionSinkMin MinSink = {0}; return Convolve1x3Then3x1(Center, MinSink); } #else { TLaneVector2D CO = Center; TLaneVector2D CP; TLaneVector2D CN; AccessNeighborTexels1x3(CO, /* out */ CP, /* out */ CN); TLaneVector2D MinO = min3(CN, CP, CO); TLaneVector2D MinP; TLaneVector2D MinN; AccessNeighborTexels3x1(MinO, /* out */ MinP, /* out */ MinN); return min3(MinN, MinP, MinO); } #endif CALL_SITE_DEBUGLOC template TLaneVector2D Max3x3(TLaneVector2D Center) #if 1 { TConvolutionSinkMax MaxSink = {0}; return Convolve1x3Then3x1(Center, MaxSink); } #else { TLaneVector2D CO = Center; TLaneVector2D CP; TLaneVector2D CN; AccessNeighborTexels1x3(CO, /* out */ CP, /* out */ CN); TLaneVector2D MaxO = max3(CN, CP, CO); TLaneVector2D MaxP; TLaneVector2D MaxN; AccessNeighborTexels3x1(MaxO, /* out */ MaxP, /* out */ MaxN); return max3(MaxN, MaxP, MaxO); } #endif CALL_SITE_DEBUGLOC template TLaneVector2D MaxRGBMinA3x3(TLaneVector2D Center) { TLaneVector2D CO = Center; TLaneVector2D CP; TLaneVector2D CN; AccessNeighborTexels1x3(CO, /* out */ CP, /* out */ CN); TLaneVector2D MaxO = max3(CN, CP, CO); MaxO.SetComponent(3, min3(CN[3], CP[3], CO[3])); TLaneVector2D MaxP; TLaneVector2D MaxN; AccessNeighborTexels3x1(MaxO, /* out */ MaxP, /* out */ MaxN); TLaneVector2D R = max3(MaxN, MaxP, MaxO); R.SetComponent(3, min3(MaxN[3], MaxP[3], MaxO[3])); return R; } CALL_SITE_DEBUGLOC template TLaneVector2D MaxRMinG3x3(TLaneVector2D Center) { TLaneVector2D CO = Center; TLaneVector2D CP; TLaneVector2D CN; AccessNeighborTexels1x3(CO, /* out */ CP, /* out */ CN); TLaneVector2D MaxO = max3(CN, CP, CO); MaxO.SetComponent(1, min3(CN[1], CP[1], CO[1])); TLaneVector2D MaxP; TLaneVector2D MaxN; AccessNeighborTexels3x1(MaxO, /* out */ MaxP, /* out */ MaxN); TLaneVector2D R = max3(MaxN, MaxP, MaxO); R.SetComponent(1, min3(MaxN[1], MaxP[1], MaxO[1])); return R; } CALL_SITE_DEBUGLOC template void MinMax3x3( TLaneVector2D Center, out TLaneVector2D OutMin, out TLaneVector2D OutMax) #if WAVE_COUNT_X == 1 && 1 { TConvolutionSinkMin MinSink = {0}; TConvolutionSinkMax MaxSink = {0}; Write1x3CenterToLDS(Center); OutMin = Convolve1x3Then3x1FromLDS(Center, MinSink); OutMax = Convolve1x3Then3x1FromLDS(Center, MaxSink); } #elif 1 { TLaneVector2D CO = Center; TLaneVector2D CP; TLaneVector2D CN; AccessNeighborTexels1x3(CO, /* out */ CP, /* out */ CN); TLaneVector2D MinO = min3(CN, CP, CO); TLaneVector2D MaxO = max3(CN, CP, CO); TLaneVector2D MinP; TLaneVector2D MinN; AccessNeighborTexels3x1(MinO, /* out */ MinP, /* out */ MinN); OutMin = min3(MinN, MinP, MinO); TLaneVector2D MaxP; TLaneVector2D MaxN; AccessNeighborTexels3x1(MaxO, /* out */ MaxP, /* out */ MaxN); OutMax = max3(MaxN, MaxP, MaxO); } #else { OutMin = Min3x3(Center); OutMax = Max3x3(Center); } #endif CALL_SITE_DEBUGLOC template TLaneVector2D MaxMinusMin3x3( TLaneVector2D Center) { TLaneVector2D BoxMin; TLaneVector2D BoxMax; MinMax3x3(Center, /* out */ BoxMin, /* out */ BoxMax); return BoxMax - BoxMin; } //------------------------------------------------------- CLAMP 3x3 CONVOLUTIONS CALL_SITE_DEBUGLOC template TLaneVector2D Clamp3x3( TLaneVector2D ToClamp, TLaneVector2D BoundaryCenter) { TLaneVector2D BoundaryMin; TLaneVector2D BoundaryMax; MinMax3x3(BoundaryCenter, /* out */ BoundaryMin, /* out */ BoundaryMax); return fastClamp(ToClamp, BoundaryMin, BoundaryMax); } CALL_SITE_DEBUGLOC template TLaneVector2D LerpClamp3x3( TLaneVector2D ToClamp, TLaneVector2D Boundary /*Boundary inside min max of ToClamp)*/, TLaneVector2D LerpFactor) { #if CONFIG_COMPILE_FP16 TLaneVector2D BoundaryMin; TLaneVector2D BoundaryMax; MinMax3x3(Boundary, /* out */ BoundaryMin, /* out */ BoundaryMax); TLaneVector2D ToClampBoundaryMin; TLaneVector2D ToClampBoundaryMax; MinMax3x3(ToClamp, /* out */ ToClampBoundaryMin, /* out */ ToClampBoundaryMax); TLaneVector2D LerpFactorVector = TLaneVector2D::Vectorize(LerpFactor); BoundaryMin = lerp(BoundaryMin, ToClampBoundaryMin, LerpFactorVector); BoundaryMax = lerp(BoundaryMax, ToClampBoundaryMax, LerpFactorVector); #else // register pressure is high TLaneVector2D BoundaryMin; TLaneVector2D BoundaryMax; TLaneVector2D BoundaryLimit; TLaneVector2D ToClampBoundaryLimit; TLaneVector2D LerpFactorVector = TLaneVector2D::Vectorize(LerpFactor); BoundaryLimit = Min3x3(Boundary); ToClampBoundaryLimit = Min3x3(ToClamp); BoundaryMin = lerp(BoundaryLimit, ToClampBoundaryLimit, LerpFactorVector); BoundaryLimit = Max3x3(Boundary); ToClampBoundaryLimit = Max3x3(ToClamp); BoundaryMax = lerp(BoundaryLimit, ToClampBoundaryLimit, LerpFactorVector); #endif return fastClamp(ToClamp, BoundaryMin, BoundaryMax); } CALL_SITE_DEBUGLOC template TLaneVector2D AnnihilateToGuide3x3( TLaneVector2D ToClamp, TLaneVector2D Guide) { return Clamp3x3(ToClamp, Clamp3x3(Guide, ToClamp)); } CALL_SITE_DEBUGLOC template void AnnihilateMutually3x3( TLaneVector2D Input, TLaneVector2D History, out TLaneVector2D AnnihilatedInput, out TLaneVector2D AnnihilatedHistory) { AnnihilatedInput = AnnihilateToGuide3x3(Input , /* Guide = */ History); AnnihilatedHistory = AnnihilateToGuide3x3(History, /* Guide = */ Input); } CALL_SITE_DEBUGLOC template void AnnihilateMutuallySingleChannel3x3( TLaneVector2D Input, TLaneVector2D History, out TLaneVector2D AnnihilatedInput, out TLaneVector2D AnnihilatedHistory) { Deconcatenate(AnnihilateToGuide3x3(/* ToClamp = */ Concatenate(Input, History), /* Guide = */ Concatenate(History, Input)), /* out */ AnnihilatedInput, /* out */ AnnihilatedHistory); } //------------------------------------------------------- MIN & MAX & CLAMP 3x3 PLUS CONVOLUTION CALL_SITE_DEBUGLOC template void MinMaxPlus3x3( TLaneVector2D Center, out TLaneVector2D OutMin, out TLaneVector2D OutMax) { TLaneVector2D CVP, CVN; AccessNeighborTexels1x3(Center, /* out */ CVP, /* out */ CVN); OutMin = min3(CVP, CVN, Center); OutMax = max3(CVP, CVN, Center); TLaneVector2D CHP, CHN; AccessNeighborTexels3x1(Center, /* out */ CHP, /* out */ CHN); OutMin = min3(CHN, CHP, OutMin); OutMax = max3(CHN, CHP, OutMax); } CALL_SITE_DEBUGLOC template TLaneVector2D ClampPlus3x3( TLaneVector2D ToClamp, TLaneVector2D BoundaryCenter) { TLaneVector2D BoundaryMin; TLaneVector2D BoundaryMax; MinMaxPlus3x3(BoundaryCenter, /* out */ BoundaryMin, /* out */ BoundaryMax); return fastClamp(ToClamp, BoundaryMin, BoundaryMax); } //------------------------------------------------------- MEDIAN 3x3 CONVOLUTION // Operator that output, lowest, median and highest values from 3 input values. CALL_SITE_DEBUGLOC template void LMHOperator(FSampleType A, FSampleType B, FSampleType C, out FSampleType L, out FSampleType M, out FSampleType H) #if COMPILER_SUPPORTS_MED3 { L = min3(A, B, C); M = med3(A, B, C); H = max3(A, B, C); } #else { FSampleType X = min(B, C); FSampleType Y = max(B, C); L = min(A, X); FSampleType Z = max(A, X); M = min(Z, Y); H = max(Z, Y); } #endif // 3 samples median. CALL_SITE_DEBUGLOC template FSampleType Median(FSampleType A, FSampleType B, FSampleType C) #if COMPILER_SUPPORTS_MED3 { return med3(A, B, C); } #else { FSampleType L, M, H; LMHOperator(A, B, C, L, M, H); return M; } #endif CALL_SITE_DEBUGLOC template TLaneVector2D Median3x3(TLaneVector2D Center) { TLaneVector2D C0 = Center; TLaneVector2D C1; TLaneVector2D C2; AccessNeighborTexels1x3(C0, /* out */ C1, /* out */ C2); // First layer. TLaneVector2D M0L0; TLaneVector2D M0M0; TLaneVector2D M0H0; LMHOperator(C0, C1, C2, M0L0, M0M0, M0H0); TLaneVector2D M0L1; TLaneVector2D M0L2; AccessNeighborTexels3x1(M0L0, /* out */ M0L1, /* out */ M0L2); TLaneVector2D M0M1; TLaneVector2D M0M2; AccessNeighborTexels3x1(M0M0, /* out */ M0M1, /* out */ M0M2); TLaneVector2D M0H1; TLaneVector2D M0H2; AccessNeighborTexels3x1(M0H0, /* out */ M0H1, /* out */ M0H2); // Second layer. TLaneVector2D M1[3]; M1[0] = max3(M0L0, M0L1, M0L2); M1[1] = Median(M0M0, M0M1, M0M2); M1[2] = min3(M0H0, M0H1, M0H2); // Third layer. return Median(M1[0], M1[1], M1[2]); } CALL_SITE_DEBUGLOC template TLaneVector2D MedianBool3x3(TLaneVector2D Center) { TLaneVector2D Zero = TLaneVector2D::Const(tsr_half(0.0)); TLaneVector2D One = TLaneVector2D::Const(tsr_half(1.0)); TLaneVector2D CenterHalf = select( Center, One, Zero); TLaneVector2D CenterSum = Sum3x3(CenterHalf); return CenterSum > TLaneVector2D::Const(4.0); } //------------------------------------------------------- DOWNSAMPLE 2x2 CONVOLUTIONS CALL_SITE_DEBUGLOC template TLaneVector2D DownsampleMin2x2( TLaneVector2D Input) { TLaneVector2D OutMin; UNROLL for (uint OutputSimdIndex = 0; OutputSimdIndex < ((LaneStrideX * LaneStrideY) / 4); OutputSimdIndex++) { const uint2 OutputPos = GetSimdIndexPixelCoordinateInLane(OutputSimdIndex); vector Input0 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[0]))); vector Input1 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[1]))); vector Input2 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[2]))); vector Input3 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[3]))); OutMin.SetElement(OutputSimdIndex, min(min(Input0, Input1), min(Input2, Input3))); } return OutMin; } CALL_SITE_DEBUGLOC template TLaneVector2D DownsampleMax2x2( TLaneVector2D Input) { TLaneVector2D OutMax; UNROLL for (uint OutputSimdIndex = 0; OutputSimdIndex < ((LaneStrideX * LaneStrideY) / 4); OutputSimdIndex++) { const uint2 OutputPos = GetSimdIndexPixelCoordinateInLane(OutputSimdIndex); vector Input0 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[0]))); vector Input1 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[1]))); vector Input2 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[2]))); vector Input3 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[3]))); OutMax.SetElement(OutputSimdIndex, max(max(Input0, Input1), max(Input2, Input3))); } return OutMax; } CALL_SITE_DEBUGLOC template TLaneVector2D DownsampleDot2x2( TLaneVector2D Input, const ScalarType Weights[4]) { TLaneVector2D OutMax; UNROLL for (uint OutputSimdIndex = 0; OutputSimdIndex < ((LaneStrideX * LaneStrideY) / 4); OutputSimdIndex++) { const uint2 OutputPos = GetSimdIndexPixelCoordinateInLane(OutputSimdIndex); vector Input0 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[0]))); vector Input1 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[1]))); vector Input2 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[2]))); vector Input3 = Input.GetElement(GetPixelCoordinateInLaneSimdIndex(tsr_short2(OutputPos * 2 + Offsets2x2[3]))); OutMax.SetElement(OutputSimdIndex, Input0 * Weights[0] + Input1 * Weights[1] + Input2 * Weights[2] + Input3 * Weights[3]); } return OutMax; } CALL_SITE_DEBUGLOC template TLaneVector2D DownsampleAvg2x2( TLaneVector2D Input) { const ScalarType Weights[4] = { ScalarType(0.25), ScalarType(0.25), ScalarType(0.25), ScalarType(0.25) }; return DownsampleDot2x2(Input, Weights); }