// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "../Common.ush" #include "../Random.ush" #include "../TextureSampling.ush" #include "../FastMath.ush" #include "../MonteCarlo.ush" #include "../ScreenPass.ush" #include "/Engine/Public/DualPixelVectorization.ush" #include "/Engine/Public/WaveBroadcastIntrinsics.ush" //------------------------------------------------------- RECOMPILE HASH #pragma message("UESHADERMETADATA_VERSION F3C5FF82-E809-4903-98CD-5DC7E2254C87") //------------------------------------------------------- COMPILER CONFIG // Generate vector truncation warnings to errors. #pragma warning(error: 3206) //------------------------------------------------------- CONFIG #define DEBUG_OUTPUT 0 #if defined(DIM_16BIT_VALU) #define CONFIG_COMPILE_FP16 DIM_16BIT_VALU #elif PLATFORM_SUPPORTS_REAL_TYPES #define CONFIG_COMPILE_FP16 1 #else #define CONFIG_COMPILE_FP16 0 #endif #define CONFIG_SCENE_COLOR_OVERFLOW 1 #if defined(DIM_ALPHA_CHANNEL) #define CONFIG_SCENE_COLOR_ALPHA DIM_ALPHA_CHANNEL #else #define CONFIG_SCENE_COLOR_ALPHA 0 #endif #define CONFIG_ENABLE_STOCASTIC_QUANTIZATION (!CONFIG_SCENE_COLOR_ALPHA) // DXC allows changing order between multiply and adds https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#precise-qualifier // which on 16bit, if linear color spaces ends up close to half(65504.0), this can go +inf on if the compiler transform a * 0.5 + b * 0.5 into (a + b) * 0.5. #define CONFIG_FP16_PRECISE_MULTIPLY_ORDER 1 //------------------------------------------------------- CONSTANTS /* Maximum number of sample. */ #define MAX_SAMPLE_COUNT 8 #if CONFIG_COMPILE_FP16 #define tsr_half half #define tsr_half2 half2 #define tsr_half3 half3 #define tsr_half4 half4 #define tsr_short int16_t #define tsr_short2 int16_t2 #define tsr_short3 int16_t3 #define tsr_short4 int16_t4 #define tsr_ushort uint16_t #define tsr_ushort2 uint16_t2 #define tsr_ushort3 uint16_t3 #define tsr_ushort4 uint16_t4 #define tsr_half2x2 half2x2 #define tsr_half3x2 half3x2 #define tsr_half4x2 half4x2 #define tsr_short2x2 int16_t2x2 #define tsr_short3x2 int16_t3x2 #define tsr_short4x2 int16_t4x2 #define tsr_ushort2x2 uint16_t2x2 #define tsr_ushort3x2 uint16_t3x2 #define tsr_ushort4x2 uint16_t4x2 #define as_tsr_short(x) asint16(x) #define as_tsr_ushort(x) asuint16(x) #define as_tsr_half(x) asfloat16(x) #else #define tsr_half float #define tsr_half2 float2 #define tsr_half3 float3 #define tsr_half4 float4 #define tsr_short int #define tsr_short2 int2 #define tsr_short3 int3 #define tsr_short4 int4 #define tsr_ushort uint #define tsr_ushort2 uint2 #define tsr_ushort3 uint3 #define tsr_ushort4 uint4 #define tsr_half2x2 float2x2 #define tsr_half3x2 float3x2 #define tsr_half4x2 float4x2 #define tsr_short2x2 int2x2 #define tsr_short3x2 int3x2 #define tsr_short4x2 int4x2 #define tsr_ushort2x2 uint2x2 #define tsr_ushort3x2 uint3x2 #define tsr_ushort4x2 uint4x2 #define as_tsr_short(x) asint(x) #define as_tsr_ushort(x) asuint(x) #define as_tsr_half(x) asfloat(x) #endif #if CONFIG_SCENE_COLOR_ALPHA #define tsr_halfC tsr_half4 #define tsr_halfCx2 tsr_half4x2 #define CONFIG_CHANNEL_COUNT 4 #else #define tsr_halfC tsr_half3 #define tsr_halfCx2 tsr_half3x2 #define CONFIG_CHANNEL_COUNT 3 #endif // Largest encodable normal number in a half used on console. static const tsr_half LargestNormalNumber = tsr_half(65504.0); //------------------------------------------------------- PARAMETERS float2 InputInfo_Extent; float2 InputInfo_ExtentInverse; float2 InputInfo_ScreenPosToViewportScale; float2 InputInfo_ScreenPosToViewportBias; uint2 InputInfo_ViewportMin; uint2 InputInfo_ViewportMax; float2 InputInfo_ViewportSize; float2 InputInfo_ViewportSizeInverse; float2 InputInfo_UVViewportMin; float2 InputInfo_UVViewportMax; float2 InputInfo_UVViewportSize; float2 InputInfo_UVViewportSizeInverse; float2 InputInfo_UVViewportBilinearMin; float2 InputInfo_UVViewportBilinearMax; float2 InputJitter; int2 InputPixelPosMin; int2 InputPixelPosMax; FScreenTransform InputPixelPosToScreenPos; float2 ScreenVelocityToInputPixelVelocity; float2 InputPixelVelocityToScreenVelocity; float2 HistoryInfo_Extent; float2 HistoryInfo_ExtentInverse; uint2 HistoryInfo_ViewportMin; uint2 HistoryInfo_ViewportMax; float2 HistoryInfo_ViewportSize; float2 HistoryInfo_ViewportSizeInverse; float2 HistoryInfo_UVViewportBilinearMin; float2 HistoryInfo_UVViewportBilinearMax; // FTSRPrevHistoryParameters float2 PrevHistoryInfo_Extent; float2 PrevHistoryInfo_ExtentInverse; float2 PrevHistoryInfo_ScreenPosToViewportScale; float2 PrevHistoryInfo_ScreenPosToViewportBias; uint2 PrevHistoryInfo_ViewportMin; uint2 PrevHistoryInfo_ViewportMax; float2 PrevHistoryInfo_ViewportSize; float2 PrevHistoryInfo_ViewportSizeInverse; float2 PrevHistoryInfo_UVViewportMin; float2 PrevHistoryInfo_UVViewportMax; float2 PrevHistoryInfo_UVViewportSize; float2 PrevHistoryInfo_UVViewportSizeInverse; float2 PrevHistoryInfo_UVViewportBilinearMin; float2 PrevHistoryInfo_UVViewportBilinearMax; FScreenTransform ScreenPosToPrevHistoryBufferUV; float HistoryPreExposureCorrection; float ResurrectionPreExposureCorrection; uint bCameraCut; #if DEBUG_OUTPUT RWTexture2DArray DebugOutput; #endif //------------------------------------------------------- UTILITY uint sign_bit(float x) { return asuint(x) >> 31u; } uint2 sign_bit(float2 x) { return uint2(sign_bit(x.x), sign_bit(x.y)); } #if PLATFORM_SUPPORTS_REAL_TYPES uint16_t sign_bit(half x) { return asuint16(x) >> uint16_t(15); } uint16_t2 sign_bit(half2 x) { return asuint16(x) >> uint16_t(15); } #endif //------------------------------------------------------- FUNCTIONS CALL_SITE_DEBUGLOC float2x2 ApplyScreenTransform(float2x2 PInA, FScreenTransform AToB) { return dpv_add(dpv_mul(PInA, AToB.xy), AToB.zw); } #if PLATFORM_SUPPORTS_REAL_TYPES CALL_SITE_DEBUGLOC half2x2 ApplyScreenTransform(half2x2 PInA, FScreenTransform AToB) { return dpv_add(dpv_mul(PInA, half2(AToB.xy)), half2(AToB.zw)); } #endif /** Compute the group wave index into SGRP to then recompue the GroupThreadIndex latter */ CALL_SITE_DEBUGLOC uint GetGroupWaveIndex(uint GroupThreadIndex, uint GroupSize) #if COMPILER_SUPPORTS_WAVE_ONCE { uint LaneCountPerWave = WaveGetLaneCount(); if (LaneCountPerWave >= GroupSize) { return 0; } return WaveReadLaneFirst(GroupThreadIndex / LaneCountPerWave); } #else { return 0; } #endif /** Force compute the group GroupThreadIndex through lane index and wave index if possible to reduce VGPR pressure. */ CALL_SITE_DEBUGLOC uint GetGroupThreadIndex(uint GroupThreadIndex, uint GroupWaveIndex) #if COMPILER_SUPPORTS_WAVE_ONCE { // shares GroupWaveOffset to save SALU uint GroupWaveOffset = WaveGetLaneCount() * GroupWaveIndex; // Do not share uint ComputedGroupThreadIndex; ISOLATE { ComputedGroupThreadIndex = GroupWaveOffset + WaveGetLaneIndex(); } return ComputedGroupThreadIndex; } #else { return GroupThreadIndex; } #endif CALL_SITE_DEBUGLOC tsr_short2x2 InvalidateOutputPixelPos(tsr_short2x2 PixelPos, uint2 ViewportMax) #if 1 { tsr_short2x2 Subtract = dpv_sub(tsr_short2(ViewportMax - 1), PixelPos); tsr_ushort2 Override = tsr_ushort2(Subtract[0] | Subtract[1]); #if CONFIG_COMPILE_FP16 PixelPos[0] |= -tsr_short2((Override & uint16_t(0x8000)) >> 15); #else PixelPos[0] |= -tsr_short2((Override & uint(0x80000000)) >> 31); #endif return PixelPos; } #else { bool bIsValidPixel = all(PixelPos < ViewportMax); PixelPos.x = bIsValidPixel ? PixelPos.x : ~tsr_short(0); return PixelPos; } #endif CALL_SITE_DEBUGLOC tsr_short2 InvalidateOutputPixelPos(tsr_short2 PixelPos, uint2 ViewportMax) #if 1 { tsr_short2 Subtract = tsr_short2(ViewportMax - 1) - PixelPos; tsr_ushort Override = tsr_ushort(Subtract.x | Subtract.y); #if CONFIG_COMPILE_FP16 PixelPos.x |= -tsr_short((Override & uint16_t(0x8000)) >> 15); #else PixelPos.x |= -tsr_short((Override & uint(0x80000000)) >> 31); #endif return PixelPos; } #else { bool bIsValidPixel = all(PixelPos < ViewportMax); PixelPos.x = bIsValidPixel ? PixelPos.x : ~tsr_short(0); return PixelPos; } #endif CALL_SITE_DEBUGLOC tsr_ushort2 Map8x8Tile2x2Lane(uint GroupThreadIndex) { tsr_ushort2 T = tsr_ushort(GroupThreadIndex).xx; tsr_ushort2 GroupId = (T >> tsr_ushort2(0, 1) & tsr_ushort2(0x01, 0x01)) | ((T >> tsr_ushort2(2 - 1, 4 - 1)) & tsr_ushort2(0x03 << 1, 0x03 << 1)); return GroupId; } CALL_SITE_DEBUGLOC tsr_ushort2 Map16x16Tile2x2Lane(uint GroupThreadIndex) { tsr_ushort T = tsr_ushort(GroupThreadIndex); tsr_ushort2 GroupId; GroupId.x = ((T >> tsr_ushort(0)) & tsr_ushort(0x01)) | ((T >> tsr_ushort(2 - 1)) & tsr_ushort(0x07 << 1)); GroupId.y = ((T >> tsr_ushort(1)) & tsr_ushort(0x01)) | ((T >> tsr_ushort(5 - 1)) & tsr_ushort(0x07 << 1)); return GroupId; } CALL_SITE_DEBUGLOC float SafeRcp(float x) { return x > 0.0 ? rcp(x) : 0.0; } CALL_SITE_DEBUGLOC float2 SafeRcp(float2 x) { return float2(SafeRcp(x.x), SafeRcp(x.y)); } #if CONFIG_COMPILE_FP16 CALL_SITE_DEBUGLOC half SafeRcp(half x) #if PLATFORM_GPU_ARCH >= PLATFORM_GPU_ARCH_AMD_RDNA_2 && PLATFORM_GPU_ARCH <= PLATFORM_GPU_ARCH_AMD_LATTEST { // If x=0.0, then MaxHalfFloat * 0.0 = 0.0 return min(rcp(x), half(MaxHalfFloat)) * saturate(x * asfloat16(uint16_t(0x7C00))); } #else { return select(x > half(0.0), min(rcp(x), half(MaxHalfFloat)), half(0.0)); } #endif CALL_SITE_DEBUGLOC half2 SafeRcp(half2 x) #if PLATFORM_GPU_ARCH >= PLATFORM_GPU_ARCH_AMD_RDNA_2 && PLATFORM_GPU_ARCH <= PLATFORM_GPU_ARCH_AMD_LATTEST { // If x=0.0, then MaxHalfFloat * 0.0 = 0.0 return min(rcp(x), half(MaxHalfFloat)) * saturate(x * asfloat16(uint16_t(0x7C00))); } #else { return select(x > half(0.0), min(rcp(x), half(MaxHalfFloat)), half(0.0)); } #endif #endif CALL_SITE_DEBUGLOC tsr_half2x2 WeightedLerpFactors(tsr_half2 WeightA, tsr_half2 WeightB, tsr_half2 Blend) { tsr_half2 BlendA = (tsr_half(1.0) - Blend) * WeightA; tsr_half2 BlendB = Blend * WeightB; tsr_half2 RcpBlend = SafeRcp(BlendA + BlendB); BlendA *= RcpBlend; BlendB *= RcpBlend; return tsr_half2x2(BlendA, BlendB); } CALL_SITE_DEBUGLOC tsr_half2 WeightedLerpFactors(tsr_half WeightA, tsr_half WeightB, tsr_half Blend) { return dpv_lo(WeightedLerpFactors( dpv_interleave_mono_registers(WeightA), dpv_interleave_mono_registers(WeightB), dpv_interleave_mono_registers(Blend))); } CALL_SITE_DEBUGLOC bool TakeOnlyOneSamplePair(float2 Offset) { return Offset.y > 0.0 || (Offset.x > 0.0 && Offset.y == 0.0); } tsr_half ComputePredictionCompleteness(tsr_half SampleHistoryValidity) { return saturate(SampleHistoryValidity * tsr_half(MAX_SAMPLE_COUNT) - tsr_half(0.2)); } CALL_SITE_DEBUGLOC void IsOffScreenOrDisoccluded(uint bCameraCut, float2x2 ScreenPos, bool2 bIsParallaxRejected, out bool2 bIsOffScreen, out bool2 bIsDisoccluded) { bool bIsCameraCut = bCameraCut != 0; bool2 bIsOutOfBounds = max(abs(ScreenPos[0]), abs(ScreenPos[1])) >= 1.0; bIsOffScreen = or(bIsCameraCut, bIsOutOfBounds); bIsDisoccluded = and(!bIsOffScreen, bIsParallaxRejected); } tsr_half2 ComputePrevWeightMultiplier(tsr_half2 BlendFactor) { return (1.0 - BlendFactor) / BlendFactor; } tsr_half2 RejectionFactorToBlendFactor(tsr_half2 RejectionFactor) { return tsr_half(1.0) - RejectionFactor; }