447 lines
11 KiB
HLSL
447 lines
11 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "../Common.ush"
|
|
#include "../Random.ush"
|
|
#include "../TextureSampling.ush"
|
|
#include "../FastMath.ush"
|
|
#include "../MonteCarlo.ush"
|
|
#include "../ScreenPass.ush"
|
|
#include "/Engine/Public/DualPixelVectorization.ush"
|
|
#include "/Engine/Public/WaveBroadcastIntrinsics.ush"
|
|
|
|
|
|
//------------------------------------------------------- RECOMPILE HASH
|
|
|
|
#pragma message("UESHADERMETADATA_VERSION F3C5FF82-E809-4903-98CD-5DC7E2254C87")
|
|
|
|
|
|
//------------------------------------------------------- COMPILER CONFIG
|
|
|
|
// Generate vector truncation warnings to errors.
|
|
#pragma warning(error: 3206)
|
|
|
|
|
|
//------------------------------------------------------- CONFIG
|
|
|
|
#define DEBUG_OUTPUT 0
|
|
|
|
#if defined(DIM_16BIT_VALU)
|
|
#define CONFIG_COMPILE_FP16 DIM_16BIT_VALU
|
|
#elif PLATFORM_SUPPORTS_REAL_TYPES
|
|
#define CONFIG_COMPILE_FP16 1
|
|
#else
|
|
#define CONFIG_COMPILE_FP16 0
|
|
#endif
|
|
|
|
#define CONFIG_SCENE_COLOR_OVERFLOW 1
|
|
|
|
#if defined(DIM_ALPHA_CHANNEL)
|
|
#define CONFIG_SCENE_COLOR_ALPHA DIM_ALPHA_CHANNEL
|
|
#else
|
|
#define CONFIG_SCENE_COLOR_ALPHA 0
|
|
#endif
|
|
|
|
#define CONFIG_ENABLE_STOCASTIC_QUANTIZATION (!CONFIG_SCENE_COLOR_ALPHA)
|
|
|
|
// DXC allows changing order between multiply and adds https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#precise-qualifier
|
|
// which on 16bit, if linear color spaces ends up close to half(65504.0), this can go +inf on if the compiler transform a * 0.5 + b * 0.5 into (a + b) * 0.5.
|
|
#define CONFIG_FP16_PRECISE_MULTIPLY_ORDER 1
|
|
|
|
|
|
//------------------------------------------------------- CONSTANTS
|
|
|
|
/* Maximum number of sample. */
|
|
#define MAX_SAMPLE_COUNT 8
|
|
|
|
#if CONFIG_COMPILE_FP16
|
|
#define tsr_half half
|
|
#define tsr_half2 half2
|
|
#define tsr_half3 half3
|
|
#define tsr_half4 half4
|
|
|
|
#define tsr_short int16_t
|
|
#define tsr_short2 int16_t2
|
|
#define tsr_short3 int16_t3
|
|
#define tsr_short4 int16_t4
|
|
|
|
#define tsr_ushort uint16_t
|
|
#define tsr_ushort2 uint16_t2
|
|
#define tsr_ushort3 uint16_t3
|
|
#define tsr_ushort4 uint16_t4
|
|
|
|
#define tsr_half2x2 half2x2
|
|
#define tsr_half3x2 half3x2
|
|
#define tsr_half4x2 half4x2
|
|
|
|
#define tsr_short2x2 int16_t2x2
|
|
#define tsr_short3x2 int16_t3x2
|
|
#define tsr_short4x2 int16_t4x2
|
|
|
|
#define tsr_ushort2x2 uint16_t2x2
|
|
#define tsr_ushort3x2 uint16_t3x2
|
|
#define tsr_ushort4x2 uint16_t4x2
|
|
|
|
#define as_tsr_short(x) asint16(x)
|
|
#define as_tsr_ushort(x) asuint16(x)
|
|
#define as_tsr_half(x) asfloat16(x)
|
|
|
|
#else
|
|
#define tsr_half float
|
|
#define tsr_half2 float2
|
|
#define tsr_half3 float3
|
|
#define tsr_half4 float4
|
|
|
|
#define tsr_short int
|
|
#define tsr_short2 int2
|
|
#define tsr_short3 int3
|
|
#define tsr_short4 int4
|
|
|
|
#define tsr_ushort uint
|
|
#define tsr_ushort2 uint2
|
|
#define tsr_ushort3 uint3
|
|
#define tsr_ushort4 uint4
|
|
|
|
#define tsr_half2x2 float2x2
|
|
#define tsr_half3x2 float3x2
|
|
#define tsr_half4x2 float4x2
|
|
|
|
#define tsr_short2x2 int2x2
|
|
#define tsr_short3x2 int3x2
|
|
#define tsr_short4x2 int4x2
|
|
|
|
#define tsr_ushort2x2 uint2x2
|
|
#define tsr_ushort3x2 uint3x2
|
|
#define tsr_ushort4x2 uint4x2
|
|
|
|
#define as_tsr_short(x) asint(x)
|
|
#define as_tsr_ushort(x) asuint(x)
|
|
#define as_tsr_half(x) asfloat(x)
|
|
|
|
#endif
|
|
|
|
#if CONFIG_SCENE_COLOR_ALPHA
|
|
#define tsr_halfC tsr_half4
|
|
#define tsr_halfCx2 tsr_half4x2
|
|
#define CONFIG_CHANNEL_COUNT 4
|
|
|
|
#else
|
|
#define tsr_halfC tsr_half3
|
|
#define tsr_halfCx2 tsr_half3x2
|
|
#define CONFIG_CHANNEL_COUNT 3
|
|
|
|
#endif
|
|
|
|
|
|
// Largest encodable normal number in a half used on console.
|
|
static const tsr_half LargestNormalNumber = tsr_half(65504.0);
|
|
|
|
|
|
//------------------------------------------------------- PARAMETERS
|
|
|
|
|
|
float2 InputInfo_Extent;
|
|
float2 InputInfo_ExtentInverse;
|
|
float2 InputInfo_ScreenPosToViewportScale;
|
|
float2 InputInfo_ScreenPosToViewportBias;
|
|
uint2 InputInfo_ViewportMin;
|
|
uint2 InputInfo_ViewportMax;
|
|
float2 InputInfo_ViewportSize;
|
|
float2 InputInfo_ViewportSizeInverse;
|
|
float2 InputInfo_UVViewportMin;
|
|
float2 InputInfo_UVViewportMax;
|
|
float2 InputInfo_UVViewportSize;
|
|
float2 InputInfo_UVViewportSizeInverse;
|
|
float2 InputInfo_UVViewportBilinearMin;
|
|
float2 InputInfo_UVViewportBilinearMax;
|
|
float2 InputJitter;
|
|
int2 InputPixelPosMin;
|
|
int2 InputPixelPosMax;
|
|
FScreenTransform InputPixelPosToScreenPos;
|
|
float2 ScreenVelocityToInputPixelVelocity;
|
|
float2 InputPixelVelocityToScreenVelocity;
|
|
|
|
float2 HistoryInfo_Extent;
|
|
float2 HistoryInfo_ExtentInverse;
|
|
uint2 HistoryInfo_ViewportMin;
|
|
uint2 HistoryInfo_ViewportMax;
|
|
float2 HistoryInfo_ViewportSize;
|
|
float2 HistoryInfo_ViewportSizeInverse;
|
|
float2 HistoryInfo_UVViewportBilinearMin;
|
|
float2 HistoryInfo_UVViewportBilinearMax;
|
|
|
|
// FTSRPrevHistoryParameters
|
|
float2 PrevHistoryInfo_Extent;
|
|
float2 PrevHistoryInfo_ExtentInverse;
|
|
float2 PrevHistoryInfo_ScreenPosToViewportScale;
|
|
float2 PrevHistoryInfo_ScreenPosToViewportBias;
|
|
uint2 PrevHistoryInfo_ViewportMin;
|
|
uint2 PrevHistoryInfo_ViewportMax;
|
|
float2 PrevHistoryInfo_ViewportSize;
|
|
float2 PrevHistoryInfo_ViewportSizeInverse;
|
|
float2 PrevHistoryInfo_UVViewportMin;
|
|
float2 PrevHistoryInfo_UVViewportMax;
|
|
float2 PrevHistoryInfo_UVViewportSize;
|
|
float2 PrevHistoryInfo_UVViewportSizeInverse;
|
|
float2 PrevHistoryInfo_UVViewportBilinearMin;
|
|
float2 PrevHistoryInfo_UVViewportBilinearMax;
|
|
FScreenTransform ScreenPosToPrevHistoryBufferUV;
|
|
float HistoryPreExposureCorrection;
|
|
float ResurrectionPreExposureCorrection;
|
|
|
|
uint bCameraCut;
|
|
|
|
#if DEBUG_OUTPUT
|
|
RWTexture2DArray<float4> DebugOutput;
|
|
#endif
|
|
|
|
|
|
//------------------------------------------------------- UTILITY
|
|
|
|
uint sign_bit(float x)
|
|
{
|
|
return asuint(x) >> 31u;
|
|
}
|
|
|
|
uint2 sign_bit(float2 x)
|
|
{
|
|
return uint2(sign_bit(x.x), sign_bit(x.y));
|
|
}
|
|
|
|
#if PLATFORM_SUPPORTS_REAL_TYPES
|
|
|
|
uint16_t sign_bit(half x)
|
|
{
|
|
return asuint16(x) >> uint16_t(15);
|
|
}
|
|
|
|
uint16_t2 sign_bit(half2 x)
|
|
{
|
|
return asuint16(x) >> uint16_t(15);
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
//------------------------------------------------------- FUNCTIONS
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
float2x2 ApplyScreenTransform(float2x2 PInA, FScreenTransform AToB)
|
|
{
|
|
return dpv_add(dpv_mul(PInA, AToB.xy), AToB.zw);
|
|
}
|
|
|
|
#if PLATFORM_SUPPORTS_REAL_TYPES
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
half2x2 ApplyScreenTransform(half2x2 PInA, FScreenTransform AToB)
|
|
{
|
|
return dpv_add(dpv_mul(PInA, half2(AToB.xy)), half2(AToB.zw));
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
/** Compute the group wave index into SGRP to then recompue the GroupThreadIndex latter */
|
|
CALL_SITE_DEBUGLOC
|
|
uint GetGroupWaveIndex(uint GroupThreadIndex, uint GroupSize)
|
|
#if COMPILER_SUPPORTS_WAVE_ONCE
|
|
{
|
|
uint LaneCountPerWave = WaveGetLaneCount();
|
|
|
|
if (LaneCountPerWave >= GroupSize)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
return WaveReadLaneFirst(GroupThreadIndex / LaneCountPerWave);
|
|
}
|
|
#else
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
/** Force compute the group GroupThreadIndex through lane index and wave index if possible to reduce VGPR pressure. */
|
|
CALL_SITE_DEBUGLOC
|
|
uint GetGroupThreadIndex(uint GroupThreadIndex, uint GroupWaveIndex)
|
|
#if COMPILER_SUPPORTS_WAVE_ONCE
|
|
{
|
|
// shares GroupWaveOffset to save SALU
|
|
uint GroupWaveOffset = WaveGetLaneCount() * GroupWaveIndex;
|
|
|
|
// Do not share
|
|
uint ComputedGroupThreadIndex;
|
|
ISOLATE
|
|
{
|
|
ComputedGroupThreadIndex = GroupWaveOffset + WaveGetLaneIndex();
|
|
}
|
|
|
|
return ComputedGroupThreadIndex;
|
|
}
|
|
#else
|
|
{
|
|
return GroupThreadIndex;
|
|
}
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2x2 InvalidateOutputPixelPos(tsr_short2x2 PixelPos, uint2 ViewportMax)
|
|
#if 1
|
|
{
|
|
tsr_short2x2 Subtract = dpv_sub(tsr_short2(ViewportMax - 1), PixelPos);
|
|
tsr_ushort2 Override = tsr_ushort2(Subtract[0] | Subtract[1]);
|
|
|
|
#if CONFIG_COMPILE_FP16
|
|
PixelPos[0] |= -tsr_short2((Override & uint16_t(0x8000)) >> 15);
|
|
#else
|
|
PixelPos[0] |= -tsr_short2((Override & uint(0x80000000)) >> 31);
|
|
#endif
|
|
|
|
return PixelPos;
|
|
}
|
|
#else
|
|
{
|
|
bool bIsValidPixel = all(PixelPos < ViewportMax);
|
|
PixelPos.x = bIsValidPixel ? PixelPos.x : ~tsr_short(0);
|
|
return PixelPos;
|
|
}
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_short2 InvalidateOutputPixelPos(tsr_short2 PixelPos, uint2 ViewportMax)
|
|
#if 1
|
|
{
|
|
tsr_short2 Subtract = tsr_short2(ViewportMax - 1) - PixelPos;
|
|
tsr_ushort Override = tsr_ushort(Subtract.x | Subtract.y);
|
|
|
|
#if CONFIG_COMPILE_FP16
|
|
PixelPos.x |= -tsr_short((Override & uint16_t(0x8000)) >> 15);
|
|
#else
|
|
PixelPos.x |= -tsr_short((Override & uint(0x80000000)) >> 31);
|
|
#endif
|
|
|
|
return PixelPos;
|
|
}
|
|
#else
|
|
{
|
|
bool bIsValidPixel = all(PixelPos < ViewportMax);
|
|
PixelPos.x = bIsValidPixel ? PixelPos.x : ~tsr_short(0);
|
|
return PixelPos;
|
|
}
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_ushort2 Map8x8Tile2x2Lane(uint GroupThreadIndex)
|
|
{
|
|
tsr_ushort2 T = tsr_ushort(GroupThreadIndex).xx;
|
|
tsr_ushort2 GroupId = (T >> tsr_ushort2(0, 1) & tsr_ushort2(0x01, 0x01)) | ((T >> tsr_ushort2(2 - 1, 4 - 1)) & tsr_ushort2(0x03 << 1, 0x03 << 1));
|
|
return GroupId;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_ushort2 Map16x16Tile2x2Lane(uint GroupThreadIndex)
|
|
{
|
|
tsr_ushort T = tsr_ushort(GroupThreadIndex);
|
|
|
|
tsr_ushort2 GroupId;
|
|
GroupId.x = ((T >> tsr_ushort(0)) & tsr_ushort(0x01)) | ((T >> tsr_ushort(2 - 1)) & tsr_ushort(0x07 << 1));
|
|
GroupId.y = ((T >> tsr_ushort(1)) & tsr_ushort(0x01)) | ((T >> tsr_ushort(5 - 1)) & tsr_ushort(0x07 << 1));
|
|
|
|
return GroupId;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
float SafeRcp(float x)
|
|
{
|
|
return x > 0.0 ? rcp(x) : 0.0;
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
float2 SafeRcp(float2 x)
|
|
{
|
|
return float2(SafeRcp(x.x), SafeRcp(x.y));
|
|
}
|
|
|
|
#if CONFIG_COMPILE_FP16
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
half SafeRcp(half x)
|
|
#if PLATFORM_GPU_ARCH >= PLATFORM_GPU_ARCH_AMD_RDNA_2 && PLATFORM_GPU_ARCH <= PLATFORM_GPU_ARCH_AMD_LATTEST
|
|
{
|
|
// If x=0.0, then MaxHalfFloat * 0.0 = 0.0
|
|
return min(rcp(x), half(MaxHalfFloat)) * saturate(x * asfloat16(uint16_t(0x7C00)));
|
|
}
|
|
#else
|
|
{
|
|
return select(x > half(0.0), min(rcp(x), half(MaxHalfFloat)), half(0.0));
|
|
}
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
half2 SafeRcp(half2 x)
|
|
#if PLATFORM_GPU_ARCH >= PLATFORM_GPU_ARCH_AMD_RDNA_2 && PLATFORM_GPU_ARCH <= PLATFORM_GPU_ARCH_AMD_LATTEST
|
|
{
|
|
// If x=0.0, then MaxHalfFloat * 0.0 = 0.0
|
|
return min(rcp(x), half(MaxHalfFloat)) * saturate(x * asfloat16(uint16_t(0x7C00)));
|
|
}
|
|
#else
|
|
{
|
|
return select(x > half(0.0), min(rcp(x), half(MaxHalfFloat)), half(0.0));
|
|
}
|
|
#endif
|
|
|
|
#endif
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2x2 WeightedLerpFactors(tsr_half2 WeightA, tsr_half2 WeightB, tsr_half2 Blend)
|
|
{
|
|
tsr_half2 BlendA = (tsr_half(1.0) - Blend) * WeightA;
|
|
tsr_half2 BlendB = Blend * WeightB;
|
|
tsr_half2 RcpBlend = SafeRcp(BlendA + BlendB);
|
|
BlendA *= RcpBlend;
|
|
BlendB *= RcpBlend;
|
|
return tsr_half2x2(BlendA, BlendB);
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
tsr_half2 WeightedLerpFactors(tsr_half WeightA, tsr_half WeightB, tsr_half Blend)
|
|
{
|
|
return dpv_lo(WeightedLerpFactors(
|
|
dpv_interleave_mono_registers(WeightA),
|
|
dpv_interleave_mono_registers(WeightB),
|
|
dpv_interleave_mono_registers(Blend)));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
bool TakeOnlyOneSamplePair(float2 Offset)
|
|
{
|
|
return Offset.y > 0.0 || (Offset.x > 0.0 && Offset.y == 0.0);
|
|
}
|
|
|
|
tsr_half ComputePredictionCompleteness(tsr_half SampleHistoryValidity)
|
|
{
|
|
return saturate(SampleHistoryValidity * tsr_half(MAX_SAMPLE_COUNT) - tsr_half(0.2));
|
|
}
|
|
|
|
CALL_SITE_DEBUGLOC
|
|
void IsOffScreenOrDisoccluded(uint bCameraCut, float2x2 ScreenPos, bool2 bIsParallaxRejected, out bool2 bIsOffScreen, out bool2 bIsDisoccluded)
|
|
{
|
|
bool bIsCameraCut = bCameraCut != 0;
|
|
bool2 bIsOutOfBounds = max(abs(ScreenPos[0]), abs(ScreenPos[1])) >= 1.0;
|
|
|
|
bIsOffScreen = or(bIsCameraCut, bIsOutOfBounds);
|
|
bIsDisoccluded = and(!bIsOffScreen, bIsParallaxRejected);
|
|
}
|
|
|
|
tsr_half2 ComputePrevWeightMultiplier(tsr_half2 BlendFactor)
|
|
{
|
|
return (1.0 - BlendFactor) / BlendFactor;
|
|
}
|
|
|
|
tsr_half2 RejectionFactorToBlendFactor(tsr_half2 RejectionFactor)
|
|
{
|
|
return tsr_half(1.0) - RejectionFactor;
|
|
}
|