Files
UnrealEngine/Engine/Shaders/Private/TemporalSuperResolution/TSRResolveHistory.usf
2025-05-18 13:04:45 +08:00

431 lines
14 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#include "TSRColorSpace.ush"
//------------------------------------------------------- CONFIG
#if DIM_NYQUIST_WAVE_SIZE != 0
#define WAVE_COUNT_X 1
#define WAVE_COUNT_Y 1
#define DOWNSAMPLE_FACTOR 2u
/** Size of the entire group loaded into memory including overscan. */
#define GROUP_TILE_SIZE 8u
#define TILE_SIZE (GROUP_TILE_SIZE - 2u)
#define LANE_OUTPUT_STRIDE_X 2u
#if DIM_NYQUIST_WAVE_SIZE == 16
#define LANE_OUTPUT_STRIDE_Y 2u
#elif DIM_NYQUIST_WAVE_SIZE == 32
#define LANE_OUTPUT_STRIDE_Y 1u
#else
#error Unknown wave size
#endif
#define LANE_INPUT_STRIDE_X (LANE_OUTPUT_STRIDE_X * DOWNSAMPLE_FACTOR)
#define LANE_INPUT_STRIDE_Y (LANE_OUTPUT_STRIDE_Y * DOWNSAMPLE_FACTOR)
/** Number of lane the GROUP_TILE_SIZE. */
#define LANE_COUNT_X (GROUP_TILE_SIZE / LANE_OUTPUT_STRIDE_X)
#define LANE_COUNT_Y (GROUP_TILE_SIZE / LANE_OUTPUT_STRIDE_Y)
/** Size of the SIMD per lane that also number of input pixel loaded into each individual lanes. */
#define INPUT_SIMD_SIZE (LANE_INPUT_STRIDE_X * LANE_INPUT_STRIDE_Y)
#define OUTPUT_SIMD_SIZE (LANE_OUTPUT_STRIDE_X * LANE_OUTPUT_STRIDE_Y)
#define tsr_input_halfC TLaneVector2D<tsr_half, CONFIG_CHANNEL_COUNT, LANE_INPUT_STRIDE_X, LANE_INPUT_STRIDE_Y>
#define tsr_output_halfC TLaneVector2D<tsr_half, CONFIG_CHANNEL_COUNT, LANE_OUTPUT_STRIDE_X, LANE_OUTPUT_STRIDE_Y>
#define tsr_output_half TLaneVector2D<tsr_half, 1, LANE_OUTPUT_STRIDE_X, LANE_OUTPUT_STRIDE_Y>
#else
#define TILE_SIZE 8
#endif
#define KERNEL_SIZE 4
#define CONFIG_CLAMP 1
#define CONFIG_HDR_WEIGHT 1
//------------------------------------------------------- CONVOLUTIONS
#if DIM_NYQUIST_WAVE_SIZE != 0
#include "TSRConvolutionNetwork.ush"
#endif
//------------------------------------------------------- CONSTANTS
static const int2 kOffsetsCross3x3[4] = {
int2(-1, -1),
int2( 1, -1),
int2(-1, 1),
int2( 1, 1),
};
//------------------------------------------------------- PARAMETERS
RWTexture2D<uint> PrevUseCountOutput;
RWTexture2D<uint> PrevClosestDepthOutput;
//------------------------------------------------------- LDS
groupshared tsr_halfC SharedArray0[TILE_SIZE * TILE_SIZE];
//------------------------------------------------------- FUNCTIONS
static const float MitchellNetravaliB = rcp(3.0);
static const float MitchellNetravaliC = rcp(3.0);
// Mitchell Netravali
// B-spline: B=1 C=0
// Mitchell: B=1/3 C=1/3
// Catmull-Rom: B=0 C=1/2
// Robidoux: B=0.3782 C=0.3109 (cylindrical)
// Robidoux Sharp: B=0.2620 C=0.3690 (cylindrical)
// Robidoux Soft: B=0.6796 C=0.1602 (cylindrical)
void MitchellNetravaliCoefs(const float B, const float C, out float OutQ0[4], out float OutQ1[4])
{
OutQ0[0] = (6.0 - 2.0 * B) / 6.0;
OutQ0[1] = 0.0;
OutQ0[2] = (-18.0 + 12.0 * B + 6.0 * C) / 6.0;
OutQ0[3] = (12.0 - 9.0 * B - 6.0 * C) / 6.0;
OutQ1[0] = (8 * B + 24 * C) / 6.0;
OutQ1[1] = (-12 * B - 48 * C) / 6.0;
OutQ1[2] = (6 * B + 30 * C) / 6.0;
OutQ1[3] = (-B - 6 * C) / 6.0;
}
float MitchellNetravali(float d, const float B, const float C)
{
// Coeficient ends up known at compile time.
float Q0[4];
float Q1[4];
MitchellNetravaliCoefs(B, C, Q0, Q1);
if (d < 1)
{
return Q0[0] + d * (Q0[1] + d * (Q0[2] + d * Q0[3]));
}
else if ((d >= 1) && (d < 2))
{
return Q1[0] + d * (Q1[1] + d * (Q1[2] + d * Q1[3]));
}
else
{
return 0;
}
}
//------------------------------------------------------- PARAMETERS
FScreenTransform DispatchThreadToHistoryPixelPos;
uint2 OutputViewRectMin;
uint2 OutputViewRectMax;
uint bGenerateOutputMip1;
float HistoryValidityMultiply;
Texture2D<tsr_halfC> UpdateHistoryOutputTexture;
RWTexture2D<tsr_halfC> SceneColorOutputMip0;
RWTexture2D<tsr_halfC> SceneColorOutputMip1;
//------------------------------------------------------- ENTRY POINT NYQUIST DOWNSAMPLE
#if DIM_NYQUIST_WAVE_SIZE != 0
#if COMPILER_SUPPORTS_WAVE_SIZE
WAVESIZE(DIM_NYQUIST_WAVE_SIZE)
#endif
[numthreads(LANE_COUNT, 1, 1)]
void MainCS(
uint2 GroupId : SV_GroupID,
uint GroupThreadIndex : SV_GroupIndex)
{
const uint LaneIndex = GroupThreadIndex;
// Load all the colors
tsr_input_halfC Colors;
ISOLATE
{
const tsr_short2 GroupPixelOffset = (
tsr_short2(HistoryInfo_ViewportMin) +
tsr_short2(GroupId) * tsr_short(TILE_SIZE * DOWNSAMPLE_FACTOR).xx +
tsr_short(-1).xx);
UNROLL_N(INPUT_SIMD_SIZE)
for (uint SimdIndex = 0; SimdIndex < INPUT_SIMD_SIZE; SimdIndex++)
{
tsr_short2 PixelPos = GroupPixelOffset + GetLaneSimdPixelOffset<LANE_INPUT_STRIDE_X, LANE_INPUT_STRIDE_Y>(LaneIndex, SimdIndex);
PixelPos = clamp(PixelPos, tsr_short2(HistoryInfo_ViewportMin), tsr_short2(HistoryInfo_ViewportMax - 1));
Colors.SetElement(SimdIndex, UpdateHistoryOutputTexture[PixelPos]);
}
}
tsr_output_halfC Output;
{
#if CONFIG_CLAMP
tsr_output_halfC Min = DownsampleMin2x2(Colors);
tsr_output_halfC Max = DownsampleMax2x2(Colors);
Min = min(Min, WaveAccessNeighborTexel(Min, tsr_short2(1, 0)));
Min = min(Min, WaveAccessNeighborTexel(Min, tsr_short2(0, 1)));
Max = max(Max, WaveAccessNeighborTexel(Max, tsr_short2(1, 0)));
Max = max(Max, WaveAccessNeighborTexel(Max, tsr_short2(0, 1)));
#endif
tsr_input_halfC LDRColors = Colors;
#if CONFIG_HDR_WEIGHT
{
UNROLL_N(INPUT_SIMD_SIZE)
for (uint SimdIndex = 0; SimdIndex < INPUT_SIMD_SIZE; SimdIndex++)
{
LDRColors.SetElement(SimdIndex, Colors.GetElement(SimdIndex) * HdrWeight4(Colors.GetElement(SimdIndex)));
}
}
#endif
/*
* ^
* | b c
* | a b
* o------>
*/
const float WeightA = MitchellNetravali(/* PixelOffset = */ 0.5, MitchellNetravaliB, MitchellNetravaliC);
const float WeightB = MitchellNetravali(/* PixelOffset = */ 1.5, MitchellNetravaliB, MitchellNetravaliC);
const float TotalWeight = 4.0 * (WeightA * WeightA + WeightB * WeightB + WeightA * WeightB * 2.0);
const tsr_half FinalWeightA = tsr_half(WeightA * WeightA / TotalWeight);
const tsr_half FinalWeightB = tsr_half(WeightA * WeightB / TotalWeight);
const tsr_half FinalWeightC = tsr_half(WeightB * WeightB / TotalWeight);
/*
* c b | b c
* b a | a b
* ------o------
* b a | a b
* c b | b c
*/
const tsr_half Weights0[4] = { FinalWeightC, FinalWeightB, FinalWeightB, FinalWeightA };
const tsr_half Weights1[4] = { FinalWeightB, FinalWeightC, FinalWeightA, FinalWeightB };
const tsr_half Weights2[4] = { FinalWeightB, FinalWeightA, FinalWeightC, FinalWeightB };
const tsr_half Weights3[4] = { FinalWeightA, FinalWeightB, FinalWeightB, FinalWeightC };
tsr_output_halfC Out0 = DownsampleDot2x2(LDRColors, Weights0);
tsr_output_halfC Out1 = DownsampleDot2x2(LDRColors, Weights1);
tsr_output_halfC Out2 = DownsampleDot2x2(LDRColors, Weights2);
tsr_output_halfC Out3 = DownsampleDot2x2(LDRColors, Weights3);
Output = Out0 + WaveAccessNeighborTexel(Out1, tsr_short2(1, 0)) + WaveAccessNeighborTexel(Out2, tsr_short2(0, 1)) + WaveAccessNeighborTexel(Out3, tsr_short2(1, 1));
#if CONFIG_HDR_WEIGHT
{
UNROLL_N(OUTPUT_SIMD_SIZE)
for (uint SimdIndex = 0; SimdIndex < OUTPUT_SIMD_SIZE; SimdIndex++)
{
Output.SetElement(SimdIndex, Output.GetElement(SimdIndex) * HdrWeightInvY(Luma4(Output.GetElement(SimdIndex))));
}
}
#endif
#if CONFIG_CLAMP
Output = clamp(Output, Min, Max);
#endif
}
ISOLATE
{
const tsr_short2 GroupPixelOffset = (
tsr_short2(OutputViewRectMin) +
tsr_short2(GroupId) * tsr_short(TILE_SIZE).xx);
Output = clamp(Output, tsr_output_halfC::Const(tsr_half(0.0)), tsr_output_halfC::Const(tsr_half(Max10BitsFloat)));
// Ensure that alpha values that are expected to be opaque (but are only close to opaque) are forced to be opaque.
// (0.995 chosen to accommodate handling of 254/255)
#if CONFIG_SCENE_COLOR_ALPHA
{
Output.SetComponent(3, select(Output[3] > tsr_output_half::Const(0.995), tsr_output_half::Const(1.0), Output[3]));
Output.SetComponent(3, select(Output[3] < tsr_output_half::Const(0.005), tsr_output_half::Const(0.0), Output[3]));
}
#endif
UNROLL_N(OUTPUT_SIMD_SIZE)
for (uint SimdIndex = 0; SimdIndex < OUTPUT_SIMD_SIZE; SimdIndex++)
{
tsr_short2 OutputPixelOffset = GetLaneSimdPixelOffset<LANE_OUTPUT_STRIDE_X, LANE_OUTPUT_STRIDE_Y>(LaneIndex, SimdIndex);
tsr_short2 OutputPixelPos = InvalidateOutputPixelPos(GroupPixelOffset + OutputPixelOffset, OutputViewRectMax);
OutputPixelPos.x = select(all(OutputPixelOffset < TILE_SIZE), OutputPixelPos.x, ~tsr_short(0));
SceneColorOutputMip0[OutputPixelPos] = Output.GetElement(SimdIndex);
}
{
tsr_output_halfC HalfResOutput = Output * tsr_half(0.25);
// Forces the * tsr_half(0.25) to be applied before to avoid turning bright pixels to +inf in the adds below.
#if CONFIG_FP16_PRECISE_MULTIPLY_ORDER
HalfResOutput = min(HalfResOutput, tsr_output_halfC::Const(tsr_half(Max10BitsFloat * 0.25)));
#endif
HalfResOutput = HalfResOutput + WaveAccessNeighborTexel(HalfResOutput, tsr_short2(1, 0));
HalfResOutput = HalfResOutput + WaveAccessNeighborTexel(HalfResOutput, tsr_short2(0, 1));
tsr_short2 OutputPixelOffset = GetLaneSimdPixelOffset<LANE_OUTPUT_STRIDE_X, LANE_OUTPUT_STRIDE_Y>(LaneIndex, /* SimdIndex = */ 0);
tsr_short2 OutputPixelPos = InvalidateOutputPixelPos(GroupPixelOffset + OutputPixelOffset, OutputViewRectMax);
OutputPixelPos.x = select(all(OutputPixelOffset < TILE_SIZE), OutputPixelPos.x, ~tsr_short(0));
tsr_short2 HalfResOutputPixelPos = OutputPixelPos >> tsr_short(1);
HalfResOutputPixelPos[0] |= ((OutputPixelPos[0] & tsr_short(0x1)) | tsr_short(!bGenerateOutputMip1)) * tsr_short(~0);
HalfResOutputPixelPos[1] |= ((OutputPixelPos[1] & tsr_short(0x1)) | tsr_short(!bGenerateOutputMip1)) * tsr_short(~0);
SceneColorOutputMip1[HalfResOutputPixelPos] = HalfResOutput.GetElement(/* SimdIndex = */ 0);
}
}
}
#else // DIM_NYQUIST_WAVE_SIZE == 0
//------------------------------------------------------- ENTRY POINT REFERENCE
[numthreads(TILE_SIZE * TILE_SIZE, 1, 1)]
void MainCS(
uint2 GroupId : SV_GroupID,
uint GroupThreadIndex : SV_GroupIndex)
{
float4 Debug = 0.0;
uint2 DispatchThreadId = (
ZOrder2D(GroupThreadIndex, uint(log2(float(TILE_SIZE)))) +
GroupId * uint2(TILE_SIZE, TILE_SIZE));
tsr_halfC Color = tsr_half(0.0);
// Downsample the history
{
// Position of the output pixel in the history.
float2 OutputPixelPos = ApplyScreenTransform(float2(DispatchThreadId), DispatchThreadToHistoryPixelPos);
float2 TopLeftKernel = floor(OutputPixelPos + (0.5 - 0.5 * KERNEL_SIZE)) + 0.5;
#if CONFIG_CLAMP
tsr_halfC MinColor = tsr_half(POSITIVE_INFINITY);
tsr_halfC MaxColor = tsr_half(0.0);
#endif
tsr_half ColorWeight = tsr_half(0.0);
UNROLL_N(KERNEL_SIZE)
for (uint x = 0; x < KERNEL_SIZE; x++)
{
UNROLL_N(KERNEL_SIZE)
for (uint y = 0; y < KERNEL_SIZE; y++)
{
float2 SamplePixelPos = TopLeftKernel + float2(x, y);
float2 SampleUV = SamplePixelPos * HistoryInfo_ExtentInverse;
SampleUV = clamp(SampleUV, HistoryInfo_UVViewportBilinearMin, HistoryInfo_UVViewportBilinearMax);
tsr_halfC SampleColor = UpdateHistoryOutputTexture.SampleLevel(GlobalBilinearClampedSampler, SampleUV, 0);
float2 PixelOffset = abs(SamplePixelPos - OutputPixelPos);
tsr_half kernelWeight = tsr_half(MitchellNetravali(PixelOffset.x, MitchellNetravaliB, MitchellNetravaliC) * MitchellNetravali(PixelOffset.y, MitchellNetravaliB, MitchellNetravaliC));
tsr_half SampleHdrWeight = HdrWeight4(SampleColor);
#if CONFIG_CLAMP
MinColor = min(MinColor, SampleColor);
MaxColor = max(MaxColor, SampleColor);
#endif
Color += (kernelWeight * SampleHdrWeight) * SampleColor;
ColorWeight += (kernelWeight * SampleHdrWeight);
}
}
Color *= SafeRcp(ColorWeight);
#if CONFIG_CLAMP
Color = clamp(Color, MinColor, MaxColor);
#endif
}
// Compute final output
tsr_halfC FinalOutputColor = Color;
FinalOutputColor = clamp(FinalOutputColor, tsr_half(0.0), tsr_half(Max10BitsFloat));
// Ensure that alpha values that are expected to be opaque (but are only close to opaque) are forced to be opaque.
// (0.995 chosen to accommodate handling of 254/255)
#if CONFIG_SCENE_COLOR_ALPHA
{
FinalOutputColor[3] = select(FinalOutputColor[3] > tsr_half(0.995), tsr_half(1.0), FinalOutputColor[3]);
FinalOutputColor[3] = select(FinalOutputColor[3] < tsr_half(0.005), tsr_half(0.0), FinalOutputColor[3]);
}
#endif
tsr_short2 OutputPixelPosition = InvalidateOutputPixelPos(tsr_short2(OutputViewRectMin + DispatchThreadId), OutputViewRectMax);
// Output final scene color Mip0
{
SceneColorOutputMip0[OutputPixelPosition] = FinalOutputColor;
}
// Output final scene color Mip1
{
uint LocalGroupThreadIndex = GroupThreadIndex;
tsr_halfC HalfResOutput = FinalOutputColor * tsr_half(0.25);
// Forces the * tsr_half(0.25) to be applied before to avoid turning bright pixels to +inf in the adds below.
#if CONFIG_FP16_PRECISE_MULTIPLY_ORDER
HalfResOutput = min(HalfResOutput, tsr_half(Max10BitsFloat * 0.25));
#endif
tsr_short2 HalfResOutputPixelPos;
HalfResOutputPixelPos[0] = (OutputPixelPosition[0] >> tsr_short(1)) | (((OutputPixelPosition[0] & tsr_short(0x1)) | tsr_short(!bGenerateOutputMip1)) * tsr_short(~0));
HalfResOutputPixelPos[1] = (OutputPixelPosition[1] >> tsr_short(1)) | (((OutputPixelPosition[1] & tsr_short(0x1)) | tsr_short(!bGenerateOutputMip1)) * tsr_short(~0));
#if PLATFORM_SUPPORTS_WAVE_BROADCAST && 0 // Support WaveBroadcast with halfs
BRANCH
if (bGenerateOutputMip1)
{
FWaveBroadcastSettings Horizontal = InitWaveXorButterfly(/* XorButterFly = */ 0x1);
FWaveBroadcastSettings Vertical = InitWaveXorButterfly(/* XorButterFly = */ 0x2);
HalfResOutput += WaveBroadcast(Horizontal, HalfResOutput);
HalfResOutput += WaveBroadcast(Vertical, HalfResOutput);
}
#else
BRANCH
if (bGenerateOutputMip1)
{
SharedArray0[LocalGroupThreadIndex] = HalfResOutput;
#if COMPILER_METAL
GroupMemoryBarrierWithGroupSync();
#endif
SharedArray0[LocalGroupThreadIndex] += SharedArray0[LocalGroupThreadIndex ^ 0x1];
SharedArray0[LocalGroupThreadIndex] += SharedArray0[LocalGroupThreadIndex ^ 0x2];
HalfResOutput = SharedArray0[LocalGroupThreadIndex];
}
#endif
SceneColorOutputMip1[HalfResOutputPixelPos] = HalfResOutput;
}
#if DEBUG_OUTPUT
{
DebugOutput[tsr_short3(OutputPixelPosition, 0)] = Debug;
}
#endif
}
#endif // DIM_NYQUIST_WAVE_SIZE == 0