Files
UnrealEngine/Engine/Shaders/Private/DiaphragmDOF/DOFSetup.usf
2025-05-18 13:04:45 +08:00

508 lines
14 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
DiaphragmDOF/DOFSetup.usf: Diaphragm DOF's setup shader.
=============================================================================*/
#define CONFIG_SETUP 1
#include "DOFDownsample.ush"
//------------------------------------------------------- COMPILE TIME CONSTANTS
#define THREADGROUP_SIZE_X (8)
#define THREADGROUP_SIZE_Y (8)
#define THREADGROUP_SIZE_TOTAL (THREADGROUP_SIZE_X * THREADGROUP_SIZE_Y)
//------------------------------------------------------- PARAMETERS
Texture2D SceneColorTexture;
Texture2D SceneDepthTexture;
//------------------------------------------------------- OUTPUTS
#if CONFIG_DOF_ALPHA
RWTexture2D<float> Output0;
RWTexture2D<float4> Output1;
RWTexture2D<float> Output2;
#else
RWTexture2D<float4> Output0;
RWTexture2D<float4> Output1;
#endif
//------------------------------------------------------- FUNCTIONS
struct FDOFSetupParameters
{
// Compile-time variables
bool bOutputFullRes;
bool bOutputHalfRes;
Texture2D SceneColorTexture;
Texture2D SceneDepthTexture;
};
struct FDOFSetupOutput
{
// Requires bOutputFullRes
bool bFullResIsValid;
uint2 FullResOutputPixelPosition;
float4 FullResColor;
float FullResCocRadius;
// Requires bOutputHalfRes
bool bHalfResIsValid;
uint2 HalfResOutputPixelPosition;
float4 HalfResColor;
float HalfResCocRadius;
};
// Calculate a 2D space-filling curve, similar to Morton, from a 1D coordinate.
// From the AMD GPUOpen "Optimizing for the Radeon RDNA architecture"
// 00 01 08 09 16 17 24 25
// 02 03 10 11 18 19 26 27
// 04 05 12 13 20 21 28 29
// 06 07 14 15 22 23 30 31
// 32 33 40 41 48 49 56 57
// 34 35 42 43 50 51 58 59
// 36 37 44 45 52 53 60 61
// 38 39 46 47 54 55 62 63
uint2 MortonLikeDecode(uint Index)
{
uint2 Result;
Result.x = (((Index >> 2) & 0x0007) & 0xFFFE) | (Index & 0x0001);
Result.y = ((Index >> 1) & 0x0003) | (((Index >> 3) & 0x0007) & 0xFFFC);
return Result;
}
#if WAVE_OPS
groupshared uint GroupNumWaves;
struct FSoftwareQuad
{
uint WaveQuadLeaderLane;
bool bIsLeaderLane;
uint2 LocalPixelPosition;
float4 LocalColor;
float LocalCocRadius;
void Setup(uint2 GroupId, uint GroupThreadIndex)
{
uint GroupWaveIndex = 0;
if (WaveGetLaneCount() < THREADGROUP_SIZE_TOTAL)
{
if(GroupThreadIndex == 0)
{
GroupNumWaves = 0;
}
GroupMemoryBarrierWithGroupSync();
if(WaveIsFirstLane())
{
InterlockedAdd(GroupNumWaves, 1, GroupWaveIndex);
}
GroupWaveIndex = WaveReadLaneFirst(GroupWaveIndex);
}
uint2 GroupSize = uint2(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y);
// Index of lane in group
uint GroupLane = (GroupWaveIndex * WaveGetLaneCount()) + WaveGetLaneIndex();
// Map threads to pixels using morton-like curve for better texture cache hit rate
LocalPixelPosition = (GroupSize * GroupId) + MortonLikeDecode(GroupLane);
WaveQuadLeaderLane = WaveGetLaneIndex() & (~(4 - 1));
bIsLeaderLane = (WaveGetLaneIndex() & (4 - 1)) == 0;
}
bool IsLeader()
{
return bIsLeaderLane;
}
uint2 GetLocalPixelPosition()
{
return LocalPixelPosition;
}
void SetLocalPixel(float4 Color, float CocRadius)
{
LocalColor = Color;
LocalCocRadius = CocRadius;
}
void Sync()
{ }
void GetPixel(uint PixelIndex, out float4 Color, out float CocRadius)
{
// Could maybe also use QuadReadLaneAt where it is supported
#if COMPILER_SUPPORTS_WAVE_SWIZZLE_GCN
const uint AndMask = ~(4 - 1);
const uint OrMask = PixelIndex;
Color.r = asfloat(WaveLaneSwizzleGCN(asuint(LocalColor.r), AndMask, OrMask, 0));
Color.g = asfloat(WaveLaneSwizzleGCN(asuint(LocalColor.g), AndMask, OrMask, 0));
Color.b = asfloat(WaveLaneSwizzleGCN(asuint(LocalColor.b), AndMask, OrMask, 0));
Color.a = asfloat(WaveLaneSwizzleGCN(asuint(LocalColor.a), AndMask, OrMask, 0));
CocRadius = asfloat(WaveLaneSwizzleGCN(asuint(LocalCocRadius), AndMask, OrMask, 0));
#elif COMPILER_SUPPORTS_WAVE_SWIZZLE_RDNA
Color.r = asfloat(WaveLaneRotateSwizzleRDNA(asuint(LocalColor.r), PixelIndex, 0));
Color.g = asfloat(WaveLaneRotateSwizzleRDNA(asuint(LocalColor.g), PixelIndex, 0));
Color.b = asfloat(WaveLaneRotateSwizzleRDNA(asuint(LocalColor.b), PixelIndex, 0));
Color.a = asfloat(WaveLaneRotateSwizzleRDNA(asuint(LocalColor.a), PixelIndex, 0));
CocRadius = asfloat(WaveLaneRotateSwizzleRDNA(asuint(LocalCocRadius), PixelIndex, 0));
#else
Color = WaveReadLaneAt(LocalColor, WaveQuadLeaderLane + PixelIndex);
CocRadius = WaveReadLaneAt(LocalCocRadius, WaveQuadLeaderLane + PixelIndex);
#endif // COMPILER_SUPPORTS_WAVE_SWIZZLE_RDNA
}
};
#else // !WAVE_OPS
groupshared float4 SoftQuadColors[THREADGROUP_SIZE_TOTAL];
groupshared float SoftQuadCocRadius[THREADGROUP_SIZE_TOTAL];
struct FSoftwareQuad
{
uint GroupLaneIndex;
uint QuadLeaderLane;
bool bIsLeaderLane;
uint2 LocalPixelPosition;
void Setup(uint2 GroupId, uint GroupThreadIndex)
{
uint2 GroupSize = uint2(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y);
// Map threads to pixels using morton-like curve for better texture cache hit rate
GroupLaneIndex = GroupThreadIndex;
LocalPixelPosition = (GroupSize * GroupId) + MortonLikeDecode(GroupThreadIndex);
QuadLeaderLane = GroupThreadIndex & (~(4 - 1));
bIsLeaderLane = (GroupThreadIndex & (4 - 1)) == 0;
}
bool IsLeader()
{
return bIsLeaderLane;
}
uint2 GetLocalPixelPosition()
{
return LocalPixelPosition;
}
void SetLocalPixel(float4 Color, float CocRadius)
{
SoftQuadColors[GroupLaneIndex] = Color;
SoftQuadCocRadius[GroupLaneIndex] = CocRadius;
}
void Sync()
{
GroupMemoryBarrierWithGroupSync();
}
void GetPixel(uint PixelIndex, out float4 Color, out float CocRadius)
{
Color = 0;
CocRadius = 0;
if (IsLeader())
{
Color = SoftQuadColors[QuadLeaderLane+PixelIndex];
CocRadius = SoftQuadCocRadius[QuadLeaderLane+PixelIndex];
}
}
};
#endif // WAVE_OPS
// Launch with 1 thread per fullres pixel, and 8x8 groupsize
FDOFSetupOutput SetupDOF(
FDOFSetupParameters Parameters,
uint2 GroupId,
uint GroupThreadIndex,
uint2 DispatchThreadId
)
{
FDOFSetupOutput Output = (FDOFSetupOutput)0;
if(Parameters.bOutputFullRes && Parameters.bOutputHalfRes)
{
uint2 GroupSize = uint2(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y);
if(any((GroupId * GroupSize) >= View.ViewRectMinAndSize.zw))
{
return Output;
}
// group is twice as large in x and y as in halfres/fullres case
// group is subdivided in 4 subgroups of 8x8 (2x2 subgroups)
// 1 thread is 1 fullres pixel
// 1 thread from first subgroup is 1 halfres pixel
FSoftwareQuad Quad;
Quad.Setup(GroupId, GroupThreadIndex);
uint2 InputPixelPosition = uint2(View.ViewRectMin.xy) + min(Quad.GetLocalPixelPosition(), View.ViewRectMinAndSize.zw);
Output.FullResOutputPixelPosition = uint2(View.ViewRectMin.xy) + Quad.GetLocalPixelPosition();
Output.bFullResIsValid = all(Quad.GetLocalPixelPosition() < View.ViewRectMinAndSize.zw);
// Fetch scene color/depth, and compute CocRadius.
float2 SceneBufferUV = (InputPixelPosition + 0.5) * View.BufferSizeAndInvSize.zw;
if (true)
{
SceneBufferUV = clamp(SceneBufferUV, View.BufferBilinearUVMinMax.xy, View.BufferBilinearUVMinMax.zw);
}
float4 SceneColor = Parameters.SceneColorTexture.SampleLevel(GlobalPointClampedSampler, SceneBufferUV, 0);
float DeviceZ = Parameters.SceneDepthTexture.SampleLevel(GlobalPointClampedSampler, SceneBufferUV, 0).r;
float CocRadius = SceneDepthToCocRadius(ConvertFromDeviceZ(DeviceZ));
// Output full resolution.
Output.FullResColor = SceneColor;
Output.FullResCocRadius = CocRadius;
Quad.SetLocalPixel(SceneColor, CocRadius);
Quad.Sync();
float4 SceneColors[4];
float CocRadii[4];
UNROLL
for (uint Index = 0; Index < 4; Index++)
{
Quad.GetPixel(Index, /* out */ SceneColors[Index], /* out */ CocRadii[Index]);
}
BRANCH
if (Quad.IsLeader())
{
Output.bHalfResIsValid = true;
Output.HalfResOutputPixelPosition = InputPixelPosition / 2;
FCocDownsampleParams DownsampleParams;
DownsampleParams.CocRadiusMultiplier = 1.0;
DownsampleParams.FrameExposureScale = 1.0;
DownsampleParams.bDoColorBasedWeighting = false;
DownsampleSceneColorWithCoc(DownsampleParams, SceneColors, CocRadii, Output.HalfResColor, Output.HalfResCocRadius);
}
}
else if(Parameters.bOutputFullRes && !Parameters.bOutputHalfRes)
{
if(any(DispatchThreadId >= View.ViewRectMinAndSize.zw))
{
return Output;
}
Output.bFullResIsValid = true;
uint2 InputPixelPosition = View.ViewRectMin.xy + DispatchThreadId;
Output.FullResOutputPixelPosition = InputPixelPosition;
float2 SceneBufferUV = (InputPixelPosition + 0.5) * View.BufferSizeAndInvSize.zw;
Output.FullResColor = Parameters.SceneColorTexture.SampleLevel(GlobalPointClampedSampler, SceneBufferUV, 0);
float DeviceZ = Parameters.SceneDepthTexture.SampleLevel(GlobalPointClampedSampler, SceneBufferUV, 0).r;
Output.FullResCocRadius = SceneDepthToCocRadius(ConvertFromDeviceZ(DeviceZ));
}
else if(!Parameters.bOutputFullRes && Parameters.bOutputHalfRes)
{
uint2 HalfResViewSize = uint2(
DivideAndRoundUp(View.ViewRectMinAndSize.z, 2),
DivideAndRoundUp(View.ViewRectMinAndSize.w, 2)
);
if(any(DispatchThreadId >= HalfResViewSize))
{
return Output;
}
Output.bHalfResIsValid = true;
Output.HalfResOutputPixelPosition = (View.ViewRectMin.xy / 2) + DispatchThreadId;
float2 SceneBufferUV = (float2(View.ViewRectMin.xy) + 2.0 * (DispatchThreadId + 0.5)) * View.BufferSizeAndInvSize.zw;
float2 SceneBufferUVOffset = View.BufferSizeAndInvSize.zw * 0.5;
FCocDownsampleParams DownsampleParams;
DownsampleParams.CocRadiusMultiplier = 1.0;
DownsampleParams.FrameExposureScale = 1.0;
DownsampleParams.bDoColorBasedWeighting = false;
uint2 GroupSize = uint2(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y);
uint2 ViewportSize = uint2(
DivideAndRoundUp(View.ViewRectMinAndSize.z, 2),
DivideAndRoundUp(View.ViewRectMinAndSize.w, 2));
bool bIsBorderPixel = any(GroupId * GroupSize + GroupSize >= ViewportSize);
float CocRadii[4];
float4 SceneColors[4];
BRANCH
if (!bIsBorderPixel)
{
// Fetch Coc
float4 SceneDepthGather = Parameters.SceneDepthTexture.Gather(GlobalPointClampedSampler, SceneBufferUV);
CocRadii[0] = SceneDepthToCocRadius(ConvertFromDeviceZ(SceneDepthGather.w));
CocRadii[1] = SceneDepthToCocRadius(ConvertFromDeviceZ(SceneDepthGather.z));
CocRadii[2] = SceneDepthToCocRadius(ConvertFromDeviceZ(SceneDepthGather.x));
CocRadii[3] = SceneDepthToCocRadius(ConvertFromDeviceZ(SceneDepthGather.y));
// Fetch color.
for (uint i = 0; i < 4; i++)
{
float2 SampleUV = SceneBufferUV + SceneBufferUVOffset * float2(kOffsetsCross3x3[i]);
SceneColors[i] = Parameters.SceneColorTexture.SampleLevel(GlobalPointClampedSampler, SampleUV, 0);
}
}
else
{
// Sample scene color and depth.
for (uint i = 0; i < 4; i++)
{
float2 SampleUV = SceneBufferUV + SceneBufferUVOffset * float2(kOffsetsCross3x3[i]);
if (true)
{
SampleUV = clamp(SampleUV, View.BufferBilinearUVMinMax.xy, View.BufferBilinearUVMinMax.zw);
}
SceneColors[i] = Parameters.SceneColorTexture.SampleLevel(GlobalPointClampedSampler, SampleUV, 0);
float SampleDeviceZ = Parameters.SceneDepthTexture.SampleLevel(GlobalPointClampedSampler, SampleUV, 0).r;
CocRadii[i] = SceneDepthToCocRadius(ConvertFromDeviceZ(SampleDeviceZ));
}
}
DownsampleSceneColorWithCoc(DownsampleParams, SceneColors, CocRadii, Output.HalfResColor, Output.HalfResCocRadius);
}
return Output;
}
#if DIM_OUTPUT_RES_DIVISOR == 0 // Output full & half resolution.
[numthreads(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y, 1)]
void SetupCS(
uint2 GroupId : SV_GroupID,
uint GroupThreadIndex : SV_GroupIndex,
uint2 DispatchThreadId : SV_DispatchThreadID)
{
FDOFSetupParameters Parameters;
Parameters.bOutputFullRes = true;
Parameters.bOutputHalfRes = true;
Parameters.SceneColorTexture = SceneColorTexture;
Parameters.SceneDepthTexture = SceneDepthTexture;
FDOFSetupOutput Output = SetupDOF(Parameters, GroupId, GroupThreadIndex, DispatchThreadId);
if (Output.bFullResIsValid)
{
#if CONFIG_DOF_ALPHA
{
Output0[Output.FullResOutputPixelPosition] = Output.FullResCocRadius;
}
#else
{
Output0[Output.FullResOutputPixelPosition] = float4(Output.FullResColor.rgb, Output.FullResCocRadius);
}
#endif
}
if (Output.bHalfResIsValid)
{
#if CONFIG_DOF_ALPHA
{
Output1[Output.HalfResOutputPixelPosition] = Output.HalfResColor;
Output2[Output.HalfResOutputPixelPosition] = Output.HalfResCocRadius;
}
#else
{
Output1[Output.HalfResOutputPixelPosition] = float4(Output.HalfResColor.rgb, Output.HalfResCocRadius);
}
#endif
}
}
#elif DIM_OUTPUT_RES_DIVISOR == 1 // Output full resolution only.
[numthreads(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y, 1)]
void SetupCS(
uint2 GroupId : SV_GroupID,
uint GroupThreadIndex : SV_GroupIndex,
uint2 DispatchThreadId : SV_DispatchThreadID)
{
FDOFSetupParameters Parameters;
Parameters.bOutputFullRes = true;
Parameters.bOutputHalfRes = false;
Parameters.SceneColorTexture = SceneColorTexture;
Parameters.SceneDepthTexture = SceneDepthTexture;
FDOFSetupOutput Output = SetupDOF(Parameters, GroupId, GroupThreadIndex, DispatchThreadId);
if (Output.bFullResIsValid)
{
#if CONFIG_DOF_ALPHA
{
Output0[Output.FullResOutputPixelPosition] = Output.FullResCocRadius;
}
#else
{
Output0[Output.FullResOutputPixelPosition] = float4(Output.FullResColor.rgb, Output.FullResCocRadius);
}
#endif
}
}
#elif DIM_OUTPUT_RES_DIVISOR == 2 // Output half resolution only.
[numthreads(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y, 1)]
void SetupCS(
uint2 GroupId : SV_GroupID,
uint GroupThreadIndex : SV_GroupIndex,
uint2 DispatchThreadId : SV_DispatchThreadID)
{
FDOFSetupParameters Parameters;
Parameters.bOutputFullRes = false;
Parameters.bOutputHalfRes = true;
Parameters.SceneColorTexture = SceneColorTexture;
Parameters.SceneDepthTexture = SceneDepthTexture;
FDOFSetupOutput Output = SetupDOF(Parameters, GroupId, GroupThreadIndex, DispatchThreadId);
if (Output.bHalfResIsValid)
{
#if CONFIG_DOF_ALPHA
{
Output1[Output.HalfResOutputPixelPosition] = Output.HalfResColor;
Output2[Output.HalfResOutputPixelPosition] = Output.HalfResCocRadius;
}
#else
{
Output1[Output.HalfResOutputPixelPosition] = float4(Output.HalfResColor.rgb, Output.HalfResCocRadius);
}
#endif
}
}
#else
#error Unkown resolution divisor.
#endif