313 lines
9.7 KiB
HLSL
313 lines
9.7 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#include "Common.ush"
|
|
#include "SceneTextureParameters.ush"
|
|
#include "ReductionCommon.ush"
|
|
#include "/Engine/Public/WaveBroadcastIntrinsics.ush"
|
|
|
|
|
|
#define MAX_MIP_BATCH_SIZE 4
|
|
#define GROUP_TILE_SIZE 8
|
|
|
|
|
|
float4 DispatchThreadIdToBufferUV;
|
|
float2 InvSize;
|
|
float2 InputViewportMaxBound;
|
|
uint4 PixelViewPortMinMax; // Max is inclusive
|
|
|
|
Texture2D ParentTextureMip;
|
|
SamplerState ParentTextureMipSampler;
|
|
|
|
#if PIXELSHADER
|
|
uint SourceMipIndex;
|
|
#endif
|
|
|
|
float4 Gather4(Texture2D Texture, SamplerState TextureSampler, float2 BufferUV)
|
|
{
|
|
// min(..., InputViewportMaxBound) because we don't want to sample outside of the viewport
|
|
// when the view size has odd dimensions on X/Y axis.
|
|
float2 UV = min(BufferUV + float2(-0.25f, -0.25f) * InvSize, InputViewportMaxBound - InvSize);
|
|
|
|
#if (PIXELSHADER && DIM_USE_MIPINDEX)
|
|
float4 UVRect = float4(UV, UV + InvSize);
|
|
return float4(
|
|
Texture.SampleLevel(TextureSampler, UVRect.xw, SourceMipIndex).r,
|
|
Texture.SampleLevel(TextureSampler, UVRect.zw, SourceMipIndex).r,
|
|
Texture.SampleLevel(TextureSampler, UVRect.zy, SourceMipIndex).r,
|
|
Texture.SampleLevel(TextureSampler, UVRect.xy, SourceMipIndex).r);
|
|
#else
|
|
return Texture.GatherRed(TextureSampler, UV);
|
|
#endif
|
|
}
|
|
|
|
#if COMPUTESHADER
|
|
|
|
RWTexture2D<float> FurthestHZBOutput_0;
|
|
RWTexture2D<float> FurthestHZBOutput_1;
|
|
RWTexture2D<float> FurthestHZBOutput_2;
|
|
RWTexture2D<float> FurthestHZBOutput_3;
|
|
|
|
RWTexture2D<float> ClosestHZBOutput_0;
|
|
RWTexture2D<float> ClosestHZBOutput_1;
|
|
RWTexture2D<float> ClosestHZBOutput_2;
|
|
RWTexture2D<float> ClosestHZBOutput_3;
|
|
|
|
groupshared uint SharedMinDeviceZ[GROUP_TILE_SIZE * GROUP_TILE_SIZE];
|
|
groupshared float SharedMaxDeviceZ[GROUP_TILE_SIZE * GROUP_TILE_SIZE];
|
|
|
|
float RoundUpF16(float DeviceZ)
|
|
{
|
|
// ClosestDeviceZ needs to be rounded up to nearest fp16 to be conservative
|
|
return f16tof32(f32tof16(DeviceZ) + 1);
|
|
}
|
|
|
|
void OutputMipLevel(uint MipLevel, uint2 OutputPixelPos, float FurthestDeviceZ, float ClosestDeviceZ)
|
|
{
|
|
#if DIM_MIP_LEVEL_COUNT >= 2
|
|
if (MipLevel == 1)
|
|
{
|
|
#if DIM_FURTHEST
|
|
FurthestHZBOutput_1[OutputPixelPos] = FurthestDeviceZ;
|
|
#endif
|
|
#if DIM_CLOSEST
|
|
ClosestHZBOutput_1[OutputPixelPos] = RoundUpF16(ClosestDeviceZ);
|
|
#endif
|
|
}
|
|
#endif
|
|
#if DIM_MIP_LEVEL_COUNT >= 3
|
|
else if (MipLevel == 2)
|
|
{
|
|
#if DIM_FURTHEST
|
|
FurthestHZBOutput_2[OutputPixelPos] = FurthestDeviceZ;
|
|
#endif
|
|
#if DIM_CLOSEST
|
|
ClosestHZBOutput_2[OutputPixelPos] = RoundUpF16(ClosestDeviceZ);
|
|
#endif
|
|
}
|
|
#endif
|
|
#if DIM_MIP_LEVEL_COUNT >= 4
|
|
else if (MipLevel == 3)
|
|
{
|
|
#if DIM_FURTHEST
|
|
FurthestHZBOutput_3[OutputPixelPos] = FurthestDeviceZ;
|
|
#endif
|
|
#if DIM_CLOSEST
|
|
ClosestHZBOutput_3[OutputPixelPos] = RoundUpF16(ClosestDeviceZ);
|
|
#endif
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#if VIS_BUFFER_FORMAT == 1
|
|
|
|
// 64b VisBuffer as uint2
|
|
Texture2D<uint2> VisBufferTexture;
|
|
float4 Gather4VisZ(float2 BufferUV, uint2 PixelCoord)
|
|
{
|
|
#if COMPILER_SUPPORTS_GATHER_UINT
|
|
float2 ClampedBufferUV = min( BufferUV + float2(-0.25f, -0.25f) * InvSize, InputViewportMaxBound - InvSize );
|
|
return asfloat(VisBufferTexture.GatherGreen(ParentTextureMipSampler, ClampedBufferUV, 0));
|
|
#else
|
|
// Workaround for Gather not being support for R32G32_UINT on D3D11. We can't alias the texture with R3232_FLOAT as that isn't supported in Unreal.
|
|
uint2 ClampedPixelCoord = min(PixelCoord.xy, PixelViewPortMinMax.zw - 1);
|
|
uint4 ClampedPixelCoord4 = uint4(ClampedPixelCoord.xy, ClampedPixelCoord.xy + 1);
|
|
uint4 Out;
|
|
Out.x = VisBufferTexture[ClampedPixelCoord4.xw].g; // (-, +)
|
|
Out.y = VisBufferTexture[ClampedPixelCoord4.zw].g; // (+, +)
|
|
Out.z = VisBufferTexture[ClampedPixelCoord4.zy].g; // (+, -)
|
|
Out.w = VisBufferTexture[ClampedPixelCoord4.xy].g; // (-, -)
|
|
return asfloat(Out);
|
|
#endif
|
|
}
|
|
|
|
|
|
#elif VIS_BUFFER_FORMAT == 3
|
|
|
|
// 64b VisBuffer as UlongType
|
|
Texture2D<UlongType> VisBufferTexture;
|
|
float4 Gather4VisZ(float2 BufferUV, uint2 PixelCoord)
|
|
{
|
|
// Workaround for Gather not being support for R32G32_UINT on D3D11. We can't alias the texture with R3232_FLOAT as that isn't supported in Unreal.
|
|
uint2 ClampedPixelCoord = min(PixelCoord.xy, PixelViewPortMinMax.zw - 1);
|
|
uint4 ClampedPixelCoord4 = uint4(ClampedPixelCoord.xy, ClampedPixelCoord.xy + 1);
|
|
|
|
uint4 Out;
|
|
Out.x = UnpackUlongType(VisBufferTexture[ClampedPixelCoord4.xw]).g; // (-, +)
|
|
Out.y = UnpackUlongType(VisBufferTexture[ClampedPixelCoord4.zw]).g; // (+, +)
|
|
Out.z = UnpackUlongType(VisBufferTexture[ClampedPixelCoord4.zy]).g; // (+, -)
|
|
Out.w = UnpackUlongType(VisBufferTexture[ClampedPixelCoord4.xy]).g; // (-, -)
|
|
return asfloat(Out);
|
|
}
|
|
|
|
#else
|
|
|
|
// 32b depth only version
|
|
Texture2D<uint> VisBufferTexture;
|
|
|
|
float4 Gather4VisZ(float2 BufferUV, uint2 PixelCoord)
|
|
{
|
|
#if COMPILER_SUPPORTS_GATHER_UINT
|
|
float2 ClampedBufferUV = min( BufferUV + float2(-0.25f, -0.25f) * InvSize, InputViewportMaxBound - InvSize );
|
|
return asfloat(VisBufferTexture.Gather(ParentTextureMipSampler, ClampedBufferUV, 0));
|
|
#else
|
|
uint2 ClampedPixelCoord = min(PixelCoord.xy, PixelViewPortMinMax.zw - 1);
|
|
uint4 ClampedPixelCoord4 = uint4(ClampedPixelCoord.xy, ClampedPixelCoord.xy + 1);
|
|
uint4 Out;
|
|
Out.x = VisBufferTexture[ClampedPixelCoord4.xw].r; // (-, +)
|
|
Out.y = VisBufferTexture[ClampedPixelCoord4.zw].r; // (+, +)
|
|
Out.z = VisBufferTexture[ClampedPixelCoord4.zy].r; // (+, -)
|
|
Out.w = VisBufferTexture[ClampedPixelCoord4.xy].r; // (-, -)
|
|
return asfloat(Out);
|
|
#endif
|
|
}
|
|
|
|
#endif
|
|
|
|
#if DIM_FROXELS
|
|
|
|
#define FROXEL_HASH_BUFFER_SIZE (GROUP_TILE_SIZE * GROUP_TILE_SIZE)
|
|
#define FROXEL_HASH_THREAD_GROUP_SIZE (GROUP_TILE_SIZE * GROUP_TILE_SIZE)
|
|
|
|
#define FROXEL_SHARED_HASH_BUFFER_VAR SharedMinDeviceZ
|
|
|
|
#include "Froxel/FroxelBuild.ush"
|
|
|
|
|
|
#if GROUP_TILE_SIZE != FROXEL_TILE_SIZE
|
|
#error "The froxel construction algorithm assumes the group size matches the tile size, if this is changed this code needs to be updated"
|
|
#endif
|
|
|
|
#endif // DIM_FROXELS
|
|
|
|
|
|
[numthreads(GROUP_TILE_SIZE, GROUP_TILE_SIZE, 1)]
|
|
void HZBBuildCS(
|
|
uint2 GroupId : SV_GroupID,
|
|
uint GroupThreadIndex : SV_GroupIndex)
|
|
{
|
|
#if DIM_MIP_LEVEL_COUNT == 1
|
|
uint2 GroupThreadId = uint2(GroupThreadIndex % GROUP_TILE_SIZE, GroupThreadIndex / GROUP_TILE_SIZE);
|
|
#else
|
|
uint2 GroupThreadId = InitialTilePixelPositionForReduction2x2(MAX_MIP_BATCH_SIZE - 1, GroupThreadIndex);
|
|
#endif
|
|
|
|
uint2 GroupOffset = GROUP_TILE_SIZE * GroupId;
|
|
uint2 DispatchThreadId = GroupOffset + GroupThreadId;
|
|
|
|
// whether or not any of the threads in the group is going to look at valid pixels, a lot don't in the HZB build because of POT rounding.
|
|
uint2 MinPixelCoord = (GroupOffset << 1) + PixelViewPortMinMax.xy;
|
|
bool bValidGroup = all(MinPixelCoord < PixelViewPortMinMax.zw);
|
|
|
|
float2 BufferUV = (DispatchThreadId + 0.5) * DispatchThreadIdToBufferUV.xy + DispatchThreadIdToBufferUV.zw;
|
|
float4 DeviceZ = Gather4(ParentTextureMip, ParentTextureMipSampler, BufferUV);
|
|
|
|
uint2 SrcPixelCoord = (DispatchThreadId << 1) + PixelViewPortMinMax.xy;
|
|
#if (VIS_BUFFER_FORMAT == 1) || (VIS_BUFFER_FORMAT == 3)
|
|
DeviceZ = max(DeviceZ, Gather4VisZ(BufferUV, SrcPixelCoord));
|
|
#elif VIS_BUFFER_FORMAT == 2
|
|
DeviceZ = Gather4VisZ(BufferUV, SrcPixelCoord);
|
|
#endif
|
|
|
|
float MinDeviceZ = min(min3(DeviceZ.x, DeviceZ.y, DeviceZ.z), DeviceZ.w);
|
|
float MaxDeviceZ = max(max3(DeviceZ.x, DeviceZ.y, DeviceZ.z), DeviceZ.w);
|
|
|
|
#if DIM_FROXELS
|
|
// 2x as each thread does 2x2 through Gather
|
|
uint2 LocalTileId = (2 * GroupThreadId) / (FROXEL_TILE_SIZE);
|
|
uint LocalLinearTileId = LocalTileId.y * 2 + LocalTileId.x;
|
|
// 2x2 tiles per group
|
|
uint2 GroupTileOffset = GroupId * 2;
|
|
uint2 TileId = GroupTileOffset + LocalTileId;
|
|
|
|
if (bValidGroup)
|
|
{
|
|
HashBuildFroxelsFromDeviceZ(DeviceZ, GroupThreadIndex, LocalLinearTileId, GroupTileOffset);
|
|
}
|
|
#endif
|
|
|
|
uint2 OutputPixelPos = DispatchThreadId;
|
|
|
|
#if DIM_FURTHEST
|
|
FurthestHZBOutput_0[OutputPixelPos] = MinDeviceZ;
|
|
#endif
|
|
|
|
#if DIM_CLOSEST
|
|
ClosestHZBOutput_0[OutputPixelPos] = RoundUpF16(MaxDeviceZ);
|
|
#endif
|
|
|
|
#if DIM_MIP_LEVEL_COUNT == 1
|
|
{
|
|
// NOP
|
|
}
|
|
#else
|
|
{
|
|
SharedMinDeviceZ[GroupThreadIndex] = asuint(MinDeviceZ);
|
|
SharedMaxDeviceZ[GroupThreadIndex] = MaxDeviceZ;
|
|
#if FEATURE_LEVEL >= FEATURE_LEVEL_SM6 || PLATFORM_SUPPORTS_SM6_0_WAVE_OPERATIONS
|
|
const uint LaneCount = WaveGetLaneCount();
|
|
#else
|
|
// Actual wave size is unknown, assume the worst
|
|
const uint LaneCount = 0u;
|
|
#endif
|
|
|
|
UNROLL
|
|
for (uint MipLevel = 1; MipLevel < DIM_MIP_LEVEL_COUNT; ++MipLevel)
|
|
{
|
|
const uint TileSize = uint(GROUP_TILE_SIZE) >> MipLevel;
|
|
const uint ReduceBankSize = TileSize * TileSize;
|
|
|
|
// More waves than one wrote to LDS, need to sync.
|
|
if ((ReduceBankSize << 2u) > LaneCount)
|
|
{
|
|
GroupMemoryBarrierWithGroupSync();
|
|
}
|
|
|
|
BRANCH
|
|
if (GroupThreadIndex < ReduceBankSize)
|
|
{
|
|
float4 ParentMinDeviceZ;
|
|
float4 ParentMaxDeviceZ;
|
|
ParentMinDeviceZ[0] = MinDeviceZ;
|
|
ParentMaxDeviceZ[0] = MaxDeviceZ;
|
|
|
|
UNROLL
|
|
for (uint i = 1; i < 4; i++)
|
|
{
|
|
uint LDSIndex = GroupThreadIndex + i * ReduceBankSize;
|
|
ParentMinDeviceZ[i] = asfloat(SharedMinDeviceZ[LDSIndex]);
|
|
ParentMaxDeviceZ[i] = SharedMaxDeviceZ[LDSIndex];
|
|
}
|
|
|
|
MinDeviceZ = min(min3(ParentMinDeviceZ.x, ParentMinDeviceZ.y, ParentMinDeviceZ.z), ParentMinDeviceZ.w);
|
|
MaxDeviceZ = max(max3(ParentMaxDeviceZ.x, ParentMaxDeviceZ.y, ParentMaxDeviceZ.z), ParentMaxDeviceZ.w);
|
|
|
|
OutputPixelPos = OutputPixelPos >> 1;
|
|
OutputMipLevel(MipLevel, OutputPixelPos, MinDeviceZ, MaxDeviceZ);
|
|
|
|
SharedMinDeviceZ[GroupThreadIndex] = asuint(MinDeviceZ);
|
|
SharedMaxDeviceZ[GroupThreadIndex] = MaxDeviceZ;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
} // HZBBuildCS
|
|
|
|
|
|
#elif PIXELSHADER
|
|
|
|
void HZBBuildPS(float4 SvPosition : SV_POSITION, out float4 OutColor : SV_Target0)
|
|
{
|
|
float2 BufferUV = SvPosition.xy * DispatchThreadIdToBufferUV.xy + DispatchThreadIdToBufferUV.zw;
|
|
float4 DeviceZ = Gather4(ParentTextureMip, ParentTextureMipSampler, BufferUV);
|
|
|
|
float FurthestDeviceZ = min(min(DeviceZ.x, DeviceZ.y), min(DeviceZ.z, DeviceZ.w));
|
|
|
|
OutColor = FurthestDeviceZ;
|
|
}
|
|
|
|
|
|
#else
|
|
#error Unknown shader frequency
|
|
|
|
#endif
|