Files
UnrealEngine/Engine/Shaders/Private/DistanceFieldObjectCulling.usf
2025-05-18 13:04:45 +08:00

416 lines
16 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
DistanceFieldObjectCulling.usf
=============================================================================*/
#include "Common.ush"
#include "ComputeShaderUtils.ush"
#include "DeferredShadingCommon.ush"
#include "DistanceFieldLightingShared.ush"
#include "DistanceFieldAOShared.ush"
#include "DistanceField/GlobalDistanceFieldShared.ush"
uint ObjectBoundingGeometryIndexCount;
groupshared uint NumGroupObjects;
groupshared uint GroupBaseIndex;
groupshared uint GroupObjectIndices[UPDATEOBJECTS_THREADGROUP_SIZE];
[numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)]
void CullObjectsForViewCS(
uint GroupIndex : SV_GroupIndex,
uint3 GroupId : SV_GroupID)
{
const uint ThreadIndex = GetUnWrappedDispatchThreadId(GroupId, GroupIndex, UPDATEOBJECTS_THREADGROUP_SIZE);
const uint ObjectIndex = ThreadIndex;
#define USE_FRUSTUM_CULLING 1
#if USE_FRUSTUM_CULLING
if (ThreadIndex == 0)
{
// RWObjectIndirectArguments is zeroed by a clear before this shader, only need to set things that are non-zero (and are not read by this shader as that would be a race condition)
// IndexCount, NumInstances, StartIndex, BaseVertexIndex, FirstInstance
RWObjectIndirectArguments[0] = ObjectBoundingGeometryIndexCount;
}
if (GroupIndex == 0)
{
NumGroupObjects = 0;
}
GroupMemoryBarrierWithGroupSync();
if (ObjectIndex < NumSceneObjects)
{
uint SourceIndex = ObjectIndex;
FDFObjectBounds DFObjectBounds = LoadDFObjectBounds(ObjectIndex);
const float3 TranslatedCenter = DFFastToTranslatedWorld(DFObjectBounds.Center, PrimaryView.PreViewTranslation);
float DistanceToViewSq = GetDistanceToCameraFromViewVectorSqr(PrimaryView.TranslatedWorldCameraOrigin - TranslatedCenter);
if (DistanceToViewSq < Square(AOMaxViewDistance + DFObjectBounds.SphereRadius)
&& ViewFrustumIntersectSphere(TranslatedCenter, DFObjectBounds.SphereRadius + AOObjectMaxDistance))
{
FDFObjectData DFObjectData = LoadDFObjectData(SourceIndex);
if ((DFObjectData.MinMaxDrawDistance2.x < 0.0001 || DistanceToViewSq > DFObjectData.MinMaxDrawDistance2.x)
&& (DFObjectData.MinMaxDrawDistance2.y < 0.0001 || DistanceToViewSq < DFObjectData.MinMaxDrawDistance2.y))
{
uint DestIndex;
InterlockedAdd(NumGroupObjects, 1U, DestIndex);
GroupObjectIndices[DestIndex] = SourceIndex;
}
}
}
GroupMemoryBarrierWithGroupSync();
if (GroupIndex == 0)
{
InterlockedAdd(RWObjectIndirectArguments[1], NumGroupObjects, GroupBaseIndex);
}
GroupMemoryBarrierWithGroupSync();
if (GroupIndex < NumGroupObjects)
{
uint SourceIndex = GroupObjectIndices[GroupIndex];
uint DestIndex = GroupBaseIndex + GroupIndex;
RWCulledObjectIndices[DestIndex] = SourceIndex;
}
#else
if (ThreadIndex == 0)
{
// IndexCount, NumInstances, StartIndex, BaseVertexIndex, FirstInstance
RWObjectIndirectArguments[0] = ObjectBoundingGeometryIndexCount;
RWObjectIndirectArguments[1] = NumSceneObjects;
}
GroupMemoryBarrierWithGroupSync();
if (ObjectIndex < NumSceneObjects)
{
uint SourceIndex = ObjectIndex;
uint DestIndex = ObjectIndex;
RWCulledObjectIndices[DestIndex] = SourceIndex;
}
#endif
}
/** Min and Max depth for this tile. */
groupshared uint IntegerTileMinZ;
groupshared uint IntegerTileMaxZ;
/** Inner Min and Max depth for this tile. */
groupshared uint IntegerTileMinZ2;
groupshared uint IntegerTileMaxZ2;
/** View rect min in xy, max in zw. */
uint4 ViewDimensions;
float2 NumGroups;
RWStructuredBuffer<float4> RWTileConeAxisAndCos;
RWStructuredBuffer<float4> RWTileConeDepthRanges;
/** Builds tile depth ranges and bounding cones. */
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void BuildTileConesMain(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint ThreadIndex = GroupThreadId.y * THREADGROUP_SIZEX + GroupThreadId.x;
// Sampling from the texture based off of the ViewRect size because the texture is created on a per-view basis
float2 BaseLevelScreenUV = (DispatchThreadId.xy + float2(.5f, .5f)) * DOWNSAMPLE_FACTOR * View.BufferSizeAndInvSize.zw;
float SceneDepth = GetDownsampledDepth(BaseLevelScreenUV);
// Initialize per-tile variables
if (ThreadIndex == 0)
{
IntegerTileMinZ = 0x7F7FFFFF;
IntegerTileMaxZ = 0;
IntegerTileMinZ2 = 0x7F7FFFFF;
IntegerTileMaxZ2 = 0;
}
GroupMemoryBarrierWithGroupSync();
// Use shared memory atomics to build the depth bounds for this tile
// Each thread is assigned to a pixel at this point
if (SceneDepth < AOMaxViewDistance)
{
InterlockedMin(IntegerTileMinZ, asuint(SceneDepth));
InterlockedMax(IntegerTileMaxZ, asuint(SceneDepth));
}
GroupMemoryBarrierWithGroupSync();
float MinTileZ = asfloat(IntegerTileMinZ);
float MaxTileZ = asfloat(IntegerTileMaxZ);
float HalfZ = .5f * (MinTileZ + MaxTileZ);
// Compute a second min and max Z, clipped by HalfZ, so that we get two depth bounds per tile
// This results in more conservative tile depth bounds and fewer intersections
if (SceneDepth >= HalfZ && SceneDepth < AOMaxViewDistance)
{
InterlockedMin(IntegerTileMinZ2, asuint(SceneDepth));
}
if (SceneDepth <= HalfZ)
{
InterlockedMax(IntegerTileMaxZ2, asuint(SceneDepth));
}
GroupMemoryBarrierWithGroupSync();
float MinTileZ2 = asfloat(IntegerTileMinZ2);
float MaxTileZ2 = asfloat(IntegerTileMaxZ2);
if (ThreadIndex == 0)
{
float3 TileConeVertex;
float3 TileConeAxis;
float TileConeAngleCos;
float TileConeAngleSin;
float4 ConeAxisDepthRanges;
{
float2 ViewSize = float2(1 / View.ViewToClip[0][0], 1 / View.ViewToClip[1][1]);
float3 TileCorner00 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner10 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner01 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner11 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1));
TileConeAxis = normalize(TileCorner00 + TileCorner10 + TileCorner01 + TileCorner11);
TileConeAngleCos = dot(TileConeAxis, TileCorner00);
TileConeAngleSin = sqrt(1 - TileConeAngleCos * TileConeAngleCos);
float TileConeAngleTan = TileConeAngleSin / TileConeAngleCos;
float ConeExpandDistance = 0;
float VertexPullbackLength = ConeExpandDistance / TileConeAngleTan;
float DistanceToNearPlane = length(TileConeAxis / TileConeAxis.z * View.NearPlane);
// 1 / cos(AngleBetweenTileCenterAndViewForward)
float InvCosTileAngle = 1.0f / TileConeAxis.z;
float ConeAxisDistanceMultiply = InvCosTileAngle;
float ConeAxisDistanceAdd = VertexPullbackLength + DistanceToNearPlane;
ConeAxisDepthRanges.x = ConeAxisDistanceMultiply * (MinTileZ - ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.y = ConeAxisDistanceMultiply * (MaxTileZ2 + ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.z = ConeAxisDistanceMultiply * (MinTileZ2 - ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.w = ConeAxisDistanceMultiply * (MaxTileZ + ConeExpandDistance) + ConeAxisDistanceAdd;
// Pull back cone vertex to contain potential samples
TileConeVertex = float3(0, 0, 0) - TileConeAxis * VertexPullbackLength;
}
uint TileIndex = GroupId.y * NumGroups.x + GroupId.x;
if (IntegerTileMinZ > IntegerTileMaxZ)
{
// Guard against IntegerTileMinZ never getting updated
RWTileConeAxisAndCos[TileIndex] = float4(0, 0, 0, 1);
RWTileConeDepthRanges[TileIndex] = 0;
}
else
{
RWTileConeAxisAndCos[TileIndex] = float4(TileConeAxis, TileConeAngleCos);
RWTileConeDepthRanges[TileIndex] = ConeAxisDepthRanges;
}
}
}
struct FObjectCullVertexOutput
{
nointerpolation float4 TranslatedPositionAndRadius : TEXCOORD0;
nointerpolation uint2 ObjectIndexInstanceIndex : TEXCOORD1;
};
float ConservativeRadiusScale;
/** Used when culling objects into screenspace tile lists */
void ObjectCullVS(
float4 InPosition : ATTRIBUTE0,
uint InstanceIndex : SV_InstanceID,
out FObjectCullVertexOutput Output,
out float4 OutPosition : SV_POSITION
)
{
const uint ObjectIndex = CulledObjectIndices[InstanceIndex];
//@todo - implement ConservativelyBoundSphere
FDFObjectBounds ObjectBounds = LoadDFObjectBounds(ObjectIndex);
const float3 TranslatedCenter = DFFastToTranslatedWorld(ObjectBounds.Center, PrimaryView.PreViewTranslation);
//@todo - expand to handle conservative rasterization
float EffectiveRadius = (ObjectBounds.SphereRadius + AOObjectMaxDistance) * ConservativeRadiusScale;
float3 TranslatedWorldPosition = InPosition.xyz * EffectiveRadius + TranslatedCenter;
OutPosition = mul(float4(TranslatedWorldPosition, 1), PrimaryView.TranslatedWorldToClip);
Output.TranslatedPositionAndRadius.xyz = TranslatedCenter;
Output.TranslatedPositionAndRadius.w = ObjectBounds.SphereRadius;
Output.ObjectIndexInstanceIndex = uint2(ObjectIndex, InstanceIndex);
}
/** Used for object <-> tile culling */
bool IntersectObjectWithConeDepthRange(
float3 TileConeVertex,
float3 TileConeAxis,
float TileConeAngleCos,
float TileConeAngleSin,
float2 ConeDepthRange,
float2 ConeAxisDistanceMinMax,
uint ObjectIndex)
{
BRANCH
if (ConeAxisDistanceMinMax.x > ConeDepthRange.x && ConeAxisDistanceMinMax.y < ConeDepthRange.y)
{
#define USE_DISTANCE_FIELD_FOR_OBJECT_CULLING 1
#if USE_DISTANCE_FIELD_FOR_OBJECT_CULLING
FDFObjectData DFObjectData = LoadDFObjectData(ObjectIndex);
float4x4 TranslatedWorldToVolume = DFFastToTranslatedWorld(DFObjectData.WorldToVolume, PrimaryView.PreViewTranslation);
// Use the position halfway between the depth ranges as the center for the bounding sphere of this tile depth range
float3 ViewTileBoundingSphereCenter = TileConeVertex + TileConeAxis * (.5f * (ConeDepthRange.x + ConeDepthRange.y));
float3 TranslatedWorldTileBoundingSphereCenter = mul(float4(ViewTileBoundingSphereCenter.xyz, 1), View.ViewToTranslatedWorld).xyz;
float DistanceAlongAxis = .5f * (ConeDepthRange.y - ConeDepthRange.x);
float FarDepthDistanceToEdgeOfCone = ConeDepthRange.y * TileConeAngleSin / TileConeAngleCos;
float TileBoundingSphereRadius = sqrt(DistanceAlongAxis * DistanceAlongAxis + FarDepthDistanceToEdgeOfCone * FarDepthDistanceToEdgeOfCone);
float3 VolumeTileBoundingSphereCenter = mul(float4(TranslatedWorldTileBoundingSphereCenter, 1), TranslatedWorldToVolume).xyz;
float BoxDistance = ComputeDistanceFromBoxToPoint(-DFObjectData.VolumePositionExtent, DFObjectData.VolumePositionExtent, VolumeTileBoundingSphereCenter) * DFObjectData.VolumeScale;
BRANCH
if (BoxDistance < TileBoundingSphereRadius + AOObjectMaxDistance)
{
float3 ClampedSamplePosition = clamp(VolumeTileBoundingSphereCenter, -DFObjectData.VolumePositionExtent, DFObjectData.VolumePositionExtent);
float DistanceToClamped = length(VolumeTileBoundingSphereCenter - ClampedSamplePosition);
float DistanceToOccluder = (DistanceToMeshSurfaceStandalone(ClampedSamplePosition, DFObjectData) + DistanceToClamped) * DFObjectData.VolumeScale;
BRANCH
if (DistanceToOccluder < TileBoundingSphereRadius + AOObjectMaxDistance)
{
return true;
}
}
#else
return true;
#endif
}
return false;
}
StructuredBuffer<float4> TileConeAxisAndCos;
StructuredBuffer<float4> TileConeDepthRanges;
RWStructuredBuffer<uint> RWNumCulledTilesArray;
RWStructuredBuffer<uint> RWCulledTilesStartOffsetArray;
RWBuffer<uint> RWCulledTileDataArray;
/** Intersects a single object with the tile and adds to the intersection list if needed. */
void ObjectCullPS(
FObjectCullVertexOutput Input,
in float4 SVPos : SV_POSITION,
out float4 OutColor : SV_Target0)
{
OutColor = 0;
uint2 TilePosition = (uint2)SVPos.xy;
uint TileIndex = TilePosition.y * NumGroups.x + TilePosition.x;
float4 ConeAxisAndCos = TileConeAxisAndCos[TileIndex];
float4 ConeAxisDepthRanges = TileConeDepthRanges[TileIndex];
float3 TileConeVertex = 0;
float3 TileConeAxis = ConeAxisAndCos.xyz;
float TileConeAngleCos = ConeAxisAndCos.w;
float TileConeAngleSin = sqrt(1 - TileConeAngleCos * TileConeAngleCos);
float3 TranslatedWorldSphereCenter = Input.TranslatedPositionAndRadius.xyz;
float SphereRadius = Input.TranslatedPositionAndRadius.w;
float3 ViewSpaceSphereCenter = mul(float4(TranslatedWorldSphereCenter, 1), View.TranslatedWorldToView).xyz;
// A value of 1 is conservative, but has a huge impact on performance
float RadiusScale = .5f;
float4 SphereCenterAndRadius = float4(ViewSpaceSphereCenter, SphereRadius + RadiusScale * AOObjectMaxDistance);
if (SphereIntersectCone(SphereCenterAndRadius, TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin))
{
float ConeAxisDistance = dot(SphereCenterAndRadius.xyz - TileConeVertex, TileConeAxis);
float2 ConeAxisDistanceMinMax = float2(ConeAxisDistance + SphereCenterAndRadius.w, ConeAxisDistance - SphereCenterAndRadius.w);
const uint ObjectIndex = Input.ObjectIndexInstanceIndex.x;
const uint InstanceIndex = Input.ObjectIndexInstanceIndex.y;
bool bTileIntersectsObject = IntersectObjectWithConeDepthRange(TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges.xy, ConeAxisDistanceMinMax, ObjectIndex);
if (!bTileIntersectsObject)
{
bTileIntersectsObject = IntersectObjectWithConeDepthRange(TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges.zw, ConeAxisDistanceMinMax, ObjectIndex);
}
if (bTileIntersectsObject)
{
#if SCATTER_CULLING_COUNT_PASS
InterlockedAdd(RWNumCulledTilesArray[InstanceIndex], 1);
#else
uint CulledTileIndex;
InterlockedAdd(RWNumCulledTilesArray[InstanceIndex], 1, CulledTileIndex);
uint CulledTileDataStart = CulledTilesStartOffsetArray[InstanceIndex];
RWCulledTileDataArray[(CulledTileDataStart + CulledTileIndex) * CULLED_TILE_DATA_STRIDE + 0] = TileIndex;
RWCulledTileDataArray[(CulledTileDataStart + CulledTileIndex) * CULLED_TILE_DATA_STRIDE + 1] = ObjectIndex;
#endif
}
}
}
RWBuffer<uint> RWObjectTilesIndirectArguments;
StructuredBuffer<uint> NumCulledTilesArray;
#ifndef COMPUTE_START_OFFSET_GROUP_SIZE
#define COMPUTE_START_OFFSET_GROUP_SIZE 1
#endif
[numthreads(COMPUTE_START_OFFSET_GROUP_SIZE, 1, 1)]
void ComputeCulledTilesStartOffsetCS(
uint GroupIndex : SV_GroupIndex,
uint3 GroupId : SV_GroupID)
{
const uint NumCulledObjects = GetCulledNumObjects();
const uint ThreadIndex = GetUnWrappedDispatchThreadId(GroupId, GroupIndex, COMPUTE_START_OFFSET_GROUP_SIZE);
const uint ObjectIndex = ThreadIndex;
if (ObjectIndex < NumCulledObjects)
{
uint NumIntersectingTiles = NumCulledTilesArray[ObjectIndex];
uint NumConeTraceThreadGroups = (NumIntersectingTiles + CONE_TRACE_TILES_PER_THREADGROUP - 1) / CONE_TRACE_TILES_PER_THREADGROUP;
uint StartOffsetThreadGroups;
InterlockedAdd(RWObjectTilesIndirectArguments[0], NumConeTraceThreadGroups, StartOffsetThreadGroups);
uint StartOffset = StartOffsetThreadGroups * CONE_TRACE_TILES_PER_THREADGROUP;
RWCulledTilesStartOffsetArray[ObjectIndex] = StartOffset;
// Pad remaining entries with INVALID_TILE_INDEX so we can skip computing them in the cone tracing pass
for (uint PaddingTileIndex = NumIntersectingTiles; PaddingTileIndex < NumConeTraceThreadGroups * CONE_TRACE_TILES_PER_THREADGROUP; PaddingTileIndex++)
{
RWCulledTileDataArray[(StartOffset + PaddingTileIndex) * CULLED_TILE_DATA_STRIDE + 0] = INVALID_TILE_INDEX;
RWCulledTileDataArray[(StartOffset + PaddingTileIndex) * CULLED_TILE_DATA_STRIDE + 1] = ObjectIndex;
}
}
if (ThreadIndex == 0)
{
RWObjectTilesIndirectArguments[1] = 1;
RWObjectTilesIndirectArguments[2] = 1;
}
}