433 lines
13 KiB
HLSL
433 lines
13 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#define SPLIT_WORK_QUEUE 1 // TODO: Remove once shader rewriter has been fixed (UE-202409)
|
|
#define OCCLUDED_PATCHES_QUEUE (CULLING_PASS == 1) // TODO: Remove once shader rewriter has been fixed (UE-202409)
|
|
|
|
#include "../Common.ush"
|
|
#include "../SceneData.ush"
|
|
#include "NaniteAttributeDecode.ush"
|
|
#include "NaniteTessellation.ush"
|
|
#include "NaniteHZBCull.ush"
|
|
#include "NaniteCullingCommon.ush"
|
|
#include "NaniteVertexDeformation.ush"
|
|
#if VIRTUAL_TEXTURE_TARGET
|
|
#include "../VirtualShadowMaps/VirtualShadowMapPageOverlap.ush"
|
|
#endif
|
|
|
|
#if NANITE_TESSELLATION
|
|
#if PATCHSPLIT_PASS
|
|
|
|
// [ Nießner and Loop 2021, "Patch-Based Occlusion Culling for Hardware Tessellation" ]
|
|
// Input cone bounds normalized displacement directions
|
|
void DisplaceBounds( float3 ConeCenter, float ConeCos, float DisplacementMin, float DisplacementMax, inout float3 BoundsMin, inout float3 BoundsMax )
|
|
{
|
|
// A (-/+) B == cos( acos(ConeCenter) (+/-) acos(ConeCos) )
|
|
float3 A = ConeCos * abs( ConeCenter );
|
|
float3 B = sqrt( saturate( 1 - Pow2( ConeCos ) ) * saturate( 1 - Pow2( ConeCenter ) ) );
|
|
|
|
float3 CapMin = A - B;
|
|
float3 CapMax = A + B;
|
|
|
|
CapMax = select( abs( ConeCenter ) >= ConeCos, 1.0, CapMax );
|
|
|
|
float3 DisplaceMin3 = ( DisplacementMin > 0 ? CapMin : CapMax ) * DisplacementMin;
|
|
float3 DisplaceMax3 = ( DisplacementMax > 0 ? CapMax : CapMin ) * DisplacementMax;
|
|
|
|
BoundsMin += select( ConeCenter >= 0, DisplaceMin3, -DisplaceMax3 );
|
|
BoundsMax += select( ConeCenter >= 0, DisplaceMax3, -DisplaceMin3 );
|
|
}
|
|
|
|
uint RegularMaterialRasterSlotCount;
|
|
StructuredBuffer<FNaniteRasterBinMeta> RasterBinMeta;
|
|
|
|
Buffer<uint> CurrentIndirectArgs;
|
|
RWBuffer<uint> NextIndirectArgs;
|
|
|
|
uint Level;
|
|
|
|
#if WRITE_STATS
|
|
RWStructuredBuffer<FNaniteStats> OutStatsBuffer;
|
|
#endif
|
|
|
|
struct FSplitTask
|
|
{
|
|
FTessellatedPatch TessellatedPatch;
|
|
|
|
uint4 Encoded;
|
|
uint QueueOffset;
|
|
|
|
void Init( uint VisibleClusterIndex, uint TriIndex );
|
|
|
|
template< typename QueueType >
|
|
bool Load( QueueType WorkQueue, uint Offset );
|
|
template< typename QueueType >
|
|
void Store( QueueType WorkQueue, uint Offset );
|
|
template< typename QueueType >
|
|
void Clear( QueueType WorkQueue, uint Offset );
|
|
|
|
uint Run();
|
|
|
|
FSplitTask CreateChild( uint ParentLaneIndex );
|
|
void RunChild( inout FSplitTask ParentTask, bool bActive, uint LocalItemIndex );
|
|
};
|
|
|
|
void FSplitTask::Init( uint VisibleClusterIndex, uint TriIndex )
|
|
{
|
|
Encoded.x = ( VisibleClusterIndex << 7 ) | TriIndex;
|
|
Encoded.y = BarycentricMax;
|
|
Encoded.z = BarycentricMax << 16;
|
|
Encoded.w = 0;
|
|
}
|
|
|
|
template< typename QueueType >
|
|
bool FSplitTask::Load( QueueType WorkQueue, uint Offset )
|
|
{
|
|
checkSlow( Offset < WorkQueue.Size );
|
|
QueueOffset = Offset;
|
|
Encoded = WorkQueue.DataBuffer_Load4( Offset * 16 );
|
|
|
|
return
|
|
Encoded.x != ~0u &&
|
|
Encoded.y != ~0u &&
|
|
Encoded.z != ~0u &&
|
|
Encoded.w != ~0u;
|
|
}
|
|
|
|
template< typename QueueType >
|
|
void FSplitTask::Store( QueueType WorkQueue, uint Offset )
|
|
{
|
|
checkSlow(
|
|
Encoded.x != ~0u &&
|
|
Encoded.y != ~0u &&
|
|
Encoded.z != ~0u &&
|
|
Encoded.w != ~0u );
|
|
|
|
WorkQueue.DataBuffer_Store4( Offset * 16, Encoded );
|
|
}
|
|
|
|
template< typename QueueType >
|
|
void FSplitTask::Clear( QueueType WorkQueue, uint Offset )
|
|
{
|
|
// TODO this could be at if( bIsBoundary ) at end of DistributeWork loop if latency is important.
|
|
WorkQueue.ReleaseTask();
|
|
if( QueueOffset != ~0u )
|
|
WorkQueue.DataBuffer_Store4( Offset * 16, ~0u );
|
|
}
|
|
|
|
uint FSplitTask::Run()
|
|
{
|
|
#if WRITE_STATS
|
|
WaveInterlockedAddScalar(OutStatsBuffer[0].NumCandidateSplitPatches, 1u);
|
|
#endif
|
|
|
|
FSplitPatch Patch;
|
|
Patch.Decode( Encoded );
|
|
|
|
FVisibleCluster VisibleCluster = GetVisibleCluster( Patch.VisibleClusterIndex, VIRTUAL_TEXTURE_TARGET );
|
|
FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked( VisibleCluster.InstanceId );
|
|
FPrimitiveSceneData PrimitiveData = GetPrimitiveData( InstanceData.PrimitiveId );
|
|
FNaniteView NaniteView = GetNaniteView( VisibleCluster.ViewId );
|
|
|
|
FInstanceDynamicData InstanceDynamicData = CalculateInstanceDynamicData( NaniteView, InstanceData );
|
|
|
|
FCluster Cluster = GetCluster( VisibleCluster.PageIndex, VisibleCluster.ClusterIndex );
|
|
|
|
const bool bWPOEnabled = (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0;
|
|
|
|
uint3 VertIndexes = DecodeTriangleIndices( Cluster, Patch.TriIndex );
|
|
|
|
const uint MaxTexCoords = 0; // UVs are unused here
|
|
FNanitePostDeformVertex Verts[3];
|
|
FetchAndDeformLocalNaniteTriangle( PrimitiveData, InstanceData, GetInstanceViewData(VisibleCluster.InstanceId, NaniteView.SceneRendererPrimaryViewId), Cluster, VisibleCluster, VertIndexes, MaxTexCoords, Verts );
|
|
|
|
// TODO: Handle PrimitiveData.MaxWPOExtent?
|
|
#if DISABLE_DISPLACEMENT_BOUNDS
|
|
const float DisplacementMin = 0.0f;
|
|
const float DisplacementMax = 0.0f;
|
|
#else
|
|
const float DisplacementMin = PrimitiveData.MinMaterialDisplacement;
|
|
const float DisplacementMax = PrimitiveData.MaxMaterialDisplacement;
|
|
#endif
|
|
|
|
float3 BoundsMin = +INFINITE_FLOAT;
|
|
float3 BoundsMax = -INFINITE_FLOAT;
|
|
|
|
float3 AvgNormal = 0;
|
|
float3 PointView[3];
|
|
float3 NormalLocal[3];
|
|
|
|
UNROLL
|
|
for( int i = 0; i < 3; i++ )
|
|
{
|
|
float3 PointPostDeform =
|
|
Verts[0].Position * Patch.Barycentrics[i].x +
|
|
Verts[1].Position * Patch.Barycentrics[i].y +
|
|
Verts[2].Position * Patch.Barycentrics[i].z;
|
|
|
|
BoundsMin = min( BoundsMin, PointPostDeform );
|
|
BoundsMax = max( BoundsMax, PointPostDeform );
|
|
|
|
float3 PointTranslatedWorld = mul( float4( PointPostDeform, 1 ), InstanceDynamicData.LocalToTranslatedWorld ).xyz;
|
|
PointView[i] = mul( float4( PointTranslatedWorld, 1 ), NaniteView.TranslatedWorldToView ).xyz;
|
|
|
|
NormalLocal[i] =
|
|
Verts[0].TangentBasis.TangentZ * Patch.Barycentrics[i].x +
|
|
Verts[1].TangentBasis.TangentZ * Patch.Barycentrics[i].y +
|
|
Verts[2].TangentBasis.TangentZ * Patch.Barycentrics[i].z;
|
|
NormalLocal[i] = normalize( NormalLocal[i] );
|
|
|
|
AvgNormal += NormalLocal[i];
|
|
}
|
|
AvgNormal = normalize( AvgNormal );
|
|
|
|
float ConeCos = min3(
|
|
dot( AvgNormal, NormalLocal[0] ),
|
|
dot( AvgNormal, NormalLocal[1] ),
|
|
dot( AvgNormal, NormalLocal[2] ) );
|
|
|
|
DisplaceBounds( AvgNormal, ConeCos, DisplacementMin, DisplacementMax, BoundsMin, BoundsMax );
|
|
|
|
#if 0
|
|
// Back face cull
|
|
float3x3 M = { PointView[0], PointView[1], PointView[2] };
|
|
bool bVisible = determinant( M ) > 0;
|
|
if( !bVisible )
|
|
return 0;
|
|
#endif
|
|
|
|
float3 BoundsCenter = 0.5 * ( BoundsMax + BoundsMin );
|
|
float3 BoundsExtent = 0.5 * ( BoundsMax - BoundsMin );
|
|
|
|
// Extend the bounds for WPO
|
|
// NOTE: always extend the bounds if any material ignores the Enable WPO flag
|
|
BoundsExtent += GetLocalMaxWPOExtent( PrimitiveData, InstanceData, bWPOEnabled );
|
|
|
|
FBoxCull Cull;
|
|
Cull.Init( NaniteView, BoundsCenter, BoundsExtent, InstanceData.NonUniformScale, InstanceDynamicData.LocalToTranslatedWorld, InstanceDynamicData.PrevLocalToTranslatedWorld );
|
|
Cull.Distance();
|
|
Cull.GlobalClipPlane();
|
|
|
|
#if VIRTUAL_TEXTURE_TARGET
|
|
const bool bCacheAsStatic = (VisibleCluster.Flags & NANITE_CULLING_FLAG_CACHE_AS_STATIC) != 0u;
|
|
// If we're rendering into the static cache, it's not safe to use the receiver mask as we may cache that (full) page
|
|
Cull.bUseReceiverMask = Cull.bUseReceiverMask && !bCacheAsStatic;
|
|
Cull.PageFlagMask = GetPageFlagMaskForRendering(bCacheAsStatic, InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId);
|
|
#endif
|
|
|
|
Cull.FrustumHZB( true );
|
|
|
|
if( !Cull.bIsVisible )
|
|
return 0;
|
|
|
|
const float LowTessDistance = CalcDisplacementLowTessDistance( PrimitiveData, InstanceData, NaniteView );
|
|
float3 TessFactors = GetTessFactors( NaniteView, PointView, LowTessDistance );
|
|
|
|
#if 0 // Avoid infinite recursion
|
|
for( int i = 0; i < 3; i++ )
|
|
{
|
|
float3 Delta = Patch.Barycentrics[i] - Patch.Barycentrics[ (i + 1) % 3 ];
|
|
bool bEdgeTooShort = min3( Delta.x, Delta.y, Delta.z ) < (2.0 / BarycentricMax);
|
|
TessFactors[i] = bEdgeTooShort ? 1.0 : TessFactors[i];
|
|
}
|
|
#endif
|
|
|
|
bool bNeedsSplitting = max3( TessFactors.x, TessFactors.y, TessFactors.z ) > NANITE_TESSELLATION_TABLE_SIZE;
|
|
|
|
if( Cull.bWasOccluded )
|
|
{
|
|
#if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN
|
|
uint WriteOffset = OccludedPatches.Add();
|
|
if( WriteOffset < OccludedPatches.Size )
|
|
{
|
|
Store( OccludedPatches, WriteOffset );
|
|
}
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
#if WRITE_STATS
|
|
WaveInterlockedAddScalar(OutStatsBuffer[0].NumVisibleSplitPatches, 1u);
|
|
#endif
|
|
|
|
if( bNeedsSplitting )
|
|
TessFactors = min( GetSplitFactors( TessFactors ), 8 );
|
|
|
|
TessellatedPatch.Init( TessFactors, Encoded.yzw, false );
|
|
|
|
if( !bNeedsSplitting )
|
|
{
|
|
uint WriteOffset;
|
|
WaveInterlockedAddScalar_( RWVisiblePatchesArgs[3], 1, WriteOffset );
|
|
if( WriteOffset < VisiblePatchesSize )
|
|
{
|
|
#if NANITE_TESSELLATION_PATCH_REFS
|
|
RWVisiblePatches.Store2( WriteOffset * 8, uint2( QueueOffset, TessellatedPatch.GetPattern() ) );
|
|
SplitWorkQueue.DataBuffer_Store4( QueueOffset * 16, Encoded );
|
|
QueueOffset = ~0u;
|
|
#else
|
|
RWVisiblePatches.Store4( WriteOffset * 16, Encoded );
|
|
#endif
|
|
|
|
#if WRITE_STATS
|
|
WaveInterlockedAddScalar(OutStatsBuffer[0].NumDicedTrianglesPatches, TessellatedPatch.GetNumTris());
|
|
#endif
|
|
}
|
|
}
|
|
else
|
|
{
|
|
return TessellatedPatch.GetNumTris();
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
FSplitTask FSplitTask::CreateChild( uint ParentLaneIndex )
|
|
{
|
|
FSplitTask ChildTask;
|
|
ChildTask.TessellatedPatch = WaveReadLaneAt( TessellatedPatch, ParentLaneIndex );
|
|
ChildTask.Encoded = WaveReadLaneAt( Encoded, ParentLaneIndex );
|
|
return ChildTask;
|
|
}
|
|
|
|
void FSplitTask::RunChild( inout FSplitTask ParentTask, bool bActive, uint LocalItemIndex )
|
|
{
|
|
if ( !bActive )
|
|
return;
|
|
|
|
const uint4 EncodedPatch = Encoded;
|
|
|
|
FSplitPatch Patch;
|
|
Patch.Decode( Encoded );
|
|
|
|
uint3 VertIndexes = TessellatedPatch.GetIndexes( LocalItemIndex );
|
|
|
|
for( int i = 0; i < 3; i++ )
|
|
{
|
|
float3 Barycentrics = TessellatedPatch.GetVert( VertIndexes[i] );
|
|
Barycentrics = Patch.TransformBarycentrics( Barycentrics );
|
|
|
|
Encoded[ i + 1 ] = EncodeBarycentrics( Barycentrics );
|
|
}
|
|
|
|
// Do not emit degenerates or the same patch again.
|
|
if( Encoded[1] == Encoded[2] ||
|
|
Encoded[2] == Encoded[3] ||
|
|
Encoded[3] == Encoded[1] ||
|
|
( Encoded[1] == EncodedPatch[1] && Encoded[2] == EncodedPatch[2] && Encoded[3] == EncodedPatch[3] ) )
|
|
return;
|
|
|
|
uint WriteOffset = SplitWorkQueue.Add();
|
|
|
|
const uint WaveNumPatches = WaveActiveCountBits( true );
|
|
if( WaveIsFirstLane() )
|
|
{
|
|
uint TotalPatches;
|
|
InterlockedAdd( NextIndirectArgs[ ( Level + 1 ) * NANITE_PATCH_SPLIT_ARG_COUNT + 3 ], WaveNumPatches, TotalPatches );
|
|
TotalPatches += WaveNumPatches;
|
|
const uint NumGroups = ( TotalPatches + THREADGROUP_SIZE - 1 ) / THREADGROUP_SIZE;
|
|
InterlockedMax( NextIndirectArgs[ ( Level + 1 ) * NANITE_PATCH_SPLIT_ARG_COUNT + 0 ], NumGroups );
|
|
}
|
|
|
|
if( WriteOffset < SplitWorkQueue.Size )
|
|
{
|
|
Store( SplitWorkQueue, WriteOffset );
|
|
}
|
|
}
|
|
|
|
[numthreads(THREADGROUP_SIZE, 1, 1)]
|
|
void PatchSplit( uint GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex )
|
|
{
|
|
const uint NumPatches = CurrentIndirectArgs[ Level * NANITE_PATCH_SPLIT_ARG_COUNT + 3 ];
|
|
const uint LevelStartIndex = CurrentIndirectArgs[ Level * NANITE_PATCH_SPLIT_ARG_COUNT + 4 ];
|
|
const uint LevelEndIndex = min( LevelStartIndex + NumPatches, SplitWorkQueue.Size );
|
|
|
|
const uint StartIndex = LevelStartIndex + GroupID * THREADGROUP_SIZE;
|
|
const uint ClampedStartIndex = min( StartIndex, LevelEndIndex );
|
|
const uint ClampedEndIndex = min( StartIndex + THREADGROUP_SIZE, LevelEndIndex );
|
|
|
|
BRANCH
|
|
if( GroupID == 0 && GroupIndex == 0 )
|
|
{
|
|
NextIndirectArgs[ ( Level + 1 ) * NANITE_PATCH_SPLIT_ARG_COUNT + 4 ] = LevelEndIndex;
|
|
}
|
|
|
|
const uint BatchSize = ClampedEndIndex - ClampedStartIndex;
|
|
BRANCH
|
|
if( BatchSize == 0 )
|
|
{
|
|
return;
|
|
}
|
|
|
|
FSplitTask Task = (FSplitTask)0;
|
|
uint NumChildren = 0;
|
|
if( GroupIndex < BatchSize )
|
|
{
|
|
Task.Load( SplitWorkQueue, ClampedStartIndex + GroupIndex );
|
|
NumChildren = Task.Run();
|
|
}
|
|
|
|
DistributeWork( Task, GroupIndex, NumChildren );
|
|
}
|
|
|
|
#else // PATCHSPLIT_PASS
|
|
|
|
RWBuffer<uint> OutClearQueueArgs;
|
|
|
|
[numthreads(1, 1, 1)]
|
|
void InitClearQueueArgs()
|
|
{
|
|
const uint NumElements = min(SplitWorkQueue.GetState(0).WriteOffset, SplitWorkQueue.Size);
|
|
OutClearQueueArgs[0] = (NumElements + 63) / 64;
|
|
OutClearQueueArgs[1] = 1;
|
|
OutClearQueueArgs[2] = 1;
|
|
}
|
|
|
|
[numthreads(64, 1, 1)]
|
|
void ClearQueue(uint DispatchThreadID : SV_DispatchThreadID)
|
|
{
|
|
const uint NumElements = min(SplitWorkQueue.GetState(0).WriteOffset, SplitWorkQueue.Size);
|
|
if (DispatchThreadID < NumElements)
|
|
{
|
|
SplitWorkQueue.DataBuffer_Store4(DispatchThreadID * 16, ~0u);
|
|
}
|
|
}
|
|
|
|
RWBuffer< uint > OutPatchSplitArgs0;
|
|
RWBuffer< uint > OutPatchSplitArgs1;
|
|
|
|
[numthreads(NANITE_TESSELLATION_MAX_PATCH_SPLIT_LEVELS, 1, 1)]
|
|
void InitPatchSplitArgs(uint GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex)
|
|
{
|
|
const uint Offset = GroupIndex * 8;
|
|
if( GroupID == 0 )
|
|
{
|
|
uint NumGroups = 0;
|
|
uint NumPatches = 0;
|
|
|
|
if( GroupIndex == 0 )
|
|
{
|
|
NumPatches = min( SplitWorkQueue.GetState(0).WriteOffset, SplitWorkQueue.Size );
|
|
NumGroups = ( NumPatches + THREADGROUP_SIZE - 1 ) / THREADGROUP_SIZE;
|
|
}
|
|
|
|
OutPatchSplitArgs0[Offset + 0] = NumGroups; // ThreadGroupCountX
|
|
OutPatchSplitArgs0[Offset + 1] = 1; // ThreadGroupCountY
|
|
OutPatchSplitArgs0[Offset + 2] = 1; // ThreadGroupCountZ
|
|
OutPatchSplitArgs0[Offset + 3] = NumPatches; // NumPatches
|
|
OutPatchSplitArgs0[Offset + 4] = 0; // LevelStartIndex
|
|
}
|
|
else
|
|
{
|
|
OutPatchSplitArgs1[Offset + 0] = 0; // ThreadGroupCountX
|
|
OutPatchSplitArgs1[Offset + 1] = 1; // ThreadGroupCountY
|
|
OutPatchSplitArgs1[Offset + 2] = 1; // ThreadGroupCountZ
|
|
OutPatchSplitArgs1[Offset + 3] = 0; // NumPatches
|
|
OutPatchSplitArgs1[Offset + 4] = 0; // LevelStartIndex
|
|
}
|
|
}
|
|
|
|
|
|
#endif // PATCHSPLIT_PASS
|
|
|
|
#endif // NANITE_TESSELLATION |