// Copyright Epic Games, Inc. All Rights Reserved. #define SPLIT_WORK_QUEUE 1 // TODO: Remove once shader rewriter has been fixed (UE-202409) #define OCCLUDED_PATCHES_QUEUE (CULLING_PASS == 1) // TODO: Remove once shader rewriter has been fixed (UE-202409) #include "../Common.ush" #include "../SceneData.ush" #include "NaniteAttributeDecode.ush" #include "NaniteTessellation.ush" #include "NaniteHZBCull.ush" #include "NaniteCullingCommon.ush" #include "NaniteVertexDeformation.ush" #if VIRTUAL_TEXTURE_TARGET #include "../VirtualShadowMaps/VirtualShadowMapPageOverlap.ush" #endif #if NANITE_TESSELLATION #if PATCHSPLIT_PASS // [ Nießner and Loop 2021, "Patch-Based Occlusion Culling for Hardware Tessellation" ] // Input cone bounds normalized displacement directions void DisplaceBounds( float3 ConeCenter, float ConeCos, float DisplacementMin, float DisplacementMax, inout float3 BoundsMin, inout float3 BoundsMax ) { // A (-/+) B == cos( acos(ConeCenter) (+/-) acos(ConeCos) ) float3 A = ConeCos * abs( ConeCenter ); float3 B = sqrt( saturate( 1 - Pow2( ConeCos ) ) * saturate( 1 - Pow2( ConeCenter ) ) ); float3 CapMin = A - B; float3 CapMax = A + B; CapMax = select( abs( ConeCenter ) >= ConeCos, 1.0, CapMax ); float3 DisplaceMin3 = ( DisplacementMin > 0 ? CapMin : CapMax ) * DisplacementMin; float3 DisplaceMax3 = ( DisplacementMax > 0 ? CapMax : CapMin ) * DisplacementMax; BoundsMin += select( ConeCenter >= 0, DisplaceMin3, -DisplaceMax3 ); BoundsMax += select( ConeCenter >= 0, DisplaceMax3, -DisplaceMin3 ); } uint RegularMaterialRasterSlotCount; StructuredBuffer RasterBinMeta; Buffer CurrentIndirectArgs; RWBuffer NextIndirectArgs; uint Level; #if WRITE_STATS RWStructuredBuffer OutStatsBuffer; #endif struct FSplitTask { FTessellatedPatch TessellatedPatch; uint4 Encoded; uint QueueOffset; void Init( uint VisibleClusterIndex, uint TriIndex ); template< typename QueueType > bool Load( QueueType WorkQueue, uint Offset ); template< typename QueueType > void Store( QueueType WorkQueue, uint Offset ); template< typename QueueType > void Clear( QueueType WorkQueue, uint Offset ); uint Run(); FSplitTask CreateChild( uint ParentLaneIndex ); void RunChild( inout FSplitTask ParentTask, bool bActive, uint LocalItemIndex ); }; void FSplitTask::Init( uint VisibleClusterIndex, uint TriIndex ) { Encoded.x = ( VisibleClusterIndex << 7 ) | TriIndex; Encoded.y = BarycentricMax; Encoded.z = BarycentricMax << 16; Encoded.w = 0; } template< typename QueueType > bool FSplitTask::Load( QueueType WorkQueue, uint Offset ) { checkSlow( Offset < WorkQueue.Size ); QueueOffset = Offset; Encoded = WorkQueue.DataBuffer_Load4( Offset * 16 ); return Encoded.x != ~0u && Encoded.y != ~0u && Encoded.z != ~0u && Encoded.w != ~0u; } template< typename QueueType > void FSplitTask::Store( QueueType WorkQueue, uint Offset ) { checkSlow( Encoded.x != ~0u && Encoded.y != ~0u && Encoded.z != ~0u && Encoded.w != ~0u ); WorkQueue.DataBuffer_Store4( Offset * 16, Encoded ); } template< typename QueueType > void FSplitTask::Clear( QueueType WorkQueue, uint Offset ) { // TODO this could be at if( bIsBoundary ) at end of DistributeWork loop if latency is important. WorkQueue.ReleaseTask(); if( QueueOffset != ~0u ) WorkQueue.DataBuffer_Store4( Offset * 16, ~0u ); } uint FSplitTask::Run() { #if WRITE_STATS WaveInterlockedAddScalar(OutStatsBuffer[0].NumCandidateSplitPatches, 1u); #endif FSplitPatch Patch; Patch.Decode( Encoded ); FVisibleCluster VisibleCluster = GetVisibleCluster( Patch.VisibleClusterIndex, VIRTUAL_TEXTURE_TARGET ); FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked( VisibleCluster.InstanceId ); FPrimitiveSceneData PrimitiveData = GetPrimitiveData( InstanceData.PrimitiveId ); FNaniteView NaniteView = GetNaniteView( VisibleCluster.ViewId ); FInstanceDynamicData InstanceDynamicData = CalculateInstanceDynamicData( NaniteView, InstanceData ); FCluster Cluster = GetCluster( VisibleCluster.PageIndex, VisibleCluster.ClusterIndex ); const bool bWPOEnabled = (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0; uint3 VertIndexes = DecodeTriangleIndices( Cluster, Patch.TriIndex ); const uint MaxTexCoords = 0; // UVs are unused here FNanitePostDeformVertex Verts[3]; FetchAndDeformLocalNaniteTriangle( PrimitiveData, InstanceData, GetInstanceViewData(VisibleCluster.InstanceId, NaniteView.SceneRendererPrimaryViewId), Cluster, VisibleCluster, VertIndexes, MaxTexCoords, Verts ); // TODO: Handle PrimitiveData.MaxWPOExtent? #if DISABLE_DISPLACEMENT_BOUNDS const float DisplacementMin = 0.0f; const float DisplacementMax = 0.0f; #else const float DisplacementMin = PrimitiveData.MinMaterialDisplacement; const float DisplacementMax = PrimitiveData.MaxMaterialDisplacement; #endif float3 BoundsMin = +INFINITE_FLOAT; float3 BoundsMax = -INFINITE_FLOAT; float3 AvgNormal = 0; float3 PointView[3]; float3 NormalLocal[3]; UNROLL for( int i = 0; i < 3; i++ ) { float3 PointPostDeform = Verts[0].Position * Patch.Barycentrics[i].x + Verts[1].Position * Patch.Barycentrics[i].y + Verts[2].Position * Patch.Barycentrics[i].z; BoundsMin = min( BoundsMin, PointPostDeform ); BoundsMax = max( BoundsMax, PointPostDeform ); float3 PointTranslatedWorld = mul( float4( PointPostDeform, 1 ), InstanceDynamicData.LocalToTranslatedWorld ).xyz; PointView[i] = mul( float4( PointTranslatedWorld, 1 ), NaniteView.TranslatedWorldToView ).xyz; NormalLocal[i] = Verts[0].TangentBasis.TangentZ * Patch.Barycentrics[i].x + Verts[1].TangentBasis.TangentZ * Patch.Barycentrics[i].y + Verts[2].TangentBasis.TangentZ * Patch.Barycentrics[i].z; NormalLocal[i] = normalize( NormalLocal[i] ); AvgNormal += NormalLocal[i]; } AvgNormal = normalize( AvgNormal ); float ConeCos = min3( dot( AvgNormal, NormalLocal[0] ), dot( AvgNormal, NormalLocal[1] ), dot( AvgNormal, NormalLocal[2] ) ); DisplaceBounds( AvgNormal, ConeCos, DisplacementMin, DisplacementMax, BoundsMin, BoundsMax ); #if 0 // Back face cull float3x3 M = { PointView[0], PointView[1], PointView[2] }; bool bVisible = determinant( M ) > 0; if( !bVisible ) return 0; #endif float3 BoundsCenter = 0.5 * ( BoundsMax + BoundsMin ); float3 BoundsExtent = 0.5 * ( BoundsMax - BoundsMin ); // Extend the bounds for WPO // NOTE: always extend the bounds if any material ignores the Enable WPO flag BoundsExtent += GetLocalMaxWPOExtent( PrimitiveData, InstanceData, bWPOEnabled ); FBoxCull Cull; Cull.Init( NaniteView, BoundsCenter, BoundsExtent, InstanceData.NonUniformScale, InstanceDynamicData.LocalToTranslatedWorld, InstanceDynamicData.PrevLocalToTranslatedWorld ); Cull.Distance(); Cull.GlobalClipPlane(); #if VIRTUAL_TEXTURE_TARGET const bool bCacheAsStatic = (VisibleCluster.Flags & NANITE_CULLING_FLAG_CACHE_AS_STATIC) != 0u; // If we're rendering into the static cache, it's not safe to use the receiver mask as we may cache that (full) page Cull.bUseReceiverMask = Cull.bUseReceiverMask && !bCacheAsStatic; Cull.PageFlagMask = GetPageFlagMaskForRendering(bCacheAsStatic, InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId); #endif Cull.FrustumHZB( true ); if( !Cull.bIsVisible ) return 0; const float LowTessDistance = CalcDisplacementLowTessDistance( PrimitiveData, InstanceData, NaniteView ); float3 TessFactors = GetTessFactors( NaniteView, PointView, LowTessDistance ); #if 0 // Avoid infinite recursion for( int i = 0; i < 3; i++ ) { float3 Delta = Patch.Barycentrics[i] - Patch.Barycentrics[ (i + 1) % 3 ]; bool bEdgeTooShort = min3( Delta.x, Delta.y, Delta.z ) < (2.0 / BarycentricMax); TessFactors[i] = bEdgeTooShort ? 1.0 : TessFactors[i]; } #endif bool bNeedsSplitting = max3( TessFactors.x, TessFactors.y, TessFactors.z ) > NANITE_TESSELLATION_TABLE_SIZE; if( Cull.bWasOccluded ) { #if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN uint WriteOffset = OccludedPatches.Add(); if( WriteOffset < OccludedPatches.Size ) { Store( OccludedPatches, WriteOffset ); } #endif } else { #if WRITE_STATS WaveInterlockedAddScalar(OutStatsBuffer[0].NumVisibleSplitPatches, 1u); #endif if( bNeedsSplitting ) TessFactors = min( GetSplitFactors( TessFactors ), 8 ); TessellatedPatch.Init( TessFactors, Encoded.yzw, false ); if( !bNeedsSplitting ) { uint WriteOffset; WaveInterlockedAddScalar_( RWVisiblePatchesArgs[3], 1, WriteOffset ); if( WriteOffset < VisiblePatchesSize ) { #if NANITE_TESSELLATION_PATCH_REFS RWVisiblePatches.Store2( WriteOffset * 8, uint2( QueueOffset, TessellatedPatch.GetPattern() ) ); SplitWorkQueue.DataBuffer_Store4( QueueOffset * 16, Encoded ); QueueOffset = ~0u; #else RWVisiblePatches.Store4( WriteOffset * 16, Encoded ); #endif #if WRITE_STATS WaveInterlockedAddScalar(OutStatsBuffer[0].NumDicedTrianglesPatches, TessellatedPatch.GetNumTris()); #endif } } else { return TessellatedPatch.GetNumTris(); } } return 0; } FSplitTask FSplitTask::CreateChild( uint ParentLaneIndex ) { FSplitTask ChildTask; ChildTask.TessellatedPatch = WaveReadLaneAt( TessellatedPatch, ParentLaneIndex ); ChildTask.Encoded = WaveReadLaneAt( Encoded, ParentLaneIndex ); return ChildTask; } void FSplitTask::RunChild( inout FSplitTask ParentTask, bool bActive, uint LocalItemIndex ) { if ( !bActive ) return; const uint4 EncodedPatch = Encoded; FSplitPatch Patch; Patch.Decode( Encoded ); uint3 VertIndexes = TessellatedPatch.GetIndexes( LocalItemIndex ); for( int i = 0; i < 3; i++ ) { float3 Barycentrics = TessellatedPatch.GetVert( VertIndexes[i] ); Barycentrics = Patch.TransformBarycentrics( Barycentrics ); Encoded[ i + 1 ] = EncodeBarycentrics( Barycentrics ); } // Do not emit degenerates or the same patch again. if( Encoded[1] == Encoded[2] || Encoded[2] == Encoded[3] || Encoded[3] == Encoded[1] || ( Encoded[1] == EncodedPatch[1] && Encoded[2] == EncodedPatch[2] && Encoded[3] == EncodedPatch[3] ) ) return; uint WriteOffset = SplitWorkQueue.Add(); const uint WaveNumPatches = WaveActiveCountBits( true ); if( WaveIsFirstLane() ) { uint TotalPatches; InterlockedAdd( NextIndirectArgs[ ( Level + 1 ) * NANITE_PATCH_SPLIT_ARG_COUNT + 3 ], WaveNumPatches, TotalPatches ); TotalPatches += WaveNumPatches; const uint NumGroups = ( TotalPatches + THREADGROUP_SIZE - 1 ) / THREADGROUP_SIZE; InterlockedMax( NextIndirectArgs[ ( Level + 1 ) * NANITE_PATCH_SPLIT_ARG_COUNT + 0 ], NumGroups ); } if( WriteOffset < SplitWorkQueue.Size ) { Store( SplitWorkQueue, WriteOffset ); } } [numthreads(THREADGROUP_SIZE, 1, 1)] void PatchSplit( uint GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex ) { const uint NumPatches = CurrentIndirectArgs[ Level * NANITE_PATCH_SPLIT_ARG_COUNT + 3 ]; const uint LevelStartIndex = CurrentIndirectArgs[ Level * NANITE_PATCH_SPLIT_ARG_COUNT + 4 ]; const uint LevelEndIndex = min( LevelStartIndex + NumPatches, SplitWorkQueue.Size ); const uint StartIndex = LevelStartIndex + GroupID * THREADGROUP_SIZE; const uint ClampedStartIndex = min( StartIndex, LevelEndIndex ); const uint ClampedEndIndex = min( StartIndex + THREADGROUP_SIZE, LevelEndIndex ); BRANCH if( GroupID == 0 && GroupIndex == 0 ) { NextIndirectArgs[ ( Level + 1 ) * NANITE_PATCH_SPLIT_ARG_COUNT + 4 ] = LevelEndIndex; } const uint BatchSize = ClampedEndIndex - ClampedStartIndex; BRANCH if( BatchSize == 0 ) { return; } FSplitTask Task = (FSplitTask)0; uint NumChildren = 0; if( GroupIndex < BatchSize ) { Task.Load( SplitWorkQueue, ClampedStartIndex + GroupIndex ); NumChildren = Task.Run(); } DistributeWork( Task, GroupIndex, NumChildren ); } #else // PATCHSPLIT_PASS RWBuffer OutClearQueueArgs; [numthreads(1, 1, 1)] void InitClearQueueArgs() { const uint NumElements = min(SplitWorkQueue.GetState(0).WriteOffset, SplitWorkQueue.Size); OutClearQueueArgs[0] = (NumElements + 63) / 64; OutClearQueueArgs[1] = 1; OutClearQueueArgs[2] = 1; } [numthreads(64, 1, 1)] void ClearQueue(uint DispatchThreadID : SV_DispatchThreadID) { const uint NumElements = min(SplitWorkQueue.GetState(0).WriteOffset, SplitWorkQueue.Size); if (DispatchThreadID < NumElements) { SplitWorkQueue.DataBuffer_Store4(DispatchThreadID * 16, ~0u); } } RWBuffer< uint > OutPatchSplitArgs0; RWBuffer< uint > OutPatchSplitArgs1; [numthreads(NANITE_TESSELLATION_MAX_PATCH_SPLIT_LEVELS, 1, 1)] void InitPatchSplitArgs(uint GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex) { const uint Offset = GroupIndex * 8; if( GroupID == 0 ) { uint NumGroups = 0; uint NumPatches = 0; if( GroupIndex == 0 ) { NumPatches = min( SplitWorkQueue.GetState(0).WriteOffset, SplitWorkQueue.Size ); NumGroups = ( NumPatches + THREADGROUP_SIZE - 1 ) / THREADGROUP_SIZE; } OutPatchSplitArgs0[Offset + 0] = NumGroups; // ThreadGroupCountX OutPatchSplitArgs0[Offset + 1] = 1; // ThreadGroupCountY OutPatchSplitArgs0[Offset + 2] = 1; // ThreadGroupCountZ OutPatchSplitArgs0[Offset + 3] = NumPatches; // NumPatches OutPatchSplitArgs0[Offset + 4] = 0; // LevelStartIndex } else { OutPatchSplitArgs1[Offset + 0] = 0; // ThreadGroupCountX OutPatchSplitArgs1[Offset + 1] = 1; // ThreadGroupCountY OutPatchSplitArgs1[Offset + 2] = 1; // ThreadGroupCountZ OutPatchSplitArgs1[Offset + 3] = 0; // NumPatches OutPatchSplitArgs1[Offset + 4] = 0; // LevelStartIndex } } #endif // PATCHSPLIT_PASS #endif // NANITE_TESSELLATION