// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "NaniteDataDecode.ush" #include "NaniteHierarchyTraversalCommon.ush" #include "../Common.ush" #include "../WaveOpUtil.ush" #include "../ComputeShaderUtils.ush" #ifndef NANITE_HIERARCHY_TRAVERSAL_TYPE # error "NANITE_HIERARCHY_TRAVERSAL_TYPE must be defined." #endif #ifndef CHECK_AND_TRIM_CLUSTER_COUNT # define CHECK_AND_TRIM_CLUSTER_COUNT 0 #endif #if GROUP_NODE_SIZE == 2 groupshared uint2 GroupNodeData[NANITE_MAX_BVH_NODES_PER_GROUP]; uint4 GetGroupNodeData(uint NodeIndex) { return uint4(GroupNodeData[NodeIndex], 0, 0); } void SetGroupNodeData(uint NodeIndex, uint4 Data) { GroupNodeData[NodeIndex] = Data.xy; } #elif GROUP_NODE_SIZE == 3 groupshared uint3 GroupNodeData[NANITE_MAX_BVH_NODES_PER_GROUP]; uint4 GetGroupNodeData(uint NodeIndex) { return uint4(GroupNodeData[NodeIndex], 0); } void SetGroupNodeData(uint NodeIndex, uint4 Data) { GroupNodeData[NodeIndex] = Data.xyz; } #else # error "Unexpected Group Node Size." #endif #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS RWCoherentStructuredBuffer(FQueueState) QueueState; #else RWStructuredBuffer QueueState; #endif #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_NODES Buffer CurrentNodeIndirectArgs; RWBuffer NextNodeIndirectArgs; uint NodeLevel; #endif groupshared uint GroupNumCandidateNodes; groupshared uint GroupCandidateNodesOffset; groupshared int GroupNodeCount; groupshared uint GroupNodeBatchStartIndex; groupshared uint GroupNodeMask; groupshared uint GroupClusterBatchStartIndex; groupshared uint GroupClusterBatchReadySize; // Need manually strip unused template functions here due to a compiler issue: https://github.com/microsoft/DirectXShaderCompiler/issues/4649 #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS || NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_NODES template void ProcessNodeBatch(uint BatchSize, uint GroupIndex, uint QueueStateIndex) { const uint LocalNodeIndex = (GroupIndex >> NANITE_MAX_BVH_NODE_FANOUT_BITS); const uint ChildIndex = GroupIndex & NANITE_MAX_BVH_NODE_FANOUT_MASK; const uint FetchIndex = min(LocalNodeIndex, BatchSize - 1); FNaniteTraversalCallback TraversalCallback; TraversalCallback.Init(ChildIndex, LocalNodeIndex, FetchIndex); const FHierarchyNodeSlice HierarchyNodeSlice = GetHierarchyNodeSlice(TraversalCallback.GetHierarchyNodeOffset(), ChildIndex); bool bVisible = HierarchyNodeSlice.bEnabled; bool bLoaded = HierarchyNodeSlice.bLoaded; if (LocalNodeIndex >= BatchSize) { bVisible = false; } bVisible = TraversalCallback.ShouldVisitChild(HierarchyNodeSlice, bVisible); uint CandidateNodesOffset = 0; const bool bOutputChild = bVisible && bLoaded; BRANCH if (bOutputChild && !HierarchyNodeSlice.bLeaf) { WaveInterlockedAddScalar_(GroupNumCandidateNodes, 1, CandidateNodesOffset); } GroupMemoryBarrierWithGroupSync(); if (GroupIndex == 0) { InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeWriteOffset, GroupNumCandidateNodes, GroupCandidateNodesOffset); #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeCount, (int)GroupNumCandidateNodes); // NodeCount needs to be conservative, so child count is added before the actual children. #else uint NumNodes = 0; InterlockedAdd(NextNodeIndirectArgs[(NodeLevel + 1) * NANITE_NODE_CULLING_ARG_COUNT + 3], GroupNumCandidateNodes, NumNodes); NumNodes += GroupNumCandidateNodes; const uint NumGroups = (NumNodes + NANITE_MAX_BVH_NODES_PER_GROUP - 1) / NANITE_MAX_BVH_NODES_PER_GROUP; InterlockedMax(NextNodeIndirectArgs[(NodeLevel + 1) * NANITE_NODE_CULLING_ARG_COUNT + 0], NumGroups); #endif } #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS AllMemoryBarrierWithGroupSync(); #else GroupMemoryBarrierWithGroupSync(); #endif // GPU might not be filled, so latency is important here. Kick new jobs as soon as possible. if (bOutputChild && !HierarchyNodeSlice.bLeaf) { CandidateNodesOffset += GroupCandidateNodesOffset; if (CandidateNodesOffset < MaxNodes) { TraversalCallback.StoreChildNode(CandidateNodesOffset, HierarchyNodeSlice); } } #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS DeviceMemoryBarrierWithGroupSync(); #endif // Continue with remaining independent work if (bOutputChild && HierarchyNodeSlice.bLeaf) { uint NumClusters = HierarchyNodeSlice.NumChildren; #if CHECK_AND_TRIM_CLUSTER_COUNT uint ClusterIndex = 0; WaveInterlockedAdd_(QueueState[0].TotalClusters, NumClusters, ClusterIndex); // Trim any clusters above MaxCandidateClusters const uint ClusterIndexEnd = min(ClusterIndex + NumClusters, MaxCandidateClusters); NumClusters = (uint)max((int)ClusterIndexEnd - (int)ClusterIndex, 0); #endif uint CandidateClustersOffset = 0; WaveInterlockedAdd_(QueueState[0].PassState[QueueStateIndex].ClusterWriteOffset, NumClusters, CandidateClustersOffset); const uint BaseClusterIndex = HierarchyNodeSlice.ChildStartReference & NANITE_MAX_CLUSTERS_PER_PAGE_MASK; const uint StartIndex = CandidateClustersOffset; const uint EndIndex = min(CandidateClustersOffset + NumClusters, MaxCandidateClusters); for (uint Index = StartIndex; Index < EndIndex; Index++) { TraversalCallback.StoreCluster(Index, HierarchyNodeSlice, BaseClusterIndex + (Index - StartIndex)); } #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS DeviceMemoryBarrier(); // Once the cluster indices have been committed to memory, we can update the cluster counters of the overlapping cluster batches. for (uint Index = StartIndex; Index < EndIndex;) { const uint BatchIndex = Index / NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE; const uint NextIndex = (Index & ~(NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE - 1u)) + NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE; const uint MaxIndex = min(NextIndex, EndIndex); const uint Num = MaxIndex - Index; TraversalCallback.AddToClusterBatch(BatchIndex, Num); Index = NextIndex; } #endif } #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS DeviceMemoryBarrierWithGroupSync(); if (GroupIndex == 0) { // Done writing clusters/nodes for current pass. Subtract us from NodeCount. InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeCount, -(int)BatchSize); } #endif TraversalCallback.OnPostNodeVisit(HierarchyNodeSlice); } #endif #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS || NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_CLUSTERS template void ProcessClusterBatch(uint BatchStartIndex, uint BatchSize, uint GroupIndex) { FNaniteTraversalCallback TraversalCallback; if (GroupIndex < BatchSize) { const uint CandidateIndex = BatchStartIndex * NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE + GroupIndex; const uint4 PackedCluster = TraversalCallback.LoadPackedCluster(CandidateIndex); TraversalCallback.ProcessCluster(PackedCluster); } #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS // Clear batch so the buffer is cleared for next pass. TraversalCallback.ClearClusterBatch(BatchStartIndex); #endif } #endif // Persistent threads culling shader // This shader is responsible for the recursive culling and traversal of the per-mesh cluster hierarchies. // It is also responsible for culling the triangle clusters found during this traversal and producing // lists of visible clusters for rasterization. Clusters are binned for SW or HW rasterization based on // screen-projected size. // Mapping tree-culling to the GPU is awkward as the number of leaf nodes that need to be accepted // is dynamic and can be anywhere from none to hundreds of thousands. Mapping threads 1:1 to trees can result in // extremely long serial processing that severely underutilizes the GPU. Conversely, mapping threads 1:1 to // leaf nodes can end up leaving most threads idle. // What we really need is the ability to dynamically spawn threads for children as they are determined // to be visible during the traversal. This is unfortunately not possible (yet), so instead we use // persistent threads. We spawn just enough worker threads to fill the GPU, keep them running and manually // distribute work to them. // For this we use a simple MPMC job queue. Instead of spawning new threads, new jobs are added to the queue // to be consumed by workers. Initially the queue is populated with work generated by preceding shader passes. // When the persistent shader is running, its workers will continuously consuming items from the queue, but also // produce new items. // When BVH nodes are processed they will append work items to the queue for each of the visible children. // Clusters are only consumed and will never generate new work. The workers keep running until the queue is empty. // At that point the trees have been recursively traversed and all the relevant clusters have been processed. // A given node can't be processed before all of its ancestors have been processed. This forms a dependency chain // of jobs that can be as long as the height of the tree. Especially for small or moderately-sized scenes, the total // latency from these dependent jobs can end up determining the total BVH processing time, leaving the GPU underutilized. // This issue is one of the main motivations for handling the cluster culling in the same shader. // Workers always favor node processing to make progress on the critical path, but when no nodes are available // they process clusters while waiting, instead of going idle. #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS template void PersistentNodeAndClusterCull(uint GroupIndex, uint QueueStateIndex) { FNaniteTraversalCallback TraversalCallback; bool bProcessNodes = true; // Should we still try to consume nodes? uint NodeBatchReadyOffset = NANITE_MAX_BVH_NODES_PER_GROUP; uint NodeBatchStartIndex = 0; uint ClusterBatchStartIndex = 0xFFFFFFFFu; while(true) { GroupMemoryBarrierWithGroupSync(); // Make sure we are done reading from group shared if (GroupIndex == 0) { GroupNumCandidateNodes = 0; GroupNodeMask = 0; } TraversalCallback.OnPreProcessNodeBatch(GroupIndex); GroupMemoryBarrierWithGroupSync(); uint NodeReadyMask = 0; if (bProcessNodes) // Try grabbing and processing nodes if they could be available. { if (NodeBatchReadyOffset == NANITE_MAX_BVH_NODES_PER_GROUP) { // No more data in current batch. Grab a new batch. if (GroupIndex == 0) { InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeReadOffset, NANITE_MAX_BVH_NODES_PER_GROUP, GroupNodeBatchStartIndex); } GroupMemoryBarrierWithGroupSync(); NodeBatchReadyOffset = 0; NodeBatchStartIndex = GroupNodeBatchStartIndex; if (NodeBatchStartIndex >= MaxNodes) { // The node range is out of bounds and so will any future range be. bProcessNodes = false; continue; } } // Check which nodes in the range have been completely written and are ready for processing. const uint NodeIndex = NodeBatchStartIndex + NodeBatchReadyOffset + GroupIndex; bool bNodeReady = (NodeBatchReadyOffset + GroupIndex < NANITE_MAX_BVH_NODES_PER_GROUP); if (bNodeReady) { bNodeReady = TraversalCallback.LoadCandidateNodeDataToGroup(NodeIndex, GroupIndex); } if (bNodeReady) { InterlockedOr(GroupNodeMask, 1u << GroupIndex); } AllMemoryBarrierWithGroupSync(); NodeReadyMask = GroupNodeMask; // Process nodes if at least the first one is ready. if (NodeReadyMask & 1u) { uint BatchSize = firstbitlow(~NodeReadyMask); ProcessNodeBatch(BatchSize, GroupIndex, QueueStateIndex); if (GroupIndex < BatchSize) { // Clear processed element so we leave the buffer cleared for next pass. TraversalCallback.ClearCandidateNodeData(NodeIndex); } NodeBatchReadyOffset += BatchSize; continue; } } // No nodes were ready. Process clusters instead. // Grab a range of clusters, if we don't already have one. if (ClusterBatchStartIndex == 0xFFFFFFFFu) { if (GroupIndex == 0) { InterlockedAdd(QueueState[0].PassState[QueueStateIndex].ClusterBatchReadOffset, 1, GroupClusterBatchStartIndex); } GroupMemoryBarrierWithGroupSync(); ClusterBatchStartIndex = GroupClusterBatchStartIndex; } if (!bProcessNodes && GroupClusterBatchStartIndex >= GetMaxClusterBatches()) break; // Has to be break instead of return to make FXC happy. if (GroupIndex == 0) { GroupNodeCount = QueueState[0].PassState[QueueStateIndex].NodeCount; GroupClusterBatchReadySize = TraversalCallback.LoadClusterBatch(ClusterBatchStartIndex); } GroupMemoryBarrierWithGroupSync(); uint ClusterBatchReadySize = GroupClusterBatchReadySize; if (!bProcessNodes && ClusterBatchReadySize == 0) // No more clusters to process and no nodes are available to break; // Has to be break instead of return to make FXC happy. if ((bProcessNodes && ClusterBatchReadySize == NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE) || (!bProcessNodes && ClusterBatchReadySize > 0)) { ProcessClusterBatch(ClusterBatchStartIndex, ClusterBatchReadySize, GroupIndex); ClusterBatchStartIndex = 0xFFFFFFFFu; } if (bProcessNodes && GroupNodeCount == 0) { bProcessNodes = false; } } } #endif #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_NODES groupshared uint GroupReadOffset; template void NodeCull(uint GroupID, uint GroupIndex, uint QueueStateIndex) { const uint NumNodes = CurrentNodeIndirectArgs[NodeLevel * NANITE_NODE_CULLING_ARG_COUNT + 3]; const uint LevelStartIndex = CurrentNodeIndirectArgs[NodeLevel * NANITE_NODE_CULLING_ARG_COUNT + 4]; const uint LevelEndIndex = min(LevelStartIndex + NumNodes, MaxNodes); const uint StartIndex = LevelStartIndex + GroupID * NANITE_MAX_BVH_NODES_PER_GROUP; const uint ClampedStartIndex = min(StartIndex, LevelEndIndex); const uint ClampedEndIndex = min(StartIndex + NANITE_MAX_BVH_NODES_PER_GROUP, LevelEndIndex); uint BatchSize = ClampedEndIndex - ClampedStartIndex; if (GroupIndex == 0) { GroupNumCandidateNodes = 0; if (GroupID == 0) { // First group updates start index for next level NextNodeIndirectArgs[(NodeLevel + 1) * NANITE_NODE_CULLING_ARG_COUNT + 4] = LevelEndIndex; } } GroupMemoryBarrierWithGroupSync(); const uint NodeIndex = ClampedStartIndex + GroupIndex; FNaniteTraversalCallback TraversalCallback; TraversalCallback.OnPreProcessNodeBatch(GroupIndex); if (GroupIndex < BatchSize) { TraversalCallback.LoadCandidateNodeDataToGroup(NodeIndex, GroupIndex, false); } GroupMemoryBarrierWithGroupSync(); ProcessNodeBatch(BatchSize, GroupIndex, QueueStateIndex); } #endif #if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_CLUSTERS template void ClusterCull(uint GroupID, uint GroupIndex, uint QueueStateIndex) { const uint ClampedNumCandidateClusters = min(QueueState[0].PassState[QueueStateIndex].ClusterWriteOffset, MaxCandidateClusters); const uint ClampedStartIndex = min(GroupID * NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE, ClampedNumCandidateClusters); const uint ClampedEndIndex = min((GroupID + 1) * NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE, ClampedNumCandidateClusters); const uint BatchSize = ClampedEndIndex - ClampedStartIndex; ProcessClusterBatch(GroupID, BatchSize, GroupIndex); } #endif