Files
UnrealEngine/Engine/Shaders/Private/Nanite/NaniteHierarchyTraversal.ush
2025-05-18 13:04:45 +08:00

422 lines
16 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#pragma once
#include "NaniteDataDecode.ush"
#include "NaniteHierarchyTraversalCommon.ush"
#include "../Common.ush"
#include "../WaveOpUtil.ush"
#include "../ComputeShaderUtils.ush"
#ifndef NANITE_HIERARCHY_TRAVERSAL_TYPE
# error "NANITE_HIERARCHY_TRAVERSAL_TYPE must be defined."
#endif
#ifndef CHECK_AND_TRIM_CLUSTER_COUNT
# define CHECK_AND_TRIM_CLUSTER_COUNT 0
#endif
#if GROUP_NODE_SIZE == 2
groupshared uint2 GroupNodeData[NANITE_MAX_BVH_NODES_PER_GROUP];
uint4 GetGroupNodeData(uint NodeIndex) { return uint4(GroupNodeData[NodeIndex], 0, 0); }
void SetGroupNodeData(uint NodeIndex, uint4 Data) { GroupNodeData[NodeIndex] = Data.xy; }
#elif GROUP_NODE_SIZE == 3
groupshared uint3 GroupNodeData[NANITE_MAX_BVH_NODES_PER_GROUP];
uint4 GetGroupNodeData(uint NodeIndex) { return uint4(GroupNodeData[NodeIndex], 0); }
void SetGroupNodeData(uint NodeIndex, uint4 Data) { GroupNodeData[NodeIndex] = Data.xyz; }
#else
# error "Unexpected Group Node Size."
#endif
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
RWCoherentStructuredBuffer(FQueueState) QueueState;
#else
RWStructuredBuffer<FQueueState> QueueState;
#endif
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_NODES
Buffer<uint> CurrentNodeIndirectArgs;
RWBuffer<uint> NextNodeIndirectArgs;
uint NodeLevel;
#endif
groupshared uint GroupNumCandidateNodes;
groupshared uint GroupCandidateNodesOffset;
groupshared int GroupNodeCount;
groupshared uint GroupNodeBatchStartIndex;
groupshared uint GroupNodeMask;
groupshared uint GroupClusterBatchStartIndex;
groupshared uint GroupClusterBatchReadySize;
// Need manually strip unused template functions here due to a compiler issue: https://github.com/microsoft/DirectXShaderCompiler/issues/4649
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS || NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_NODES
template<typename FNaniteTraversalCallback>
void ProcessNodeBatch(uint BatchSize, uint GroupIndex, uint QueueStateIndex)
{
const uint LocalNodeIndex = (GroupIndex >> NANITE_MAX_BVH_NODE_FANOUT_BITS);
const uint ChildIndex = GroupIndex & NANITE_MAX_BVH_NODE_FANOUT_MASK;
const uint FetchIndex = min(LocalNodeIndex, BatchSize - 1);
FNaniteTraversalCallback TraversalCallback;
TraversalCallback.Init(ChildIndex, LocalNodeIndex, FetchIndex);
const FHierarchyNodeSlice HierarchyNodeSlice = GetHierarchyNodeSlice(TraversalCallback.GetHierarchyNodeOffset(), ChildIndex);
bool bVisible = HierarchyNodeSlice.bEnabled;
bool bLoaded = HierarchyNodeSlice.bLoaded;
if (LocalNodeIndex >= BatchSize)
{
bVisible = false;
}
bVisible = TraversalCallback.ShouldVisitChild(HierarchyNodeSlice, bVisible);
uint CandidateNodesOffset = 0;
const bool bOutputChild = bVisible && bLoaded;
BRANCH
if (bOutputChild && !HierarchyNodeSlice.bLeaf)
{
WaveInterlockedAddScalar_(GroupNumCandidateNodes, 1, CandidateNodesOffset);
}
GroupMemoryBarrierWithGroupSync();
if (GroupIndex == 0)
{
InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeWriteOffset, GroupNumCandidateNodes, GroupCandidateNodesOffset);
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeCount, (int)GroupNumCandidateNodes); // NodeCount needs to be conservative, so child count is added before the actual children.
#else
uint NumNodes = 0;
InterlockedAdd(NextNodeIndirectArgs[(NodeLevel + 1) * NANITE_NODE_CULLING_ARG_COUNT + 3], GroupNumCandidateNodes, NumNodes);
NumNodes += GroupNumCandidateNodes;
const uint NumGroups = (NumNodes + NANITE_MAX_BVH_NODES_PER_GROUP - 1) / NANITE_MAX_BVH_NODES_PER_GROUP;
InterlockedMax(NextNodeIndirectArgs[(NodeLevel + 1) * NANITE_NODE_CULLING_ARG_COUNT + 0], NumGroups);
#endif
}
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
AllMemoryBarrierWithGroupSync();
#else
GroupMemoryBarrierWithGroupSync();
#endif
// GPU might not be filled, so latency is important here. Kick new jobs as soon as possible.
if (bOutputChild && !HierarchyNodeSlice.bLeaf)
{
CandidateNodesOffset += GroupCandidateNodesOffset;
if (CandidateNodesOffset < MaxNodes)
{
TraversalCallback.StoreChildNode(CandidateNodesOffset, HierarchyNodeSlice);
}
}
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
DeviceMemoryBarrierWithGroupSync();
#endif
// Continue with remaining independent work
if (bOutputChild && HierarchyNodeSlice.bLeaf)
{
uint NumClusters = HierarchyNodeSlice.NumChildren;
#if CHECK_AND_TRIM_CLUSTER_COUNT
uint ClusterIndex = 0;
WaveInterlockedAdd_(QueueState[0].TotalClusters, NumClusters, ClusterIndex);
// Trim any clusters above MaxCandidateClusters
const uint ClusterIndexEnd = min(ClusterIndex + NumClusters, MaxCandidateClusters);
NumClusters = (uint)max((int)ClusterIndexEnd - (int)ClusterIndex, 0);
#endif
uint CandidateClustersOffset = 0;
WaveInterlockedAdd_(QueueState[0].PassState[QueueStateIndex].ClusterWriteOffset, NumClusters, CandidateClustersOffset);
const uint BaseClusterIndex = HierarchyNodeSlice.ChildStartReference & NANITE_MAX_CLUSTERS_PER_PAGE_MASK;
const uint StartIndex = CandidateClustersOffset;
const uint EndIndex = min(CandidateClustersOffset + NumClusters, MaxCandidateClusters);
for (uint Index = StartIndex; Index < EndIndex; Index++)
{
TraversalCallback.StoreCluster(Index, HierarchyNodeSlice, BaseClusterIndex + (Index - StartIndex));
}
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
DeviceMemoryBarrier();
// Once the cluster indices have been committed to memory, we can update the cluster counters of the overlapping cluster batches.
for (uint Index = StartIndex; Index < EndIndex;)
{
const uint BatchIndex = Index / NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE;
const uint NextIndex = (Index & ~(NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE - 1u)) + NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE;
const uint MaxIndex = min(NextIndex, EndIndex);
const uint Num = MaxIndex - Index;
TraversalCallback.AddToClusterBatch(BatchIndex, Num);
Index = NextIndex;
}
#endif
}
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
DeviceMemoryBarrierWithGroupSync();
if (GroupIndex == 0)
{
// Done writing clusters/nodes for current pass. Subtract us from NodeCount.
InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeCount, -(int)BatchSize);
}
#endif
TraversalCallback.OnPostNodeVisit(HierarchyNodeSlice);
}
#endif
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS || NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_CLUSTERS
template<typename FNaniteTraversalCallback>
void ProcessClusterBatch(uint BatchStartIndex, uint BatchSize, uint GroupIndex)
{
FNaniteTraversalCallback TraversalCallback;
if (GroupIndex < BatchSize)
{
const uint CandidateIndex = BatchStartIndex * NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE + GroupIndex;
const uint4 PackedCluster = TraversalCallback.LoadPackedCluster(CandidateIndex);
TraversalCallback.ProcessCluster(PackedCluster);
}
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
// Clear batch so the buffer is cleared for next pass.
TraversalCallback.ClearClusterBatch(BatchStartIndex);
#endif
}
#endif
// Persistent threads culling shader
// This shader is responsible for the recursive culling and traversal of the per-mesh cluster hierarchies.
// It is also responsible for culling the triangle clusters found during this traversal and producing
// lists of visible clusters for rasterization. Clusters are binned for SW or HW rasterization based on
// screen-projected size.
// Mapping tree-culling to the GPU is awkward as the number of leaf nodes that need to be accepted
// is dynamic and can be anywhere from none to hundreds of thousands. Mapping threads 1:1 to trees can result in
// extremely long serial processing that severely underutilizes the GPU. Conversely, mapping threads 1:1 to
// leaf nodes can end up leaving most threads idle.
// What we really need is the ability to dynamically spawn threads for children as they are determined
// to be visible during the traversal. This is unfortunately not possible (yet), so instead we use
// persistent threads. We spawn just enough worker threads to fill the GPU, keep them running and manually
// distribute work to them.
// For this we use a simple MPMC job queue. Instead of spawning new threads, new jobs are added to the queue
// to be consumed by workers. Initially the queue is populated with work generated by preceding shader passes.
// When the persistent shader is running, its workers will continuously consuming items from the queue, but also
// produce new items.
// When BVH nodes are processed they will append work items to the queue for each of the visible children.
// Clusters are only consumed and will never generate new work. The workers keep running until the queue is empty.
// At that point the trees have been recursively traversed and all the relevant clusters have been processed.
// A given node can't be processed before all of its ancestors have been processed. This forms a dependency chain
// of jobs that can be as long as the height of the tree. Especially for small or moderately-sized scenes, the total
// latency from these dependent jobs can end up determining the total BVH processing time, leaving the GPU underutilized.
// This issue is one of the main motivations for handling the cluster culling in the same shader.
// Workers always favor node processing to make progress on the critical path, but when no nodes are available
// they process clusters while waiting, instead of going idle.
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
template<typename FNaniteTraversalCallback>
void PersistentNodeAndClusterCull(uint GroupIndex, uint QueueStateIndex)
{
FNaniteTraversalCallback TraversalCallback;
bool bProcessNodes = true; // Should we still try to consume nodes?
uint NodeBatchReadyOffset = NANITE_MAX_BVH_NODES_PER_GROUP;
uint NodeBatchStartIndex = 0;
uint ClusterBatchStartIndex = 0xFFFFFFFFu;
while(true)
{
GroupMemoryBarrierWithGroupSync(); // Make sure we are done reading from group shared
if (GroupIndex == 0)
{
GroupNumCandidateNodes = 0;
GroupNodeMask = 0;
}
TraversalCallback.OnPreProcessNodeBatch(GroupIndex);
GroupMemoryBarrierWithGroupSync();
uint NodeReadyMask = 0;
if (bProcessNodes) // Try grabbing and processing nodes if they could be available.
{
if (NodeBatchReadyOffset == NANITE_MAX_BVH_NODES_PER_GROUP)
{
// No more data in current batch. Grab a new batch.
if (GroupIndex == 0)
{
InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeReadOffset, NANITE_MAX_BVH_NODES_PER_GROUP, GroupNodeBatchStartIndex);
}
GroupMemoryBarrierWithGroupSync();
NodeBatchReadyOffset = 0;
NodeBatchStartIndex = GroupNodeBatchStartIndex;
if (NodeBatchStartIndex >= MaxNodes)
{
// The node range is out of bounds and so will any future range be.
bProcessNodes = false;
continue;
}
}
// Check which nodes in the range have been completely written and are ready for processing.
const uint NodeIndex = NodeBatchStartIndex + NodeBatchReadyOffset + GroupIndex;
bool bNodeReady = (NodeBatchReadyOffset + GroupIndex < NANITE_MAX_BVH_NODES_PER_GROUP);
if (bNodeReady)
{
bNodeReady = TraversalCallback.LoadCandidateNodeDataToGroup(NodeIndex, GroupIndex);
}
if (bNodeReady)
{
InterlockedOr(GroupNodeMask, 1u << GroupIndex);
}
AllMemoryBarrierWithGroupSync();
NodeReadyMask = GroupNodeMask;
// Process nodes if at least the first one is ready.
if (NodeReadyMask & 1u)
{
uint BatchSize = firstbitlow(~NodeReadyMask);
ProcessNodeBatch<FNaniteTraversalCallback>(BatchSize, GroupIndex, QueueStateIndex);
if (GroupIndex < BatchSize)
{
// Clear processed element so we leave the buffer cleared for next pass.
TraversalCallback.ClearCandidateNodeData(NodeIndex);
}
NodeBatchReadyOffset += BatchSize;
continue;
}
}
// No nodes were ready. Process clusters instead.
// Grab a range of clusters, if we don't already have one.
if (ClusterBatchStartIndex == 0xFFFFFFFFu)
{
if (GroupIndex == 0)
{
InterlockedAdd(QueueState[0].PassState[QueueStateIndex].ClusterBatchReadOffset, 1, GroupClusterBatchStartIndex);
}
GroupMemoryBarrierWithGroupSync();
ClusterBatchStartIndex = GroupClusterBatchStartIndex;
}
if (!bProcessNodes && GroupClusterBatchStartIndex >= GetMaxClusterBatches())
break; // Has to be break instead of return to make FXC happy.
if (GroupIndex == 0)
{
GroupNodeCount = QueueState[0].PassState[QueueStateIndex].NodeCount;
GroupClusterBatchReadySize = TraversalCallback.LoadClusterBatch(ClusterBatchStartIndex);
}
GroupMemoryBarrierWithGroupSync();
uint ClusterBatchReadySize = GroupClusterBatchReadySize;
if (!bProcessNodes && ClusterBatchReadySize == 0) // No more clusters to process and no nodes are available to
break; // Has to be break instead of return to make FXC happy.
if ((bProcessNodes && ClusterBatchReadySize == NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE) || (!bProcessNodes && ClusterBatchReadySize > 0))
{
ProcessClusterBatch<FNaniteTraversalCallback>(ClusterBatchStartIndex, ClusterBatchReadySize, GroupIndex);
ClusterBatchStartIndex = 0xFFFFFFFFu;
}
if (bProcessNodes && GroupNodeCount == 0)
{
bProcessNodes = false;
}
}
}
#endif
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_NODES
groupshared uint GroupReadOffset;
template<typename FNaniteTraversalCallback>
void NodeCull(uint GroupID, uint GroupIndex, uint QueueStateIndex)
{
const uint NumNodes = CurrentNodeIndirectArgs[NodeLevel * NANITE_NODE_CULLING_ARG_COUNT + 3];
const uint LevelStartIndex = CurrentNodeIndirectArgs[NodeLevel * NANITE_NODE_CULLING_ARG_COUNT + 4];
const uint LevelEndIndex = min(LevelStartIndex + NumNodes, MaxNodes);
const uint StartIndex = LevelStartIndex + GroupID * NANITE_MAX_BVH_NODES_PER_GROUP;
const uint ClampedStartIndex = min(StartIndex, LevelEndIndex);
const uint ClampedEndIndex = min(StartIndex + NANITE_MAX_BVH_NODES_PER_GROUP, LevelEndIndex);
uint BatchSize = ClampedEndIndex - ClampedStartIndex;
if (GroupIndex == 0)
{
GroupNumCandidateNodes = 0;
if (GroupID == 0)
{
// First group updates start index for next level
NextNodeIndirectArgs[(NodeLevel + 1) * NANITE_NODE_CULLING_ARG_COUNT + 4] = LevelEndIndex;
}
}
GroupMemoryBarrierWithGroupSync();
const uint NodeIndex = ClampedStartIndex + GroupIndex;
FNaniteTraversalCallback TraversalCallback;
TraversalCallback.OnPreProcessNodeBatch(GroupIndex);
if (GroupIndex < BatchSize)
{
TraversalCallback.LoadCandidateNodeDataToGroup(NodeIndex, GroupIndex, false);
}
GroupMemoryBarrierWithGroupSync();
ProcessNodeBatch<FNaniteTraversalCallback>(BatchSize, GroupIndex, QueueStateIndex);
}
#endif
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_CLUSTERS
template<typename FNaniteTraversalCallback>
void ClusterCull(uint GroupID, uint GroupIndex, uint QueueStateIndex)
{
const uint ClampedNumCandidateClusters = min(QueueState[0].PassState[QueueStateIndex].ClusterWriteOffset, MaxCandidateClusters);
const uint ClampedStartIndex = min(GroupID * NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE, ClampedNumCandidateClusters);
const uint ClampedEndIndex = min((GroupID + 1) * NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE, ClampedNumCandidateClusters);
const uint BatchSize = ClampedEndIndex - ClampedStartIndex;
ProcessClusterBatch<FNaniteTraversalCallback>(GroupID, BatchSize, GroupIndex);
}
#endif