UnrealEngine/Engine/Shaders/Private/Nanite/NaniteHierarchyTraversal.ush

// Copyright Epic Games, Inc. All Rights Reserved.

#pragma once

#include "NaniteDataDecode.ush"
#include "NaniteHierarchyTraversalCommon.ush"

#include "../Common.ush"
#include "../WaveOpUtil.ush"
#include "../ComputeShaderUtils.ush"

#ifndef NANITE_HIERARCHY_TRAVERSAL_TYPE
#	error "NANITE_HIERARCHY_TRAVERSAL_TYPE must be defined."
#endif

#ifndef CHECK_AND_TRIM_CLUSTER_COUNT
#	define CHECK_AND_TRIM_CLUSTER_COUNT 0
#endif

#if GROUP_NODE_SIZE == 2

groupshared uint2 GroupNodeData[NANITE_MAX_BVH_NODES_PER_GROUP];
uint4 GetGroupNodeData(uint NodeIndex) { return uint4(GroupNodeData[NodeIndex], 0, 0); }
void SetGroupNodeData(uint NodeIndex, uint4 Data) { GroupNodeData[NodeIndex] = Data.xy; }

#elif GROUP_NODE_SIZE == 3

groupshared uint3 GroupNodeData[NANITE_MAX_BVH_NODES_PER_GROUP];
uint4 GetGroupNodeData(uint NodeIndex) { return uint4(GroupNodeData[NodeIndex], 0); }
void SetGroupNodeData(uint NodeIndex, uint4 Data) { GroupNodeData[NodeIndex] = Data.xyz; }

#else
#	error "Unexpected Group Node Size."
#endif

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
RWCoherentStructuredBuffer(FQueueState)	QueueState;
#else
RWStructuredBuffer<FQueueState>	QueueState;
#endif

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_NODES
Buffer<uint>					CurrentNodeIndirectArgs;
RWBuffer<uint>					NextNodeIndirectArgs;
uint							NodeLevel;
#endif

groupshared uint	GroupNumCandidateNodes;
groupshared uint	GroupCandidateNodesOffset;

groupshared int		GroupNodeCount;
groupshared uint	GroupNodeBatchStartIndex;
groupshared uint	GroupNodeMask;

groupshared uint	GroupClusterBatchStartIndex;
groupshared uint	GroupClusterBatchReadySize;

// Need manually strip unused template functions here due to a compiler issue: https://github.com/microsoft/DirectXShaderCompiler/issues/4649

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS || NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_NODES

template<typename FNaniteTraversalCallback>
void ProcessNodeBatch(uint BatchSize, uint GroupIndex, uint QueueStateIndex)
{
	const uint LocalNodeIndex = (GroupIndex >> NANITE_MAX_BVH_NODE_FANOUT_BITS);
	const uint ChildIndex = GroupIndex & NANITE_MAX_BVH_NODE_FANOUT_MASK;
	const uint FetchIndex = min(LocalNodeIndex, BatchSize - 1);

	FNaniteTraversalCallback TraversalCallback;
	TraversalCallback.Init(ChildIndex, LocalNodeIndex, FetchIndex);

	const FHierarchyNodeSlice HierarchyNodeSlice = GetHierarchyNodeSlice(TraversalCallback.GetHierarchyNodeOffset(), ChildIndex);

	bool bVisible = HierarchyNodeSlice.bEnabled;
	bool bLoaded = HierarchyNodeSlice.bLoaded;

	if (LocalNodeIndex >= BatchSize)
	{
		bVisible = false;
	}

	bVisible = TraversalCallback.ShouldVisitChild(HierarchyNodeSlice, bVisible);

	uint CandidateNodesOffset = 0;

	const bool bOutputChild = bVisible && bLoaded;
	BRANCH
	if (bOutputChild && !HierarchyNodeSlice.bLeaf)
	{
		WaveInterlockedAddScalar_(GroupNumCandidateNodes, 1, CandidateNodesOffset);
	}

	GroupMemoryBarrierWithGroupSync();

	if (GroupIndex == 0)
	{
		InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeWriteOffset, GroupNumCandidateNodes, GroupCandidateNodesOffset);
#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
		InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeCount, (int)GroupNumCandidateNodes);	// NodeCount needs to be conservative, so child count is added before the actual children.
#else
		uint NumNodes = 0;
		InterlockedAdd(NextNodeIndirectArgs[(NodeLevel + 1) * NANITE_NODE_CULLING_ARG_COUNT + 3], GroupNumCandidateNodes, NumNodes);
		NumNodes += GroupNumCandidateNodes;
		const uint NumGroups = (NumNodes + NANITE_MAX_BVH_NODES_PER_GROUP - 1) / NANITE_MAX_BVH_NODES_PER_GROUP;
		InterlockedMax(NextNodeIndirectArgs[(NodeLevel + 1) * NANITE_NODE_CULLING_ARG_COUNT + 0], NumGroups);
#endif
	}

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
	AllMemoryBarrierWithGroupSync();
#else
	GroupMemoryBarrierWithGroupSync();
#endif

	// GPU might not be filled, so latency is important here. Kick new jobs as soon as possible.
	if (bOutputChild && !HierarchyNodeSlice.bLeaf)
	{
		CandidateNodesOffset += GroupCandidateNodesOffset;

		if (CandidateNodesOffset < MaxNodes)
		{
			TraversalCallback.StoreChildNode(CandidateNodesOffset, HierarchyNodeSlice);
		}
	}

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
	DeviceMemoryBarrierWithGroupSync();
#endif

	// Continue with remaining independent work
	if (bOutputChild && HierarchyNodeSlice.bLeaf)
	{
		uint NumClusters = HierarchyNodeSlice.NumChildren;

#if CHECK_AND_TRIM_CLUSTER_COUNT
		uint ClusterIndex = 0;
		WaveInterlockedAdd_(QueueState[0].TotalClusters, NumClusters, ClusterIndex);

		// Trim any clusters above MaxCandidateClusters
		const uint ClusterIndexEnd = min(ClusterIndex + NumClusters, MaxCandidateClusters);
		NumClusters = (uint)max((int)ClusterIndexEnd - (int)ClusterIndex, 0);
#endif

		uint CandidateClustersOffset = 0;
		WaveInterlockedAdd_(QueueState[0].PassState[QueueStateIndex].ClusterWriteOffset, NumClusters, CandidateClustersOffset);

		const uint BaseClusterIndex = HierarchyNodeSlice.ChildStartReference & NANITE_MAX_CLUSTERS_PER_PAGE_MASK;
		const uint StartIndex = CandidateClustersOffset;
		const uint EndIndex = min(CandidateClustersOffset + NumClusters, MaxCandidateClusters);

		for (uint Index = StartIndex; Index < EndIndex; Index++)
		{
			TraversalCallback.StoreCluster(Index, HierarchyNodeSlice, BaseClusterIndex + (Index - StartIndex));
		}


#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
		DeviceMemoryBarrier();

		// Once the cluster indices have been committed to memory, we can update the cluster counters of the overlapping cluster batches.
		for (uint Index = StartIndex; Index < EndIndex;)
		{
			const uint BatchIndex = Index / NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE;
			const uint NextIndex = (Index & ~(NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE - 1u)) + NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE;
			const uint MaxIndex = min(NextIndex, EndIndex);
			const uint Num = MaxIndex - Index;
			TraversalCallback.AddToClusterBatch(BatchIndex, Num);
			Index = NextIndex;
		}
#endif
	}

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
	DeviceMemoryBarrierWithGroupSync();
	if (GroupIndex == 0)
	{
		// Done writing clusters/nodes for current pass. Subtract us from NodeCount.
		InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeCount, -(int)BatchSize);
	}
#endif

	TraversalCallback.OnPostNodeVisit(HierarchyNodeSlice);
}

#endif

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS || NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_CLUSTERS

template<typename FNaniteTraversalCallback>
void ProcessClusterBatch(uint BatchStartIndex, uint BatchSize, uint GroupIndex)
{
	FNaniteTraversalCallback TraversalCallback;

	if (GroupIndex < BatchSize)
	{
		const uint CandidateIndex = BatchStartIndex * NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE + GroupIndex;
		const uint4 PackedCluster = TraversalCallback.LoadPackedCluster(CandidateIndex);

		TraversalCallback.ProcessCluster(PackedCluster);
	}

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS
	// Clear batch so the buffer is cleared for next pass.
	TraversalCallback.ClearClusterBatch(BatchStartIndex);
#endif
}

#endif

// Persistent threads culling shader
// This shader is responsible for the recursive culling and traversal of the per-mesh cluster hierarchies.
// It is also responsible for culling the triangle clusters found during this traversal and producing
// lists of visible clusters for rasterization. Clusters are binned for SW or HW rasterization based on
// screen-projected size.

// Mapping tree-culling to the GPU is awkward as the number of leaf nodes that need to be accepted
// is dynamic and can be anywhere from none to hundreds of thousands. Mapping threads 1:1 to trees can result in
// extremely long serial processing that severely underutilizes the GPU. Conversely, mapping threads 1:1 to
// leaf nodes can end up leaving most threads idle.

// What we really need is the ability to dynamically spawn threads for children as they are determined
// to be visible during the traversal. This is unfortunately not possible (yet), so instead we use
// persistent threads. We spawn just enough worker threads to fill the GPU, keep them running and manually
// distribute work to them.

// For this we use a simple MPMC job queue. Instead of spawning new threads, new jobs are added to the queue
// to be consumed by workers. Initially the queue is populated with work generated by preceding shader passes.
// When the persistent shader is running, its workers will continuously consuming items from the queue, but also
// produce new items.

// When BVH nodes are processed they will append work items to the queue for each of the visible children.
// Clusters are only consumed and will never generate new work. The workers keep running until the queue is empty.
// At that point the trees have been recursively traversed and all the relevant clusters have been processed.

// A given node can't be processed before all of its ancestors have been processed. This forms a dependency chain
// of jobs that can be as long as the height of the tree. Especially for small or moderately-sized scenes, the total
// latency from these dependent jobs can end up determining the total BVH processing time, leaving the GPU underutilized.
// This issue is one of the main motivations for handling the cluster culling in the same shader.
// Workers always favor node processing to make progress on the critical path, but when no nodes are available
// they process clusters while waiting, instead of going idle.

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_PERSISTENT_NODES_AND_CLUSTERS

template<typename FNaniteTraversalCallback>
void PersistentNodeAndClusterCull(uint GroupIndex, uint QueueStateIndex)
{
	FNaniteTraversalCallback TraversalCallback;

	bool bProcessNodes			= true;						// Should we still try to consume nodes?
	uint NodeBatchReadyOffset	= NANITE_MAX_BVH_NODES_PER_GROUP;
	uint NodeBatchStartIndex	= 0;
	uint ClusterBatchStartIndex = 0xFFFFFFFFu;

	while(true)
	{
		GroupMemoryBarrierWithGroupSync();	// Make sure we are done reading from group shared
		if (GroupIndex == 0)
		{
			GroupNumCandidateNodes	= 0;
			GroupNodeMask			= 0;
		}

		TraversalCallback.OnPreProcessNodeBatch(GroupIndex);

		GroupMemoryBarrierWithGroupSync();

		uint NodeReadyMask = 0;
		if (bProcessNodes)	// Try grabbing and processing nodes if they could be available.
		{
			if (NodeBatchReadyOffset == NANITE_MAX_BVH_NODES_PER_GROUP)
			{
				// No more data in current batch. Grab a new batch.
				if (GroupIndex == 0)
				{
					InterlockedAdd(QueueState[0].PassState[QueueStateIndex].NodeReadOffset, NANITE_MAX_BVH_NODES_PER_GROUP, GroupNodeBatchStartIndex);
				}
				GroupMemoryBarrierWithGroupSync();

				NodeBatchReadyOffset = 0;
				NodeBatchStartIndex = GroupNodeBatchStartIndex;
				if (NodeBatchStartIndex >= MaxNodes)
				{
					// The node range is out of bounds and so will any future range be.
					bProcessNodes = false;
					continue;
				}
			}

			// Check which nodes in the range have been completely written and are ready for processing.
			const uint NodeIndex = NodeBatchStartIndex + NodeBatchReadyOffset + GroupIndex;
			bool bNodeReady = (NodeBatchReadyOffset + GroupIndex < NANITE_MAX_BVH_NODES_PER_GROUP);
			if (bNodeReady)
			{
				bNodeReady = TraversalCallback.LoadCandidateNodeDataToGroup(NodeIndex, GroupIndex);
			}

			if (bNodeReady)
			{
				InterlockedOr(GroupNodeMask, 1u << GroupIndex);
			}
			AllMemoryBarrierWithGroupSync();
			NodeReadyMask = GroupNodeMask;

			// Process nodes if at least the first one is ready.
			if (NodeReadyMask & 1u)
			{
				uint BatchSize = firstbitlow(~NodeReadyMask);
				ProcessNodeBatch<FNaniteTraversalCallback>(BatchSize, GroupIndex, QueueStateIndex);
				if (GroupIndex < BatchSize)
				{
					// Clear processed element so we leave the buffer cleared for next pass.
					TraversalCallback.ClearCandidateNodeData(NodeIndex);
				}

				NodeBatchReadyOffset += BatchSize;
				continue;
			}
		}

		// No nodes were ready. Process clusters instead.

		// Grab a range of clusters, if we don't already have one.
		if (ClusterBatchStartIndex == 0xFFFFFFFFu)
		{
			if (GroupIndex == 0)
			{
				InterlockedAdd(QueueState[0].PassState[QueueStateIndex].ClusterBatchReadOffset, 1, GroupClusterBatchStartIndex);
			}
			GroupMemoryBarrierWithGroupSync();
			ClusterBatchStartIndex = GroupClusterBatchStartIndex;
		}

		if (!bProcessNodes && GroupClusterBatchStartIndex >= GetMaxClusterBatches())
			break;	// Has to be break instead of return to make FXC happy.

		if (GroupIndex == 0)
		{
			GroupNodeCount = QueueState[0].PassState[QueueStateIndex].NodeCount;
			GroupClusterBatchReadySize = TraversalCallback.LoadClusterBatch(ClusterBatchStartIndex);
		}
		GroupMemoryBarrierWithGroupSync();

		uint ClusterBatchReadySize = GroupClusterBatchReadySize;
		if (!bProcessNodes && ClusterBatchReadySize == 0)	// No more clusters to process and no nodes are available to
			break;	// Has to be break instead of return to make FXC happy.

		if ((bProcessNodes && ClusterBatchReadySize == NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE) || (!bProcessNodes && ClusterBatchReadySize > 0))
		{
			ProcessClusterBatch<FNaniteTraversalCallback>(ClusterBatchStartIndex, ClusterBatchReadySize, GroupIndex);
			ClusterBatchStartIndex = 0xFFFFFFFFu;
		}

		if (bProcessNodes && GroupNodeCount == 0)
		{
			bProcessNodes = false;
		}
	}
}

#endif

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_NODES

groupshared uint GroupReadOffset;
template<typename FNaniteTraversalCallback>
void NodeCull(uint GroupID, uint GroupIndex, uint QueueStateIndex)
{

	const uint NumNodes				= CurrentNodeIndirectArgs[NodeLevel * NANITE_NODE_CULLING_ARG_COUNT + 3];
	const uint LevelStartIndex		= CurrentNodeIndirectArgs[NodeLevel * NANITE_NODE_CULLING_ARG_COUNT + 4];
	const uint LevelEndIndex		= min(LevelStartIndex + NumNodes, MaxNodes);

	const uint StartIndex			= LevelStartIndex + GroupID * NANITE_MAX_BVH_NODES_PER_GROUP;
	const uint ClampedStartIndex	= min(StartIndex, LevelEndIndex);
	const uint ClampedEndIndex		= min(StartIndex + NANITE_MAX_BVH_NODES_PER_GROUP, LevelEndIndex);

	uint BatchSize					= ClampedEndIndex - ClampedStartIndex;

	if (GroupIndex == 0)
	{
		GroupNumCandidateNodes = 0;

		if (GroupID == 0)
		{
			// First group updates start index for next level
			NextNodeIndirectArgs[(NodeLevel + 1) * NANITE_NODE_CULLING_ARG_COUNT + 4] = LevelEndIndex;
		}
	}

	GroupMemoryBarrierWithGroupSync();

	const uint NodeIndex = ClampedStartIndex + GroupIndex;

	FNaniteTraversalCallback TraversalCallback;
	TraversalCallback.OnPreProcessNodeBatch(GroupIndex);

	if (GroupIndex < BatchSize)
	{
		TraversalCallback.LoadCandidateNodeDataToGroup(NodeIndex, GroupIndex, false);
	}

	GroupMemoryBarrierWithGroupSync();
	ProcessNodeBatch<FNaniteTraversalCallback>(BatchSize, GroupIndex, QueueStateIndex);
}

#endif

#if NANITE_HIERARCHY_TRAVERSAL_TYPE == NANITE_CULLING_TYPE_CLUSTERS

template<typename FNaniteTraversalCallback>
void ClusterCull(uint GroupID, uint GroupIndex, uint QueueStateIndex)
{
	const uint ClampedNumCandidateClusters = min(QueueState[0].PassState[QueueStateIndex].ClusterWriteOffset, MaxCandidateClusters);
	const uint ClampedStartIndex = min(GroupID * NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE, ClampedNumCandidateClusters);
	const uint ClampedEndIndex = min((GroupID + 1) * NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE, ClampedNumCandidateClusters);

	const uint BatchSize = ClampedEndIndex - ClampedStartIndex;
	ProcessClusterBatch<FNaniteTraversalCallback>(GroupID, BatchSize, GroupIndex);
}

#endif