UnrealEngine/Engine/Shaders/Private/Nanite/NaniteInstanceHierarchyCulling.usf

// Copyright Epic Games, Inc. All Rights Reserved.

// Do not use shared samplers as it requires the View uniform buffer, which is not bound for this shader.
#define USE_HZB_SHARED_SAMPLERS 0

#include "../Common.ush"
#include "../SceneData.ush"
#include "../WaveOpUtil.ush"
#include "../ComputeShaderUtils.ush"
#include "../SceneCulling/SceneCulling.ush"
#include "../WorkGroupLoadBalancer.ush"

#include "NaniteCullingCommon.ush"
#include "NaniteCulling.ush"
#include "NaniteDataDecode.ush"
#include "NaniteHZBCull.ush"

#if DEBUG_FLAGS
RWStructuredBuffer<FNaniteStats>		OutStatsBuffer;
#endif

// Occluded chunks fed back to post-pass.
RWStructuredBuffer<FOccludedChunkDraw> OutOccludedChunkDraws;
RWBuffer<uint> OutOccludedChunkArgs;


void WriteOccludedChunk(uint ChunkId, uint ViewGroupId, uint OccludedViewMask)
{
	uint OccludedChunkOffset = 0;
	WaveInterlockedAddScalarInGroups(OutOccludedChunkArgs[3], OutOccludedChunkArgs[0], 64, 1, OccludedChunkOffset);

	FOccludedChunkDraw OccDraw;
	OccDraw.ViewGroupId = ViewGroupId;
	OccDraw.OccludedViewMask = OccludedViewMask;
	OccDraw.ChunkId = ChunkId;

	OutOccludedChunkDraws[OccludedChunkOffset] = OccDraw;
}

StructuredBuffer<FViewDrawGroup> InViewDrawRanges;

// Yet another format
// TODO: unify/remove paths
RWStructuredBuffer<FInstanceCullingGroupWork> OutInstanceWorkGroups;

// TODO: this used to be the same arg as the post pass instances from the instance culling main pass, but due to a weird bug they are separate, for now.
// Output indirect dispatch args for the following instance culling pass (x will contain all the groups, which matches the array count above).
RWBuffer<uint> OutInstanceWorkArgs;
uint MaxInstanceWorkGroups;
uint bAllowStaticGeometryPath;

bool AppendInstanceCullingWorkGroup(uint ItemChunksOffset, uint ViewGroupId, uint ActiveViewMask, bool bIsStaticSceneGeometry)
{
	uint InstanceGroupsOutOffset = 0;
	BRANCH
	if (bIsStaticSceneGeometry && bAllowStaticGeometryPath)
	{
		WaveInterlockedAddScalar_(OutInstanceWorkArgs[0u], 1u, InstanceGroupsOutOffset);
	}
	else
	{
		WaveInterlockedAddScalar_(OutInstanceWorkArgs[4u], 1u, InstanceGroupsOutOffset);
		// Write from the end
		InstanceGroupsOutOffset = MaxInstanceWorkGroups - InstanceGroupsOutOffset - 1u;
	}

	if (InstanceGroupsOutOffset < MaxInstanceWorkGroups)
	{
		uint PackedItemChunkDesc = InstanceHierarchyItemChunks[ItemChunksOffset];

		FInstanceCullingGroupWork InstanceCullingGroupWork;
		InstanceCullingGroupWork.ViewGroupId = ViewGroupId;
		InstanceCullingGroupWork.PackedItemChunkDesc = PackedItemChunkDesc;
		InstanceCullingGroupWork.ActiveViewMask = ActiveViewMask;
		// Allocate the static geo to the end of the buffer
		OutInstanceWorkGroups[InstanceGroupsOutOffset] = InstanceCullingGroupWork;
		return true;
	}
	return false;
}

void UpdateChunkPreCullStat(uint NumChunks, bool bShouldUpdate)
{
#if DEBUG_FLAGS
	if (bShouldUpdate && (RenderFlags & NANITE_RENDER_FLAG_WRITE_STATS) != 0u)
	{
#if CULLING_PASS == CULLING_PASS_OCCLUSION_POST
		InterlockedAdd(OutStatsBuffer[0].NumPostHierarchyChunksPreCull, NumChunks);
#else
		InterlockedAdd(OutStatsBuffer[0].NumMainHierarchyChunksPreCull, NumChunks);
#endif
	}
#endif
}

#if CULLING_PASS == CULLING_PASS_OCCLUSION_POST
Buffer<uint> InOccludedChunkArgs;
StructuredBuffer<FOccludedChunkDraw> InOccludedChunkDraws;
#else
StructuredBuffer<FCellChunkDraw> ParentInfoBuffer;
StructuredBuffer<uint> InGroupIds;
uint NumGroupIds;
uint NumParentInfos;
#endif

void ProcessChunk(uint ChunkId, uint ViewGroupId, uint InOccludedViewMask)
{
	FViewDrawGroup ViewDrawGroup = InViewDrawRanges[ViewGroupId];

	uint CellId = ExplicitChunkCellIds[ChunkId];
	FSceneHiearchyCellData CellData = GetSceneHiearchyCellData(CellId);

	const float LevelCellSize = CellData.BlockData.LevelCellSize;
	FInstanceChunkAttributes ChunkAttributes = LoadInstanceChunkAttributes(ChunkId, LevelCellSize * 2.0f, LevelCellSize * 0.5f);


	FCellHeader CellHeader = GetCellHeader(CellId);
	const bool bIsStaticSceneGeometry = (ChunkId - CellHeader.ItemChunksOffset) < CellHeader.NumStaticChunks;

#if VIRTUAL_TEXTURE_TARGET
	// Early out if none of the instances in the chunk cast shadows
	if ((ChunkAttributes.AnyFlags & PCAF_ANY_CAST_SHADOW) == 0u)
	{
		return;
	}
#endif

	FAabb CellRelativeBounds = ChunkAttributes.Aabb;
	float3 ImplicitBoundsMin = float3(CellData.LocalCellCoord) * LevelCellSize - (LevelCellSize * 0.5f).xxx;
	float3 LocalBoundsCenter = (CellRelativeBounds.Min + CellRelativeBounds.Max) * 0.5f + ImplicitBoundsMin;
	float3 LocalBoundsExtent = (CellRelativeBounds.Max - CellRelativeBounds.Min) * 0.5f;
	float2 InstanceDrawDistanceMinMaxSquared = ChunkAttributes.InstanceDrawDistanceMinMaxSquared;
	bool bHasDrawDistance = InstanceDrawDistanceMinMaxSquared.x < InstanceDrawDistanceMinMaxSquared.y;

#if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN
	uint OutOccludedViewMask = 0U;
#endif
	uint ActiveViewMask = 0U;
	// Wrt mip views - views, there is no explicit handling here: they are expected to come in a compact range (post view compaction, or from host for non-VSM draws)
	for (uint ViewIndex = 0; ViewIndex < ViewDrawGroup.NumViews; ++ViewIndex)
	{
#if CULLING_PASS == CULLING_PASS_OCCLUSION_POST
		// TODO: loop over set bits instead!
		if ((InOccludedViewMask & (1U << ViewIndex)) == 0U)
		{
			continue;
		}
#endif
		uint ViewId = ViewDrawGroup.FirstView + ViewIndex;

		FNaniteView NaniteView = GetNaniteView(ViewId);

		// Depth clipping should only be disabled with orthographic projections
		const bool bIsOrtho = IsOrthoProjection(NaniteView.ViewToClip);
		const bool bNearClip = (NaniteView.Flags & NANITE_VIEW_FLAG_NEAR_CLIP) != 0u;
		const bool bViewHZB = (NaniteView.Flags & NANITE_VIEW_FLAG_HZBTEST) != 0u;
		const bool bIsViewUncached = (NaniteView.Flags & NANITE_VIEW_FLAG_UNCACHED) != 0u;

		// TODO: Move out of the loop, logically all views in a group should share the preview translation
		// TODO: Make the view compaction pull this data out of the view and store with the group?
		float4x4 LocalToTranslatedWorld = MakeTranslationMatrix(DFFastToTranslatedWorld(CellData.BlockData.WorldPos, NaniteView.PreViewTranslation));
		float4x4 PrevLocalToTranslatedWorld = MakeTranslationMatrix(DFFastToTranslatedWorld(CellData.BlockData.WorldPos, NaniteView.PrevPreViewTranslation));

		FBoxCull Cull;
		Cull.Init(NaniteView, LocalBoundsCenter, LocalBoundsExtent, float4(1.0f, 1.0f, 1.0f, 1.0f), LocalToTranslatedWorld, PrevLocalToTranslatedWorld);
		if (CULLING_PASS == CULLING_PASS_OCCLUSION_POST)
		{
			Cull.bSkipCullFrustum = true;
			Cull.bSkipCullGlobalClipPlane = true;
		}

#if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN
		Cull.DistanceConservative(bHasDrawDistance, InstanceDrawDistanceMinMaxSquared.x, InstanceDrawDistanceMinMaxSquared.y);
		Cull.GlobalClipPlane();
#endif
		BRANCH
        if( Cull.bIsVisible )
        {
#if VIRTUAL_TEXTURE_TARGET
            Cull.PageFlagMask = VSM_FLAG_ANY_UNCACHED;
			// Using the static HZB and disabling any dynamic depth culling is conservative while
			// we traverse the hierarchy and we don't yet know what static/dynamic geometry is below
			Cull.bIsStaticGeometry = true;
			// If we're rendering into the static cache, it's not safe to use the receiver mask as we may cache that (full) page
			// Since at this point in the hierarchy there could still be static instances below, it's only safe to use
			// the receiver mask if we know the whole view is uncached.
			Cull.bUseReceiverMask = Cull.bUseReceiverMask && Cull.bIsViewUncached;
#endif
			// Note: bClampToPageLevel == true means we might test rather large footprints in the HZB, leading to load imbalance
			// TODO: rebalance the work in the workgroup? Spawn more work groups / cell? Implement the top (virtual space) HZB hierarchy?
			const bool bClampToPageLevel = false;
            Cull.FrustumHZB( bClampToPageLevel );
        }

#if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN
		if( Cull.bWasOccluded )
		{
			OutOccludedViewMask |= 1U << ViewIndex;
		}
#endif
		if( Cull.bIsVisible && !Cull.bWasOccluded )
		{
			ActiveViewMask |= 1U << ViewIndex;
		}
	}

#if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN
	if( OutOccludedViewMask != 0u)
	{
		WriteOccludedChunk(ChunkId, ViewGroupId, OutOccludedViewMask);
	}
#endif
	if (ActiveViewMask != 0u)
	{
		AppendInstanceCullingWorkGroup(ChunkId, ViewGroupId, ActiveViewMask, bIsStaticSceneGeometry);
	}
}

#ifdef InstanceHierarchyCellChunkCull_CS

[numthreads(64, 1, 1)]
void InstanceHierarchyCellChunkCull_CS(uint3 GroupId : SV_GroupID, uint GroupThreadIndex : SV_GroupIndex)
{
	FWorkGroupSetup WorkGroupSetup = WorkGroupLoadBalancer_Setup(GroupId, GroupThreadIndex);

	if (!WorkGroupSetup.bValid)
	{
		return;
	}

	UpdateChunkPreCullStat(WorkGroupLoadBalancer_GetNumItems(), WorkGroupSetup.DispatchThreadId == 0u);

	// decode payload into index
	uint CellDrawIndex = WorkGroupSetup.WorkGroupInfo.Payload + WorkGroupSetup.Item.Payload;

	FCellChunkDraw CellDraw = ParentInfoBuffer[CellDrawIndex];
	ProcessChunk(CellDraw.ItemChunksOffset + WorkGroupSetup.ChildIndex, CellDraw.ViewGroupId, 0u);
}

#endif // InstanceHierarchyCellChunkCull_CS

#ifdef InstanceHierarchyChunkCull_CS

[numthreads(64, 1, 1)]
void InstanceHierarchyChunkCull_CS(uint3 DispatchThreadID : SV_DispatchThreadID)
{
#if CULLING_PASS == CULLING_PASS_OCCLUSION_POST
	uint NumChunkDraws = InOccludedChunkArgs[3];
	if (DispatchThreadID.x >= NumChunkDraws)
	{
		return;
	}
	FOccludedChunkDraw OccDraw = InOccludedChunkDraws[DispatchThreadID.x];
	const uint ChunkId = OccDraw.ChunkId;
	const uint ViewGroupId = OccDraw.ViewGroupId;
	const uint InOccludedViewMask = OccDraw.OccludedViewMask;

	UpdateChunkPreCullStat(NumChunkDraws, all(DispatchThreadID == 0u));
#else
	// exit if out of range
	if (DispatchThreadID.y >= NumGroupIds || DispatchThreadID.x >= NumAllocatedChunks)
	{
		return;
	}

	UpdateChunkPreCullStat(NumGroupIds * NumAllocatedChunks, all(DispatchThreadID == 0u));
	const uint ChunkId = DispatchThreadID.x;

	// early out if chunk not allocated / used by any cell
	if (!IsChunkUsed(ChunkId))
	{
		return;
	}

	const uint ViewGroupId = InGroupIds[DispatchThreadID.y];
	const uint InOccludedViewMask = 0u;
#endif

	ProcessChunk(ChunkId, ViewGroupId, InOccludedViewMask);
}
#endif // InstanceHierarchyChunkCull_CS

// Store the uncullable in the regular cell draw array, at the end.
uint NumViewDrawGroups;
uint UncullableItemChunksOffset;
uint UncullableNumItemChunks;

// One thread per uncullable chunk in the X dimension * View draw groups in the Y dimensions.
[numthreads(64, 1, 1)]
void AppendUncullableInstanceWork(uint3 DispatchThreadID : SV_DispatchThreadID)
{
	const uint ChunkIndex = DispatchThreadID.x;
	const uint ViewGroupId = DispatchThreadID.y;

	if (ChunkIndex < UncullableNumItemChunks)
	{
		FViewDrawGroup ViewDrawGroup = InViewDrawRanges[ViewGroupId];
		if (ViewDrawGroup.NumViews > 0u)
		{
			uint ActiveViewMask = (1u << ViewDrawGroup.NumViews) - 1u;

			AppendInstanceCullingWorkGroup(UncullableItemChunksOffset + ChunkIndex, ViewGroupId, ActiveViewMask, false);
		}
	}
}

RWBuffer< uint > OutInstanceWorkArgs0;
RWBuffer< uint > OutInstanceWorkArgs1;

[numthreads(1, 1, 1)]
void InitArgs()
{
	OutInstanceWorkArgs0[0] = 0; // group count == x dimension
	OutInstanceWorkArgs0[1] = 1;
	OutInstanceWorkArgs0[2] = 1;
	OutInstanceWorkArgs0[3] = 0; // instance count

	// Second args for static geo args growing from the rear
	OutInstanceWorkArgs0[4] = 0;
	OutInstanceWorkArgs0[5] = 1;
	OutInstanceWorkArgs0[6] = 1;
	OutInstanceWorkArgs0[7] = 0;

#if OCCLUSION_CULLING
	// occluded chunks
	OutOccludedChunkArgs[0] = 0; // group count == x dimension
	OutOccludedChunkArgs[1] = 1;
	OutOccludedChunkArgs[2] = 1;
	OutOccludedChunkArgs[3] = 0; // Item count

	OutInstanceWorkArgs1[0] = 0; // group count == x dimension
	OutInstanceWorkArgs1[1] = 1;
	OutInstanceWorkArgs1[2] = 1;
	OutInstanceWorkArgs1[3] = 0; // instance count

	// Second args for static geo args growing from the rear
	OutInstanceWorkArgs1[4] = 0;
	OutInstanceWorkArgs1[5] = 1;
	OutInstanceWorkArgs1[6] = 1;
	OutInstanceWorkArgs1[7] = 0;
#endif
}

#ifdef InstanceHierarchySanitizeInstanceArgsCS

RWBuffer< uint > InOutInstanceWorkArgs;
uint GroupWorkArgsMaxCount;

[numthreads(1, 1, 1)]
void InstanceHierarchySanitizeInstanceArgsCS()
{
	InOutInstanceWorkArgs[0] = min(InOutInstanceWorkArgs[0], GroupWorkArgsMaxCount);

	// Clamp the number of non-static-geo work groups (thread arg)
	uint ClampedNumStatic = min(InOutInstanceWorkArgs[4], GroupWorkArgsMaxCount);
	InOutInstanceWorkArgs[4] = ClampedNumStatic;
}

#endif