Files
UnrealEngine/Engine/Shaders/Private/Nanite/NaniteInstanceHierarchyCulling.usf
2025-05-18 13:04:45 +08:00

360 lines
12 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
// Do not use shared samplers as it requires the View uniform buffer, which is not bound for this shader.
#define USE_HZB_SHARED_SAMPLERS 0
#include "../Common.ush"
#include "../SceneData.ush"
#include "../WaveOpUtil.ush"
#include "../ComputeShaderUtils.ush"
#include "../SceneCulling/SceneCulling.ush"
#include "../WorkGroupLoadBalancer.ush"
#include "NaniteCullingCommon.ush"
#include "NaniteCulling.ush"
#include "NaniteDataDecode.ush"
#include "NaniteHZBCull.ush"
#if DEBUG_FLAGS
RWStructuredBuffer<FNaniteStats> OutStatsBuffer;
#endif
// Occluded chunks fed back to post-pass.
RWStructuredBuffer<FOccludedChunkDraw> OutOccludedChunkDraws;
RWBuffer<uint> OutOccludedChunkArgs;
void WriteOccludedChunk(uint ChunkId, uint ViewGroupId, uint OccludedViewMask)
{
uint OccludedChunkOffset = 0;
WaveInterlockedAddScalarInGroups(OutOccludedChunkArgs[3], OutOccludedChunkArgs[0], 64, 1, OccludedChunkOffset);
FOccludedChunkDraw OccDraw;
OccDraw.ViewGroupId = ViewGroupId;
OccDraw.OccludedViewMask = OccludedViewMask;
OccDraw.ChunkId = ChunkId;
OutOccludedChunkDraws[OccludedChunkOffset] = OccDraw;
}
StructuredBuffer<FViewDrawGroup> InViewDrawRanges;
// Yet another format
// TODO: unify/remove paths
RWStructuredBuffer<FInstanceCullingGroupWork> OutInstanceWorkGroups;
// TODO: this used to be the same arg as the post pass instances from the instance culling main pass, but due to a weird bug they are separate, for now.
// Output indirect dispatch args for the following instance culling pass (x will contain all the groups, which matches the array count above).
RWBuffer<uint> OutInstanceWorkArgs;
uint MaxInstanceWorkGroups;
uint bAllowStaticGeometryPath;
bool AppendInstanceCullingWorkGroup(uint ItemChunksOffset, uint ViewGroupId, uint ActiveViewMask, bool bIsStaticSceneGeometry)
{
uint InstanceGroupsOutOffset = 0;
BRANCH
if (bIsStaticSceneGeometry && bAllowStaticGeometryPath)
{
WaveInterlockedAddScalar_(OutInstanceWorkArgs[0u], 1u, InstanceGroupsOutOffset);
}
else
{
WaveInterlockedAddScalar_(OutInstanceWorkArgs[4u], 1u, InstanceGroupsOutOffset);
// Write from the end
InstanceGroupsOutOffset = MaxInstanceWorkGroups - InstanceGroupsOutOffset - 1u;
}
if (InstanceGroupsOutOffset < MaxInstanceWorkGroups)
{
uint PackedItemChunkDesc = InstanceHierarchyItemChunks[ItemChunksOffset];
FInstanceCullingGroupWork InstanceCullingGroupWork;
InstanceCullingGroupWork.ViewGroupId = ViewGroupId;
InstanceCullingGroupWork.PackedItemChunkDesc = PackedItemChunkDesc;
InstanceCullingGroupWork.ActiveViewMask = ActiveViewMask;
// Allocate the static geo to the end of the buffer
OutInstanceWorkGroups[InstanceGroupsOutOffset] = InstanceCullingGroupWork;
return true;
}
return false;
}
void UpdateChunkPreCullStat(uint NumChunks, bool bShouldUpdate)
{
#if DEBUG_FLAGS
if (bShouldUpdate && (RenderFlags & NANITE_RENDER_FLAG_WRITE_STATS) != 0u)
{
#if CULLING_PASS == CULLING_PASS_OCCLUSION_POST
InterlockedAdd(OutStatsBuffer[0].NumPostHierarchyChunksPreCull, NumChunks);
#else
InterlockedAdd(OutStatsBuffer[0].NumMainHierarchyChunksPreCull, NumChunks);
#endif
}
#endif
}
#if CULLING_PASS == CULLING_PASS_OCCLUSION_POST
Buffer<uint> InOccludedChunkArgs;
StructuredBuffer<FOccludedChunkDraw> InOccludedChunkDraws;
#else
StructuredBuffer<FCellChunkDraw> ParentInfoBuffer;
StructuredBuffer<uint> InGroupIds;
uint NumGroupIds;
uint NumParentInfos;
#endif
void ProcessChunk(uint ChunkId, uint ViewGroupId, uint InOccludedViewMask)
{
FViewDrawGroup ViewDrawGroup = InViewDrawRanges[ViewGroupId];
uint CellId = ExplicitChunkCellIds[ChunkId];
FSceneHiearchyCellData CellData = GetSceneHiearchyCellData(CellId);
const float LevelCellSize = CellData.BlockData.LevelCellSize;
FInstanceChunkAttributes ChunkAttributes = LoadInstanceChunkAttributes(ChunkId, LevelCellSize * 2.0f, LevelCellSize * 0.5f);
FCellHeader CellHeader = GetCellHeader(CellId);
const bool bIsStaticSceneGeometry = (ChunkId - CellHeader.ItemChunksOffset) < CellHeader.NumStaticChunks;
#if VIRTUAL_TEXTURE_TARGET
// Early out if none of the instances in the chunk cast shadows
if ((ChunkAttributes.AnyFlags & PCAF_ANY_CAST_SHADOW) == 0u)
{
return;
}
#endif
FAabb CellRelativeBounds = ChunkAttributes.Aabb;
float3 ImplicitBoundsMin = float3(CellData.LocalCellCoord) * LevelCellSize - (LevelCellSize * 0.5f).xxx;
float3 LocalBoundsCenter = (CellRelativeBounds.Min + CellRelativeBounds.Max) * 0.5f + ImplicitBoundsMin;
float3 LocalBoundsExtent = (CellRelativeBounds.Max - CellRelativeBounds.Min) * 0.5f;
float2 InstanceDrawDistanceMinMaxSquared = ChunkAttributes.InstanceDrawDistanceMinMaxSquared;
bool bHasDrawDistance = InstanceDrawDistanceMinMaxSquared.x < InstanceDrawDistanceMinMaxSquared.y;
#if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN
uint OutOccludedViewMask = 0U;
#endif
uint ActiveViewMask = 0U;
// Wrt mip views - views, there is no explicit handling here: they are expected to come in a compact range (post view compaction, or from host for non-VSM draws)
for (uint ViewIndex = 0; ViewIndex < ViewDrawGroup.NumViews; ++ViewIndex)
{
#if CULLING_PASS == CULLING_PASS_OCCLUSION_POST
// TODO: loop over set bits instead!
if ((InOccludedViewMask & (1U << ViewIndex)) == 0U)
{
continue;
}
#endif
uint ViewId = ViewDrawGroup.FirstView + ViewIndex;
FNaniteView NaniteView = GetNaniteView(ViewId);
// Depth clipping should only be disabled with orthographic projections
const bool bIsOrtho = IsOrthoProjection(NaniteView.ViewToClip);
const bool bNearClip = (NaniteView.Flags & NANITE_VIEW_FLAG_NEAR_CLIP) != 0u;
const bool bViewHZB = (NaniteView.Flags & NANITE_VIEW_FLAG_HZBTEST) != 0u;
const bool bIsViewUncached = (NaniteView.Flags & NANITE_VIEW_FLAG_UNCACHED) != 0u;
// TODO: Move out of the loop, logically all views in a group should share the preview translation
// TODO: Make the view compaction pull this data out of the view and store with the group?
float4x4 LocalToTranslatedWorld = MakeTranslationMatrix(DFFastToTranslatedWorld(CellData.BlockData.WorldPos, NaniteView.PreViewTranslation));
float4x4 PrevLocalToTranslatedWorld = MakeTranslationMatrix(DFFastToTranslatedWorld(CellData.BlockData.WorldPos, NaniteView.PrevPreViewTranslation));
FBoxCull Cull;
Cull.Init(NaniteView, LocalBoundsCenter, LocalBoundsExtent, float4(1.0f, 1.0f, 1.0f, 1.0f), LocalToTranslatedWorld, PrevLocalToTranslatedWorld);
if (CULLING_PASS == CULLING_PASS_OCCLUSION_POST)
{
Cull.bSkipCullFrustum = true;
Cull.bSkipCullGlobalClipPlane = true;
}
#if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN
Cull.DistanceConservative(bHasDrawDistance, InstanceDrawDistanceMinMaxSquared.x, InstanceDrawDistanceMinMaxSquared.y);
Cull.GlobalClipPlane();
#endif
BRANCH
if( Cull.bIsVisible )
{
#if VIRTUAL_TEXTURE_TARGET
Cull.PageFlagMask = VSM_FLAG_ANY_UNCACHED;
// Using the static HZB and disabling any dynamic depth culling is conservative while
// we traverse the hierarchy and we don't yet know what static/dynamic geometry is below
Cull.bIsStaticGeometry = true;
// If we're rendering into the static cache, it's not safe to use the receiver mask as we may cache that (full) page
// Since at this point in the hierarchy there could still be static instances below, it's only safe to use
// the receiver mask if we know the whole view is uncached.
Cull.bUseReceiverMask = Cull.bUseReceiverMask && Cull.bIsViewUncached;
#endif
// Note: bClampToPageLevel == true means we might test rather large footprints in the HZB, leading to load imbalance
// TODO: rebalance the work in the workgroup? Spawn more work groups / cell? Implement the top (virtual space) HZB hierarchy?
const bool bClampToPageLevel = false;
Cull.FrustumHZB( bClampToPageLevel );
}
#if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN
if( Cull.bWasOccluded )
{
OutOccludedViewMask |= 1U << ViewIndex;
}
#endif
if( Cull.bIsVisible && !Cull.bWasOccluded )
{
ActiveViewMask |= 1U << ViewIndex;
}
}
#if CULLING_PASS == CULLING_PASS_OCCLUSION_MAIN
if( OutOccludedViewMask != 0u)
{
WriteOccludedChunk(ChunkId, ViewGroupId, OutOccludedViewMask);
}
#endif
if (ActiveViewMask != 0u)
{
AppendInstanceCullingWorkGroup(ChunkId, ViewGroupId, ActiveViewMask, bIsStaticSceneGeometry);
}
}
#ifdef InstanceHierarchyCellChunkCull_CS
[numthreads(64, 1, 1)]
void InstanceHierarchyCellChunkCull_CS(uint3 GroupId : SV_GroupID, uint GroupThreadIndex : SV_GroupIndex)
{
FWorkGroupSetup WorkGroupSetup = WorkGroupLoadBalancer_Setup(GroupId, GroupThreadIndex);
if (!WorkGroupSetup.bValid)
{
return;
}
UpdateChunkPreCullStat(WorkGroupLoadBalancer_GetNumItems(), WorkGroupSetup.DispatchThreadId == 0u);
// decode payload into index
uint CellDrawIndex = WorkGroupSetup.WorkGroupInfo.Payload + WorkGroupSetup.Item.Payload;
FCellChunkDraw CellDraw = ParentInfoBuffer[CellDrawIndex];
ProcessChunk(CellDraw.ItemChunksOffset + WorkGroupSetup.ChildIndex, CellDraw.ViewGroupId, 0u);
}
#endif // InstanceHierarchyCellChunkCull_CS
#ifdef InstanceHierarchyChunkCull_CS
[numthreads(64, 1, 1)]
void InstanceHierarchyChunkCull_CS(uint3 DispatchThreadID : SV_DispatchThreadID)
{
#if CULLING_PASS == CULLING_PASS_OCCLUSION_POST
uint NumChunkDraws = InOccludedChunkArgs[3];
if (DispatchThreadID.x >= NumChunkDraws)
{
return;
}
FOccludedChunkDraw OccDraw = InOccludedChunkDraws[DispatchThreadID.x];
const uint ChunkId = OccDraw.ChunkId;
const uint ViewGroupId = OccDraw.ViewGroupId;
const uint InOccludedViewMask = OccDraw.OccludedViewMask;
UpdateChunkPreCullStat(NumChunkDraws, all(DispatchThreadID == 0u));
#else
// exit if out of range
if (DispatchThreadID.y >= NumGroupIds || DispatchThreadID.x >= NumAllocatedChunks)
{
return;
}
UpdateChunkPreCullStat(NumGroupIds * NumAllocatedChunks, all(DispatchThreadID == 0u));
const uint ChunkId = DispatchThreadID.x;
// early out if chunk not allocated / used by any cell
if (!IsChunkUsed(ChunkId))
{
return;
}
const uint ViewGroupId = InGroupIds[DispatchThreadID.y];
const uint InOccludedViewMask = 0u;
#endif
ProcessChunk(ChunkId, ViewGroupId, InOccludedViewMask);
}
#endif // InstanceHierarchyChunkCull_CS
// Store the uncullable in the regular cell draw array, at the end.
uint NumViewDrawGroups;
uint UncullableItemChunksOffset;
uint UncullableNumItemChunks;
// One thread per uncullable chunk in the X dimension * View draw groups in the Y dimensions.
[numthreads(64, 1, 1)]
void AppendUncullableInstanceWork(uint3 DispatchThreadID : SV_DispatchThreadID)
{
const uint ChunkIndex = DispatchThreadID.x;
const uint ViewGroupId = DispatchThreadID.y;
if (ChunkIndex < UncullableNumItemChunks)
{
FViewDrawGroup ViewDrawGroup = InViewDrawRanges[ViewGroupId];
if (ViewDrawGroup.NumViews > 0u)
{
uint ActiveViewMask = (1u << ViewDrawGroup.NumViews) - 1u;
AppendInstanceCullingWorkGroup(UncullableItemChunksOffset + ChunkIndex, ViewGroupId, ActiveViewMask, false);
}
}
}
RWBuffer< uint > OutInstanceWorkArgs0;
RWBuffer< uint > OutInstanceWorkArgs1;
[numthreads(1, 1, 1)]
void InitArgs()
{
OutInstanceWorkArgs0[0] = 0; // group count == x dimension
OutInstanceWorkArgs0[1] = 1;
OutInstanceWorkArgs0[2] = 1;
OutInstanceWorkArgs0[3] = 0; // instance count
// Second args for static geo args growing from the rear
OutInstanceWorkArgs0[4] = 0;
OutInstanceWorkArgs0[5] = 1;
OutInstanceWorkArgs0[6] = 1;
OutInstanceWorkArgs0[7] = 0;
#if OCCLUSION_CULLING
// occluded chunks
OutOccludedChunkArgs[0] = 0; // group count == x dimension
OutOccludedChunkArgs[1] = 1;
OutOccludedChunkArgs[2] = 1;
OutOccludedChunkArgs[3] = 0; // Item count
OutInstanceWorkArgs1[0] = 0; // group count == x dimension
OutInstanceWorkArgs1[1] = 1;
OutInstanceWorkArgs1[2] = 1;
OutInstanceWorkArgs1[3] = 0; // instance count
// Second args for static geo args growing from the rear
OutInstanceWorkArgs1[4] = 0;
OutInstanceWorkArgs1[5] = 1;
OutInstanceWorkArgs1[6] = 1;
OutInstanceWorkArgs1[7] = 0;
#endif
}
#ifdef InstanceHierarchySanitizeInstanceArgsCS
RWBuffer< uint > InOutInstanceWorkArgs;
uint GroupWorkArgsMaxCount;
[numthreads(1, 1, 1)]
void InstanceHierarchySanitizeInstanceArgsCS()
{
InOutInstanceWorkArgs[0] = min(InOutInstanceWorkArgs[0], GroupWorkArgsMaxCount);
// Clamp the number of non-static-geo work groups (thread arg)
uint ClampedNumStatic = min(InOutInstanceWorkArgs[4], GroupWorkArgsMaxCount);
InOutInstanceWorkArgs[4] = ClampedNumStatic;
}
#endif