Files
UnrealEngine/Engine/Shaders/Private/VirtualShadowMaps/VirtualShadowMapBuildPerPageDrawCommands.usf
2025-05-18 13:04:45 +08:00

506 lines
19 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
VirtualShadowMapBuildPerPageDrawCommands.usf:
=============================================================================*/
// Do not use shared samplers as it requires the View uniform buffer, which is not bound for this shader.
#define USE_HZB_SHARED_SAMPLERS 0
#include "../Common.ush"
#include "../ViewData.ush"
#include "VirtualShadowMapPageOverlap.ush"
#include "VirtualShadowMapProjectionCommon.ush"
// Turn on the logic for culling based on min screen radius (used in NaniteCullingCommon.ush)
#define NANITE_CULLING_ENABLE_MIN_RADIUS_CULL 1
// Turn on various VSM-related logic in NaniteCullingCommon
#define VIRTUAL_TEXTURE_TARGET 1
#include "../Nanite/NaniteCullingCommon.ush"
#include "../Nanite/NaniteDataDecode.ush"
#include "../InstanceCulling/InstanceCullingCommon.ush"
#include "../InstanceCulling/InstanceCullingLoadBalancer.ush"
#include "../WaveOpUtil.ush"
#include "../DynamicMeshBounds.ush"
#include "/Engine/Shared/VirtualShadowMapDefinitions.h"
#include "VirtualShadowMapStats.ush"
#include "VirtualShadowMapPageCacheCommon.ush"
#include "../ShaderPrint.ush"
RWStructuredBuffer<FVSMVisibleInstanceCmd> VisibleInstancesOut;
RWStructuredBuffer<uint> VisibleInstanceCountBufferOut;
uint TotalPrimaryViews;
uint VisibleInstancesBufferNum;
#if ENABLE_BATCH_MODE
StructuredBuffer<FVSMCullingBatchInfo> VSMCullingBatchInfos;
#else // !ENABLE_BATCH_MODE
uint FirstPrimaryView;
uint NumPrimaryViews;
uint DynamicInstanceIdOffset;
uint DynamicInstanceIdMax;
#endif // ENABLE_BATCH_MODE
StructuredBuffer<uint2> DrawCommandDescs;
RWBuffer<uint> DrawIndirectArgsBufferOut;
// TODO: Move to common header
// Get the area of an "inclusive" rect (which means that the max is inside the rect), also guards against negative area (where min > max)
uint GetInclusiveRectArea(uint4 Rect)
{
if (all(Rect.zw >= Rect.xy))
{
uint2 Size = Rect.zw - Rect.xy;
return (Size.x + 1) * (Size.y + 1);
}
return 0;
}
bool WriteCmd(uint MipViewId, uint InstanceId, uint IndirectArgIndex, uint CullingFlags, bool bStaticPage)
{
FPageInfo PageInfo;
PageInfo.ViewId = MipViewId;
PageInfo.bStaticPage = bStaticPage;
FVSMVisibleInstanceCmd VisibleInstanceCmd;
VisibleInstanceCmd.PackedPageInfo = PackPageInfo(PageInfo);
VisibleInstanceCmd.InstanceIdAndFlags = (CullingFlags << INSTANCE_ID_NUM_BITS) | InstanceId;
VisibleInstanceCmd.IndirectArgIndex = IndirectArgIndex;
uint VisibleInstanceOutputOffset = 0U;
WaveInterlockedAddScalar_(VisibleInstanceCountBufferOut[0], 1U, VisibleInstanceOutputOffset);
if (VisibleInstanceOutputOffset < VisibleInstancesBufferNum)
{
VisibleInstancesOut[VisibleInstanceOutputOffset] = VisibleInstanceCmd;
return true;
}
else
{
return false;
}
}
#define MAX_SINGLE_THREAD_MARKING_AREA (8U)
#define MARKING_JOB_QUEUE_SIZE (NUM_THREADS_PER_GROUP * 2U)
struct FSharedMarkingJob
{
uint4 RectPages;
uint VirtualShadowMapId;
uint MipLevel;
uint FlagMask;
uint MarkPageDirtyFlags;
};
groupshared uint NumSharedMarkingJobs;
groupshared uint2 SharedMarkingJobs[MARKING_JOB_QUEUE_SIZE];
uint2 PackJob(FSharedMarkingJob Job)
{
uint2 Packed;
Packed.x = (Job.RectPages.x << 25U)
| (Job.RectPages.y << 18U)
| (Job.RectPages.z << 11U)
| (Job.RectPages.w << 4u)
| Job.FlagMask;
Packed.y = (Job.VirtualShadowMapId << 8U)
| (Job.MipLevel << 4U)
| (Job.MarkPageDirtyFlags & 0xF);
return Packed;
}
FSharedMarkingJob UnpackJob(uint2 Packed)
{
FSharedMarkingJob Job;
Job.RectPages.x = Packed.x >> 25u;
Job.RectPages.y = (Packed.x >> 18u) & 0x7Fu;
Job.RectPages.z = (Packed.x >> 11u) & 0x7Fu;
Job.RectPages.w = (Packed.x >> 4u) & 0x7Fu;
Job.FlagMask = Packed.x & 0xFu;
Job.VirtualShadowMapId = Packed.y >> 8U;
Job.MipLevel = (Packed.y >> 4) & 0x7;
Job.MarkPageDirtyFlags = (Packed.y & 0xF);
return Job;
}
bool MarkPageDirty(
FVirtualSMLevelOffset PageTableLevelOffset,
uint2 vPage,
uint MipLevel,
uint PageFlagMask,
uint MarkPageDirtyFlags)
{
// TODO: Do we actually even need this check?
FVSMPageOffset PageFlagOffset = CalcPageOffset(PageTableLevelOffset, MipLevel, vPage);
uint PageFlag = VirtualShadowMapGetPageFlag(PageFlagOffset);
if ((PageFlag & PageFlagMask) != 0)
{
return VirtualShadowMapMarkPageDirty(PageFlagOffset, MarkPageDirtyFlags);
}
return false;
}
#if VSM_GENERATE_STATS
uint NumPageAreaDiagnosticSlots;
uint LargeInstancePageAreaThreshold;
#endif
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
void CullPerPageDrawCommandsCs(uint3 GroupId : SV_GroupID, int GroupThreadIndex : SV_GroupIndex)
{
uint DispatchGroupId = GetUnWrappedDispatchGroupId(GroupId);
if (DispatchGroupId >= InstanceCullingLoadBalancer_GetNumBatches())
{
return;
}
if (GroupThreadIndex == 0)
{
NumSharedMarkingJobs = 0U;
}
GroupMemoryBarrierWithGroupSync();
#if ENABLE_BATCH_MODE
// Load Instance culling context batch info, indirection per group
FContextBatchInfo BatchInfo = LoadBatchInfo(DispatchGroupId);
FVSMCullingBatchInfo VSMCullingBatchInfo = VSMCullingBatchInfos[BatchInds[DispatchGroupId]];
#else // !ENABLE_BATCH_MODE
// Single Instance culling context batch in the call, set up batch from the kernel parameters
FContextBatchInfo BatchInfo = (FContextBatchInfo)0;
BatchInfo.DynamicInstanceIdOffset = DynamicInstanceIdOffset;
BatchInfo.DynamicInstanceIdMax = DynamicInstanceIdMax;
// Note: for the unbatched case, the permutation will control HZB test, so we set to true
BatchInfo.bAllowOcclusionCulling = true;
FVSMCullingBatchInfo VSMCullingBatchInfo;
VSMCullingBatchInfo.FirstPrimaryView = FirstPrimaryView;
VSMCullingBatchInfo.NumPrimaryViews = NumPrimaryViews;
#endif // ENABLE_BATCH_MODE
uint CurrentBatchProcessingMode = 0U;
FInstanceWorkSetup WorkSetup = InstanceCullingLoadBalancer_Setup(GroupId, GroupThreadIndex, GetItemDataOffset(BatchInfo, CurrentBatchProcessingMode));
if (WorkSetup.bValid)
{
const FInstanceCullingPayload Payload = LoadInstanceCullingPayload(WorkSetup.Item.Payload, BatchInfo);
uint InstanceDataOffset = WorkSetup.Item.InstanceDataOffset;
if (Payload.bDynamicInstanceDataOffset)
{
InstanceDataOffset += BatchInfo.DynamicInstanceIdOffset;
checkSlow(InstanceDataOffset + uint(WorkSetup.LocalItemIndex) < BatchInfo.DynamicInstanceIdMax);
}
uint InstanceId = InstanceDataOffset + uint(WorkSetup.LocalItemIndex);
FDrawCommandDesc DrawCommandDesc = UnpackDrawCommandDesc(DrawCommandDescs[Payload.IndirectArgIndex]);
// Load relevant instance data
FInstanceSceneData InstanceData = GetInstanceSceneData(InstanceId);
FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId);
float3 LocalBoundsCenter = InstanceData.LocalBoundsCenter;
float3 LocalBoundsExtent = InstanceData.LocalBoundsExtent;
LoadDynamicMeshBounds(DrawCommandDesc.DynamicMeshBoundsIndex, LocalBoundsCenter, LocalBoundsExtent);
#if VSM_GENERATE_STATS
float4 BoundsColor = all(LocalBoundsCenter == InstanceData.LocalBoundsCenter) ? ColorRed : ColorGreen;
AddOBBWS(LocalBoundsCenter - LocalBoundsExtent, LocalBoundsCenter + LocalBoundsExtent, BoundsColor, DFHackToFloat(InstanceData.LocalToWorld));
#endif
uint ThreadTotalForAllViews = 0;
#if VSM_GENERATE_STATS
uint TotalPageArea = 0U;
float NumVisibleLevels = 0.0f;
#endif
bool bMarkingJobQueueOverflow = false;
bool bVisibleInstancesOverflow = false;
// Loop over views and output visible instance (i.e., those that overlap a valid page)
for (uint PrimaryViewId = VSMCullingBatchInfo.FirstPrimaryView; PrimaryViewId < VSMCullingBatchInfo.FirstPrimaryView + VSMCullingBatchInfo.NumPrimaryViews; ++PrimaryViewId)
{
FNaniteView NaniteView = GetNaniteView(PrimaryViewId);
uint CullingFlags = INSTANCE_CULLING_FLAGS_DEFAULT;
bool bEnableWPO = DrawCommandDesc.bMaterialUsesWorldPositionOffset;
FInstanceDynamicData DynamicData = CalculateInstanceDynamicData(NaniteView, InstanceData);
FBoxCull Cull;
Cull.Init(
NaniteView,
LocalBoundsCenter,
LocalBoundsExtent,
InstanceData.NonUniformScale,
DynamicData.LocalToTranslatedWorld,
DynamicData.PrevLocalToTranslatedWorld );
Cull.bSkipWPODisableDistance |= DrawCommandDesc.bMaterialAlwaysEvaluatesWorldPositionOffset;
Cull.Distance( PrimitiveData );
bEnableWPO = bEnableWPO && Cull.bEnableWPO;
// NOTE: Doing this for just the primary view (not mip views) is fine for current logic
const bool bAllowWPO = VirtualShadowMapIsWPOAllowed(PrimitiveData, NaniteView.TargetLayerIndex);
bEnableWPO = bEnableWPO && bAllowWPO;
bool bCacheAsStatic = ShouldCacheInstanceAsStatic(InstanceId, (NaniteView.Flags & NANITE_VIEW_FLAG_UNCACHED), bAllowWPO, NaniteView.SceneRendererPrimaryViewId);
// TODO: Set Cull.bIsStaticGeometry and dynamic depth range culling?
#if USE_HZB_OCCLUSION
bool bUseStaticOcclusion = bCacheAsStatic;
#endif
// TODO: there seems to be some code sharing that could be enabled by switching on VIRTUAL_TEXTURE_TARGET for FBoxCull
// If we're rendering into the static cache, it's not safe to use the receiver mask as we may cache that (full) page
Cull.bUseReceiverMask = Cull.bUseReceiverMask && !bCacheAsStatic;
Cull.bIsStaticGeometry = bCacheAsStatic;
if (!bEnableWPO)
{
// Disable the Evaluate WPO culling flag if WPO was disabled
CullingFlags &= ~INSTANCE_CULLING_FLAG_EVALUATE_WPO;
}
Cull.ScreenSize(DrawCommandDesc.MinScreenSize, DrawCommandDesc.MaxScreenSize);
Cull.GlobalClipPlane();
bool bInvalidatePages = ShouldMaterialInvalidateShadowCache(PrimitiveData, bEnableWPO)
|| GetInstanceViewData(InstanceId, NaniteView.SceneRendererPrimaryViewId).bIsDeforming;
FFrustumCullData FrustumCull = (FFrustumCullData)0;
BRANCH
if( Cull.bIsVisible )
{
FrustumCull = Cull.Frustum();
}
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_TOTAL, NaniteView.TargetNumMipLevels);
BRANCH
if (Cull.bIsVisible)
{
// Compute estimated footprint in the VSM base level - note that we don't use the actual footprint because it changes under rotation and also with edge clipping which makes it unstable when used for culling.
float PixelEstRadius = CalcClipSpaceRadiusEstimate(Cull.bIsOrtho, InstanceData, Cull.LocalToTranslatedWorld, NaniteView.ViewToClip) * float(VSM_VIRTUAL_MAX_RESOLUTION_XY);
uint FlagMask = GetPageFlagMaskForRendering(bCacheAsStatic, InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId);
// Loop over mip levels and count number of output visible instances
for (uint MipLevel = 0U; MipLevel < uint(NaniteView.TargetNumMipLevels); ++MipLevel)
{
uint MipViewId = MipLevel * TotalPrimaryViews + PrimaryViewId;
FNaniteView MipView = GetNaniteView(MipViewId);
uint VirtualShadowMapId = uint(MipView.TargetLayerIndex);
FVirtualShadowMapHandle VirtualShadowMapHandle = FVirtualShadowMapHandle::MakeFromId(MipView.TargetLayerIndex);
FScreenRect Rect = GetScreenRect(MipView.ViewRect, FrustumCull, 4);
bool bDetailGeometry = IsDetailGeometry(bCacheAsStatic, false, PixelEstRadius);
// Update for next clip level
PixelEstRadius *= 0.5f;
Rect = VirtualShadowMapGetUncachedScreenRect( Rect, VirtualShadowMapHandle, MipLevel );
uint4 RectPages = VirtualShadowMapGetPageRect( Rect );
if (OverlapsAnyValidPage(VirtualShadowMapHandle, MipLevel, Rect, FlagMask, bDetailGeometry, Cull.bUseReceiverMask))
{
#if VSM_GENERATE_STATS
TotalPageArea += float(GetInclusiveRectArea(RectPages));
NumVisibleLevels += 1.0f;
#endif
#if USE_HZB_OCCLUSION
if (Cull.bViewHZB)
{
// We are using the current-frame HZB and can use any page whether it is cached or not
// TODO: Figure out which flag(s) we actually want here. Also need to check the non-nanite one?
if (!IsVisibleMaskedHZB(VirtualShadowMapHandle, MipLevel, Rect, true, true, 0U, 0xFFFFFFFF, bUseStaticOcclusion))
{
StatsBufferInterlockedInc(VSM_STAT_NON_NANITE_INSTANCES_HZB_CULLED);
continue;
}
}
#endif // USE_HZB_OCCLUSION
uint NumMappedPages = 0U;
{
const uint MarkPageDirtyFlags = VirtualShadowMapGetMarkPageDirtyFlags(bInvalidatePages, bCacheAsStatic, Cull.bIsViewUncached, bAllowWPO);
uint2 RectPagesSize = (RectPages.zw + 1u) - RectPages.xy;
bool bIsSmallJob = RectPagesSize.x * RectPagesSize.y <= MAX_SINGLE_THREAD_MARKING_AREA;
// NOTE: Large job only necessary if we actually have flags to write
bool bDoLargeJob = !bIsSmallJob && (MarkPageDirtyFlags != 0);
uint LargeJobIndex = 0U;
if (bDoLargeJob)
{
// allocate slot to defer the work and use all threads in group
InterlockedAdd(NumSharedMarkingJobs, 1U, LargeJobIndex);
bMarkingJobQueueOverflow = bMarkingJobQueueOverflow || LargeJobIndex >= MARKING_JOB_QUEUE_SIZE;
}
if (bIsSmallJob || bMarkingJobQueueOverflow)
{
FVirtualSMLevelOffset PageTableLevelOffset = CalcPageTableLevelOffset(VirtualShadowMapHandle, MipLevel);
for (uint Y = RectPages.y; Y <= RectPages.w; ++Y)
{
for (uint X = RectPages.x; X <= RectPages.z; ++X)
{
if (MarkPageDirty(PageTableLevelOffset, uint2(X, Y), MipLevel, FlagMask, MarkPageDirtyFlags))
{
// Count pages if we're touching all of them anyway
++NumMappedPages;
}
}
}
}
else
{
if (bDoLargeJob)
{
// defer the work and use all threads in group
FSharedMarkingJob Job;
Job.RectPages = RectPages;
Job.VirtualShadowMapId = VirtualShadowMapId;
Job.MipLevel = MipLevel;
Job.FlagMask = FlagMask;
Job.MarkPageDirtyFlags = MarkPageDirtyFlags;
SharedMarkingJobs[LargeJobIndex] = PackJob(Job);
}
// Must assume we have mapped pages (or defer the cmd write also...)
NumMappedPages = 1U;
}
}
if (NumMappedPages > 0U)
{
++ThreadTotalForAllViews;
bVisibleInstancesOverflow |= !WriteCmd(MipViewId, InstanceId, Payload.IndirectArgIndex, CullingFlags, bCacheAsStatic);
}
}
else
{
StatsBufferInterlockedInc(VSM_STAT_NON_NANITE_INSTANCES_PAGE_MASK_CULLED);
}
}
}
else
{
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_FRUSTUM_CULLED, NaniteView.TargetNumMipLevels);
}
}
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_DRAWN, ThreadTotalForAllViews);
StatsBufferInterlockedEnableFlags(VSM_STAT_OVERFLOW_FLAGS, VSM_STAT_OVERFLOW_FLAG_MARKING_JOB_QUEUE, bMarkingJobQueueOverflow, true);
StatsBufferInterlockedEnableFlags(VSM_STAT_OVERFLOW_FLAGS, VSM_STAT_OVERFLOW_FLAG_VISIBLE_INSTANCES, bVisibleInstancesOverflow, true);
// Accumulate total number of instances for each indirect argument, is also used to allocate space and output compact range of instances later
InterlockedAdd(DrawIndirectArgsBufferOut[Payload.IndirectArgIndex * INDIRECT_ARGS_NUM_WORDS + 1], ThreadTotalForAllViews);
#if VSM_GENERATE_STATS
if (NumPageAreaDiagnosticSlots > 0U && TotalPageArea > LargeInstancePageAreaThreshold)
{
for (uint Index = 0U; Index < NumPageAreaDiagnosticSlots; ++Index)
{
uint PrevArea = 0U;
// Store these after the stats slots
InterlockedMax(OutStatsBuffer[VSM_STAT_NUM + 2U * Index], TotalPageArea, PrevArea);
// If the area was greater then we store the persistent Primitive ID after the area.
// This obviously has a data race, so numbers & IDs will only be approximately correct, could use 64-bit atomics instead...
if (PrevArea < TotalPageArea)
{
OutStatsBuffer[VSM_STAT_NUM + 2U * Index + 1U] = PrimitiveData.PersistentPrimitiveIndex;
break;
}
}
}
#endif
}
GroupMemoryBarrierWithGroupSync();
uint NumMarkingJobs = NumSharedMarkingJobs;
for (uint JobIndex = 0U; JobIndex < min(MARKING_JOB_QUEUE_SIZE, NumMarkingJobs); ++JobIndex)
{
FSharedMarkingJob Job = UnpackJob(SharedMarkingJobs[JobIndex]);
FVirtualShadowMapHandle VirtualShadowMapHandle = FVirtualShadowMapHandle::MakeFromId(Job.VirtualShadowMapId);
FVirtualSMLevelOffset PageTableLevelOffset = CalcPageTableLevelOffset(VirtualShadowMapHandle, Job.MipLevel);
uint2 RectPagesSize = (Job.RectPages.zw + 1u) - Job.RectPages.xy;
for (uint Index = GroupThreadIndex; Index < RectPagesSize.x * RectPagesSize.y; Index += NUM_THREADS_PER_GROUP)
{
uint LocalPageY = uint(floor((float(Index) + 0.5f) * rcp(RectPagesSize.x)));
uint2 vPage = Job.RectPages.xy + uint2(Index - RectPagesSize.x * LocalPageY, LocalPageY);
// We don't need to add the correct flag mask here, as we don't look at the VSM_FLAG_DETAIL_GEOMETRY for invalidaiton processing
MarkPageDirty(PageTableLevelOffset, vPage, Job.MipLevel, Job.FlagMask, Job.MarkPageDirtyFlags);
}
}
}
Buffer<uint> DrawIndirectArgsBuffer;
RWBuffer<uint> InstanceIdOffsetBufferOut;
RWStructuredBuffer<uint> OutputOffsetBufferOut;
RWStructuredBuffer<uint> TmpInstanceIdOffsetBufferOut;
uint NumIndirectArgs;
/**
* Separate pass to allocate space, needs to run once the final space requirements are known. We buffer the page/instance-draw info and reshuffle later.
* TODO: Possibly just re-run the culling process in the output pass, saves storing stuff, but may cost more and runs the risk of the passes disagreeing e.g., due to rounding or whatever.
*/
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
void AllocateCommandInstanceOutputSpaceCs(uint IndirectArgIndex : SV_DispatchThreadID)
{
if (IndirectArgIndex < NumIndirectArgs)
{
uint CommandInstanceCount = DrawIndirectArgsBuffer[IndirectArgIndex * INDIRECT_ARGS_NUM_WORDS + 1];
uint CommandInstanceOffset = 0U;
if (CommandInstanceCount > 0U)
{
InterlockedAdd(OutputOffsetBufferOut[0], CommandInstanceCount, CommandInstanceOffset);
}
InstanceIdOffsetBufferOut[IndirectArgIndex] = CommandInstanceOffset;
// Store second copy for use during output pass (as we need the first offset buffer during the actual rendering)
TmpInstanceIdOffsetBufferOut[IndirectArgIndex] = CommandInstanceOffset;
}
// Also set up indirect dispatch args for the output pass (OutputCommandInstanceLists)
//if (IndirectArgIndex == 0)
//{
// uint NumVisibleInstances = VisibleInstanceCountBuffer[0];
// // ...dispatch args to process all the visible instances
//}
}
StructuredBuffer<FVSMVisibleInstanceCmd> VisibleInstances;
StructuredBuffer <uint> VisibleInstanceCountBuffer;
//RWStructuredBuffer<uint> TmpInstanceIdOffsetBufferOut;
RWStructuredBuffer<uint> InstanceIdsBufferOut;
RWStructuredBuffer<uint> PageInfoBufferOut;
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
void OutputCommandInstanceListsCs(uint VisibleInstanceIndex : SV_DispatchThreadID)
{
uint NumVisibleInstances = VisibleInstanceCountBuffer[0];
if (VisibleInstanceIndex < NumVisibleInstances)
{
FVSMVisibleInstanceCmd VisibleInstanceCmd = VisibleInstances[VisibleInstanceIndex];
const uint InstanceId = VisibleInstanceCmd.InstanceIdAndFlags & INSTANCE_ID_MASK;
const uint CullingFlags = VisibleInstanceCmd.InstanceIdAndFlags >> INSTANCE_ID_NUM_BITS;
// Scatter the instance ID & other data.
uint InstanceIdOutputOffset = 0;
InterlockedAdd(TmpInstanceIdOffsetBufferOut[VisibleInstanceCmd.IndirectArgIndex], 1U, InstanceIdOutputOffset);
// TODO: maybe repack as uint2 since that might be better for these type of presumably scalar loads.
InstanceIdsBufferOut[InstanceIdOutputOffset] = PackInstanceCullingOutput(InstanceId, 0u, CullingFlags);
PageInfoBufferOut[InstanceIdOutputOffset] = VisibleInstanceCmd.PackedPageInfo;
}
}