506 lines
19 KiB
HLSL
506 lines
19 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
/*=============================================================================
|
|
VirtualShadowMapBuildPerPageDrawCommands.usf:
|
|
=============================================================================*/
|
|
|
|
// Do not use shared samplers as it requires the View uniform buffer, which is not bound for this shader.
|
|
#define USE_HZB_SHARED_SAMPLERS 0
|
|
|
|
#include "../Common.ush"
|
|
#include "../ViewData.ush"
|
|
#include "VirtualShadowMapPageOverlap.ush"
|
|
#include "VirtualShadowMapProjectionCommon.ush"
|
|
|
|
// Turn on the logic for culling based on min screen radius (used in NaniteCullingCommon.ush)
|
|
#define NANITE_CULLING_ENABLE_MIN_RADIUS_CULL 1
|
|
|
|
// Turn on various VSM-related logic in NaniteCullingCommon
|
|
#define VIRTUAL_TEXTURE_TARGET 1
|
|
|
|
#include "../Nanite/NaniteCullingCommon.ush"
|
|
#include "../Nanite/NaniteDataDecode.ush"
|
|
#include "../InstanceCulling/InstanceCullingCommon.ush"
|
|
#include "../InstanceCulling/InstanceCullingLoadBalancer.ush"
|
|
#include "../WaveOpUtil.ush"
|
|
#include "../DynamicMeshBounds.ush"
|
|
#include "/Engine/Shared/VirtualShadowMapDefinitions.h"
|
|
#include "VirtualShadowMapStats.ush"
|
|
#include "VirtualShadowMapPageCacheCommon.ush"
|
|
|
|
#include "../ShaderPrint.ush"
|
|
|
|
RWStructuredBuffer<FVSMVisibleInstanceCmd> VisibleInstancesOut;
|
|
RWStructuredBuffer<uint> VisibleInstanceCountBufferOut;
|
|
uint TotalPrimaryViews;
|
|
uint VisibleInstancesBufferNum;
|
|
|
|
#if ENABLE_BATCH_MODE
|
|
StructuredBuffer<FVSMCullingBatchInfo> VSMCullingBatchInfos;
|
|
#else // !ENABLE_BATCH_MODE
|
|
uint FirstPrimaryView;
|
|
uint NumPrimaryViews;
|
|
uint DynamicInstanceIdOffset;
|
|
uint DynamicInstanceIdMax;
|
|
#endif // ENABLE_BATCH_MODE
|
|
|
|
StructuredBuffer<uint2> DrawCommandDescs;
|
|
|
|
RWBuffer<uint> DrawIndirectArgsBufferOut;
|
|
|
|
// TODO: Move to common header
|
|
// Get the area of an "inclusive" rect (which means that the max is inside the rect), also guards against negative area (where min > max)
|
|
uint GetInclusiveRectArea(uint4 Rect)
|
|
{
|
|
if (all(Rect.zw >= Rect.xy))
|
|
{
|
|
uint2 Size = Rect.zw - Rect.xy;
|
|
return (Size.x + 1) * (Size.y + 1);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bool WriteCmd(uint MipViewId, uint InstanceId, uint IndirectArgIndex, uint CullingFlags, bool bStaticPage)
|
|
{
|
|
FPageInfo PageInfo;
|
|
PageInfo.ViewId = MipViewId;
|
|
PageInfo.bStaticPage = bStaticPage;
|
|
|
|
FVSMVisibleInstanceCmd VisibleInstanceCmd;
|
|
VisibleInstanceCmd.PackedPageInfo = PackPageInfo(PageInfo);
|
|
VisibleInstanceCmd.InstanceIdAndFlags = (CullingFlags << INSTANCE_ID_NUM_BITS) | InstanceId;
|
|
VisibleInstanceCmd.IndirectArgIndex = IndirectArgIndex;
|
|
|
|
uint VisibleInstanceOutputOffset = 0U;
|
|
WaveInterlockedAddScalar_(VisibleInstanceCountBufferOut[0], 1U, VisibleInstanceOutputOffset);
|
|
if (VisibleInstanceOutputOffset < VisibleInstancesBufferNum)
|
|
{
|
|
VisibleInstancesOut[VisibleInstanceOutputOffset] = VisibleInstanceCmd;
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
#define MAX_SINGLE_THREAD_MARKING_AREA (8U)
|
|
#define MARKING_JOB_QUEUE_SIZE (NUM_THREADS_PER_GROUP * 2U)
|
|
|
|
struct FSharedMarkingJob
|
|
{
|
|
uint4 RectPages;
|
|
uint VirtualShadowMapId;
|
|
uint MipLevel;
|
|
uint FlagMask;
|
|
uint MarkPageDirtyFlags;
|
|
};
|
|
|
|
groupshared uint NumSharedMarkingJobs;
|
|
groupshared uint2 SharedMarkingJobs[MARKING_JOB_QUEUE_SIZE];
|
|
|
|
uint2 PackJob(FSharedMarkingJob Job)
|
|
{
|
|
uint2 Packed;
|
|
Packed.x = (Job.RectPages.x << 25U)
|
|
| (Job.RectPages.y << 18U)
|
|
| (Job.RectPages.z << 11U)
|
|
| (Job.RectPages.w << 4u)
|
|
| Job.FlagMask;
|
|
Packed.y = (Job.VirtualShadowMapId << 8U)
|
|
| (Job.MipLevel << 4U)
|
|
| (Job.MarkPageDirtyFlags & 0xF);
|
|
return Packed;
|
|
}
|
|
|
|
FSharedMarkingJob UnpackJob(uint2 Packed)
|
|
{
|
|
FSharedMarkingJob Job;
|
|
Job.RectPages.x = Packed.x >> 25u;
|
|
Job.RectPages.y = (Packed.x >> 18u) & 0x7Fu;
|
|
Job.RectPages.z = (Packed.x >> 11u) & 0x7Fu;
|
|
Job.RectPages.w = (Packed.x >> 4u) & 0x7Fu;
|
|
Job.FlagMask = Packed.x & 0xFu;
|
|
Job.VirtualShadowMapId = Packed.y >> 8U;
|
|
Job.MipLevel = (Packed.y >> 4) & 0x7;
|
|
Job.MarkPageDirtyFlags = (Packed.y & 0xF);
|
|
return Job;
|
|
}
|
|
|
|
bool MarkPageDirty(
|
|
FVirtualSMLevelOffset PageTableLevelOffset,
|
|
uint2 vPage,
|
|
uint MipLevel,
|
|
uint PageFlagMask,
|
|
uint MarkPageDirtyFlags)
|
|
{
|
|
// TODO: Do we actually even need this check?
|
|
FVSMPageOffset PageFlagOffset = CalcPageOffset(PageTableLevelOffset, MipLevel, vPage);
|
|
uint PageFlag = VirtualShadowMapGetPageFlag(PageFlagOffset);
|
|
if ((PageFlag & PageFlagMask) != 0)
|
|
{
|
|
return VirtualShadowMapMarkPageDirty(PageFlagOffset, MarkPageDirtyFlags);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
#if VSM_GENERATE_STATS
|
|
uint NumPageAreaDiagnosticSlots;
|
|
uint LargeInstancePageAreaThreshold;
|
|
#endif
|
|
|
|
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
|
|
void CullPerPageDrawCommandsCs(uint3 GroupId : SV_GroupID, int GroupThreadIndex : SV_GroupIndex)
|
|
{
|
|
uint DispatchGroupId = GetUnWrappedDispatchGroupId(GroupId);
|
|
|
|
if (DispatchGroupId >= InstanceCullingLoadBalancer_GetNumBatches())
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (GroupThreadIndex == 0)
|
|
{
|
|
NumSharedMarkingJobs = 0U;
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
#if ENABLE_BATCH_MODE
|
|
// Load Instance culling context batch info, indirection per group
|
|
FContextBatchInfo BatchInfo = LoadBatchInfo(DispatchGroupId);
|
|
FVSMCullingBatchInfo VSMCullingBatchInfo = VSMCullingBatchInfos[BatchInds[DispatchGroupId]];
|
|
#else // !ENABLE_BATCH_MODE
|
|
// Single Instance culling context batch in the call, set up batch from the kernel parameters
|
|
FContextBatchInfo BatchInfo = (FContextBatchInfo)0;
|
|
BatchInfo.DynamicInstanceIdOffset = DynamicInstanceIdOffset;
|
|
BatchInfo.DynamicInstanceIdMax = DynamicInstanceIdMax;
|
|
// Note: for the unbatched case, the permutation will control HZB test, so we set to true
|
|
BatchInfo.bAllowOcclusionCulling = true;
|
|
|
|
FVSMCullingBatchInfo VSMCullingBatchInfo;
|
|
VSMCullingBatchInfo.FirstPrimaryView = FirstPrimaryView;
|
|
VSMCullingBatchInfo.NumPrimaryViews = NumPrimaryViews;
|
|
|
|
#endif // ENABLE_BATCH_MODE
|
|
|
|
uint CurrentBatchProcessingMode = 0U;
|
|
FInstanceWorkSetup WorkSetup = InstanceCullingLoadBalancer_Setup(GroupId, GroupThreadIndex, GetItemDataOffset(BatchInfo, CurrentBatchProcessingMode));
|
|
if (WorkSetup.bValid)
|
|
{
|
|
const FInstanceCullingPayload Payload = LoadInstanceCullingPayload(WorkSetup.Item.Payload, BatchInfo);
|
|
|
|
uint InstanceDataOffset = WorkSetup.Item.InstanceDataOffset;
|
|
|
|
if (Payload.bDynamicInstanceDataOffset)
|
|
{
|
|
InstanceDataOffset += BatchInfo.DynamicInstanceIdOffset;
|
|
checkSlow(InstanceDataOffset + uint(WorkSetup.LocalItemIndex) < BatchInfo.DynamicInstanceIdMax);
|
|
}
|
|
|
|
uint InstanceId = InstanceDataOffset + uint(WorkSetup.LocalItemIndex);
|
|
FDrawCommandDesc DrawCommandDesc = UnpackDrawCommandDesc(DrawCommandDescs[Payload.IndirectArgIndex]);
|
|
|
|
// Load relevant instance data
|
|
FInstanceSceneData InstanceData = GetInstanceSceneData(InstanceId);
|
|
FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId);
|
|
|
|
float3 LocalBoundsCenter = InstanceData.LocalBoundsCenter;
|
|
float3 LocalBoundsExtent = InstanceData.LocalBoundsExtent;
|
|
LoadDynamicMeshBounds(DrawCommandDesc.DynamicMeshBoundsIndex, LocalBoundsCenter, LocalBoundsExtent);
|
|
|
|
#if VSM_GENERATE_STATS
|
|
float4 BoundsColor = all(LocalBoundsCenter == InstanceData.LocalBoundsCenter) ? ColorRed : ColorGreen;
|
|
AddOBBWS(LocalBoundsCenter - LocalBoundsExtent, LocalBoundsCenter + LocalBoundsExtent, BoundsColor, DFHackToFloat(InstanceData.LocalToWorld));
|
|
#endif
|
|
|
|
uint ThreadTotalForAllViews = 0;
|
|
#if VSM_GENERATE_STATS
|
|
uint TotalPageArea = 0U;
|
|
float NumVisibleLevels = 0.0f;
|
|
#endif
|
|
bool bMarkingJobQueueOverflow = false;
|
|
bool bVisibleInstancesOverflow = false;
|
|
|
|
// Loop over views and output visible instance (i.e., those that overlap a valid page)
|
|
for (uint PrimaryViewId = VSMCullingBatchInfo.FirstPrimaryView; PrimaryViewId < VSMCullingBatchInfo.FirstPrimaryView + VSMCullingBatchInfo.NumPrimaryViews; ++PrimaryViewId)
|
|
{
|
|
FNaniteView NaniteView = GetNaniteView(PrimaryViewId);
|
|
|
|
uint CullingFlags = INSTANCE_CULLING_FLAGS_DEFAULT;
|
|
bool bEnableWPO = DrawCommandDesc.bMaterialUsesWorldPositionOffset;
|
|
|
|
FInstanceDynamicData DynamicData = CalculateInstanceDynamicData(NaniteView, InstanceData);
|
|
|
|
FBoxCull Cull;
|
|
Cull.Init(
|
|
NaniteView,
|
|
LocalBoundsCenter,
|
|
LocalBoundsExtent,
|
|
InstanceData.NonUniformScale,
|
|
DynamicData.LocalToTranslatedWorld,
|
|
DynamicData.PrevLocalToTranslatedWorld );
|
|
|
|
Cull.bSkipWPODisableDistance |= DrawCommandDesc.bMaterialAlwaysEvaluatesWorldPositionOffset;
|
|
|
|
Cull.Distance( PrimitiveData );
|
|
bEnableWPO = bEnableWPO && Cull.bEnableWPO;
|
|
|
|
// NOTE: Doing this for just the primary view (not mip views) is fine for current logic
|
|
const bool bAllowWPO = VirtualShadowMapIsWPOAllowed(PrimitiveData, NaniteView.TargetLayerIndex);
|
|
bEnableWPO = bEnableWPO && bAllowWPO;
|
|
bool bCacheAsStatic = ShouldCacheInstanceAsStatic(InstanceId, (NaniteView.Flags & NANITE_VIEW_FLAG_UNCACHED), bAllowWPO, NaniteView.SceneRendererPrimaryViewId);
|
|
|
|
// TODO: Set Cull.bIsStaticGeometry and dynamic depth range culling?
|
|
|
|
#if USE_HZB_OCCLUSION
|
|
bool bUseStaticOcclusion = bCacheAsStatic;
|
|
#endif
|
|
|
|
// TODO: there seems to be some code sharing that could be enabled by switching on VIRTUAL_TEXTURE_TARGET for FBoxCull
|
|
// If we're rendering into the static cache, it's not safe to use the receiver mask as we may cache that (full) page
|
|
Cull.bUseReceiverMask = Cull.bUseReceiverMask && !bCacheAsStatic;
|
|
Cull.bIsStaticGeometry = bCacheAsStatic;
|
|
|
|
if (!bEnableWPO)
|
|
{
|
|
// Disable the Evaluate WPO culling flag if WPO was disabled
|
|
CullingFlags &= ~INSTANCE_CULLING_FLAG_EVALUATE_WPO;
|
|
}
|
|
|
|
Cull.ScreenSize(DrawCommandDesc.MinScreenSize, DrawCommandDesc.MaxScreenSize);
|
|
|
|
Cull.GlobalClipPlane();
|
|
|
|
bool bInvalidatePages = ShouldMaterialInvalidateShadowCache(PrimitiveData, bEnableWPO)
|
|
|| GetInstanceViewData(InstanceId, NaniteView.SceneRendererPrimaryViewId).bIsDeforming;
|
|
|
|
FFrustumCullData FrustumCull = (FFrustumCullData)0;
|
|
BRANCH
|
|
if( Cull.bIsVisible )
|
|
{
|
|
FrustumCull = Cull.Frustum();
|
|
}
|
|
|
|
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_TOTAL, NaniteView.TargetNumMipLevels);
|
|
|
|
BRANCH
|
|
if (Cull.bIsVisible)
|
|
{
|
|
// Compute estimated footprint in the VSM base level - note that we don't use the actual footprint because it changes under rotation and also with edge clipping which makes it unstable when used for culling.
|
|
float PixelEstRadius = CalcClipSpaceRadiusEstimate(Cull.bIsOrtho, InstanceData, Cull.LocalToTranslatedWorld, NaniteView.ViewToClip) * float(VSM_VIRTUAL_MAX_RESOLUTION_XY);
|
|
|
|
uint FlagMask = GetPageFlagMaskForRendering(bCacheAsStatic, InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId);
|
|
|
|
// Loop over mip levels and count number of output visible instances
|
|
for (uint MipLevel = 0U; MipLevel < uint(NaniteView.TargetNumMipLevels); ++MipLevel)
|
|
{
|
|
uint MipViewId = MipLevel * TotalPrimaryViews + PrimaryViewId;
|
|
FNaniteView MipView = GetNaniteView(MipViewId);
|
|
uint VirtualShadowMapId = uint(MipView.TargetLayerIndex);
|
|
FVirtualShadowMapHandle VirtualShadowMapHandle = FVirtualShadowMapHandle::MakeFromId(MipView.TargetLayerIndex);
|
|
|
|
FScreenRect Rect = GetScreenRect(MipView.ViewRect, FrustumCull, 4);
|
|
|
|
bool bDetailGeometry = IsDetailGeometry(bCacheAsStatic, false, PixelEstRadius);
|
|
// Update for next clip level
|
|
PixelEstRadius *= 0.5f;
|
|
|
|
Rect = VirtualShadowMapGetUncachedScreenRect( Rect, VirtualShadowMapHandle, MipLevel );
|
|
uint4 RectPages = VirtualShadowMapGetPageRect( Rect );
|
|
|
|
if (OverlapsAnyValidPage(VirtualShadowMapHandle, MipLevel, Rect, FlagMask, bDetailGeometry, Cull.bUseReceiverMask))
|
|
{
|
|
#if VSM_GENERATE_STATS
|
|
TotalPageArea += float(GetInclusiveRectArea(RectPages));
|
|
NumVisibleLevels += 1.0f;
|
|
#endif
|
|
#if USE_HZB_OCCLUSION
|
|
if (Cull.bViewHZB)
|
|
{
|
|
// We are using the current-frame HZB and can use any page whether it is cached or not
|
|
// TODO: Figure out which flag(s) we actually want here. Also need to check the non-nanite one?
|
|
if (!IsVisibleMaskedHZB(VirtualShadowMapHandle, MipLevel, Rect, true, true, 0U, 0xFFFFFFFF, bUseStaticOcclusion))
|
|
{
|
|
StatsBufferInterlockedInc(VSM_STAT_NON_NANITE_INSTANCES_HZB_CULLED);
|
|
continue;
|
|
}
|
|
}
|
|
#endif // USE_HZB_OCCLUSION
|
|
|
|
uint NumMappedPages = 0U;
|
|
{
|
|
const uint MarkPageDirtyFlags = VirtualShadowMapGetMarkPageDirtyFlags(bInvalidatePages, bCacheAsStatic, Cull.bIsViewUncached, bAllowWPO);
|
|
|
|
uint2 RectPagesSize = (RectPages.zw + 1u) - RectPages.xy;
|
|
bool bIsSmallJob = RectPagesSize.x * RectPagesSize.y <= MAX_SINGLE_THREAD_MARKING_AREA;
|
|
|
|
// NOTE: Large job only necessary if we actually have flags to write
|
|
bool bDoLargeJob = !bIsSmallJob && (MarkPageDirtyFlags != 0);
|
|
|
|
uint LargeJobIndex = 0U;
|
|
if (bDoLargeJob)
|
|
{
|
|
// allocate slot to defer the work and use all threads in group
|
|
InterlockedAdd(NumSharedMarkingJobs, 1U, LargeJobIndex);
|
|
bMarkingJobQueueOverflow = bMarkingJobQueueOverflow || LargeJobIndex >= MARKING_JOB_QUEUE_SIZE;
|
|
}
|
|
|
|
if (bIsSmallJob || bMarkingJobQueueOverflow)
|
|
{
|
|
FVirtualSMLevelOffset PageTableLevelOffset = CalcPageTableLevelOffset(VirtualShadowMapHandle, MipLevel);
|
|
|
|
for (uint Y = RectPages.y; Y <= RectPages.w; ++Y)
|
|
{
|
|
for (uint X = RectPages.x; X <= RectPages.z; ++X)
|
|
{
|
|
if (MarkPageDirty(PageTableLevelOffset, uint2(X, Y), MipLevel, FlagMask, MarkPageDirtyFlags))
|
|
{
|
|
// Count pages if we're touching all of them anyway
|
|
++NumMappedPages;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (bDoLargeJob)
|
|
{
|
|
// defer the work and use all threads in group
|
|
FSharedMarkingJob Job;
|
|
Job.RectPages = RectPages;
|
|
Job.VirtualShadowMapId = VirtualShadowMapId;
|
|
Job.MipLevel = MipLevel;
|
|
Job.FlagMask = FlagMask;
|
|
Job.MarkPageDirtyFlags = MarkPageDirtyFlags;
|
|
SharedMarkingJobs[LargeJobIndex] = PackJob(Job);
|
|
}
|
|
// Must assume we have mapped pages (or defer the cmd write also...)
|
|
NumMappedPages = 1U;
|
|
}
|
|
}
|
|
|
|
if (NumMappedPages > 0U)
|
|
{
|
|
++ThreadTotalForAllViews;
|
|
bVisibleInstancesOverflow |= !WriteCmd(MipViewId, InstanceId, Payload.IndirectArgIndex, CullingFlags, bCacheAsStatic);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
StatsBufferInterlockedInc(VSM_STAT_NON_NANITE_INSTANCES_PAGE_MASK_CULLED);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_FRUSTUM_CULLED, NaniteView.TargetNumMipLevels);
|
|
}
|
|
}
|
|
|
|
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_DRAWN, ThreadTotalForAllViews);
|
|
StatsBufferInterlockedEnableFlags(VSM_STAT_OVERFLOW_FLAGS, VSM_STAT_OVERFLOW_FLAG_MARKING_JOB_QUEUE, bMarkingJobQueueOverflow, true);
|
|
StatsBufferInterlockedEnableFlags(VSM_STAT_OVERFLOW_FLAGS, VSM_STAT_OVERFLOW_FLAG_VISIBLE_INSTANCES, bVisibleInstancesOverflow, true);
|
|
|
|
// Accumulate total number of instances for each indirect argument, is also used to allocate space and output compact range of instances later
|
|
InterlockedAdd(DrawIndirectArgsBufferOut[Payload.IndirectArgIndex * INDIRECT_ARGS_NUM_WORDS + 1], ThreadTotalForAllViews);
|
|
|
|
#if VSM_GENERATE_STATS
|
|
if (NumPageAreaDiagnosticSlots > 0U && TotalPageArea > LargeInstancePageAreaThreshold)
|
|
{
|
|
for (uint Index = 0U; Index < NumPageAreaDiagnosticSlots; ++Index)
|
|
{
|
|
uint PrevArea = 0U;
|
|
// Store these after the stats slots
|
|
InterlockedMax(OutStatsBuffer[VSM_STAT_NUM + 2U * Index], TotalPageArea, PrevArea);
|
|
// If the area was greater then we store the persistent Primitive ID after the area.
|
|
// This obviously has a data race, so numbers & IDs will only be approximately correct, could use 64-bit atomics instead...
|
|
if (PrevArea < TotalPageArea)
|
|
{
|
|
OutStatsBuffer[VSM_STAT_NUM + 2U * Index + 1U] = PrimitiveData.PersistentPrimitiveIndex;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
uint NumMarkingJobs = NumSharedMarkingJobs;
|
|
for (uint JobIndex = 0U; JobIndex < min(MARKING_JOB_QUEUE_SIZE, NumMarkingJobs); ++JobIndex)
|
|
{
|
|
FSharedMarkingJob Job = UnpackJob(SharedMarkingJobs[JobIndex]);
|
|
FVirtualShadowMapHandle VirtualShadowMapHandle = FVirtualShadowMapHandle::MakeFromId(Job.VirtualShadowMapId);
|
|
FVirtualSMLevelOffset PageTableLevelOffset = CalcPageTableLevelOffset(VirtualShadowMapHandle, Job.MipLevel);
|
|
uint2 RectPagesSize = (Job.RectPages.zw + 1u) - Job.RectPages.xy;
|
|
for (uint Index = GroupThreadIndex; Index < RectPagesSize.x * RectPagesSize.y; Index += NUM_THREADS_PER_GROUP)
|
|
{
|
|
uint LocalPageY = uint(floor((float(Index) + 0.5f) * rcp(RectPagesSize.x)));
|
|
uint2 vPage = Job.RectPages.xy + uint2(Index - RectPagesSize.x * LocalPageY, LocalPageY);
|
|
|
|
// We don't need to add the correct flag mask here, as we don't look at the VSM_FLAG_DETAIL_GEOMETRY for invalidaiton processing
|
|
MarkPageDirty(PageTableLevelOffset, vPage, Job.MipLevel, Job.FlagMask, Job.MarkPageDirtyFlags);
|
|
}
|
|
}
|
|
}
|
|
|
|
Buffer<uint> DrawIndirectArgsBuffer;
|
|
RWBuffer<uint> InstanceIdOffsetBufferOut;
|
|
RWStructuredBuffer<uint> OutputOffsetBufferOut;
|
|
RWStructuredBuffer<uint> TmpInstanceIdOffsetBufferOut;
|
|
uint NumIndirectArgs;
|
|
|
|
/**
|
|
* Separate pass to allocate space, needs to run once the final space requirements are known. We buffer the page/instance-draw info and reshuffle later.
|
|
* TODO: Possibly just re-run the culling process in the output pass, saves storing stuff, but may cost more and runs the risk of the passes disagreeing e.g., due to rounding or whatever.
|
|
*/
|
|
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
|
|
void AllocateCommandInstanceOutputSpaceCs(uint IndirectArgIndex : SV_DispatchThreadID)
|
|
{
|
|
if (IndirectArgIndex < NumIndirectArgs)
|
|
{
|
|
uint CommandInstanceCount = DrawIndirectArgsBuffer[IndirectArgIndex * INDIRECT_ARGS_NUM_WORDS + 1];
|
|
uint CommandInstanceOffset = 0U;
|
|
if (CommandInstanceCount > 0U)
|
|
{
|
|
InterlockedAdd(OutputOffsetBufferOut[0], CommandInstanceCount, CommandInstanceOffset);
|
|
}
|
|
InstanceIdOffsetBufferOut[IndirectArgIndex] = CommandInstanceOffset;
|
|
// Store second copy for use during output pass (as we need the first offset buffer during the actual rendering)
|
|
TmpInstanceIdOffsetBufferOut[IndirectArgIndex] = CommandInstanceOffset;
|
|
}
|
|
|
|
// Also set up indirect dispatch args for the output pass (OutputCommandInstanceLists)
|
|
//if (IndirectArgIndex == 0)
|
|
//{
|
|
// uint NumVisibleInstances = VisibleInstanceCountBuffer[0];
|
|
// // ...dispatch args to process all the visible instances
|
|
//}
|
|
}
|
|
|
|
StructuredBuffer<FVSMVisibleInstanceCmd> VisibleInstances;
|
|
StructuredBuffer <uint> VisibleInstanceCountBuffer;
|
|
//RWStructuredBuffer<uint> TmpInstanceIdOffsetBufferOut;
|
|
RWStructuredBuffer<uint> InstanceIdsBufferOut;
|
|
RWStructuredBuffer<uint> PageInfoBufferOut;
|
|
|
|
|
|
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
|
|
void OutputCommandInstanceListsCs(uint VisibleInstanceIndex : SV_DispatchThreadID)
|
|
{
|
|
uint NumVisibleInstances = VisibleInstanceCountBuffer[0];
|
|
|
|
if (VisibleInstanceIndex < NumVisibleInstances)
|
|
{
|
|
FVSMVisibleInstanceCmd VisibleInstanceCmd = VisibleInstances[VisibleInstanceIndex];
|
|
const uint InstanceId = VisibleInstanceCmd.InstanceIdAndFlags & INSTANCE_ID_MASK;
|
|
const uint CullingFlags = VisibleInstanceCmd.InstanceIdAndFlags >> INSTANCE_ID_NUM_BITS;
|
|
|
|
// Scatter the instance ID & other data.
|
|
uint InstanceIdOutputOffset = 0;
|
|
InterlockedAdd(TmpInstanceIdOffsetBufferOut[VisibleInstanceCmd.IndirectArgIndex], 1U, InstanceIdOutputOffset);
|
|
// TODO: maybe repack as uint2 since that might be better for these type of presumably scalar loads.
|
|
InstanceIdsBufferOut[InstanceIdOutputOffset] = PackInstanceCullingOutput(InstanceId, 0u, CullingFlags);
|
|
PageInfoBufferOut[InstanceIdOutputOffset] = VisibleInstanceCmd.PackedPageInfo;
|
|
}
|
|
}
|
|
|