Files
UnrealEngine/Engine/Shaders/Private/InstanceCulling/CompactVisibleInstances.usf
2025-05-18 13:04:45 +08:00

161 lines
6.8 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#include "../Common.ush"
#include "../ComputeShaderUtils.ush"
#include "../ThreadGroupPrefixSum.ush"
#include "InstanceCompactionCommon.ush"
#if COMPILER_SUPPORTS_WAVE_VOTE
// TODO: Need to also check that wave width == thread group size (or implement workaround for other wave sizes)
#define COMPACTION_USES_WAVE_OPS 0
#endif
/** Data needed per draw command for compaction */
StructuredBuffer<FPackedDrawCommandCompactionData> DrawCommandCompactionData;
/**
* CalculateCompactBlockInstanceOffsets
*
* Compaction phase 1 - Generate output offsets of "blocks" of source instance IDs such that all valid IDs will be compacted for
* the draw comand. Each thread group in this dispatch aligns with a single draw command. It uses a hybrid of parallelization and
* serialization to calculate the compact output offsets in the instance ID buffer in a way that doesn't require multiple dispatches.
*/
#if CALCULATE_COMPACT_BLOCK_INSTANCE_OFFSETS
/** Number of valid instances per block in a draw command (accumulated by BuildInstanceDrawCommands.usf) */
StructuredBuffer<uint> BlockInstanceCounts;
/** Destination instance offset per block of instances in a draw command. (Relative to draw command's InstanceIdOffset) */
RWStructuredBuffer<uint> BlockDestInstanceOffsetsOut;
/** The indirect args buffer to receive the final, compacted instance count */
RWBuffer<uint> DrawIndirectArgsBufferOut;
/** Entry point */
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
void CalculateCompactBlockInstanceOffsetsCS(uint3 GroupId : SV_GroupID, uint GroupThreadIndex : SV_GroupIndex)
{
const uint DrawCommandIndex = GetUnWrappedDispatchGroupId(GroupId);
const FDrawCommandCompactionData DrawCommandData = UnpackDrawCommandCompactionData(DrawCommandCompactionData[DrawCommandIndex]);
const uint BlockOffset = DrawCommandData.BlockOffset;
const uint NumBlocksInDrawCommand = GetCompactionBlockCount(DrawCommandData.NumInstances);
const uint NumWorkIterations = ((NumBlocksInDrawCommand - 1U) / NUM_THREADS_PER_GROUP) + 1U;
uint CurInstanceOffset = 0;
uint CurDrawCommandBlockIndex = 0;
for (uint IterIndex = 0; IterIndex < NumWorkIterations; ++IterIndex)
{
const uint DrawCommandBlockIndex = CurDrawCommandBlockIndex + GroupThreadIndex;
const uint GlobalBlockIndex = DrawCommandData.BlockOffset + DrawCommandBlockIndex;
uint BlockInstanceDestCount = 0;
BRANCH
if (DrawCommandBlockIndex < NumBlocksInDrawCommand)
{
BlockInstanceDestCount = BlockInstanceCounts[GlobalBlockIndex];
}
uint PrefixSum, GroupSum;
#if COMPACTION_USES_WAVE_OPS
PrefixSum = WavePrefixSum(BlockInstanceDestCount);
GroupSum = WaveActiveSum(BlockInstanceDestCount);
#else
PrefixSum = ThreadGroupPrefixSum(BlockInstanceDestCount, GroupThreadIndex, GroupSum);
#endif // COMPACTION_USES_WAVE_OPS
// Write out the instance offsets
BRANCH
if (DrawCommandBlockIndex < NumBlocksInDrawCommand)
{
BlockDestInstanceOffsetsOut[GlobalBlockIndex] = CurInstanceOffset + PrefixSum;
}
CurInstanceOffset += GroupSum;
CurDrawCommandBlockIndex += NUM_THREADS_PER_GROUP;
}
if (GroupThreadIndex == 0)
{
// Set the final indirect arg instance count
DrawIndirectArgsBufferOut[DrawCommandData.IndirectArgIndex * INDIRECT_ARGS_NUM_WORDS + 1] = CurInstanceOffset;
}
}
#endif // CALCULATE_COMPACT_BLOCK_INSTANCE_OFFSETS
/**
* CompactVisibleInstances
*
* Compaction phase 2 - Write out the visible instances in compact order
* Each group outputs a compact block of visible instances. The max source (uncompacted) block size is equal to
* COMPACTION_BLOCK_NUM_INSTANCES * NumViews. The number of threads in the group is equal to the number of instances
* per block.
*/
#if COMPACT_VISIBLE_INSTANCES
/** Draw command index per block */
StructuredBuffer<uint> BlockDrawCommandIndices;
/** The raw, uncompacted instance ID buffer (provided by BuildInstanceDrawCommands.usf) */
StructuredBuffer<uint> InstanceIdsBufferIn;
/** Instance output offsets per block (provided by CalculateCompactBlockInstanceOffsetsCS) */
StructuredBuffer<uint> BlockDestInstanceOffsets;
/** The compacted output buffer of InstanceIds (non-mobile) */
RWStructuredBuffer<uint> InstanceIdsBufferOut;
/** The compacted output buffer of instance data consumed by vertex shaders on mobile */
RWByteAddressBuffer InstanceIdsBufferOutMobile;
/** Entry point */
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
void CompactVisibleInstances(uint3 GroupId : SV_GroupID, uint GroupThreadIndex : SV_GroupIndex)
{
const uint GlobalBlockIndex = GetUnWrappedDispatchGroupId(GroupId);
const FDrawCommandCompactionData DrawCommandData = UnpackDrawCommandCompactionData(DrawCommandCompactionData[BlockDrawCommandIndices[GlobalBlockIndex]]);
const uint DrawCommandBlockIndex = GlobalBlockIndex - DrawCommandData.BlockOffset;
const uint MaxInstancesPerBlock = NUM_THREADS_PER_GROUP * DrawCommandData.NumViews;
const uint BlockSrcInstanceCount = min(DrawCommandData.NumInstances * DrawCommandData.NumViews - DrawCommandBlockIndex * MaxInstancesPerBlock, MaxInstancesPerBlock);
const uint BlockSrcInstanceOffset = DrawCommandData.SrcInstanceIdOffset + DrawCommandBlockIndex * MaxInstancesPerBlock;
const uint BlockDestInstanceOffset = DrawCommandData.DestInstanceIdOffset + BlockDestInstanceOffsets[GlobalBlockIndex];
// These are the relative offsets for the whole group from the start of the source and destination offsets for the block, respectively
uint CurGroupSrcOffset = 0;
uint CurGroupDestOffset = 0;
while (CurGroupSrcOffset < BlockSrcInstanceCount) // may loop as many times as NumViews
{
uint InstanceId = 0xFFFFFFFFU;
const uint BlockSrcInstanceIndex = CurGroupSrcOffset + GroupThreadIndex;
const uint GlobalSrcInstanceIndex = BlockSrcInstanceOffset + BlockSrcInstanceIndex;
BRANCH
if (BlockSrcInstanceIndex < BlockSrcInstanceCount)
{
InstanceId = InstanceIdsBufferIn[GlobalSrcInstanceIndex];
}
const bool bIsValidInstance = (InstanceId != 0xFFFFFFFFU);
// now calculate the index based on the valid index in the group
uint PrefixSum, GroupSum;
#if COMPACTION_USES_WAVE_OPS
PrefixSum = WavePrefixCountBits(bIsValidInstance);
GroupSum = WaveActiveCountBits(bIsValidInstance);
#else
// NOTE: This must be called even by threads that won't output an InstanceId
PrefixSum = ThreadGroupPrefixSum(bIsValidInstance ? 1U : 0U, GroupThreadIndex, GroupSum);
#endif // COMPACTION_USES_WAVE_OPS
if (bIsValidInstance)
{
const uint BlockDestInstanceIndex = CurGroupDestOffset + PrefixSum;
const uint GlobalDestInstanceIndex = BlockDestInstanceOffset + BlockDestInstanceIndex;
// TODO: This currently doesn't support mobile, which outputs transform data into a vertex buffer
InstanceIdsBufferOut[GlobalDestInstanceIndex] = InstanceId;
}
CurGroupSrcOffset += NUM_THREADS_PER_GROUP;
CurGroupDestOffset += GroupSum;
}
}
#endif // COMPACT_VISIBLE_INSTANCES