Files
UnrealEngine/Engine/Source/Runtime/Renderer/Private/InstanceCulling/InstanceCullingContext.cpp
2025-05-18 13:04:45 +08:00

1754 lines
81 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
#include "InstanceCulling/InstanceCullingContext.h"
#include "CoreMinimal.h"
#include "DataDrivenShaderPlatformInfo.h"
#include "RHI.h"
#include "RendererModule.h"
#include "ShaderParameterMacros.h"
#include "RenderGraphResources.h"
#include "RenderGraphBuilder.h"
#include "RenderGraphUtils.h"
#include "SceneRendering.h"
#include "ScenePrivate.h"
#include "SystemTextures.h"
#include "InstanceCulling/InstanceCullingManager.h"
#include "InstanceCullingDefinitions.h"
#include "InstanceCullingLoadBalancer.h"
#include "InstanceCullingMergedContext.h"
#include "InstanceCullingOcclusionQuery.h"
#include "RenderCore.h"
#include "MeshDrawCommandStats.h"
#include "UnrealEngine.h"
#include "RHIBreadcrumbs.h"
#include "Materials/MaterialRenderProxy.h"
#include "ViewData.h"
#include "GPUSkinCache.h"
static TAutoConsoleVariable<int32> CVarCullInstances(
TEXT("r.CullInstances"),
1,
TEXT("CullInstances."),
ECVF_RenderThreadSafe);
static TAutoConsoleVariable<int32> CVarOcclusionCullInstances(
TEXT("r.InstanceCulling.OcclusionCull"),
0,
TEXT("Whether to do per instance occlusion culling for GPU instance culling."),
ECVF_RenderThreadSafe | ECVF_Preview);
static int32 GOcclusionForceInstanceCulling = 0;
static FAutoConsoleVariableRef CVarOcclusionForceInstanceCulling(
TEXT("r.InstanceCulling.ForceInstanceCulling"),
GOcclusionForceInstanceCulling,
TEXT("Whether to force per instance occlusion culling."),
ECVF_RenderThreadSafe);
static int32 GInstanceCullingAllowOrderPreservation = 1;
static FAutoConsoleVariableRef CVarInstanceCullingAllowOrderPreservation(
TEXT("r.InstanceCulling.AllowInstanceOrderPreservation"),
GInstanceCullingAllowOrderPreservation,
TEXT("Whether or not to allow instances to preserve instance draw order using GPU compaction."),
ECVF_RenderThreadSafe);
IMPLEMENT_STATIC_UNIFORM_BUFFER_SLOT(InstanceCullingUbSlot);
IMPLEMENT_STATIC_UNIFORM_BUFFER_STRUCT(FInstanceCullingGlobalUniforms, "InstanceCulling", InstanceCullingUbSlot);
IMPLEMENT_STATIC_UNIFORM_BUFFER_SLOT(BatchedPrimitive);
IMPLEMENT_STATIC_AND_SHADER_UNIFORM_BUFFER_STRUCT_EX(FBatchedPrimitiveParameters, "BatchedPrimitive", BatchedPrimitive, FShaderParametersMetadata::EUsageFlags::UniformView);
static const TCHAR* BatchProcessingModeStr[] =
{
TEXT("Generic"),
TEXT("UnCulled"),
};
static_assert(UE_ARRAY_COUNT(BatchProcessingModeStr) == uint32(EBatchProcessingMode::Num), "BatchProcessingModeStr length does not match EBatchProcessingMode::Num, these must be kept in sync.");
DECLARE_GPU_STAT(BuildRenderingCommandsDeferred);
DECLARE_GPU_STAT(BuildRenderingCommands);
static bool IsInstanceOrderPreservationAllowed(EShaderPlatform ShaderPlatform)
{
// Instance order preservation is currently not supported on mobile platforms
return GInstanceCullingAllowOrderPreservation && !IsMobilePlatform(ShaderPlatform);
}
static FUintVector2 PackDrawCommandDesc(bool bMaterialUsesWorldPositionOffset, bool bMaterialAlwaysEvaluatesWorldPositionOffset, FMeshDrawCommandCullingPayload CullingPayload, EMeshDrawCommandCullingPayloadFlags CullingPayloadFlags)
{
// See UnpackDrawCommandDesc() in shader code.
uint32 PackedData = bMaterialUsesWorldPositionOffset ? 1U : 0U;
PackedData |= bMaterialAlwaysEvaluatesWorldPositionOffset ? 2U : 0U;
PackedData |= CullingPayload.LodIndex << 2;
if (EnumHasAnyFlags(CullingPayloadFlags, EMeshDrawCommandCullingPayloadFlags::MinScreenSizeCull))
{
PackedData |= CullingPayload.MinScreenSize << 6;
}
if (EnumHasAnyFlags(CullingPayloadFlags, EMeshDrawCommandCullingPayloadFlags::MaxScreenSizeCull))
{
PackedData |= CullingPayload.MaxScreenSize << 19;
}
return FUintVector2(PackedData, uint32(CullingPayload.DynamicMeshBoundsIndex));
}
FMeshDrawCommandOverrideArgs GetMeshDrawCommandOverrideArgs(const FInstanceCullingDrawParams& InstanceCullingDrawParams)
{
FMeshDrawCommandOverrideArgs Result;
Result.IndirectArgsBuffer = InstanceCullingDrawParams.DrawIndirectArgsBuffer.GetBuffer() != nullptr ? InstanceCullingDrawParams.DrawIndirectArgsBuffer.GetBuffer()->GetRHI() : nullptr;
Result.InstanceDataByteOffset = InstanceCullingDrawParams.InstanceDataByteOffset;
Result.IndirectArgsByteOffset = InstanceCullingDrawParams.IndirectArgsByteOffset;
if (InstanceCullingDrawParams.BatchedPrimitive.GetUniformBuffer() != nullptr)
{
Result.InstanceBuffer = nullptr;
Result.InstanceCullingStaticUB = InstanceCullingDrawParams.BatchedPrimitive.GetUniformBuffer()->GetRHI();
}
else
{
Result.InstanceBuffer = InstanceCullingDrawParams.InstanceIdOffsetBuffer.GetBuffer() != nullptr ? InstanceCullingDrawParams.InstanceIdOffsetBuffer.GetBuffer()->GetRHI() : nullptr;
Result.InstanceCullingStaticUB = InstanceCullingDrawParams.InstanceCulling.GetUniformBuffer() != nullptr ? InstanceCullingDrawParams.InstanceCulling.GetUniformBuffer()->GetRHI() : nullptr;
}
return Result;
}
FUniformBufferStaticSlot FInstanceCullingContext::GetStaticUniformBufferSlot(EShaderPlatform ShaderPlatform)
{
const static FName InstanceCullingSlotName = "InstanceCullingUbSlot";
const static FName BatchedPrimitiveSlotName = "BatchedPrimitive";
return FUniformBufferStaticSlotRegistry::Get().FindSlotByName(PlatformGPUSceneUsesUniformBufferView(ShaderPlatform) ? BatchedPrimitiveSlotName : InstanceCullingSlotName);
}
static uint32 GetInstanceDataStrideElements(EShaderPlatform ShaderPlatform, EBatchProcessingMode Mode)
{
if (PlatformGPUSceneUsesUniformBufferView(ShaderPlatform))
{
// float4 elements, stride depends on whether we writing instances or primitives
return FInstanceCullingContext::UniformViewInstanceStride[static_cast<uint32>(Mode)] / 16u;
}
else
{
// one uint element per-instance
return 1u;
}
}
uint32 FInstanceCullingContext::GetInstanceIdBufferStride(EShaderPlatform ShaderPlatform)
{
if (PlatformGPUSceneUsesUniformBufferView(ShaderPlatform))
{
return UniformViewInstanceStride[1]; // UnCulled
}
else
{
return sizeof(uint32);
}
}
uint32 FInstanceCullingContext::StepInstanceDataOffsetBytes(uint32 NumStepDraws) const
{
// UniformBufferView path uses one instance step rate, on default step is once per draw
if (bUsesUniformBufferView)
{
uint32 GenericStride = LoadBalancers[0]->GetTotalNumInstances() * UniformViewInstanceStride[0];
uint32 UnculledStride = LoadBalancers[1]->GetTotalNumInstances() * UniformViewInstanceStride[1];
return (GenericStride + UnculledStride) * ViewIds.Num();
}
else
{
return NumStepDraws * sizeof(uint32);
}
}
uint32 FInstanceCullingContext::GetInstanceIdNumElements() const
{
if (bUsesUniformBufferView)
{
// This data is used in CS to compute offset for writing instance data
uint32 GenericStride = LoadBalancers[0]->GetTotalNumInstances() * UniformViewInstanceStride[0] / 16u;
uint32 UnculledStride = LoadBalancers[1]->GetTotalNumInstances() * UniformViewInstanceStride[1] / 16u;
return (GenericStride + UnculledStride) * ViewIds.Num();
}
else
{
return TotalInstances * ViewIds.Num();
}
}
FInstanceCullingContext::FInstanceCullingContext(
const TCHAR* PassName,
EShaderPlatform InShaderPlatform,
FInstanceCullingManager* InInstanceCullingManager,
TArrayView<const int32> InViewIds,
const TRefCountPtr<IPooledRenderTarget>& InPrevHZB,
EInstanceCullingMode InInstanceCullingMode,
EInstanceCullingFlags InFlags,
EBatchProcessingMode InSingleInstanceProcessingMode) :
InstanceCullingManager(InInstanceCullingManager),
ShaderPlatform(InShaderPlatform),
ViewIds(InViewIds),
PrevHZB(InPrevHZB),
bIsEnabled(InInstanceCullingManager == nullptr || InInstanceCullingManager->IsEnabled()),
InstanceCullingMode(InInstanceCullingMode),
Flags(InFlags),
SingleInstanceProcessingMode(InSingleInstanceProcessingMode),
InstanceCullingStaticSlot(IsMobilePlatform(InShaderPlatform) ? GetStaticUniformBufferSlot(InShaderPlatform) : MAX_UNIFORM_BUFFER_STATIC_SLOTS),
bUsesUniformBufferView(PlatformGPUSceneUsesUniformBufferView(InShaderPlatform))
{
#if MESH_DRAW_COMMAND_STATS
if (FMeshDrawCommandStatsManager* Instance = FMeshDrawCommandStatsManager::Get())
{
if (FCString::Strcmp(PassName, TEXT("HitProxy")) != 0)
{
MeshDrawCommandPassStats = Instance->CreatePassStats(PassName);
}
}
#endif
}
bool FInstanceCullingContext::IsGPUCullingEnabled()
{
return CVarCullInstances.GetValueOnAnyThread() != 0;
}
bool FInstanceCullingContext::IsOcclusionCullingEnabled()
{
return IsGPUCullingEnabled() && CVarOcclusionCullInstances.GetValueOnAnyThread() != 0;
}
FInstanceCullingContext::~FInstanceCullingContext()
{
for (auto& LoadBalancer : LoadBalancers)
{
if (LoadBalancer != nullptr)
{
delete LoadBalancer;
}
}
}
void FInstanceCullingContext::ResetCommands(int32 MaxNumCommands)
{
IndirectArgs.Empty(MaxNumCommands);
MeshDrawCommandInfos.Empty(MaxNumCommands);
DrawCommandDescs.Empty(MaxNumCommands);
InstanceIdOffsets.Empty(MaxNumCommands);
PayloadData.Empty(MaxNumCommands);
TotalInstances = 0U;
DrawCommandCompactionData.Empty(MaxNumCommands);
CompactionBlockDataIndices.Reset();
NumCompactionInstances = 0U;
}
bool FInstanceCullingContext::IsInstanceOrderPreservationEnabled() const
{
// NOTE: Instance compaction is currently not enabled on mobile platforms
return IsInstanceOrderPreservationAllowed(ShaderPlatform) && !EnumHasAnyFlags(Flags, EInstanceCullingFlags::NoInstanceOrderPreservation);
}
uint32 FInstanceCullingContext::AllocateIndirectArgs(const FMeshDrawCommand *MeshDrawCommand)
{
const uint32 NumPrimitives = MeshDrawCommand->NumPrimitives;
if (ensure(MeshDrawCommand->PrimitiveType < PT_Num))
{
// default to PT_TriangleList & PT_RectList
uint32 NumVerticesOrIndices = NumPrimitives * 3U;
switch (MeshDrawCommand->PrimitiveType)
{
case PT_QuadList:
NumVerticesOrIndices = NumPrimitives * 4U;
break;
case PT_TriangleStrip:
NumVerticesOrIndices = NumPrimitives + 2U;
break;
case PT_LineList:
NumVerticesOrIndices = NumPrimitives * 2U;
break;
case PT_PointList:
NumVerticesOrIndices = NumPrimitives;
break;
default:
break;
}
return IndirectArgs.Emplace(FRHIDrawIndexedIndirectParameters{ NumVerticesOrIndices, 0U, MeshDrawCommand->FirstIndex, int32(MeshDrawCommand->VertexParams.BaseVertexIndex), 0U });
}
return 0U;
}
void FInstanceCullingContext::BeginAsyncSetup(SyncPrerequisitesFuncType&& InSyncPrerequisitesFunc)
{
SyncPrerequisitesFunc = MoveTemp(InSyncPrerequisitesFunc);
}
void FInstanceCullingContext::WaitForSetupTask()
{
if (SyncPrerequisitesFunc)
{
SyncPrerequisitesFunc(*this);
}
SyncPrerequisitesFunc = SyncPrerequisitesFuncType();
}
void FInstanceCullingContext::SetDynamicPrimitiveInstanceOffsets(int32 InDynamicInstanceIdOffset, int32 InDynamicInstanceIdNum)
{
DynamicInstanceIdOffset = InDynamicInstanceIdOffset;
DynamicInstanceIdNum = InDynamicInstanceIdNum;
}
// Key things to achieve
// 1. low-data handling of since ID/Primitive path
// 2. no redundant alloc upload of indirect cmd if none needed.
// 2.1 Only allocate indirect draw cmd if needed,
// 3.
void FInstanceCullingContext::AddInstancesToDrawCommand(uint32 IndirectArgsOffset, int32 InstanceDataOffset, uint32 RunOffset, uint32 NumInstances, EInstanceFlags InstanceFlags)
{
checkSlow(InstanceDataOffset >= 0);
const bool bDynamicInstanceDataOffset = EnumHasAnyFlags(InstanceFlags, EInstanceFlags::DynamicInstanceDataOffset);
const bool bPreserveInstanceOrder = EnumHasAnyFlags(InstanceFlags, EInstanceFlags::PreserveInstanceOrder);
const bool bForceInstanceCulling = EnumHasAnyFlags(InstanceFlags, EInstanceFlags::ForceInstanceCulling);
uint32 Payload = (bDynamicInstanceDataOffset ? INSTANCE_CULLING_DYNAMIC_INSTANCE_DATA_OFFSET_BIT_MASK : 0U);
if (bPreserveInstanceOrder)
{
checkSlow(!EnumHasAnyFlags(Flags, EInstanceCullingFlags::NoInstanceOrderPreservation)); // this should have already been handled
// We need to provide full payload data for these instances
// NOTE: The extended payload data flag is in the lowest bit instead of the highest because the payload is not a full dword, see FInstanceCullingLoadBalancerBase::PackItem
Payload |= (INSTANCE_CULLING_PRESERVE_INSTANCE_ORDER_BIT_MASK | (uint32(PayloadData.Num()) << INSTANCE_CULLING_PAYLOAD_NUM_COMMON_BITS));
PayloadData.Emplace(bDynamicInstanceDataOffset, IndirectArgsOffset, InstanceDataOffset, RunOffset, DrawCommandCompactionData.Num());
}
else
{
// Conserve space by packing the relevant payload information into the dword
Payload |= (IndirectArgsOffset << INSTANCE_CULLING_PAYLOAD_NUM_COMMON_BITS);
}
// We special-case the single-instance (i.e., regular primitives) as they don't need culling (again), except where explicitly specified.
// In actual fact this is not 100% true because dynamic path primitives may not have been culled.
EBatchProcessingMode Mode = (NumInstances == 1 && !bForceInstanceCulling) ? SingleInstanceProcessingMode : EBatchProcessingMode::Generic;
LoadBalancers[uint32(Mode)]->Add(uint32(InstanceDataOffset), NumInstances, Payload);
TotalInstances += NumInstances;
}
void FInstanceCullingContext::AddInstancesToDrawCommand(uint32 IndirectArgsOffset, int32 InstanceDataOffset, uint32 RunOffset, uint32 NumInstances, EInstanceFlags InstanceFlags, uint32 MaxBatchSize)
{
// Batching is disabled or first run of instances fit into batch size
if (MaxBatchSize == MAX_uint32 || (NumInstances <= MaxBatchSize && RunOffset == 0))
{
AddInstancesToDrawCommand(IndirectArgsOffset, InstanceDataOffset, RunOffset, NumInstances, InstanceFlags);
return;
}
// In case we are adding more than one instance run
// we will need to append instances to a last batch until its full
if (RunOffset > 0 && NumInstances > 0)
{
uint32 NumInstancesInBatch = RunOffset % MaxBatchSize;
if (NumInstancesInBatch > 0)
{
NumInstancesInBatch = FMath::Min(MaxBatchSize - NumInstancesInBatch, NumInstances);
// appending to a last batch
IndirectArgsOffset = (IndirectArgs.Num() - 1);
AddInstancesToDrawCommand(IndirectArgsOffset, InstanceDataOffset, RunOffset, NumInstancesInBatch, InstanceFlags);
InstanceDataOffset += NumInstancesInBatch;
NumInstances -= NumInstancesInBatch;
}
}
// Split rest of the instances into batches
if (NumInstances > 0)
{
uint32 NumBatches = FMath::DivideAndRoundUp(NumInstances, MaxBatchSize);
FMeshDrawCommandInfo& RESTRICT DrawCmd = MeshDrawCommandInfos.Last();
FRHIDrawIndexedIndirectParameters LastIndirectArgs = IndirectArgs.Last();
FUintVector2 LastCommandDesc = DrawCommandDescs.Last();
uint32 NumViews = ViewIds.Num();
for (uint32 BatchIdx = 0; BatchIdx < NumBatches; BatchIdx++)
{
uint32 NumInstancesInBatch = FMath::Min(MaxBatchSize, NumInstances);
if (RunOffset > 0 || BatchIdx != 0)
{
DrawCommandDescs.Add(LastCommandDesc);
IndirectArgsOffset = IndirectArgs.Add(LastIndirectArgs);
InstanceIdOffsets.Add(GetInstanceIdNumElements());
DrawCmd.NumBatches++;
}
AddInstancesToDrawCommand(IndirectArgsOffset, InstanceDataOffset, RunOffset, NumInstancesInBatch, InstanceFlags);
InstanceDataOffset += NumInstancesInBatch;
NumInstances -= NumInstancesInBatch;
}
}
}
void FInstanceCullingContext::AddInstanceRunsToDrawCommand(uint32 IndirectArgsOffset, int32 InstanceDataOffset, const uint32* Runs, uint32 NumRuns, EInstanceFlags InstanceFlags, uint32 MaxBatchSize)
{
// Add items to current generic batch as they are instanced for sure.
uint32 NumInstancesInRuns = 0;
for (uint32 Index = 0; Index < NumRuns; ++Index)
{
uint32 RunStart = Runs[Index * 2];
uint32 RunEndIncl = Runs[Index * 2 + 1];
uint32 NumInstances = (RunEndIncl + 1U) - RunStart;
AddInstancesToDrawCommand(IndirectArgsOffset, InstanceDataOffset + RunStart, NumInstancesInRuns, NumInstances, InstanceFlags | EInstanceFlags::ForceInstanceCulling, MaxBatchSize);
NumInstancesInRuns += NumInstances;
}
}
// Base class that provides common functionality between all compaction phases
class FCompactVisibleInstancesBaseCs : public FGlobalShader
{
public:
/** A compaction block is a group of instance IDs sized (N * NumViews). This is N. */
static constexpr int32 CompactionBlockNumInstances = 64;
FCompactVisibleInstancesBaseCs() = default;
FCompactVisibleInstancesBaseCs(const ShaderMetaType::CompiledShaderInitializerType& Initializer)
: FGlobalShader(Initializer)
{
}
static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters)
{
// Currently compaction isn't supported on mobile
return UseGPUScene(Parameters.Platform) && GetMaxSupportedFeatureLevel(Parameters.Platform) > ERHIFeatureLevel::ES3_1;
}
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
{
FGlobalShader::ModifyCompilationEnvironment(Parameters, OutEnvironment);
OutEnvironment.SetDefine(TEXT("COMPACTION_BLOCK_NUM_INSTANCES"), CompactionBlockNumInstances);
OutEnvironment.SetDefine(TEXT("INDIRECT_ARGS_NUM_WORDS"), FInstanceCullingContext::IndirectArgsNumWords);
}
};
// Compaction shader for phase one - calculate instance offsets for each instance compaction "block"
class FCalculateCompactBlockInstanceOffsetsCs final : public FCompactVisibleInstancesBaseCs
{
DECLARE_GLOBAL_SHADER(FCalculateCompactBlockInstanceOffsetsCs);
SHADER_USE_PARAMETER_STRUCT(FCalculateCompactBlockInstanceOffsetsCs, FCompactVisibleInstancesBaseCs)
public:
static constexpr int32 NumThreadsPerGroup = 512;
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
{
FCompactVisibleInstancesBaseCs::ModifyCompilationEnvironment(Parameters, OutEnvironment);
OutEnvironment.SetDefine(TEXT("CALCULATE_COMPACT_BLOCK_INSTANCE_OFFSETS"), 1);
OutEnvironment.SetDefine(TEXT("NUM_THREADS_PER_GROUP"), NumThreadsPerGroup);
}
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer<FInstanceCullingContext::FCompactionData>, DrawCommandCompactionData)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer<uint32>, BlockInstanceCounts)
SHADER_PARAMETER_RDG_BUFFER_UAV(RWStructuredBuffer<uint32>, BlockDestInstanceOffsetsOut)
SHADER_PARAMETER_RDG_BUFFER_UAV(RWBuffer<uint32>, DrawIndirectArgsBufferOut)
END_SHADER_PARAMETER_STRUCT()
};
IMPLEMENT_GLOBAL_SHADER(FCalculateCompactBlockInstanceOffsetsCs, "/Engine/Private/InstanceCulling/CompactVisibleInstances.usf", "CalculateCompactBlockInstanceOffsetsCS", SF_Compute);
// Compaction shader for phase two - output visible instances, compacted and in original draw order
class FCompactVisibleInstancesCs final : public FCompactVisibleInstancesBaseCs
{
DECLARE_GLOBAL_SHADER(FCompactVisibleInstancesCs);
SHADER_USE_PARAMETER_STRUCT(FCompactVisibleInstancesCs, FCompactVisibleInstancesBaseCs)
public:
static constexpr int32 NumThreadsPerGroup = FCompactVisibleInstancesBaseCs::CompactionBlockNumInstances;
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
{
FCompactVisibleInstancesBaseCs::ModifyCompilationEnvironment(Parameters, OutEnvironment);
OutEnvironment.SetDefine(TEXT("COMPACT_VISIBLE_INSTANCES"), 1);
OutEnvironment.SetDefine(TEXT("NUM_THREADS_PER_GROUP"), NumThreadsPerGroup);
}
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer<FInstanceCullingContext::FCompactionData>, DrawCommandCompactionData)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer<uint32>, BlockDrawCommandIndices)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer<uint32>, InstanceIdsBufferIn)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer<uint32>, BlockDestInstanceOffsets)
SHADER_PARAMETER_RDG_BUFFER_UAV(RWStructuredBuffer<uint32>, InstanceIdsBufferOut)
SHADER_PARAMETER_RDG_BUFFER_UAV(RWStructuredBuffer<float4>, InstanceIdsBufferOutMobile)
END_SHADER_PARAMETER_STRUCT()
};
IMPLEMENT_GLOBAL_SHADER(FCompactVisibleInstancesCs, "/Engine/Private/InstanceCulling/CompactVisibleInstances.usf", "CompactVisibleInstances", SF_Compute);
class FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs : public FGlobalShader
{
DECLARE_GLOBAL_SHADER(FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs);
SHADER_USE_PARAMETER_STRUCT(FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs, FGlobalShader)
public:
static constexpr int32 NumThreadsPerGroup = FInstanceProcessingGPULoadBalancer::ThreadGroupSize;
// GPUCULL_TODO: remove once buffer is somehow unified
class FSingleInstanceModeDim : SHADER_PERMUTATION_BOOL("SINGLE_INSTANCE_MODE");
class FCullInstancesDim : SHADER_PERMUTATION_BOOL("CULL_INSTANCES");
class FAllowWPODisableDim : SHADER_PERMUTATION_BOOL("ALLOW_WPO_DISABLE");
class FOcclusionCullInstancesDim : SHADER_PERMUTATION_BOOL("OCCLUSION_CULL_INSTANCES");
class FStereoModeDim : SHADER_PERMUTATION_BOOL("STEREO_CULLING_MODE");
class FBatchedDim : SHADER_PERMUTATION_BOOL("ENABLE_BATCH_MODE");
class FInstanceCompactionDim : SHADER_PERMUTATION_BOOL("ENABLE_INSTANCE_COMPACTION");
using FPermutationDomain = TShaderPermutationDomain<FSingleInstanceModeDim, FCullInstancesDim, FAllowWPODisableDim, FOcclusionCullInstancesDim, FStereoModeDim, FBatchedDim, FInstanceCompactionDim>;
static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters)
{
if (!UseGPUScene(Parameters.Platform))
{
return false;
}
const FPermutationDomain PermutationVector(Parameters.PermutationId);
// Currently, instance compaction is not supported on mobile platforms
if (PermutationVector.Get<FInstanceCompactionDim>() && IsMobilePlatform(Parameters.Platform))
{
return false;
}
// Current behavior is that instance culling coerces the WPO disable distance check, so don't compile permutations
// that include the former and exclude the latter
if (PermutationVector.Get<FCullInstancesDim>() && !PermutationVector.Get<FAllowWPODisableDim>())
{
return false;
}
return true;
}
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
{
FGlobalShader::ModifyCompilationEnvironment(Parameters, OutEnvironment);
FInstanceProcessingGPULoadBalancer::SetShaderDefines(OutEnvironment);
// Force use of DXC for platforms compiled with hlslcc due to hlslcc's inability to handle member functions in structs
if (FDataDrivenShaderPlatformInfo::GetIsHlslcc(Parameters.Platform))
{
OutEnvironment.CompilerFlags.Add(CFLAG_ForceDXC);
}
// This shader takes a very long time to compile with FXC, so we pre-compile it with DXC first and then forward the optimized HLSL to FXC.
OutEnvironment.CompilerFlags.Add(CFLAG_PrecompileWithDXC);
OutEnvironment.SetDefine(TEXT("INDIRECT_ARGS_NUM_WORDS"), FInstanceCullingContext::IndirectArgsNumWords);
OutEnvironment.SetDefine(TEXT("USE_GLOBAL_GPU_SCENE_DATA"), 1);
OutEnvironment.SetDefine(TEXT("USE_GLOBAL_GPU_LIGHTMAP_DATA"), 1);
OutEnvironment.SetDefine(TEXT("NANITE_MULTI_VIEW"), 1);
OutEnvironment.SetDefine(TEXT("PRIM_ID_DYNAMIC_FLAG"), GPrimIDDynamicFlag);
OutEnvironment.SetDefine(TEXT("COMPACTION_BLOCK_NUM_INSTANCES"), FCompactVisibleInstancesBaseCs::CompactionBlockNumInstances);
OutEnvironment.SetDefine(TEXT("BATCH_PROCESSING_MODE_GENERIC"), uint32(EBatchProcessingMode::Generic));
OutEnvironment.SetDefine(TEXT("BATCH_PROCESSING_MODE_UNCULLED"), uint32(EBatchProcessingMode::UnCulled));
OutEnvironment.SetDefine(TEXT("BATCH_PROCESSING_MODE_NUM"), uint32(EBatchProcessingMode::Num));
const FPermutationDomain PermutationVector(Parameters.PermutationId);
EBatchProcessingMode ProcessingMode = (PermutationVector.Get<FSingleInstanceModeDim>() ? EBatchProcessingMode::UnCulled : EBatchProcessingMode::Generic);
OutEnvironment.SetDefine(TEXT("INSTANCE_DATA_STRIDE_ELEMENTS"), GetInstanceDataStrideElements(Parameters.Platform, ProcessingMode));
static const auto CVarPrimitiveHasTileOffsetData = IConsoleManager::Get().FindTConsoleVariableDataInt(TEXT("r.PrimitiveHasTileOffsetData"));
const bool bPrimitiveHasTileOffsetData = CVarPrimitiveHasTileOffsetData->GetValueOnAnyThread() != 0;
OutEnvironment.SetDefine(TEXT("PRIMITIVE_HAS_TILEOFFSET_DATA"), bPrimitiveHasTileOffsetData ? 1 : 0);
DynamicMeshBoundsModifyCompilationEnvironment(OutEnvironment);
}
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
SHADER_PARAMETER_RDG_UNIFORM_BUFFER(FSceneUniformParameters, Scene)
SHADER_PARAMETER_STRUCT_INCLUDE(FGPUSceneResourceParameters, GPUSceneParameters)
SHADER_PARAMETER_STRUCT_INCLUDE(FInstanceProcessingGPULoadBalancer::FShaderParameters, LoadBalancerParameters)
SHADER_PARAMETER_STRUCT_INCLUDE(RendererViewData::FCullingShaderParameters, ViewDataCullingParameters)
SHADER_PARAMETER_STRUCT_INCLUDE(FDynamicMeshBoundsShaderParameters, DynamicBoundsParameters )
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer< FUintVector2 >, DrawCommandDescs)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer< FInstanceCullingContext::FPayloadData >, InstanceCullingPayloads)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer< uint32 >, ViewIds)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer< FContextBatchInfo >, BatchInfos)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer< uint32 >, BatchInds)
SHADER_PARAMETER_RDG_BUFFER_SRV(Buffer<uint>, InstanceIdOffsetBuffer)
SHADER_PARAMETER_RDG_BUFFER_SRV(Buffer<uint>, InstanceOcclusionQueryBuffer)
SHADER_PARAMETER_RDG_BUFFER_UAV(RWStructuredBuffer<uint>, InstanceIdsBufferOut)
SHADER_PARAMETER_RDG_BUFFER_UAV(RWStructuredBuffer<float4>, InstanceIdsBufferOutMobile)
SHADER_PARAMETER_RDG_BUFFER_UAV(RWBuffer<uint>, DrawIndirectArgsBufferOut)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer<FInstanceCullingContext::FCompactionData>, DrawCommandCompactionData)
SHADER_PARAMETER_RDG_BUFFER_UAV(RWStructuredBuffer<uint>, CompactInstanceIdsBufferOut)
SHADER_PARAMETER_RDG_BUFFER_UAV(RWStructuredBuffer<uint>, CompactionBlockCounts)
SHADER_PARAMETER_RDG_TEXTURE(Texture2D, HZBTexture)
SHADER_PARAMETER_SAMPLER(SamplerState, HZBSampler)
SHADER_PARAMETER(FVector2f, HZBSize)
SHADER_PARAMETER(uint32, NumViewIds)
SHADER_PARAMETER(uint32, CurrentBatchProcessingMode)
SHADER_PARAMETER(int32, DynamicInstanceIdOffset)
SHADER_PARAMETER(int32, DynamicInstanceIdMax)
END_SHADER_PARAMETER_STRUCT()
};
IMPLEMENT_GLOBAL_SHADER(FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs, "/Engine/Private/InstanceCulling/BuildInstanceDrawCommands.usf", "InstanceCullBuildInstanceIdBufferCS", SF_Compute);
const TRDGUniformBufferRef<FInstanceCullingGlobalUniforms> FInstanceCullingContext::CreateDummyInstanceCullingUniformBuffer(FRDGBuilder& GraphBuilder)
{
FInstanceCullingGlobalUniforms* InstanceCullingGlobalUniforms = GraphBuilder.AllocParameters<FInstanceCullingGlobalUniforms>();
FRDGBufferRef DummyBuffer = GSystemTextures.GetDefaultStructuredBuffer(GraphBuilder, 4);
InstanceCullingGlobalUniforms->InstanceIdsBuffer = GraphBuilder.CreateSRV(DummyBuffer);
InstanceCullingGlobalUniforms->PageInfoBuffer = GraphBuilder.CreateSRV(DummyBuffer);
InstanceCullingGlobalUniforms->BufferCapacity = 0;
return GraphBuilder.CreateUniformBuffer(InstanceCullingGlobalUniforms);
}
class FInstanceCullingDeferredContext : public FInstanceCullingMergedContext
{
public:
FInstanceCullingDeferredContext(EShaderPlatform InShaderPlatform, FInstanceCullingManager* InInstanceCullingManager = nullptr, int32 InNumBins=2)
: FInstanceCullingMergedContext(InShaderPlatform, false, InNumBins)
, InstanceCullingManager(InInstanceCullingManager)
{}
FInstanceCullingManager* InstanceCullingManager;
FRDGBufferRef DrawIndirectArgsBuffer = nullptr;
FRDGBufferRef InstanceDataBuffer = nullptr;
TRDGUniformBufferRef<FInstanceCullingGlobalUniforms> UniformBuffer = nullptr;
TRDGUniformBufferRef<FBatchedPrimitiveParameters> BatchedPrimitive = nullptr;
bool bProcessed = false;
void ProcessBatched(TArray<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FParameters*> PassParameters);
#if MESH_DRAW_COMMAND_STATS
FRHIGPUBufferReadback* MeshDrawCommandStatsIndirectArgsReadbackBuffer = nullptr;
#endif
};
static uint32 GetInstanceIdBufferSize(EShaderPlatform ShaderPlatform, uint32 NumInstanceElements)
{
if (PlatformGPUSceneUsesUniformBufferView(ShaderPlatform))
{
// Add an additional max range slack to a buffer size, so when binding last element we still have a full UBO range
NumInstanceElements += (PLATFORM_MAX_UNIFORM_BUFFER_RANGE / 16u);
return NumInstanceElements;
}
else
{
// Desktop uses StructuredBuffer<uint> NumElements==NumInstances
return NumInstanceElements;
}
}
static FRDGBufferDesc CreateInstanceIdBufferDesc(EShaderPlatform ShaderPlatform, uint32 NumInstanceElements)
{
if (PlatformGPUSceneUsesUniformBufferView(ShaderPlatform))
{
// float4
FRDGBufferDesc Desc = FRDGBufferDesc::CreateStructuredDesc(16u, NumInstanceElements);
Desc.Usage |= EBufferUsageFlags::UniformBuffer;
return Desc;
}
else
{
return FRDGBufferDesc::CreateStructuredDesc(sizeof(uint32), NumInstanceElements);
}
}
void FInstanceCullingContext::BuildRenderingCommands(
FRDGBuilder& GraphBuilder,
const FGPUScene& GPUScene,
int32 InDynamicInstanceIdOffset,
int32 InDynamicInstanceIdNum,
FInstanceCullingResult& Results)
{
check(!SyncPrerequisitesFunc);
Results = FInstanceCullingResult();
SetDynamicPrimitiveInstanceOffsets(InDynamicInstanceIdOffset, InDynamicInstanceIdNum);
BuildRenderingCommandsInternal(GraphBuilder, GPUScene, EAsyncProcessingMode::Synchronous, &Results.Parameters);
}
void FInstanceCullingContext::BuildRenderingCommands(FRDGBuilder& GraphBuilder, const FGPUScene& GPUScene, FInstanceCullingDrawParams* InstanceCullingDrawParams)
{
BuildRenderingCommandsInternal(GraphBuilder, GPUScene, EAsyncProcessingMode::DeferredOrAsync, InstanceCullingDrawParams);
}
bool FInstanceCullingContext::HasCullingCommands() const
{
check(!SyncPrerequisitesFunc); return TotalInstances > 0;
}
void FInstanceCullingContext::BuildRenderingCommandsInternal(
FRDGBuilder& GraphBuilder,
const FGPUScene& GPUScene,
EAsyncProcessingMode AsyncProcessingMode,
FInstanceCullingDrawParams* InstanceCullingDrawParams)
{
#if MESH_DRAW_COMMAND_STATS
if (MeshDrawCommandPassStats)
{
check(!MeshDrawCommandPassStats->bBuildRenderingCommandsCalled);
MeshDrawCommandPassStats->bBuildRenderingCommandsCalled = true;
}
#endif
check(InstanceCullingDrawParams);
FMemory::Memzero(*InstanceCullingDrawParams);
if (InstanceCullingManager)
{
InstanceCullingDrawParams->Scene = InstanceCullingManager->SceneUniforms.GetBuffer(GraphBuilder);
}
if (AsyncProcessingMode != EAsyncProcessingMode::Synchronous && InstanceCullingManager && InstanceCullingManager->IsDeferredCullingActive() && (InstanceCullingMode == EInstanceCullingMode::Normal))
{
FInstanceCullingDeferredContext *DeferredContext = InstanceCullingManager->DeferredContext;
// If this is true, then RDG Execute or Drain has been called, and no further contexts can be deferred.
if (!DeferredContext->bProcessed)
{
InstanceCullingDrawParams->DrawIndirectArgsBuffer = DeferredContext->DrawIndirectArgsBuffer;
InstanceCullingDrawParams->InstanceIdOffsetBuffer = DeferredContext->InstanceDataBuffer;
InstanceCullingDrawParams->InstanceCulling = DeferredContext->UniformBuffer;
InstanceCullingDrawParams->BatchedPrimitive = DeferredContext->BatchedPrimitive;
DeferredContext->AddBatch(GraphBuilder, this, InstanceCullingDrawParams);
}
return;
}
WaitForSetupTask();
if (!HasCullingCommands())
{
if (InstanceCullingManager)
{
InstanceCullingDrawParams->InstanceCulling = InstanceCullingManager->GetDummyInstanceCullingUniformBuffer();
}
return;
}
check(DynamicInstanceIdOffset >= 0);
check(DynamicInstanceIdNum >= 0);
ensure(InstanceCullingMode == EInstanceCullingMode::Normal || ViewIds.Num() == 2);
// If there is no manager, then there is no data on culling, so set flag to skip that and ignore buffers.
const bool bCullInstances = InstanceCullingManager != nullptr && CVarCullInstances.GetValueOnRenderThread() != 0;
const bool bAllowWPODisable = InstanceCullingManager != nullptr;
RDG_EVENT_SCOPE_STAT(GraphBuilder, BuildRenderingCommands, "BuildRenderingCommands(Culling=%s)", bCullInstances ? TEXT("On") : TEXT("Off"));
RDG_GPU_STAT_SCOPE(GraphBuilder, BuildRenderingCommands);
const bool bOrderPreservationEnabled = IsInstanceOrderPreservationEnabled();
const uint32 NumCompactionBlocks = uint32(CompactionBlockDataIndices.Num());
FRDGBufferRef CompactInstanceIdsBuffer = nullptr;
FRDGBufferUAVRef CompactInstanceIdsUAV = nullptr;
FRDGBufferRef CompactionBlockCountsBuffer = nullptr;
FRDGBufferUAVRef CompactionBlockCountsUAV = nullptr;
FRDGBufferSRVRef DrawCommandCompactionDataSRV = nullptr;
if (bOrderPreservationEnabled)
{
// Create buffers for compacting instances for draw commands that need it
CompactInstanceIdsBuffer = GraphBuilder.CreateBuffer(FRDGBufferDesc::CreateStructuredDesc(sizeof(uint32), FMath::Max(NumCompactionInstances, 1u)), TEXT("InstanceCulling.Compaction.TempInstanceIdsBuffer"));
CompactInstanceIdsUAV = GraphBuilder.CreateUAV(CompactInstanceIdsBuffer);
CompactionBlockCountsBuffer = GraphBuilder.CreateBuffer(FRDGBufferDesc::CreateStructuredDesc(sizeof(uint32), FMath::Max(NumCompactionBlocks, 1u)), TEXT("InstanceCulling.Compaction.BlockInstanceCounts"));
CompactionBlockCountsUAV = GraphBuilder.CreateUAV(CompactionBlockCountsBuffer);
FRDGBufferRef DrawCommandCompactionDataBuffer = nullptr;
if (DrawCommandCompactionData.Num() > 0)
{
DrawCommandCompactionDataBuffer = CreateStructuredBuffer(GraphBuilder, TEXT("InstanceCulling.DrawCommandCompactionData"), DrawCommandCompactionData);
}
else
{
DrawCommandCompactionDataBuffer = GSystemTextures.GetDefaultStructuredBuffer(GraphBuilder, sizeof(FCompactionData));
}
DrawCommandCompactionDataSRV = GraphBuilder.CreateSRV(DrawCommandCompactionDataBuffer);
if (NumCompactionBlocks > 0)
{
ensure(NumCompactionInstances > 0);
// We must clear the block counts buffer, as it will be written to using atomic increments
AddClearUAVPass(GraphBuilder, CompactionBlockCountsUAV, 0);
}
}
FGlobalShaderMap* ShaderMap = GetGlobalShaderMap(ShaderPlatform);
FRDGBufferRef ViewIdsBuffer = CreateStructuredBuffer(GraphBuilder, TEXT("InstanceCulling.ViewIds"), ViewIds);
const uint32 InstanceIdBufferSize = GetInstanceIdBufferSize(ShaderPlatform, GetInstanceIdNumElements());
FRDGBufferRef InstanceIdsBuffer = GraphBuilder.CreateBuffer(CreateInstanceIdBufferDesc(ShaderPlatform, InstanceIdBufferSize), TEXT("InstanceCulling.InstanceIdsBuffer"));
FRDGBufferUAVRef InstanceIdsBufferUAV = GraphBuilder.CreateUAV(InstanceIdsBuffer, ERDGUnorderedAccessViewFlags::SkipBarrier);
FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FParameters PassParametersTmp;
PassParametersTmp.DrawCommandDescs = GraphBuilder.CreateSRV(CreateStructuredBuffer(GraphBuilder, TEXT("InstanceCulling.DrawCommandDescs"), DrawCommandDescs));
PassParametersTmp.InstanceCullingPayloads = GraphBuilder.CreateSRV(CreateStructuredBuffer(GraphBuilder, TEXT("InstanceCulling.PayloadData"), PayloadData));
// NOTE: because it is possible to not have access to the SceneUB (for convoluted reasons), we bind the scene data directly,
// but to get access to the other facilities when available, we also bind the scene UB). This works because culling is disabled when InstanceCullingManager is not available.
PassParametersTmp.Scene = InstanceCullingManager ? InstanceCullingManager->SceneUniforms.GetBuffer(GraphBuilder) : nullptr;
PassParametersTmp.GPUSceneParameters = GPUScene.GetShaderParameters(GraphBuilder);
PassParametersTmp.DynamicInstanceIdOffset = DynamicInstanceIdOffset;
PassParametersTmp.DynamicInstanceIdMax = DynamicInstanceIdOffset + DynamicInstanceIdNum;
// Compaction parameters
PassParametersTmp.DrawCommandCompactionData = DrawCommandCompactionDataSRV;
PassParametersTmp.CompactInstanceIdsBufferOut = CompactInstanceIdsUAV;
PassParametersTmp.CompactionBlockCounts = CompactionBlockCountsUAV;
// Create buffer for indirect args and upload draw arg data, also clears the instance to zero
FRDGBufferDesc IndirectArgsDesc = FRDGBufferDesc::CreateIndirectDesc(IndirectArgsNumWords * IndirectArgs.Num());
IndirectArgsDesc.Usage = EBufferUsageFlags(IndirectArgsDesc.Usage | BUF_MultiGPUGraphIgnore);
FRDGBufferRef DrawIndirectArgsRDG = GraphBuilder.CreateBuffer(IndirectArgsDesc, TEXT("InstanceCulling.DrawIndirectArgsBuffer"));
GraphBuilder.QueueBufferUpload(DrawIndirectArgsRDG, IndirectArgs.GetData(), IndirectArgs.GetTypeSize() * IndirectArgs.Num());
// Note: we redundantly clear the instance counts here as there is some issue with replays on certain consoles.
AddClearIndirectArgInstanceCountPass(GraphBuilder, ShaderMap, DrawIndirectArgsRDG);
// not using structured buffer as we have to get at it as a vertex buffer
FRDGBufferRef InstanceIdOffsetBufferRDG = GraphBuilder.CreateBuffer(FRDGBufferDesc::CreateBufferDesc(sizeof(uint32), InstanceIdOffsets.Num()), TEXT("InstanceCulling.InstanceIdOffsetBuffer"));
GraphBuilder.QueueBufferUpload(InstanceIdOffsetBufferRDG, InstanceIdOffsets.GetData(), InstanceIdOffsets.GetTypeSize() * InstanceIdOffsets.Num());
PassParametersTmp.ViewIds = GraphBuilder.CreateSRV(ViewIdsBuffer);
PassParametersTmp.ViewDataCullingParameters.NumCullingViews = 0;
if ((bCullInstances || bAllowWPODisable) && InstanceCullingManager)
{
PassParametersTmp.ViewDataCullingParameters = InstanceCullingManager->ViewDataManager.GetCullingParameters(GraphBuilder);
#if DO_CHECK
for (int32 ViewId : ViewIds)
{
checkf(ViewId < int32(PassParametersTmp.ViewDataCullingParameters.NumCullingViews), TEXT("Attempting to process a culling context that references a view that has not been uploaded yet."));
}
#endif
}
PassParametersTmp.DynamicBoundsParameters = GetDynamicMeshBoundsShaderParameters(GraphBuilder);
PassParametersTmp.NumViewIds = ViewIds.Num();
// only one of these will be used in the shader
PassParametersTmp.InstanceIdsBufferOut = InstanceIdsBufferUAV;
PassParametersTmp.InstanceIdsBufferOutMobile = InstanceIdsBufferUAV;
PassParametersTmp.DrawIndirectArgsBufferOut = GraphBuilder.CreateUAV(DrawIndirectArgsRDG, PF_R32_UINT, ERDGUnorderedAccessViewFlags::SkipBarrier);
PassParametersTmp.InstanceIdOffsetBuffer = GraphBuilder.CreateSRV(InstanceIdOffsetBufferRDG, PF_R32_UINT);
const bool bOcclusionCullInstances = PrevHZB.IsValid() && IsOcclusionCullingEnabled();
if (bOcclusionCullInstances)
{
PassParametersTmp.HZBTexture = GraphBuilder.RegisterExternalTexture(PrevHZB);
PassParametersTmp.HZBSize = PassParametersTmp.HZBTexture->Desc.Extent;
PassParametersTmp.HZBSampler = TStaticSamplerState< SF_Point, AM_Clamp, AM_Clamp, AM_Clamp >::GetRHI();
}
if (InstanceCullingManager && InstanceCullingManager->InstanceOcclusionQueryBuffer)
{
PassParametersTmp.InstanceOcclusionQueryBuffer = GraphBuilder.CreateSRV(
InstanceCullingManager->InstanceOcclusionQueryBuffer,
InstanceCullingManager->InstanceOcclusionQueryBufferFormat);
}
else
{
FRDGBufferRef DummyBuffer = GSystemTextures.GetDefaultBuffer(GraphBuilder, 4, 0u);
PassParametersTmp.InstanceOcclusionQueryBuffer = GraphBuilder.CreateSRV(DummyBuffer, PF_R32_UINT);
}
for (uint32 Mode = 0U; Mode < uint32(EBatchProcessingMode::Num); ++Mode)
{
FInstanceProcessingGPULoadBalancer* LoadBalancer = LoadBalancers[Mode];
if (!LoadBalancer->IsEmpty())
{
FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FParameters* PassParameters = GraphBuilder.AllocParameters<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FParameters>();
*PassParameters = PassParametersTmp;
// Upload data etc
auto GPUData = LoadBalancer->Upload(GraphBuilder);
GPUData.GetShaderParameters(GraphBuilder, PassParameters->LoadBalancerParameters);
PassParameters->CurrentBatchProcessingMode = Mode;
// UnCulled bucket is used for a single instance mode
check(EBatchProcessingMode(Mode) != EBatchProcessingMode::UnCulled || LoadBalancer->HasSingleInstanceItemsOnly());
FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FPermutationDomain PermutationVector;
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FSingleInstanceModeDim>(EBatchProcessingMode(Mode) == EBatchProcessingMode::UnCulled);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FCullInstancesDim>(bCullInstances && EBatchProcessingMode(Mode) != EBatchProcessingMode::UnCulled);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FAllowWPODisableDim>(bAllowWPODisable);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FOcclusionCullInstancesDim>(bOcclusionCullInstances);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FStereoModeDim>(InstanceCullingMode == EInstanceCullingMode::Stereo);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FBatchedDim>(false);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FInstanceCompactionDim>(bOrderPreservationEnabled);
auto ComputeShader = ShaderMap->GetShader<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs>(PermutationVector);
FComputeShaderUtils::AddPass(
GraphBuilder,
RDG_EVENT_NAME("CullInstances(%s)", BatchProcessingModeStr[Mode]),
ComputeShader,
PassParameters,
LoadBalancer->GetWrappedCsGroupCount()
);
}
}
if (bOrderPreservationEnabled && NumCompactionBlocks > 0)
{
FRDGBufferRef BlockDestInstanceOffsets = GraphBuilder.CreateBuffer(FRDGBufferDesc::CreateStructuredDesc(sizeof(uint32), NumCompactionBlocks), TEXT("InstanceCulling.Compaction.BlockDestInstanceOffsets"));
// Compaction phase one - prefix sum of the compaction "blocks"
{
auto PassParameters = GraphBuilder.AllocParameters<FCalculateCompactBlockInstanceOffsetsCs::FParameters>();
PassParameters->DrawCommandCompactionData = DrawCommandCompactionDataSRV;
PassParameters->BlockInstanceCounts = GraphBuilder.CreateSRV(CompactionBlockCountsBuffer);
PassParameters->BlockDestInstanceOffsetsOut = GraphBuilder.CreateUAV(BlockDestInstanceOffsets);
PassParameters->DrawIndirectArgsBufferOut = PassParametersTmp.DrawIndirectArgsBufferOut;
auto ComputeShader = ShaderMap->GetShader<FCalculateCompactBlockInstanceOffsetsCs>();
FComputeShaderUtils::AddPass(
GraphBuilder,
RDG_EVENT_NAME("Instance Compaction Phase 1"),
ComputeShader,
PassParameters,
FComputeShaderUtils::GetGroupCountWrapped(DrawCommandCompactionData.Num())
);
}
// Compaction phase two - write instances to compact final location
{
FRDGBufferRef BlockDrawCommandIndices = CreateStructuredBuffer(GraphBuilder, TEXT("InstanceCulling.Compaction.BlockDrawCommandIndices"), CompactionBlockDataIndices);
auto PassParameters = GraphBuilder.AllocParameters<FCompactVisibleInstancesCs::FParameters>();
PassParameters->DrawCommandCompactionData = DrawCommandCompactionDataSRV;
PassParameters->BlockDrawCommandIndices = GraphBuilder.CreateSRV(BlockDrawCommandIndices);
PassParameters->InstanceIdsBufferIn = GraphBuilder.CreateSRV(CompactInstanceIdsBuffer);
PassParameters->BlockDestInstanceOffsets = GraphBuilder.CreateSRV(BlockDestInstanceOffsets);
PassParameters->InstanceIdsBufferOut = InstanceIdsBufferUAV;
PassParameters->InstanceIdsBufferOutMobile = InstanceIdsBufferUAV;
auto ComputeShader = ShaderMap->GetShader<FCompactVisibleInstancesCs>();
FComputeShaderUtils::AddPass(
GraphBuilder,
RDG_EVENT_NAME("Instance Compaction Phase 2"),
ComputeShader,
PassParameters,
FComputeShaderUtils::GetGroupCountWrapped(NumCompactionBlocks)
);
}
}
InstanceCullingDrawParams->DrawIndirectArgsBuffer = DrawIndirectArgsRDG;
InstanceCullingDrawParams->InstanceIdOffsetBuffer = InstanceIdOffsetBufferRDG;
if (PlatformGPUSceneUsesUniformBufferView(ShaderPlatform))
{
FBatchedPrimitiveParameters* BatchedPrimitiveParameters = GraphBuilder.AllocParameters<FBatchedPrimitiveParameters>();
BatchedPrimitiveParameters->Data = GraphBuilder.CreateSRV(InstanceIdsBuffer);
InstanceCullingDrawParams->BatchedPrimitive = GraphBuilder.CreateUniformBuffer(BatchedPrimitiveParameters);
}
else
{
FInstanceCullingGlobalUniforms* UniformParameters = GraphBuilder.AllocParameters<FInstanceCullingGlobalUniforms>();
UniformParameters->InstanceIdsBuffer = GraphBuilder.CreateSRV(InstanceIdsBuffer);
UniformParameters->PageInfoBuffer = GraphBuilder.CreateSRV(InstanceIdsBuffer);
UniformParameters->BufferCapacity = InstanceIdBufferSize;
InstanceCullingDrawParams->InstanceCulling = GraphBuilder.CreateUniformBuffer(UniformParameters);
}
#if MESH_DRAW_COMMAND_STATS
if (MeshDrawCommandPassStats)
{
FRHIGPUBufferReadback* GPUBufferReadback = FMeshDrawCommandStatsManager::Get()->QueueDrawRDGIndirectArgsReadback(GraphBuilder, DrawIndirectArgsRDG);
MeshDrawCommandPassStats->SetInstanceCullingGPUBufferReadback(GPUBufferReadback, 0);
}
#endif
}
void FInstanceCullingDeferredContext::ProcessBatched(TArray<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FParameters*> PassParameters)
{
if (bProcessed)
{
return;
}
MergeBatches();
#if MESH_DRAW_COMMAND_STATS
// Setup the indirect buffer and correct offset for each pass in the merged buffer
if (MeshDrawCommandStatsIndirectArgsReadbackBuffer)
{
for (int32 BatchIndex = 0; BatchIndex < Batches.Num(); ++BatchIndex)
{
const FBatchItem& BatchItem = Batches[BatchIndex];
if (BatchItem.Context->MeshDrawCommandPassStats)
{
BatchItem.Context->MeshDrawCommandPassStats->SetInstanceCullingGPUBufferReadback(MeshDrawCommandStatsIndirectArgsReadbackBuffer, BatchInfos[BatchIndex].IndirectArgsOffset);
}
}
}
#endif // MESH_DRAW_COMMAND_STATS
bProcessed = true;
// Finalize culling pass parameters
for (uint32 BinIndex = 0U; BinIndex < uint32(LoadBalancers.Num()); ++BinIndex)
{
PassParameters[BinIndex]->NumViewIds = ViewIds.Num();
PassParameters[BinIndex]->LoadBalancerParameters.NumBatches = LoadBalancers[BinIndex].GetBatches().Num();
PassParameters[BinIndex]->LoadBalancerParameters.NumItems = LoadBalancers[BinIndex].GetItems().Num();
PassParameters[BinIndex]->LoadBalancerParameters.NumGroupsPerBatch = 1;
}
}
template <typename DataType>
FORCEINLINE int32 GetArrayDataSize(const TArrayView<const DataType>& Array)
{
return Array.GetTypeSize() * Array.Num();
}
template <typename DataType, typename AllocatorType>
FORCEINLINE int32 GetArrayDataSize(const TArray<DataType, AllocatorType>& Array)
{
return Array.GetTypeSize() * Array.Num();
}
FInstanceCullingDeferredContext *FInstanceCullingContext::CreateDeferredContext(
FRDGBuilder& GraphBuilder,
const FGPUScene& GPUScene,
FInstanceCullingManager& InstanceCullingManager)
{
// Bin 0 is reserved for UnCulled batches, every other bin is for each HZB. So at the very least we must have 2 bins.
// Generic batches with a null HZB will go in bin 1, together with the ones associated to the first HZB.
uint32 NumBins = FMath::Max(2, InstanceCullingManager.ViewPrevHZBs.Num() + 1);
#define INST_CULL_CALLBACK_BIN_INDEX(CustomCode) \
[PassParameters, DeferredContext, BinIndex]() \
{ \
DeferredContext->ProcessBatched(PassParameters); \
return CustomCode; \
}
#define INST_CULL_CALLBACK(CustomCode) \
[PassParameters, DeferredContext]() \
{ \
DeferredContext->ProcessBatched(PassParameters); \
return CustomCode; \
}
#define INST_CULL_CREATE_STRUCT_BUFF_ARGS(ArrayName) \
GraphBuilder, \
TEXT("InstanceCulling.") TEXT(#ArrayName), \
DeferredContext->ArrayName.GetTypeSize(), \
INST_CULL_CALLBACK(DeferredContext->ArrayName.Num()), \
INST_CULL_CALLBACK(DeferredContext->ArrayName.GetData()), \
INST_CULL_CALLBACK(DeferredContext->ArrayName.Num() * DeferredContext->ArrayName.GetTypeSize())
#define INST_CULL_CREATE_STRUCT_BUFF_ARGS_BIN_INDEX(ArrayName) \
GraphBuilder, \
TEXT("InstanceCulling.") TEXT(#ArrayName), \
DeferredContext->ArrayName[BinIndex].GetTypeSize(), \
INST_CULL_CALLBACK_BIN_INDEX(DeferredContext->ArrayName[BinIndex].Num()), \
INST_CULL_CALLBACK_BIN_INDEX(DeferredContext->ArrayName[BinIndex].GetData()), \
INST_CULL_CALLBACK_BIN_INDEX(DeferredContext->ArrayName[BinIndex].Num() * DeferredContext->ArrayName[BinIndex].GetTypeSize())
const ERHIFeatureLevel::Type FeatureLevel = GPUScene.GetFeatureLevel();
const EShaderPlatform ShaderPlatform = GPUScene.GetShaderPlatform();
FInstanceCullingDeferredContext* DeferredContext = GraphBuilder.AllocObject<FInstanceCullingDeferredContext>(ShaderPlatform, &InstanceCullingManager, NumBins);
const bool bCullInstances = CVarCullInstances.GetValueOnRenderThread() != 0;
const bool bAllowWPODisable = true;
RDG_EVENT_SCOPE_STAT(GraphBuilder, BuildRenderingCommandsDeferred, "BuildRenderingCommandsDeferred(Culling=%s)", bCullInstances ? TEXT("On") : TEXT("Off"));
RDG_GPU_STAT_SCOPE(GraphBuilder, BuildRenderingCommandsDeferred);
TArray<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FParameters*> PassParameters;
PassParameters.SetNum(NumBins);
for (uint32 BinIndex = 0U; BinIndex < NumBins; ++BinIndex)
{
PassParameters[BinIndex] = GraphBuilder.AllocParameters<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FParameters>();
}
// Create buffers for compacting instances for draw commands that need it
const bool bEnableInstanceCompaction = IsInstanceOrderPreservationAllowed(ShaderPlatform);
FRDGBufferSRVRef DrawCommandCompactionDataSRV = nullptr;
FRDGBufferRef CompactInstanceIdsBuffer = nullptr;
FRDGBufferUAVRef CompactInstanceIdsUAV = nullptr;
FRDGBufferRef CompactionBlockCountsBuffer = nullptr;
FRDGBufferUAVRef CompactionBlockCountsUAV = nullptr;
if (bEnableInstanceCompaction)
{
DrawCommandCompactionDataSRV = GraphBuilder.CreateSRV(CreateStructuredBuffer(INST_CULL_CREATE_STRUCT_BUFF_ARGS(DrawCommandCompactionData)));
CompactInstanceIdsBuffer = CreateStructuredBuffer(
GraphBuilder,
TEXT("InstanceCulling.Compaction.TempInstanceIdsBuffer"),
sizeof(uint32),
INST_CULL_CALLBACK(FMath::Max(DeferredContext->TotalCompactionInstances, 1)),
INST_CULL_CALLBACK(nullptr),
INST_CULL_CALLBACK(0));
CompactInstanceIdsUAV = GraphBuilder.CreateUAV(CompactInstanceIdsBuffer);
CompactionBlockCountsBuffer = CreateStructuredBuffer(
GraphBuilder,
TEXT("InstanceCulling.Compaction.BlockInstanceCounts"),
sizeof(uint32),
INST_CULL_CALLBACK(FMath::Max(DeferredContext->TotalCompactionBlocks, 1)),
INST_CULL_CALLBACK(nullptr),
INST_CULL_CALLBACK(0));
CompactionBlockCountsUAV = GraphBuilder.CreateUAV(CompactionBlockCountsBuffer);
// We must clear the block counts buffer, as they will be written to using atomic increments
// TODO: Come up with a clever way to cull this pass when no compaction is needed (currently can't know until the batch is complete on the RDG execution timeline).
AddClearUAVPass(GraphBuilder, CompactionBlockCountsUAV, 0);
}
FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FParameters PassParametersTmp = {};
FRDGBufferRef DrawCommandDescsRDG = CreateStructuredBuffer(INST_CULL_CREATE_STRUCT_BUFF_ARGS(DrawCommandDescs));
FRDGBufferRef InstanceCullingPayloadsRDG = CreateStructuredBuffer(INST_CULL_CREATE_STRUCT_BUFF_ARGS(PayloadData));
FRDGBufferRef ViewIdsRDG = CreateStructuredBuffer(INST_CULL_CREATE_STRUCT_BUFF_ARGS(ViewIds));
FRDGBufferRef BatchInfosRDG = CreateStructuredBuffer(INST_CULL_CREATE_STRUCT_BUFF_ARGS(BatchInfos));
DeferredContext->DrawIndirectArgsBuffer = GraphBuilder.CreateBuffer(FRDGBufferDesc::CreateIndirectDesc(), TEXT("InstanceCulling.DrawIndirectArgsBuffer"), INST_CULL_CALLBACK(IndirectArgsNumWords * DeferredContext->IndirectArgs.Num()));
GraphBuilder.QueueBufferUpload(DeferredContext->DrawIndirectArgsBuffer, INST_CULL_CALLBACK(DeferredContext->IndirectArgs.GetData()), INST_CULL_CALLBACK(GetArrayDataSize(DeferredContext->IndirectArgs)));
FGlobalShaderMap* ShaderMap = GetGlobalShaderMap(FeatureLevel);
// Note: we redundantly clear the instance counts here as there is some issue with replays on certain consoles.
AddClearIndirectArgInstanceCountPass(GraphBuilder, ShaderMap, DeferredContext->DrawIndirectArgsBuffer, INST_CULL_CALLBACK(DeferredContext->IndirectArgs.Num()));
// not using structured buffer as we want/have to get at it as a vertex buffer
FRDGBufferRef InstanceIdOffsetBuffer = GraphBuilder.CreateBuffer(FRDGBufferDesc::CreateBufferDesc(sizeof(uint32), 1), TEXT("InstanceCulling.InstanceIdOffsetBuffer"), INST_CULL_CALLBACK(DeferredContext->InstanceIdOffsets.Num()));
GraphBuilder.QueueBufferUpload(InstanceIdOffsetBuffer, INST_CULL_CALLBACK(DeferredContext->InstanceIdOffsets.GetData()), INST_CULL_CALLBACK(DeferredContext->InstanceIdOffsets.GetTypeSize() * DeferredContext->InstanceIdOffsets.Num()));
FRDGBufferRef InstanceIdsBuffer = GraphBuilder.CreateBuffer(
CreateInstanceIdBufferDesc(ShaderPlatform, 1),
TEXT("InstanceCulling.InstanceIdsBuffer"),
INST_CULL_CALLBACK(GetInstanceIdBufferSize(DeferredContext->ShaderPlatform, DeferredContext->InstanceIdBufferElements))
);
FRDGBufferUAVRef InstanceIdsBufferUAV = GraphBuilder.CreateUAV(InstanceIdsBuffer, ERDGUnorderedAccessViewFlags::SkipBarrier);
DeferredContext->InstanceDataBuffer = InstanceIdOffsetBuffer;
// Because the view uniforms are not set up by the time this runs
// PassParameters->View = View.ViewUniformBuffer;
// Set up global GPU-scene data instead...
// NOTE: because it is possible in the non-deferred path to not have access to the SceneUB (for convoluted reasons), we bind the scene data directly,
// but to get access to the other facilities when available, we also bind the scene UB). This works because culling is disabled when InstanceCullingManager is not available.
PassParametersTmp.Scene = InstanceCullingManager.SceneUniforms.GetBuffer(GraphBuilder);
PassParametersTmp.GPUSceneParameters = GPUScene.GetShaderParameters(GraphBuilder);
PassParametersTmp.DrawCommandDescs = GraphBuilder.CreateSRV(DrawCommandDescsRDG);
PassParametersTmp.InstanceCullingPayloads = GraphBuilder.CreateSRV(InstanceCullingPayloadsRDG);
PassParametersTmp.BatchInfos = GraphBuilder.CreateSRV(BatchInfosRDG);
PassParametersTmp.ViewIds = GraphBuilder.CreateSRV(ViewIdsRDG);
// only one of these will be used in the shader
PassParametersTmp.InstanceIdsBufferOut = InstanceIdsBufferUAV;
PassParametersTmp.InstanceIdsBufferOutMobile = InstanceIdsBufferUAV;
PassParametersTmp.DrawIndirectArgsBufferOut = GraphBuilder.CreateUAV(DeferredContext->DrawIndirectArgsBuffer, PF_R32_UINT, ERDGUnorderedAccessViewFlags::SkipBarrier);
PassParametersTmp.InstanceIdOffsetBuffer = GraphBuilder.CreateSRV(InstanceIdOffsetBuffer, PF_R32_UINT);
if (bCullInstances || bAllowWPODisable)
{
PassParametersTmp.ViewDataCullingParameters = InstanceCullingManager.ViewDataManager.GetCullingParameters(GraphBuilder);
}
PassParametersTmp.DynamicBoundsParameters = GetDynamicMeshBoundsShaderParameters(GraphBuilder);
// Compaction parameters
PassParametersTmp.DrawCommandCompactionData = DrawCommandCompactionDataSRV;
PassParametersTmp.CompactInstanceIdsBufferOut = CompactInstanceIdsUAV;
PassParametersTmp.CompactionBlockCounts = CompactionBlockCountsUAV;
if (InstanceCullingManager.InstanceOcclusionQueryBuffer)
{
PassParametersTmp.InstanceOcclusionQueryBuffer = GraphBuilder.CreateSRV(
InstanceCullingManager.InstanceOcclusionQueryBuffer,
InstanceCullingManager.InstanceOcclusionQueryBufferFormat);
}
else
{
FRDGBufferRef DummyBuffer = GSystemTextures.GetDefaultBuffer(GraphBuilder, 4, 0u);
PassParametersTmp.InstanceOcclusionQueryBuffer = GraphBuilder.CreateSRV(DummyBuffer, PF_R32_UINT);
}
// Record the number of culling views to be able to check that no views referencing out-of bounds views are queued up
DeferredContext->NumCullingViews = InstanceCullingManager.ViewDataManager.GetNumCullingViews();
for (uint32 BinIndex = 0U; BinIndex < NumBins; ++BinIndex)
{
*PassParameters[BinIndex] = PassParametersTmp;
FRDGBufferRef BatchIndsRDG = CreateStructuredBuffer(INST_CULL_CREATE_STRUCT_BUFF_ARGS_BIN_INDEX(BatchInds));
PassParameters[BinIndex]->BatchInds = GraphBuilder.CreateSRV(BatchIndsRDG);
// The first bin (0) is used for the EBatchProcessingMode::UnCulled batches
EBatchProcessingMode CurrentBatchProcessingMode = (BinIndex == 0) ? EBatchProcessingMode::UnCulled : EBatchProcessingMode::Generic;
FInstanceProcessingGPULoadBalancer::FGPUData Result;
FRDGBufferRef BatchBuffer = CreateStructuredBuffer(
GraphBuilder,
TEXT("InstanceCullingLoadBalancer.Batches"),
sizeof(FInstanceProcessingGPULoadBalancer::FPackedBatch),
INST_CULL_CALLBACK_BIN_INDEX(DeferredContext->LoadBalancers[BinIndex].GetBatches().Num()),
INST_CULL_CALLBACK_BIN_INDEX(DeferredContext->LoadBalancers[BinIndex].GetBatches().GetData()),
INST_CULL_CALLBACK_BIN_INDEX(GetArrayDataSize(DeferredContext->LoadBalancers[BinIndex].GetBatches())));
FRDGBufferRef ItemBuffer = CreateStructuredBuffer(
GraphBuilder,
TEXT("InstanceCullingLoadBalancer.Items"),
sizeof(FInstanceProcessingGPULoadBalancer::FPackedItem),
INST_CULL_CALLBACK_BIN_INDEX(DeferredContext->LoadBalancers[BinIndex].GetItems().Num()),
INST_CULL_CALLBACK_BIN_INDEX(DeferredContext->LoadBalancers[BinIndex].GetItems().GetData()),
INST_CULL_CALLBACK_BIN_INDEX(GetArrayDataSize(DeferredContext->LoadBalancers[BinIndex].GetItems())));
PassParameters[BinIndex]->LoadBalancerParameters.BatchBuffer = GraphBuilder.CreateSRV(BatchBuffer);
PassParameters[BinIndex]->LoadBalancerParameters.ItemBuffer = GraphBuilder.CreateSRV(ItemBuffer);
PassParameters[BinIndex]->LoadBalancerParameters.NumGroupsPerBatch = 1;
PassParameters[BinIndex]->CurrentBatchProcessingMode = static_cast<uint32>(CurrentBatchProcessingMode);
FRDGTextureRef ViewPrevHZB = nullptr;
if (BinIndex > 0 && InstanceCullingManager.ViewPrevHZBs.Num() >= (int32)BinIndex)
{
ViewPrevHZB = GraphBuilder.RegisterExternalTexture(InstanceCullingManager.ViewPrevHZBs[BinIndex - 1]);
}
bool bOcclusionCullInstances = ViewPrevHZB != nullptr && FInstanceCullingContext::IsOcclusionCullingEnabled();
if (bOcclusionCullInstances)
{
PassParameters[BinIndex]->HZBTexture = ViewPrevHZB;
PassParameters[BinIndex]->HZBSize = ViewPrevHZB->Desc.Extent;
PassParameters[BinIndex]->HZBSampler = TStaticSamplerState< SF_Point, AM_Clamp, AM_Clamp, AM_Clamp >::GetRHI();
}
FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FPermutationDomain PermutationVector;
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FBatchedDim>(true);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FSingleInstanceModeDim>(CurrentBatchProcessingMode == EBatchProcessingMode::UnCulled);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FCullInstancesDim>(bCullInstances && CurrentBatchProcessingMode != EBatchProcessingMode::UnCulled);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FAllowWPODisableDim>(bAllowWPODisable);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FOcclusionCullInstancesDim>(bOcclusionCullInstances);
PermutationVector.Set<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs::FInstanceCompactionDim>(bEnableInstanceCompaction);
auto ComputeShader = ShaderMap->GetShader<FBuildInstanceIdBufferAndCommandsFromPrimitiveIdsCs>(PermutationVector);
FComputeShaderUtils::AddPass(
GraphBuilder,
RDG_EVENT_NAME("CullInstances(%s). Bin %d", BatchProcessingModeStr[uint32(CurrentBatchProcessingMode)], BinIndex),
ComputeShader,
PassParameters[BinIndex],
INST_CULL_CALLBACK_BIN_INDEX(DeferredContext->LoadBalancers[BinIndex].GetWrappedCsGroupCount()));
}
// TODO: Come up with a way to cull these passes when no compaction is needed. The group count resulting in (0, 0, 0) causes the pass lambdas to not execute,
// but currently cannot cull resource transitions
if (bEnableInstanceCompaction)
{
FRDGBufferRef BlockDestInstanceOffsets = CreateStructuredBuffer(
GraphBuilder,
TEXT("InstanceCulling.Compaction.BlockDestInstanceOffsets"),
sizeof(uint32),
INST_CULL_CALLBACK(FMath::Max<uint32>(DeferredContext->TotalCompactionBlocks, 1U)),
INST_CULL_CALLBACK(nullptr),
INST_CULL_CALLBACK(0));
// Compaction phase one - prefix sum of the compaction "blocks"
{
auto PassParameters2 = GraphBuilder.AllocParameters<FCalculateCompactBlockInstanceOffsetsCs::FParameters>();
PassParameters2->DrawCommandCompactionData = DrawCommandCompactionDataSRV;
PassParameters2->BlockInstanceCounts = GraphBuilder.CreateSRV(CompactionBlockCountsBuffer);
PassParameters2->BlockDestInstanceOffsetsOut = GraphBuilder.CreateUAV(BlockDestInstanceOffsets);
PassParameters2->DrawIndirectArgsBufferOut = PassParametersTmp.DrawIndirectArgsBufferOut;
auto ComputeShader = ShaderMap->GetShader<FCalculateCompactBlockInstanceOffsetsCs>();
FComputeShaderUtils::AddPass(
GraphBuilder,
RDG_EVENT_NAME("Instance Compaction Phase 1"),
ComputeShader,
PassParameters2,
[DeferredContext]()
{
return FComputeShaderUtils::GetGroupCountWrapped(DeferredContext->TotalCompactionDrawCommands);
});
}
// Compaction phase two - write instances to compact final location
{
FRDGBufferRef BlockDrawCommandIndices = CreateStructuredBuffer(INST_CULL_CREATE_STRUCT_BUFF_ARGS(CompactionBlockDataIndices));
auto PassParameters2 = GraphBuilder.AllocParameters<FCompactVisibleInstancesCs::FParameters>();
PassParameters2->DrawCommandCompactionData = DrawCommandCompactionDataSRV;
PassParameters2->BlockDrawCommandIndices = GraphBuilder.CreateSRV(BlockDrawCommandIndices);
PassParameters2->InstanceIdsBufferIn = GraphBuilder.CreateSRV(CompactInstanceIdsBuffer);
PassParameters2->BlockDestInstanceOffsets = GraphBuilder.CreateSRV(BlockDestInstanceOffsets);
PassParameters2->InstanceIdsBufferOut = InstanceIdsBufferUAV;
PassParameters2->InstanceIdsBufferOutMobile = InstanceIdsBufferUAV;
auto ComputeShader = ShaderMap->GetShader<FCompactVisibleInstancesCs>();
FComputeShaderUtils::AddPass(
GraphBuilder,
RDG_EVENT_NAME("Instance Compaction Phase 2"),
ComputeShader,
PassParameters2,
[PassParameters2, DeferredContext]()
{
return FComputeShaderUtils::GetGroupCountWrapped(DeferredContext->TotalCompactionBlocks);
});
}
}
if (PlatformGPUSceneUsesUniformBufferView(ShaderPlatform))
{
FBatchedPrimitiveParameters* BatchedPrimitiveParameters = GraphBuilder.AllocParameters<FBatchedPrimitiveParameters>();
BatchedPrimitiveParameters->Data = GraphBuilder.CreateSRV(InstanceIdsBuffer);
DeferredContext->BatchedPrimitive = GraphBuilder.CreateUniformBuffer(BatchedPrimitiveParameters);
}
else
{
FInstanceCullingGlobalUniforms* UniformParameters = GraphBuilder.AllocParameters<FInstanceCullingGlobalUniforms>();
UniformParameters->InstanceIdsBuffer = GraphBuilder.CreateSRV(InstanceIdsBuffer);
UniformParameters->PageInfoBuffer = GraphBuilder.CreateSRV(InstanceIdsBuffer);
UniformParameters->BufferCapacity = 0U; // TODO: this is not used at the moment, but is intended for range checks so would have been good.
DeferredContext->UniformBuffer = GraphBuilder.CreateUniformBuffer(UniformParameters);
}
#undef INST_CULL_CREATE_STRUCT_BUFF_ARGS
#undef INST_CULL_CALLBACK
#undef INST_CULL_CALLBACK_MODE
#undef INST_CULL_CREATE_STRUCT_BUFF_ARGS_MODE
#if MESH_DRAW_COMMAND_STATS
if (FMeshDrawCommandStatsManager* Instance = FMeshDrawCommandStatsManager::Get())
{
if (Instance->CollectStats())
{
DeferredContext->MeshDrawCommandStatsIndirectArgsReadbackBuffer = Instance->QueueDrawRDGIndirectArgsReadback(GraphBuilder, DeferredContext->DrawIndirectArgsBuffer);;
}
}
#endif // MESH_DRAW_COMMAND_STATS
return DeferredContext;
}
class FClearIndirectArgInstanceCountCs : public FGlobalShader
{
DECLARE_GLOBAL_SHADER(FClearIndirectArgInstanceCountCs);
SHADER_USE_PARAMETER_STRUCT(FClearIndirectArgInstanceCountCs, FGlobalShader)
public:
static constexpr int32 NumThreadsPerGroup = 64;
static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters)
{
return UseGPUScene(Parameters.Platform);
}
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
{
FGlobalShader::ModifyCompilationEnvironment(Parameters, OutEnvironment);
FInstanceProcessingGPULoadBalancer::SetShaderDefines(OutEnvironment);
OutEnvironment.SetDefine(TEXT("INDIRECT_ARGS_NUM_WORDS"), FInstanceCullingContext::IndirectArgsNumWords);
OutEnvironment.SetDefine(TEXT("NUM_THREADS_PER_GROUP"), NumThreadsPerGroup);
OutEnvironment.SetDefine(TEXT("COMPACTION_BLOCK_NUM_INSTANCES"), FCompactVisibleInstancesBaseCs::CompactionBlockNumInstances);
}
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
SHADER_PARAMETER_RDG_BUFFER_UAV(RWBuffer<uint>, DrawIndirectArgsBufferOut)
SHADER_PARAMETER(uint32, NumIndirectArgs)
END_SHADER_PARAMETER_STRUCT()
};
IMPLEMENT_GLOBAL_SHADER(FClearIndirectArgInstanceCountCs, "/Engine/Private/InstanceCulling/BuildInstanceDrawCommands.usf", "ClearIndirectArgInstanceCountCS", SF_Compute);
void FInstanceCullingContext::AddClearIndirectArgInstanceCountPass(FRDGBuilder& GraphBuilder, FGlobalShaderMap* ShaderMap, FRDGBufferRef DrawIndirectArgsBuffer, TFunction<int32()> NumIndirectArgsCallback)
{
FClearIndirectArgInstanceCountCs::FParameters* PassParameters = GraphBuilder.AllocParameters<FClearIndirectArgInstanceCountCs::FParameters>();
// Upload data etc
PassParameters->DrawIndirectArgsBufferOut = GraphBuilder.CreateUAV(DrawIndirectArgsBuffer, PF_R32_UINT);
PassParameters->NumIndirectArgs = DrawIndirectArgsBuffer->Desc.NumElements / FInstanceCullingContext::IndirectArgsNumWords;
auto ComputeShader = ShaderMap->GetShader<FClearIndirectArgInstanceCountCs>();
if (NumIndirectArgsCallback)
{
const FShaderParametersMetadata* ParametersMetadata = FClearIndirectArgInstanceCountCs::FParameters::FTypeInfo::GetStructMetadata();
ClearUnusedGraphResources(ComputeShader, ParametersMetadata, PassParameters);
GraphBuilder.AddPass(
RDG_EVENT_NAME("ClearIndirectArgInstanceCount"),
ParametersMetadata,
PassParameters,
ERDGPassFlags::Compute,
[ParametersMetadata, PassParameters, ComputeShader, NumIndirectArgsCallback = MoveTemp(NumIndirectArgsCallback)](FRDGAsyncTask, FRHIComputeCommandList& RHICmdList)
{
int32 NumIndirectArgs = NumIndirectArgsCallback();
PassParameters->NumIndirectArgs = NumIndirectArgs;
const FIntVector GroupCount = FComputeShaderUtils::GetGroupCountWrapped(NumIndirectArgs, FClearIndirectArgInstanceCountCs::NumThreadsPerGroup);
if (GroupCount.X > 0 && GroupCount.Y > 0 && GroupCount.Z > 0)
{
FComputeShaderUtils::ValidateGroupCount(GroupCount);
FComputeShaderUtils::Dispatch(RHICmdList, ComputeShader, ParametersMetadata, *PassParameters, GroupCount);
}
});
}
else
{
FComputeShaderUtils::AddPass(
GraphBuilder,
RDG_EVENT_NAME("ClearIndirectArgInstanceCount"),
ComputeShader,
PassParameters,
FComputeShaderUtils::GetGroupCountWrapped(PassParameters->NumIndirectArgs, FClearIndirectArgInstanceCountCs::NumThreadsPerGroup)
);
}
}
/**
* Allocate indirect arg slots for all meshes to use instancing,
* add commands that populate the indirect calls and index & id buffers, and
* Collapse all commands that share the same state bucket ID
* NOTE: VisibleMeshDrawCommandsInOut can only become shorter.
*/
void FInstanceCullingContext::SetupDrawCommands(
FMeshCommandOneFrameArray& VisibleMeshDrawCommandsInOut,
bool bInCompactIdenticalCommands,
const FScene *Scene,
// Stats
int32& MaxInstances,
int32& VisibleMeshDrawCommandsNum,
int32& NewPassVisibleMeshDrawCommandsNum)
{
QUICK_SCOPE_CYCLE_COUNTER(STAT_BuildMeshDrawCommandPrimitiveIdBuffer);
FVisibleMeshDrawCommand* RESTRICT PassVisibleMeshDrawCommands = VisibleMeshDrawCommandsInOut.GetData();
// TODO: make VSM set this for now to force the processing down a single batch (to simplify), maybe.
const bool bForceGenericProcessing = false;
const bool bMultiView = ViewIds.Num() > 1 && !(ViewIds.Num() == 2 && InstanceCullingMode == EInstanceCullingMode::Stereo);
if (bMultiView || bForceGenericProcessing)
{
// multi-view defaults to culled path to make cube-maps more efficient
SingleInstanceProcessingMode = EBatchProcessingMode::Generic;
}
QUICK_SCOPE_CYCLE_COUNTER(STAT_DynamicInstancingOfVisibleMeshDrawCommands);
ResetCommands(VisibleMeshDrawCommandsInOut.Num());
for (auto& LoadBalancer : LoadBalancers)
{
if (LoadBalancer == nullptr)
{
LoadBalancer = new FInstanceProcessingGPULoadBalancer;
}
#if DO_CHECK
if (InstanceCullingMode == EInstanceCullingMode::Stereo)
{
check(ViewIds.Num() == 2);
}
else
{
check(LoadBalancer->IsEmpty());
}
#endif
}
int32 CurrentStateBucketId = -1;
EMeshDrawCommandCullingPayloadFlags CurrentCullingPayloadFlags = EMeshDrawCommandCullingPayloadFlags::Default;
MaxInstances = 1;
// Only used to supply stats
uint32 CurrentAutoInstanceCount = 1;
// Scan through and compact away all with consecutive statebucked ID, and record primitive IDs in GPU-scene culling command
const int32 NumDrawCommandsIn = VisibleMeshDrawCommandsInOut.Num();
int32 NumDrawCommandsOut = 0;
uint32 CurrentIndirectArgsOffset = 0U;
const int32 NumViews = ViewIds.Num();
const bool bAlwaysUseIndirectDraws = (SingleInstanceProcessingMode != EBatchProcessingMode::UnCulled);
const bool bOrderPreservationEnabled = IsInstanceOrderPreservationEnabled();
const uint32 MaxGenericBatchSize = bUsesUniformBufferView ? PLATFORM_MAX_UNIFORM_BUFFER_RANGE / UniformViewInstanceStride[0] : MAX_uint32;
const uint32 MaxPrimitiveBatchSize = bUsesUniformBufferView ? PLATFORM_MAX_UNIFORM_BUFFER_RANGE / UniformViewInstanceStride[1] : MAX_uint32;
// Allocate conservatively for all commands, may not use all.
for (int32 DrawCommandIndex = 0; DrawCommandIndex < NumDrawCommandsIn; ++DrawCommandIndex)
{
const FVisibleMeshDrawCommand& RESTRICT VisibleMeshDrawCommand = PassVisibleMeshDrawCommands[DrawCommandIndex];
const FMeshDrawCommand* RESTRICT MeshDrawCommand = VisibleMeshDrawCommand.MeshDrawCommand;
const bool bFetchInstanceCountFromScene = EnumHasAnyFlags(VisibleMeshDrawCommand.Flags, EFVisibleMeshDrawCommandFlags::FetchInstanceCountFromScene);
check(!bFetchInstanceCountFromScene || Scene != nullptr);
const bool bSupportsGPUSceneInstancing = EnumHasAnyFlags(VisibleMeshDrawCommand.Flags, EFVisibleMeshDrawCommandFlags::HasPrimitiveIdStreamIndex);
const bool bMaterialUsesWorldPositionOffset = EnumHasAnyFlags(VisibleMeshDrawCommand.Flags, EFVisibleMeshDrawCommandFlags::MaterialUsesWorldPositionOffset);
const bool bMaterialAlwaysEvaluatesWorldPositionOffset = EnumHasAnyFlags(VisibleMeshDrawCommand.Flags, EFVisibleMeshDrawCommandFlags::MaterialAlwaysEvaluatesWorldPositionOffset);
const bool bForceInstanceCulling = EnumHasAnyFlags(VisibleMeshDrawCommand.Flags, EFVisibleMeshDrawCommandFlags::ForceInstanceCulling) || (GOcclusionForceInstanceCulling != 0);
const bool bPreserveInstanceOrder = bOrderPreservationEnabled && EnumHasAnyFlags(VisibleMeshDrawCommand.Flags, EFVisibleMeshDrawCommandFlags::PreserveInstanceOrder);
const bool bUseIndirectDraw = bFetchInstanceCountFromScene || bAlwaysUseIndirectDraws || bForceInstanceCulling || (VisibleMeshDrawCommand.NumRuns > 0 || MeshDrawCommand->NumInstances > 1);
// UniformBufferView path and instance order preservation do not support merging ISM draws atm
const bool bCompactIdenticalCommands = bInCompactIdenticalCommands && !(bUseIndirectDraw && bPreserveInstanceOrder)
&& (bUsesUniformBufferView ? (CurrentAutoInstanceCount < MaxPrimitiveBatchSize && !bUseIndirectDraw) : true);
if (bCompactIdenticalCommands && CurrentStateBucketId != -1 && VisibleMeshDrawCommand.StateBucketId == CurrentStateBucketId && VisibleMeshDrawCommand.CullingPayloadFlags == CurrentCullingPayloadFlags)
{
// Drop since previous covers for this
CurrentAutoInstanceCount++;
MaxInstances = FMath::Max<int32>(CurrentAutoInstanceCount, MaxInstances);
FMeshDrawCommandInfo& RESTRICT DrawCmd = MeshDrawCommandInfos.Last();
if (DrawCmd.bUseIndirect == 0)
{
DrawCmd.IndirectArgsOffsetOrNumInstances += 1;
}
// Nothing needs to be done when indirect rendering is used on the draw command because the current cached value CurrentIndirectArgsOffset won't change
// and these instances will be added to the same previous draw command in AddInstancesToDrawCommand below
}
else
{
// Reset auto-instance count (only needed for logging)
CurrentAutoInstanceCount = 1;
// kept 1:1 with the retained (not compacted) mesh draw commands, implicitly clears num instances
FMeshDrawCommandInfo& RESTRICT DrawCmd = MeshDrawCommandInfos.AddZeroed_GetRef();
DrawCmd.NumBatches = 1;
DrawCmd.BatchDataStride = PLATFORM_MAX_UNIFORM_BUFFER_RANGE;
// TODO: redundantly create an indirect arg slot for every draw command (even thoug those that don't support GPU-scene don't need one)
// the unsupported ones are skipped in FMeshDrawCommand::SubmitDrawBegin/End.
// in the future pipe through draw command info to submit, such that they may be skipped.
//if (bSupportsGPUSceneInstancing)
{
DrawCmd.bUseIndirect = bUseIndirectDraw;
CurrentIndirectArgsOffset = AllocateIndirectArgs(MeshDrawCommand);
DrawCommandDescs.Add(
PackDrawCommandDesc(
bMaterialUsesWorldPositionOffset,
bMaterialAlwaysEvaluatesWorldPositionOffset,
VisibleMeshDrawCommand.CullingPayload,
VisibleMeshDrawCommand.CullingPayloadFlags
)
);
if (bUseIndirectDraw)
{
DrawCmd.IndirectArgsOffsetOrNumInstances = CurrentIndirectArgsOffset * FInstanceCullingContext::IndirectArgsNumWords * sizeof(uint32);
}
else
{
DrawCmd.IndirectArgsOffsetOrNumInstances = 1;
}
const uint32 CurrentNumDraws = InstanceIdOffsets.Num();
// drawcall specific offset into per-instance buffer
DrawCmd.InstanceDataByteOffset = StepInstanceDataOffsetBytes(CurrentNumDraws);
InstanceIdOffsets.Emplace(GetInstanceIdNumElements());
}
// Record the last bucket ID (may be -1)
CurrentStateBucketId = VisibleMeshDrawCommand.StateBucketId;
CurrentCullingPayloadFlags = VisibleMeshDrawCommand.CullingPayloadFlags;
// If we have dropped any we need to move up to maintain 1:1
if (DrawCommandIndex > NumDrawCommandsOut)
{
PassVisibleMeshDrawCommands[NumDrawCommandsOut] = PassVisibleMeshDrawCommands[DrawCommandIndex];
}
NumDrawCommandsOut++;
}
if (bSupportsGPUSceneInstancing)
{
EInstanceFlags InstanceFlags = EInstanceFlags::None;
if (VisibleMeshDrawCommand.PrimitiveIdInfo.bIsDynamicPrimitive)
{
EnumAddFlags(InstanceFlags, EInstanceFlags::DynamicInstanceDataOffset);
}
if (bForceInstanceCulling)
{
EnumAddFlags(InstanceFlags, EInstanceFlags::ForceInstanceCulling);
}
if (bPreserveInstanceOrder)
{
EnumAddFlags(InstanceFlags, EInstanceFlags::PreserveInstanceOrder);
}
const uint32 InstanceOffset = TotalInstances;
// append 'culling command' targeting the current slot
// This will cause all instances belonging to the Primitive to be added to the command, if they are visible etc (GPU-Scene knows all - sees all)
if (VisibleMeshDrawCommand.RunArray)
{
AddInstanceRunsToDrawCommand(CurrentIndirectArgsOffset, VisibleMeshDrawCommand.PrimitiveIdInfo.InstanceSceneDataOffset, VisibleMeshDrawCommand.RunArray, VisibleMeshDrawCommand.NumRuns, InstanceFlags, MaxGenericBatchSize);
}
else if (bFetchInstanceCountFromScene)
{
check(Scene != nullptr);
check(!VisibleMeshDrawCommand.PrimitiveIdInfo.bIsDynamicPrimitive);
uint32 NumInstances = uint32(Scene->Primitives[VisibleMeshDrawCommand.PrimitiveIdInfo.ScenePrimitiveId]->GetNumInstanceSceneDataEntries());
if (NumInstances > 0u)
{
AddInstancesToDrawCommand(CurrentIndirectArgsOffset, VisibleMeshDrawCommand.PrimitiveIdInfo.InstanceSceneDataOffset, 0, NumInstances, InstanceFlags, MaxGenericBatchSize);
}
}
else
{
if (Scene != nullptr)
{
// Make sure the cached MDC matches what is stored in the scene
checkSlow(VisibleMeshDrawCommand.PrimitiveIdInfo.bIsDynamicPrimitive
|| !bSupportsGPUSceneInstancing
|| VisibleMeshDrawCommand.MeshDrawCommand->NumInstances == uint32(Scene->Primitives[VisibleMeshDrawCommand.PrimitiveIdInfo.ScenePrimitiveId]->GetNumInstanceSceneDataEntries()));
// This condition is used to skip re-caching MDCs and thus should not be set on anything that doesn't take the above path
checkSlow(!Scene->Primitives[VisibleMeshDrawCommand.PrimitiveIdInfo.ScenePrimitiveId]->Proxy->DoesMeshBatchesUseSceneInstanceCount());
}
AddInstancesToDrawCommand(CurrentIndirectArgsOffset, VisibleMeshDrawCommand.PrimitiveIdInfo.InstanceSceneDataOffset, 0, VisibleMeshDrawCommand.MeshDrawCommand->NumInstances, InstanceFlags, MaxGenericBatchSize);
}
const uint32 NumInstancesAdded = TotalInstances - InstanceOffset;
if (bPreserveInstanceOrder && NumInstancesAdded > 0)
{
const uint32 CompactionDataIndex = uint32(DrawCommandCompactionData.Num());
DrawCommandCompactionData.Emplace(
NumInstancesAdded,
NumViews,
uint32(CompactionBlockDataIndices.Num()),
CurrentIndirectArgsOffset,
NumCompactionInstances,
InstanceIdOffsets.Last());
const int32 FirstBlock = CompactionBlockDataIndices.Num();
const uint32 NumCompactionBlocksThisCommand = FMath::DivideAndRoundUp(NumInstancesAdded, CompactionBlockNumInstances);
CompactionBlockDataIndices.AddUninitialized(NumCompactionBlocksThisCommand);
for (int32 Block = FirstBlock; Block < CompactionBlockDataIndices.Num(); ++Block)
{
CompactionBlockDataIndices[Block] = CompactionDataIndex;
}
NumCompactionInstances += NumInstancesAdded * NumViews;
}
}
}
check(bInCompactIdenticalCommands || NumDrawCommandsIn == NumDrawCommandsOut);
checkf(NumDrawCommandsOut == MeshDrawCommandInfos.Num(), TEXT("There must be a 1:1 mapping between MeshDrawCommandInfos and mesh draw commands, as this assumption is made in SubmitDrawCommands."));
// Setup instancing stats for logging.
VisibleMeshDrawCommandsNum = VisibleMeshDrawCommandsInOut.Num();
NewPassVisibleMeshDrawCommandsNum = NumDrawCommandsOut;
// Resize array post-compaction of dynamic instances
VisibleMeshDrawCommandsInOut.SetNum(NumDrawCommandsOut, EAllowShrinking::No);
}
void FInstanceCullingContext::SubmitDrawCommands(
const FMeshCommandOneFrameArray& VisibleMeshDrawCommands,
const FGraphicsMinimalPipelineStateSet& GraphicsMinimalPipelineStateSet,
const FMeshDrawCommandOverrideArgs& OverrideArgs,
int32 StartIndex,
int32 NumMeshDrawCommands,
uint32 InInstanceFactor,
FRHICommandList& RHICmdList) const
{
if (VisibleMeshDrawCommands.Num() == 0)
{
// FIXME: looks like parallel rendering can spawn empty FDrawVisibleMeshCommandsAnyThreadTask
return;
}
if (IsEnabled())
{
check(MeshDrawCommandInfos.Num() >= (StartIndex + NumMeshDrawCommands));
FMeshDrawCommandSceneArgs SceneArgs;
SceneArgs.PrimitiveIdsBuffer = OverrideArgs.InstanceBuffer;
SceneArgs.RootConstants = OverrideArgs.RootConstants;
if (IsUniformBufferStaticSlotValid(InstanceCullingStaticSlot))
{
if (bUsesUniformBufferView)
{
SceneArgs.BatchedPrimitiveSlot = InstanceCullingStaticSlot;
}
// Only does anything when batching is disabled and each mesh pass has an unique InstanceCulling/BatchedPrimitive UB
// otherwise static UBs set once when we start render-pass
RHICmdList.SetStaticUniformBuffer(InstanceCullingStaticSlot, OverrideArgs.InstanceCullingStaticUB);
}
FMeshDrawCommandStateCache StateCache;
INC_DWORD_STAT_BY(STAT_MeshDrawCalls, NumMeshDrawCommands);
for (int32 DrawCommandIndex = StartIndex; DrawCommandIndex < StartIndex + NumMeshDrawCommands; DrawCommandIndex++)
{
//SCOPED_CONDITIONAL_DRAW_EVENTF(RHICmdList, MeshEvent, GEmitMeshDrawEvent != 0, TEXT("Mesh Draw"));
const FVisibleMeshDrawCommand& VisibleMeshDrawCommand = VisibleMeshDrawCommands[DrawCommandIndex];
const FMeshDrawCommandInfo& DrawCommandInfo = MeshDrawCommandInfos[DrawCommandIndex];
uint32 InstanceFactor = InInstanceFactor;
SceneArgs.IndirectArgsByteOffset = 0u;
SceneArgs.IndirectArgsBuffer = nullptr;
if (DrawCommandInfo.bUseIndirect)
{
SceneArgs.IndirectArgsByteOffset = OverrideArgs.IndirectArgsByteOffset + DrawCommandInfo.IndirectArgsOffsetOrNumInstances;
SceneArgs.IndirectArgsBuffer = OverrideArgs.IndirectArgsBuffer;
}
else
{
// TODO: need a better way to override number of instances
InstanceFactor = InInstanceFactor * DrawCommandInfo.IndirectArgsOffsetOrNumInstances;
}
SceneArgs.PrimitiveIdOffset = OverrideArgs.InstanceDataByteOffset + DrawCommandInfo.InstanceDataByteOffset;
#if WANTS_DRAW_MESH_EVENTS
RHI_BREADCRUMB_EVENT_CONDITIONAL_F(RHICmdList, GShowMaterialDrawEvents != 0, "MaterialDrawEvent", "%s %s (%u instances)"
, VisibleMeshDrawCommand.MeshDrawCommand->GetDebugData().MaterialRenderProxy->GetMaterialName()
, VisibleMeshDrawCommand.MeshDrawCommand->GetDebugData().ResourceName
, VisibleMeshDrawCommand.MeshDrawCommand->NumInstances * InstanceFactor
);
#endif
const bool bAllowSkipDrawCommand = true;
if (FMeshDrawCommand::SubmitDrawBegin(*VisibleMeshDrawCommand.MeshDrawCommand, GraphicsMinimalPipelineStateSet, SceneArgs, InstanceFactor, RHICmdList, StateCache, bAllowSkipDrawCommand))
{
FMeshDrawCommand::SubmitDrawEnd(*VisibleMeshDrawCommand.MeshDrawCommand, SceneArgs, InstanceFactor, RHICmdList);
// If MDC was split to a more than one batch, submit them without changing state
for (uint32 BatchIdx = 1; BatchIdx < DrawCommandInfo.NumBatches; ++BatchIdx)
{
SceneArgs.PrimitiveIdOffset += DrawCommandInfo.BatchDataStride;
SceneArgs.IndirectArgsByteOffset += sizeof(FRHIDrawIndexedIndirectParameters);
FMeshDrawCommand::SubmitDrawEnd(*VisibleMeshDrawCommand.MeshDrawCommand, SceneArgs, InstanceFactor, RHICmdList);
}
}
}
}
else
{
FMeshDrawCommandSceneArgs SceneArgs;
SubmitMeshDrawCommandsRange(VisibleMeshDrawCommands, GraphicsMinimalPipelineStateSet, SceneArgs, 0, false, StartIndex, NumMeshDrawCommands, InInstanceFactor, RHICmdList);
}
}