Files
UnrealEngine/Engine/Source/Runtime/Renderer/Public/InstanceCulling/InstanceCullingContext.h
2025-05-18 13:04:45 +08:00

367 lines
14 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
#pragma once
#include "CoreMinimal.h"
#include "MeshPassProcessor.h"
#include "RHI.h"
#include "RenderGraphResources.h"
struct FInstanceCullingResult;
class FGPUScene;
class FInstanceCullingManager;
class FInstanceCullingDrawParams;
class FScene;
class FGPUScenePrimitiveCollector;
class FInstanceCullingDeferredContext;
struct FMeshDrawCommandPassStats;
DECLARE_UNIFORM_BUFFER_STRUCT(FSceneUniformParameters, RENDERER_API)
BEGIN_GLOBAL_SHADER_PARAMETER_STRUCT(FInstanceCullingGlobalUniforms, RENDERER_API)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer<uint>, InstanceIdsBuffer)
SHADER_PARAMETER_RDG_BUFFER_SRV(StructuredBuffer<uint>, PageInfoBuffer)
SHADER_PARAMETER(uint32, BufferCapacity)
END_GLOBAL_SHADER_PARAMETER_STRUCT()
BEGIN_GLOBAL_SHADER_PARAMETER_STRUCT(FBatchedPrimitiveParameters,RENDERER_API)
SHADER_PARAMETER_RDG_BUFFER_SRV(Buffer<float4>, Data)
END_GLOBAL_SHADER_PARAMETER_STRUCT()
BEGIN_SHADER_PARAMETER_STRUCT(FInstanceCullingDrawParams, )
RDG_BUFFER_ACCESS(DrawIndirectArgsBuffer, ERHIAccess::IndirectArgs)
RDG_BUFFER_ACCESS(InstanceIdOffsetBuffer, ERHIAccess::VertexOrIndexBuffer)
SHADER_PARAMETER(uint32, InstanceDataByteOffset) // offset into per-instance buffer
SHADER_PARAMETER(uint32, IndirectArgsByteOffset) // offset into indirect args buffer
SHADER_PARAMETER_RDG_UNIFORM_BUFFER(FInstanceCullingGlobalUniforms, InstanceCulling)
SHADER_PARAMETER_RDG_UNIFORM_BUFFER(FSceneUniformParameters, Scene)
SHADER_PARAMETER_RDG_UNIFORM_BUFFER(FBatchedPrimitiveParameters, BatchedPrimitive)
END_SHADER_PARAMETER_STRUCT()
FMeshDrawCommandOverrideArgs GetMeshDrawCommandOverrideArgs(const FInstanceCullingDrawParams& InstanceCullingDrawParams);
enum class EInstanceCullingMode
{
Normal,
Stereo,
};
enum class EInstanceCullingFlags : uint8
{
None = 0,
NoInstanceOrderPreservation = 1 << 0,
};
ENUM_CLASS_FLAGS(EInstanceCullingFlags)
// Enumeration of the specialized command processing variants
enum class EBatchProcessingMode : uint32
{
// Generic processing mode, handles all the features.
Generic,
// General work batches that need load balancing, either instance runs or primitive id ranges (auto instanced) but culling is disabled
// may have multi-view (but probably not used for that path)
UnCulled,
Num,
};
class FInstanceProcessingGPULoadBalancer;
/**
*/
class FInstanceCullingContext
{
public:
enum class EInstanceFlags : uint8
{
None = 0,
DynamicInstanceDataOffset = 1 << 0,
ForceInstanceCulling = 1 << 1,
PreserveInstanceOrder = 1 << 2
};
static constexpr uint32 UniformViewInstanceStride[2] =
{ // One for each BatchProcessingMode
BATCHED_INSTANCE_DATA_STRIDE,
BATCHED_PRIMITIVE_DATA_STRIDE
};
static constexpr uint32 IndirectArgsNumWords = 5;
static constexpr uint32 CompactionBlockNumInstances = 64;
RENDERER_API static uint32 GetInstanceIdBufferStride(EShaderPlatform ShaderPlatform);
RENDERER_API static FUniformBufferStaticSlot GetStaticUniformBufferSlot(EShaderPlatform ShaderPlatform);
FInstanceCullingContext() {}
UE_DEPRECATED(5.4, "Use constructor which provides pass name as first argument")
FInstanceCullingContext(
EShaderPlatform ShaderPlatform,
FInstanceCullingManager* InInstanceCullingManager,
TArrayView<const int32> InViewIds,
const TRefCountPtr<IPooledRenderTarget>& InPrevHZB,
EInstanceCullingMode InInstanceCullingMode = EInstanceCullingMode::Normal,
EInstanceCullingFlags InFlags = EInstanceCullingFlags::None,
EBatchProcessingMode InSingleInstanceProcessingMode = EBatchProcessingMode::UnCulled) :
FInstanceCullingContext(TEXT("Unknown"), ShaderPlatform, InInstanceCullingManager, InViewIds, InPrevHZB, InInstanceCullingMode, InFlags, InSingleInstanceProcessingMode)
{
}
/**
* Create an instance culling context to process draw commands that can be culled using GPU-Scene.
* @param InPrevHZB if non-null enables HZB-occlusion culling for the context (if r.InstanceCulling.OcclusionCull is enabled),
* NOTE: only one PrevHZB target is allowed accross all passes currently, so either must be atlased or otherwise the same.
*/
RENDERER_API FInstanceCullingContext(
const TCHAR* PassName,
EShaderPlatform ShaderPlatform,
FInstanceCullingManager* InInstanceCullingManager,
TArrayView<const int32> InViewIds,
const TRefCountPtr<IPooledRenderTarget>& InPrevHZB,
EInstanceCullingMode InInstanceCullingMode = EInstanceCullingMode::Normal,
EInstanceCullingFlags InFlags = EInstanceCullingFlags::None,
EBatchProcessingMode InSingleInstanceProcessingMode = EBatchProcessingMode::UnCulled
);
RENDERER_API ~FInstanceCullingContext();
static RENDERER_API const TRDGUniformBufferRef<FInstanceCullingGlobalUniforms> CreateDummyInstanceCullingUniformBuffer(FRDGBuilder& GraphBuilder);
static bool IsGPUCullingEnabled();
static bool IsOcclusionCullingEnabled();
/**
* Call to empty out the culling commands & other culling data.
*/
void ResetCommands(int32 MaxNumCommands);
bool IsEnabled() const { return bIsEnabled; }
bool IsInstanceOrderPreservationEnabled() const;
/**
* Add command to cull a range of instances for the given mesh draw command index.
* Multiple commands may add to the same slot, ordering is not preserved.
*/
void AddInstancesToDrawCommand(uint32 IndirectArgsOffset, int32 InstanceDataOffset, uint32 RunOffset, uint32 NumInstances, EInstanceFlags InstanceFlags);
void AddInstancesToDrawCommand(uint32 IndirectArgsOffset, int32 InstanceDataOffset, uint32 RunOffset, uint32 NumInstances, EInstanceFlags InstanceFlags, uint32 MaxBatchSize);
/**
* Command that is executed in the per-view, post-cull pass to gather up the instances belonging to this primitive.
* Multiple commands may add to the same slot, ordering is not preserved.
*/
void AddInstanceRunsToDrawCommand(uint32 IndirectArgsOffset, int32 InstanceDataOffset, const uint32* Runs, uint32 NumRuns, EInstanceFlags InstanceFlags, uint32 MaxBatchSize);
/*
* Allocate space for indirect draw call argumens for a given MeshDrawCommand and initialize with draw command data.
* TODO: support cached pre-allocated commands.
*/
uint32 AllocateIndirectArgs(const FMeshDrawCommand* MeshDrawCommand);
/*
* Computes instance data byte offset for a next draw command taking into account platform specifics
*/
uint32 StepInstanceDataOffsetBytes(uint32 NumStepDraws) const;
uint32 GetInstanceIdNumElements() const;
using SyncPrerequisitesFuncType = TFunction<void (FInstanceCullingContext &InstanceCullingContext)>;
/**
* Set up the context to track an async setup process, or some deferred setup work.
* The supplied function should do two things, apart from any other processing needed.
* 1. wait for the async setup task
* 2. Call SetDynamicPrimitiveInstanceOffsets (unless that is achieved somehow else).
*/
void BeginAsyncSetup(SyncPrerequisitesFuncType&& InSyncPrerequisitesFunc);
/**
* Calls the sync function passed tp BeginAsyncSetup to ensure the setup processing is completed.
*/
void WaitForSetupTask();
/**
*/
void SetDynamicPrimitiveInstanceOffsets(int32 InDynamicInstanceIdOffset, int32 InDynamicInstanceIdNum);
/**
* This version is never deferred, nor async, calling BeginAsyncSetup before this is an error.
*/
void BuildRenderingCommands(
FRDGBuilder& GraphBuilder,
const FGPUScene& GPUScene,
int32 InDynamicInstanceIdOffset,
int32 InDynamicInstanceIdNum,
FInstanceCullingResult& Results);
/**
* This BuildRenderingCommands operation may be deferred and merged into a global pass when possible.
* Note: InstanceCullingDrawParams is captured by the deferred culling passes and must therefore have a RDG-lifetime.
* If BeginAsyncSetup has been called prior to this, the WaitForSetupTask is deferred as long as possible.
* If BeginAsyncSetup was not called, then SetDynamicPrimitiveInstanceOffsets must be called before this.
*/
void BuildRenderingCommands(FRDGBuilder& GraphBuilder, const FGPUScene& GPUScene, FInstanceCullingDrawParams* InstanceCullingDrawParams);
/**
* Returns true if there are any instances in this context needing to be rendered. Must not be called before WaitForSetupTask if BeginAsyncSetup was called.
*/
bool HasCullingCommands() const;
EInstanceCullingMode GetInstanceCullingMode() const { return InstanceCullingMode; }
/**
* Add a batched BuildRenderingCommands pass. Each batch represents a BuildRenderingCommands call from a mesh pass.
* Batches are collected as we walk through the main render setup and are executed when RDG Execute or Drain is called.
* This implicitly ends the deferred context, so if Drain is used, it should be paired with a new call to BeginDeferredCulling.
*/
static FInstanceCullingDeferredContext* CreateDeferredContext(
FRDGBuilder& GraphBuilder,
const FGPUScene& GPUScene,
FInstanceCullingManager& InstanceCullingManager);
/**
* Helper function to add a pass to zero the instance count in the indirect args.
*/
static void AddClearIndirectArgInstanceCountPass(FRDGBuilder& GraphBuilder, FGlobalShaderMap* ShaderMap, FRDGBufferRef DrawIndirectArgsBuffer, TFunction<int32()> NumIndirectArgsCallback = TFunction<int32()>());
void SetupDrawCommands(
FMeshCommandOneFrameArray& VisibleMeshDrawCommandsInOut,
bool bCompactIdenticalCommands,
const FScene *Scene,
// Stats
int32& MaxInstancesOut,
int32& VisibleMeshDrawCommandsNumOut,
int32& NewPassVisibleMeshDrawCommandsNumOut);
void SubmitDrawCommands(
const FMeshCommandOneFrameArray& VisibleMeshDrawCommands,
const FGraphicsMinimalPipelineStateSet& GraphicsMinimalPipelineStateSet,
const FMeshDrawCommandOverrideArgs& OverrideArgs,
int32 StartIndex,
int32 NumMeshDrawCommands,
uint32 InstanceFactor,
FRHICommandList& RHICmdList) const;
FInstanceCullingManager* InstanceCullingManager = nullptr;
EShaderPlatform ShaderPlatform = SP_NumPlatforms;
TArray<int32, TInlineAllocator<6, SceneRenderingAllocator>> ViewIds;
TRefCountPtr<IPooledRenderTarget> PrevHZB = nullptr;
bool bIsEnabled = false;
EInstanceCullingMode InstanceCullingMode = EInstanceCullingMode::Normal;
EInstanceCullingFlags Flags = EInstanceCullingFlags::None;
uint32 TotalInstances = 0U;
int32 DynamicInstanceIdOffset = -1;
int32 DynamicInstanceIdNum = -1;
SyncPrerequisitesFuncType SyncPrerequisitesFunc;
public:
enum class EAsyncProcessingMode
{
DeferredOrAsync,
Synchronous,
};
void BuildRenderingCommandsInternal(FRDGBuilder& GraphBuilder, const FGPUScene& GPUScene, EAsyncProcessingMode AsyncProcessingMode, FInstanceCullingDrawParams* InstanceCullingDrawParams);
// Auxiliary info for each mesh draw command that needs submitting.
struct FMeshDrawCommandInfo
{
// flag to indicate if using indirect or not.
uint32 bUseIndirect : 1U;
// stores either the offset (in bytes) to the indirect args or the number of instances
uint32 IndirectArgsOffsetOrNumInstances : 31U;
// offset into per-instance buffer
uint32 InstanceDataByteOffset;
//
uint32 NumBatches : 15u;
uint32 BatchDataStride : 17u;
};
struct FPayloadData
{
uint32 bDynamicInstanceDataOffset_IndirectArgsIndex;
uint32 InstanceDataOffset;
uint32 RunInstanceOffset;
uint32 CompactionDataIndex;
FPayloadData() = default;
FPayloadData(
bool bInDynamicInstanceDataOffset,
uint32 InIndirectArgsIndex,
uint32 InInstanceDataOffset,
uint32 InRunInstanceOffset,
uint32 InCompactionDataIndex)
: bDynamicInstanceDataOffset_IndirectArgsIndex(InIndirectArgsIndex | (bInDynamicInstanceDataOffset ? (1u << 31u) : 0u))
, InstanceDataOffset(InInstanceDataOffset)
, RunInstanceOffset(InRunInstanceOffset)
, CompactionDataIndex(InCompactionDataIndex)
{
checkSlow(InIndirectArgsIndex < (1u << 31u));
}
};
struct FCompactionData
{
static const uint32 NumViewBits = 8;
uint32 NumInstances_NumViews;
uint32 BlockOffset;
uint32 IndirectArgsIndex;
uint32 SrcInstanceIdOffset;
uint32 DestInstanceIdOffset;
FCompactionData() = default;
FCompactionData(
uint32 InNumInstances,
uint32 InNumViews,
uint32 InBlockOffset,
uint32 InIndirectArgsIndex,
uint32 InSrcInstanceIdOffset,
uint32 InDestInstanceIdOffset)
: NumInstances_NumViews(InNumViews | (InNumInstances << NumViewBits))
, BlockOffset(InBlockOffset)
, IndirectArgsIndex(InIndirectArgsIndex)
, SrcInstanceIdOffset(InSrcInstanceIdOffset)
, DestInstanceIdOffset(InDestInstanceIdOffset)
{
checkSlow(InNumViews < (1u << NumViewBits));
checkSlow(InNumInstances < (1u << (32 - NumViewBits)));
}
};
TArray<FMeshDrawCommandInfo, SceneRenderingAllocator> MeshDrawCommandInfos;
TArray<FRHIDrawIndexedIndirectParameters, SceneRenderingAllocator> IndirectArgs;
TArray<FUintVector2, SceneRenderingAllocator> DrawCommandDescs;
TArray<FPayloadData, SceneRenderingAllocator> PayloadData;
TArray<uint32, SceneRenderingAllocator> InstanceIdOffsets;
TArray<FCompactionData, SceneRenderingAllocator> DrawCommandCompactionData;
TArray<uint32, SceneRenderingAllocator> CompactionBlockDataIndices;
uint32 NumCompactionInstances = 0U;
using LoadBalancerArray = TStaticArray<FInstanceProcessingGPULoadBalancer*, static_cast<uint32>(EBatchProcessingMode::Num)>;
// Driver for collecting items using one mode of processing
LoadBalancerArray LoadBalancers = LoadBalancerArray(InPlace, nullptr);
// Set of specialized batches that collect items with different properties each context may have only a subset.
//TStaticArray<FBatchProcessor, EBatchProcessingMode::Num> Batches;
// Processing mode to use for single-instance primitives, default to skip culling, as this is already done on CPU.
EBatchProcessingMode SingleInstanceProcessingMode = EBatchProcessingMode::UnCulled;
// Static uniform buffer slot for InstanceCulling/BatchedPrimitive UB. Only valid for a mobile renderer, which needs to handle cases with a manualy merged render-passes
FUniformBufferStaticSlot InstanceCullingStaticSlot;
// Whether current platform uses 'Uniform Buffer View' path
bool bUsesUniformBufferView;
#if MESH_DRAW_COMMAND_STATS
public:
// Optional pass stats
FMeshDrawCommandPassStats* MeshDrawCommandPassStats = nullptr;
#endif
};
ENUM_CLASS_FLAGS(FInstanceCullingContext::EInstanceFlags)