960 lines
39 KiB
C++
960 lines
39 KiB
C++
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#include "InstanceCullingOcclusionQuery.h"
|
|
|
|
#include "Containers/ArrayView.h"
|
|
#include "Containers/ResourceArray.h"
|
|
#include "GPUScene.h"
|
|
#include "GlobalShader.h"
|
|
#include "InstanceCulling/InstanceCullingManager.h"
|
|
#include "RHIAccess.h"
|
|
#include "RHIFeatureLevel.h"
|
|
#include "RHIGlobals.h"
|
|
#include "RHIResourceUtils.h"
|
|
#include "RHIShaderPlatform.h"
|
|
#include "RHIStaticStates.h"
|
|
#include "RenderGraphBuilder.h"
|
|
#include "RenderGraphUtils.h"
|
|
#include "ShaderParameterMacros.h"
|
|
#include "ShaderParameterStruct.h"
|
|
#include "SceneRendering.h"
|
|
#include "ScenePrivate.h"
|
|
#include "UnifiedBuffer.h"
|
|
#include "HAL/IConsoleManager.h"
|
|
#include "DataDrivenShaderPlatformInfo.h"
|
|
#include "ProfilingDebugging/CpuProfilerTrace.h"
|
|
|
|
static TAutoConsoleVariable<int32> CVarInstanceCullingOcclusionQueries(
|
|
TEXT("r.InstanceCulling.OcclusionQueries"),
|
|
0,
|
|
TEXT("EXPERIMENTAL: Use per-instance software occlusion queries to perform less conservative visibility test than what's possible with HZB alone"),
|
|
ECVF_RenderThreadSafe | ECVF_Preview);
|
|
|
|
static int32 GInstanceCullingUseLoadBalancer = 1;
|
|
static FAutoConsoleVariableRef CVarInstanceCullingUseLoadBalancer(
|
|
TEXT("r.InstanceCulling.UseLoadBalancer"),
|
|
GInstanceCullingUseLoadBalancer,
|
|
TEXT("Prefer to use UseLoadBalancer"),
|
|
ECVF_RenderThreadSafe);
|
|
|
|
struct FInstanceCullingOcclusionQueryDeferredContext;
|
|
|
|
namespace
|
|
{
|
|
|
|
|
|
|
|
|
|
static EPixelFormat GetPreferredVisibilityMaskFormat()
|
|
{
|
|
EPixelFormat PossibleFormats[] =
|
|
{
|
|
PF_R8_UINT, // may be available if typed UAV load/store is supported on current hardware
|
|
PF_R32_UINT, // guaranteed to be supported
|
|
};
|
|
|
|
for (EPixelFormat Format : PossibleFormats)
|
|
{
|
|
EPixelFormatCapabilities Capabilities = GPixelFormats[Format].Capabilities;
|
|
if (EnumHasAllFlags(Capabilities, EPixelFormatCapabilities::TypedUAVLoad | EPixelFormatCapabilities::TypedUAVStore))
|
|
{
|
|
return Format;
|
|
}
|
|
}
|
|
|
|
return PF_Unknown;
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
* Prepares indirect draw parameters for per-instance per-pixel occlusion query rendering pass.
|
|
*/
|
|
class FInstanceCullingOcclusionQueryCS : public FGlobalShader
|
|
{
|
|
DECLARE_GLOBAL_SHADER(FInstanceCullingOcclusionQueryCS);
|
|
SHADER_USE_PARAMETER_STRUCT(FInstanceCullingOcclusionQueryCS, FGlobalShader);
|
|
|
|
public:
|
|
|
|
class FMultiView : SHADER_PERMUTATION_BOOL("DIM_MULTI_VIEW");
|
|
class FUseLoadBalancerDim : SHADER_PERMUTATION_BOOL("USE_LOAD_BALANCER");
|
|
using FPermutationDomain = TShaderPermutationDomain<FMultiView, FUseLoadBalancerDim>;
|
|
|
|
static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters)
|
|
{
|
|
return FDataDrivenShaderPlatformInfo::GetSupportsVertexShaderSRVs(Parameters.Platform);
|
|
}
|
|
|
|
static constexpr int32 NumThreadsPerGroup = 64;
|
|
|
|
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
|
|
{
|
|
FGlobalShader::ModifyCompilationEnvironment(Parameters, OutEnvironment);
|
|
const FPermutationDomain PermutationVector(Parameters.PermutationId);
|
|
|
|
// Currently, instance compaction is not supported on mobile platforms
|
|
if (PermutationVector.Get<FUseLoadBalancerDim>())
|
|
{
|
|
FInstanceProcessingGPULoadBalancer::SetShaderDefines(OutEnvironment);
|
|
}
|
|
|
|
OutEnvironment.SetDefine(TEXT("USE_GLOBAL_GPU_SCENE_DATA"), 1);
|
|
OutEnvironment.SetDefine(TEXT("VF_SUPPORTS_PRIMITIVE_SCENE_DATA"), 1);
|
|
OutEnvironment.SetDefine(TEXT("NUM_THREADS_PER_GROUP_DEFAULT"), NumThreadsPerGroup);
|
|
}
|
|
|
|
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FGPUSceneResourceParameters, GPUSceneParameters)
|
|
SHADER_PARAMETER_RDG_BUFFER_UAV(RWBuffer<uint32>, OutIndirectArgsBuffer)
|
|
SHADER_PARAMETER_RDG_BUFFER_UAV(RWBuffer<uint32>, OutInstanceIdBuffer)
|
|
SHADER_PARAMETER_RDG_BUFFER_UAV(RWBuffer<uint>, RWVisibilityMask) // One uint8/32 per instance (0 if instance is culled, non-0 otherwise)
|
|
SHADER_PARAMETER_RDG_BUFFER_SRV(Buffer<uint32>, InstanceIdBuffer)
|
|
SHADER_PARAMETER_STRUCT_REF(FViewUniformShaderParameters, View)
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FInstanceProcessingGPULoadBalancer::FShaderParameters, LoadBalancerParameters)
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FHZBParameters, HZBParameters)
|
|
SHADER_PARAMETER(float, OcclusionSlop)
|
|
SHADER_PARAMETER(int32, NumInstances)
|
|
SHADER_PARAMETER(uint32, ViewMask)
|
|
END_SHADER_PARAMETER_STRUCT()
|
|
};
|
|
|
|
IMPLEMENT_GLOBAL_SHADER(FInstanceCullingOcclusionQueryCS, "/Engine/Private/InstanceCulling/InstanceCullingOcclusionQuery.usf", "MainCS", SF_Compute);
|
|
|
|
class FInstanceCullingOcclusionQueryVS : public FGlobalShader
|
|
{
|
|
DECLARE_GLOBAL_SHADER(FInstanceCullingOcclusionQueryVS);
|
|
SHADER_USE_PARAMETER_STRUCT(FInstanceCullingOcclusionQueryVS, FGlobalShader);
|
|
|
|
public:
|
|
|
|
class FMultiView : SHADER_PERMUTATION_BOOL("DIM_MULTI_VIEW");
|
|
using FPermutationDomain = TShaderPermutationDomain<FMultiView>;
|
|
|
|
static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters)
|
|
{
|
|
return FDataDrivenShaderPlatformInfo::GetSupportsVertexShaderSRVs(Parameters.Platform);
|
|
}
|
|
|
|
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
|
|
{
|
|
FGlobalShader::ModifyCompilationEnvironment(Parameters, OutEnvironment);
|
|
OutEnvironment.SetDefine(TEXT("USE_GLOBAL_GPU_SCENE_DATA"), 1);
|
|
OutEnvironment.SetDefine(TEXT("VF_SUPPORTS_PRIMITIVE_SCENE_DATA"), 1);
|
|
}
|
|
|
|
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FGPUSceneResourceParameters, GPUSceneParameters)
|
|
RDG_BUFFER_ACCESS(IndirectDrawArgsBuffer, ERHIAccess::IndirectArgs)
|
|
SHADER_PARAMETER_RDG_BUFFER_SRV(Buffer<uint32>, InstanceIdBuffer)
|
|
SHADER_PARAMETER_RDG_BUFFER_UAV(RWBuffer<uint>, RWVisibilityMask) // One uint8/32 per instance (0 if instance is culled, non-0 otherwise)
|
|
SHADER_PARAMETER_STRUCT_REF(FViewUniformShaderParameters, View)
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FHZBParameters, HZBParameters)
|
|
SHADER_PARAMETER(float, OcclusionSlop)
|
|
SHADER_PARAMETER(uint32, ViewMask)
|
|
END_SHADER_PARAMETER_STRUCT()
|
|
};
|
|
|
|
class FInstanceCullingOcclusionQueryPS : public FGlobalShader
|
|
{
|
|
DECLARE_GLOBAL_SHADER(FInstanceCullingOcclusionQueryPS);
|
|
SHADER_USE_PARAMETER_STRUCT(FInstanceCullingOcclusionQueryPS, FGlobalShader);
|
|
|
|
public:
|
|
static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters)
|
|
{
|
|
return FDataDrivenShaderPlatformInfo::GetSupportsVertexShaderSRVs(Parameters.Platform);
|
|
}
|
|
|
|
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
|
|
{
|
|
}
|
|
|
|
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
|
|
SHADER_PARAMETER_RDG_BUFFER_UAV(RWBuffer<uint>, RWVisibilityMask) // One uint8/32 per instance (0 if instance is culled, non-0 otherwise)
|
|
END_SHADER_PARAMETER_STRUCT()
|
|
};
|
|
|
|
IMPLEMENT_GLOBAL_SHADER(FInstanceCullingOcclusionQueryVS, "/Engine/Private/InstanceCulling/InstanceCullingOcclusionQuery.usf", "MainVS", SF_Vertex);
|
|
IMPLEMENT_GLOBAL_SHADER(FInstanceCullingOcclusionQueryPS, "/Engine/Private/InstanceCulling/InstanceCullingOcclusionQuery.usf", "MainPS", SF_Pixel);
|
|
|
|
BEGIN_SHADER_PARAMETER_STRUCT(FOcclusionInstanceCullingParameters,)
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FInstanceCullingOcclusionQueryVS::FParameters, VS)
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FInstanceCullingOcclusionQueryPS::FParameters, PS)
|
|
RENDER_TARGET_BINDING_SLOTS()
|
|
END_SHADER_PARAMETER_STRUCT()
|
|
|
|
class FInstanceCullingOcclusionQueryBox : public FRenderResource
|
|
{
|
|
public:
|
|
FBufferRHIRef IndexBuffer;
|
|
FBufferRHIRef VertexBuffer;
|
|
FVertexDeclarationRHIRef VertexDeclaration;
|
|
|
|
// Destructor
|
|
virtual ~FInstanceCullingOcclusionQueryBox() {}
|
|
|
|
virtual void InitRHI(FRHICommandListBase& RHICmdList) override
|
|
{
|
|
static const uint16 BoxIndexBufferData[] =
|
|
{
|
|
// Tri list
|
|
0, 1, 2, 0, 2, 3,
|
|
4, 5, 6, 4, 6, 7,
|
|
1, 4, 7, 1, 7, 2,
|
|
5, 0, 3, 5, 3, 6,
|
|
5, 4, 1, 5, 1, 0,
|
|
3, 2, 7, 3, 7, 6,
|
|
// Line list
|
|
0, 1, 0, 3, 0, 5,
|
|
7, 2, 7, 6, 7, 4,
|
|
3, 2, 1, 2, 3, 6,
|
|
5, 6, 5, 4, 1, 4
|
|
};
|
|
|
|
static const FVector3f BoxVertexBufferData[] =
|
|
{
|
|
FVector3f(-1.0f, +1.0f, +1.0f),
|
|
FVector3f(+1.0f, +1.0f, +1.0f),
|
|
FVector3f(+1.0f, -1.0f, +1.0f),
|
|
FVector3f(-1.0f, -1.0f, +1.0f),
|
|
FVector3f(+1.0f, +1.0f, -1.0f),
|
|
FVector3f(-1.0f, +1.0f, -1.0f),
|
|
FVector3f(-1.0f, -1.0f, -1.0f),
|
|
FVector3f(+1.0f, -1.0f, -1.0f),
|
|
};
|
|
|
|
IndexBuffer = UE::RHIResourceUtils::CreateIndexBufferFromArray(RHICmdList, TEXT("FInstanceCullingOcclusionQueryBox_IndexBuffer"), MakeConstArrayView(BoxIndexBufferData));
|
|
VertexBuffer = UE::RHIResourceUtils::CreateVertexBufferFromArray(RHICmdList, TEXT("FInstanceCullingOcclusionQueryBox_VertexBuffer"), MakeConstArrayView(BoxVertexBufferData));
|
|
|
|
FVertexDeclarationElementList VertexDeclarationElements;
|
|
VertexDeclarationElements.Add(FVertexElement(0, 0, VET_Float3, 0, 12));
|
|
VertexDeclaration = PipelineStateCache::GetOrCreateVertexDeclaration(VertexDeclarationElements);
|
|
}
|
|
|
|
virtual void ReleaseRHI() override
|
|
{
|
|
IndexBuffer.SafeRelease();
|
|
VertexBuffer.SafeRelease();
|
|
VertexDeclaration.SafeRelease();
|
|
}
|
|
};
|
|
|
|
TGlobalResource<FInstanceCullingOcclusionQueryBox> GInstanceCullingOcclusionQueryBox;
|
|
|
|
static void RenderInstanceOcclusionCulling(
|
|
FRHICommandList& RHICmdList,
|
|
FViewInfo& View,
|
|
FOcclusionInstanceCullingParameters* PassParameters,
|
|
bool bMultiView)
|
|
{
|
|
FInstanceCullingOcclusionQueryVS::FPermutationDomain VSPermutationVector;
|
|
VSPermutationVector.Set<FInstanceCullingOcclusionQueryVS::FMultiView>(bMultiView);
|
|
TShaderMapRef<FInstanceCullingOcclusionQueryVS> VertexShader(View.ShaderMap, VSPermutationVector);
|
|
|
|
TShaderMapRef<FInstanceCullingOcclusionQueryPS> PixelShader(View.ShaderMap);
|
|
|
|
FGraphicsPipelineStateInitializer GraphicsPSOInit;
|
|
RHICmdList.ApplyCachedRenderTargets(GraphicsPSOInit);
|
|
|
|
FIntVector4 ViewRect = FIntVector4(View.ViewRect.Min.X, View.ViewRect.Min.Y, View.ViewRect.Max.X, View.ViewRect.Max.Y);
|
|
RHICmdList.SetViewport(ViewRect.X, ViewRect.Y, 0.0f, ViewRect.Z, ViewRect.W, 1.0f);
|
|
|
|
GraphicsPSOInit.BoundShaderState.VertexDeclarationRHI = GInstanceCullingOcclusionQueryBox.VertexDeclaration;
|
|
GraphicsPSOInit.BoundShaderState.VertexShaderRHI = VertexShader.GetVertexShader();
|
|
GraphicsPSOInit.BoundShaderState.PixelShaderRHI = PixelShader.GetPixelShader();
|
|
GraphicsPSOInit.DepthStencilState = TStaticDepthStencilState<false, CF_DepthNearOrEqual>::GetRHI(); // Depth test, no write
|
|
GraphicsPSOInit.BlendState = TStaticBlendState<>::GetRHI(); // Blend state does not matter, as we are not writing to render targets
|
|
GraphicsPSOInit.RasterizerState = TStaticRasterizerState<FM_Solid, CM_CW>::GetRHI();
|
|
GraphicsPSOInit.PrimitiveType = PT_TriangleList;
|
|
|
|
SetGraphicsPipelineState(RHICmdList, GraphicsPSOInit, 0);
|
|
|
|
ClearUnusedGraphResources(VertexShader, &PassParameters->VS);
|
|
ClearUnusedGraphResources(PixelShader, &PassParameters->PS);
|
|
|
|
SetShaderParameters(RHICmdList, VertexShader, VertexShader.GetVertexShader(), PassParameters->VS);
|
|
SetShaderParameters(RHICmdList, PixelShader, PixelShader.GetPixelShader(), PassParameters->PS);
|
|
|
|
RHICmdList.SetStreamSource(0, GInstanceCullingOcclusionQueryBox.VertexBuffer, 0);
|
|
|
|
FRDGBufferRef IndirectArgsBuffer = PassParameters->VS.IndirectDrawArgsBuffer;
|
|
IndirectArgsBuffer->MarkResourceAsUsed();
|
|
|
|
RHICmdList.DrawIndexedPrimitiveIndirect(GInstanceCullingOcclusionQueryBox.IndexBuffer, IndirectArgsBuffer->GetRHI(), 0);
|
|
}
|
|
|
|
|
|
/*
|
|
* Structure to compute data that's not available on the rendering thread during RDG setup.
|
|
* In particular, we want to wait for visible mesh draw commands as late as possible.
|
|
*/
|
|
struct FInstanceCullingOcclusionQueryDeferredContext
|
|
{
|
|
FInstanceCullingOcclusionQueryDeferredContext(const FViewInfo* InView, int32 InNumGPUSceneInstances, EMeshPass::Type InMeshPass, FInstanceCullingContext* InInstanceCullingContext)
|
|
: View(InView)
|
|
, NumGPUSceneInstances(InNumGPUSceneInstances)
|
|
, MeshPass(InMeshPass)
|
|
, InstanceCullingContext(InInstanceCullingContext)
|
|
{
|
|
}
|
|
|
|
static FORCEINLINE bool IsRelevantCommand(const FVisibleMeshDrawCommand& VisibleCommand)
|
|
{
|
|
// There may be multiple visible mesh draw commands that refer to the same instance when GPU-based LOD selection is used.
|
|
// This filter is designed to remove the duplicates, keeping only the "authoritative" instance.
|
|
// TODO: a less implicit mechanism would be welcome here, such as a dedicated flag.
|
|
const EMeshDrawCommandCullingPayloadFlags Flags = VisibleCommand.CullingPayloadFlags;
|
|
const bool bCompatibleFlags = Flags == EMeshDrawCommandCullingPayloadFlags::Default
|
|
|| Flags == EMeshDrawCommandCullingPayloadFlags::MinScreenSizeCull;
|
|
|
|
// Only commands with HasPrimitiveIdStreamIndex are compatible with GPU Instance Culling
|
|
const bool bSupportsGPUSceneInstancing = EnumHasAnyFlags(VisibleCommand.Flags, EFVisibleMeshDrawCommandFlags::HasPrimitiveIdStreamIndex);
|
|
|
|
// NumPrimitives is 0 if mesh draw command uses IndirectArgs
|
|
// This path is currently not implemented/supported by oclcusion query culling.
|
|
// Commands that use instance runs are currently not supported.
|
|
return bCompatibleFlags
|
|
&& bSupportsGPUSceneInstancing
|
|
&& VisibleCommand.PrimitiveIdInfo.InstanceSceneDataOffset != INDEX_NONE
|
|
&& VisibleCommand.NumRuns == 0;
|
|
};
|
|
|
|
static FORCEINLINE uint32 GetCommandNumInstances(const FVisibleMeshDrawCommand& VisibleMeshDrawCommand, const FScene *Scene)
|
|
{
|
|
const bool bFetchInstanceCountFromScene = EnumHasAnyFlags(VisibleMeshDrawCommand.Flags, EFVisibleMeshDrawCommandFlags::FetchInstanceCountFromScene);
|
|
if (bFetchInstanceCountFromScene)
|
|
{
|
|
check(Scene != nullptr);
|
|
check(!VisibleMeshDrawCommand.PrimitiveIdInfo.bIsDynamicPrimitive);
|
|
return uint32(Scene->Primitives[VisibleMeshDrawCommand.PrimitiveIdInfo.ScenePrimitiveId]->GetNumInstanceSceneDataEntries());
|
|
}
|
|
return VisibleMeshDrawCommand.MeshDrawCommand->NumInstances;
|
|
}
|
|
|
|
void Execute()
|
|
{
|
|
if (bFunctionExecuted)
|
|
{
|
|
return;
|
|
}
|
|
|
|
TRACE_CPUPROFILER_EVENT_SCOPE(FInstanceCullingOcclusionQueryDeferredContext::Execute);
|
|
|
|
bFunctionExecuted = true;
|
|
|
|
if (!View->ParallelMeshDrawCommandPasses[MeshPass])
|
|
{
|
|
return;
|
|
}
|
|
|
|
const FParallelMeshDrawCommandPass& MeshDrawCommandPass = *View->ParallelMeshDrawCommandPasses[MeshPass];
|
|
|
|
// Execute() is expected to run late enough to not stall here.
|
|
// If it does happen, then we may have to move the render pass to later point in the frame.
|
|
MeshDrawCommandPass.WaitForSetupTask();
|
|
|
|
if (InstanceCullingContext != nullptr)
|
|
{
|
|
InstanceProcessingGPULoadBalancer = InstanceCullingContext->LoadBalancers[int(EBatchProcessingMode::Generic)];
|
|
bValid = (InstanceProcessingGPULoadBalancer != nullptr);
|
|
static FInstanceProcessingGPULoadBalancer Dummy;
|
|
// Always provide a load balancer so that CreateLoadBalancerGPUDataDeferred doesn't crash. bValid=false will skip the dispatch
|
|
if (!bValid)
|
|
{
|
|
InstanceProcessingGPULoadBalancer = &Dummy;
|
|
}
|
|
|
|
// in case something goes wrong: we will skip the compute since bValid won't be true and we will fill up the data from VisibleInstanceIds
|
|
AlignedNumInstances = FInstanceCullingOcclusionQueryCS::NumThreadsPerGroup;
|
|
VisibleInstanceIds.SetNumZeroed(AlignedNumInstances);
|
|
InstanceProcessingGPULoadBalancer->FinalizeBatches();
|
|
FIntVector LoadBalancerNumThreadGroups = InstanceProcessingGPULoadBalancer->GetWrappedCsGroupCount();
|
|
// Needed to allocate the buffer holding the instance ids after the culling pass, see DeferredAlignedNumInstancesOutputCulling
|
|
AlignedNumInstances = LoadBalancerNumThreadGroups.X * LoadBalancerNumThreadGroups.Y * LoadBalancerNumThreadGroups.Z * FInstanceCullingOcclusionQueryCS::NumThreadsPerGroup;
|
|
|
|
return;
|
|
}
|
|
|
|
const FMeshCommandOneFrameArray& VisibleMeshDrawCommands = MeshDrawCommandPass.GetMeshDrawCommands();
|
|
|
|
const FScene *Scene = View->Family->Scene->GetRenderScene();
|
|
|
|
NumInstances = CountVisibleInstances(VisibleMeshDrawCommands, Scene);
|
|
|
|
NumThreadGroups = FComputeShaderUtils::GetGroupCount(NumInstances, FInstanceCullingOcclusionQueryCS::NumThreadsPerGroup);
|
|
|
|
const int32 MaxSupportedInstances = GRHIGlobals.MaxDispatchThreadGroupsPerDimension.X * FInstanceCullingOcclusionQueryCS::NumThreadsPerGroup;
|
|
if (!ensureMsgf(NumThreadGroups.X * FInstanceCullingOcclusionQueryCS::NumThreadsPerGroup <= MaxSupportedInstances,
|
|
TEXT("Number of instances (%d) is greater than currently supported by FInstanceCullingOcclusionQueryRenderer (%d). ")
|
|
TEXT("Per-instance occlusion queries will be disabled. ")
|
|
TEXT("Increase FInstanceCullingOcclusionQueryCS::NumThreadsPerGroup or implement wrapped group count support."),
|
|
NumInstances, MaxSupportedInstances))
|
|
{
|
|
return;
|
|
}
|
|
|
|
// Align buffer sizes to ensure each thread in the thread group has a valid slot to write without introducing bounds checks
|
|
AlignedNumInstances = NumThreadGroups.X * FInstanceCullingOcclusionQueryCS::NumThreadsPerGroup;
|
|
|
|
if (AlignedNumInstances == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
const uint32 DynamicPrimitiveInstanceOffset = View->DynamicPrimitiveCollector.GetInstanceSceneDataOffset();
|
|
|
|
FillVisibleInstanceIds(VisibleMeshDrawCommands, DynamicPrimitiveInstanceOffset, Scene);
|
|
|
|
bValid = true;
|
|
}
|
|
|
|
uint32 CountVisibleInstances(const FMeshCommandOneFrameArray& VisibleMeshDrawCommands, const FScene *Scene) const
|
|
{
|
|
TRACE_CPUPROFILER_EVENT_SCOPE(FInstanceCullingOcclusionQueryDeferredContext::CountVisibleInstances);
|
|
|
|
uint32 Result = 0;
|
|
|
|
for (const FVisibleMeshDrawCommand& VisibleCommand : VisibleMeshDrawCommands)
|
|
{
|
|
if (!IsRelevantCommand(VisibleCommand))
|
|
{
|
|
continue;
|
|
}
|
|
Result += GetCommandNumInstances(VisibleCommand, Scene);
|
|
}
|
|
|
|
return Result;
|
|
}
|
|
|
|
void FillVisibleInstanceIds(const FMeshCommandOneFrameArray& VisibleMeshDrawCommands, const uint32 DynamicPrimitiveInstanceOffset, const FScene *Scene)
|
|
{
|
|
TRACE_CPUPROFILER_EVENT_SCOPE(FInstanceCullingOcclusionQueryDeferredContext::FillVisibleInstanceIds);
|
|
|
|
check(AlignedNumInstances != 0);
|
|
|
|
// Write output data directly, bypassing TArray::Add overhead (resize branch, etc.)
|
|
VisibleInstanceIds.SetNumUninitialized(AlignedNumInstances);
|
|
uint32* ResultData = VisibleInstanceIds.GetData();
|
|
uint32* ResultCursor = ResultData;
|
|
|
|
for (const FVisibleMeshDrawCommand& VisibleCommand : VisibleMeshDrawCommands)
|
|
{
|
|
if (!IsRelevantCommand(VisibleCommand))
|
|
{
|
|
continue;
|
|
}
|
|
uint32 CommandNumInstances = GetCommandNumInstances(VisibleCommand, Scene);
|
|
if (CommandNumInstances == 0u)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
uint32 InstanceBaseIndex = VisibleCommand.PrimitiveIdInfo.InstanceSceneDataOffset;
|
|
if (VisibleCommand.PrimitiveIdInfo.bIsDynamicPrimitive)
|
|
{
|
|
InstanceBaseIndex += DynamicPrimitiveInstanceOffset;
|
|
}
|
|
|
|
check(InstanceBaseIndex + CommandNumInstances <= uint32(NumGPUSceneInstances));
|
|
|
|
for (uint32 i = 0; i < CommandNumInstances; ++i)
|
|
{
|
|
*ResultCursor = InstanceBaseIndex + i;
|
|
++ResultCursor;
|
|
}
|
|
}
|
|
|
|
for (int32 i = NumInstances; i < AlignedNumInstances; ++i)
|
|
{
|
|
*ResultCursor = 0;
|
|
++ResultCursor;
|
|
}
|
|
|
|
check(ResultCursor == ResultData + AlignedNumInstances);
|
|
}
|
|
|
|
FRDGBufferNumElementsCallback DeferredAlignedNumInstancesOutputCulling()
|
|
{
|
|
return [Context = this]() -> uint32
|
|
{
|
|
Context->Execute();
|
|
return Context->AlignedNumInstances;
|
|
};
|
|
}
|
|
|
|
FRDGBufferNumElementsCallback DeferredNumInstanceIdData()
|
|
{
|
|
return [Context = this]() -> uint32
|
|
{
|
|
Context->Execute();
|
|
return Context->VisibleInstanceIds.Num();
|
|
};
|
|
}
|
|
|
|
FRDGBufferInitialDataCallback DeferredInstanceIdData()
|
|
{
|
|
return [Context = this]() -> const void*
|
|
{
|
|
Context->Execute();
|
|
return Context->VisibleInstanceIds.GetData();
|
|
};
|
|
}
|
|
|
|
FRDGBufferInitialDataSizeCallback DeferredInstanceIdDataSize()
|
|
{
|
|
return [Context = this]() -> uint64
|
|
{
|
|
Context->Execute();
|
|
return Context->VisibleInstanceIds.Num() * Context->VisibleInstanceIds.GetTypeSize();
|
|
};
|
|
}
|
|
|
|
// Execute function may be called multiple times, but we only want to run computations once
|
|
bool bFunctionExecuted = false;
|
|
|
|
// If this is false, then some late validation have failed and rendering should be skipped
|
|
bool bValid = false;
|
|
|
|
const FViewInfo* View = nullptr;
|
|
int32 NumGPUSceneInstances = 0;
|
|
EMeshPass::Type MeshPass = EMeshPass::Num;
|
|
FInstanceCullingContext* InstanceCullingContext = nullptr;
|
|
FInstanceProcessingGPULoadBalancer* InstanceProcessingGPULoadBalancer = nullptr;
|
|
int32 NumInstances = 0;
|
|
int32 AlignedNumInstances = 0;
|
|
FIntVector NumThreadGroups = FIntVector::ZeroValue;
|
|
|
|
TArray<uint32> VisibleInstanceIds;
|
|
};
|
|
|
|
static void CreateLoadBalancerGPUDataDeferred(FRDGBuilder& GraphBuilder, FInstanceCullingOcclusionQueryCS::FParameters* PassParameters, FInstanceCullingOcclusionQueryDeferredContext* DeferredContext)
|
|
{
|
|
PassParameters->LoadBalancerParameters.BatchBuffer = GraphBuilder.CreateSRV(
|
|
CreateStructuredBuffer(GraphBuilder, TEXT("InstanceCullingLoadBalancer.Batches"), [DeferredContext]()
|
|
-> const TArray<FInstanceCullingLoadBalancerBase::FPackedBatch>&
|
|
{
|
|
DeferredContext->Execute();
|
|
return DeferredContext->InstanceProcessingGPULoadBalancer->GetBatches();
|
|
}));
|
|
|
|
PassParameters->LoadBalancerParameters.ItemBuffer = GraphBuilder.CreateSRV(
|
|
CreateStructuredBuffer(GraphBuilder, TEXT("InstanceCullingLoadBalancer.Items"), [DeferredContext]()
|
|
-> const TArray<FInstanceCullingLoadBalancerBase::FPackedItem>&
|
|
{
|
|
DeferredContext->Execute();
|
|
return DeferredContext->InstanceProcessingGPULoadBalancer->GetItems();
|
|
}));
|
|
}
|
|
|
|
uint32 FInstanceCullingOcclusionQueryRenderer::Render(
|
|
FRDGBuilder& GraphBuilder,
|
|
FGPUScene& GPUScene,
|
|
FViewInfo& View)
|
|
{
|
|
if (!IsCompatibleWithView(View))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
const uint32 ViewMask = FindOrAddViewSlot(View);
|
|
|
|
if (ViewMask == 0)
|
|
{
|
|
// Silently fall back to no culling when we hit the limit of maximum supported views
|
|
return 0;
|
|
}
|
|
|
|
TRACE_CPUPROFILER_EVENT_SCOPE(FInstanceCullingOcclusionQueryRenderer::Render);
|
|
|
|
// Whether to use shader permutation that preserves visibility bits corresponding to other views (slight extra cost)
|
|
const bool bMultiView = CurrentRenderedViewIDs.Num() > 1;
|
|
|
|
const int32 NumGPUSceneInstances = GPUScene.GetNumInstances();
|
|
|
|
FInstanceCullingContext* InstanceCullingContext = nullptr;
|
|
if (auto* Pass = View.ParallelMeshDrawCommandPasses[EMeshPass::BasePass]; Pass && GInstanceCullingUseLoadBalancer > 0)
|
|
{
|
|
// At this point in time, we don't have the guarantee that MeshDrawCommandPass is done. Only access stable members, not batches/items/mdcs
|
|
InstanceCullingContext = Pass->GetInstanceCullingContext();
|
|
}
|
|
|
|
FInstanceCullingOcclusionQueryDeferredContext* DeferredContext = GraphBuilder.AllocObject<FInstanceCullingOcclusionQueryDeferredContext>(&View, NumGPUSceneInstances,
|
|
EMeshPass::BasePass, InstanceCullingContext);
|
|
|
|
FRDGTextureRef DepthTexture = View.GetSceneTextures().Depth.Target;
|
|
|
|
checkf(DepthTexture && IsHZBValid(View, EHZBType::FurthestHZB),
|
|
TEXT("Occlusion query instance culling pass requires scene depth texture and HZB. See FInstanceCullingOcclusionQueryRenderer::IsCompatibleWithView()"));
|
|
|
|
const FGPUSceneResourceParameters GPUSceneParameters = GPUScene.GetShaderParameters(GraphBuilder);
|
|
|
|
const FIntPoint ViewRectSize = View.ViewRect.Size();
|
|
|
|
EPixelFormat VisibilityMaskFormat = GetPreferredVisibilityMaskFormat();
|
|
int32 VisibilityMaskStride = GPixelFormats[VisibilityMaskFormat].BlockBytes;
|
|
|
|
// Create the result buffer on demand
|
|
if (!CurrentInstanceOcclusionQueryBuffer)
|
|
{
|
|
const int32 AlignedNumGPUSceneInstances =
|
|
FMath::DivideAndRoundUp(NumGPUSceneInstances, FInstanceCullingOcclusionQueryCS::NumThreadsPerGroup)
|
|
* FInstanceCullingOcclusionQueryCS::NumThreadsPerGroup;
|
|
|
|
CurrentInstanceOcclusionQueryBuffer = GraphBuilder.CreateBuffer(
|
|
FRDGBufferDesc::CreateBufferDesc(VisibilityMaskStride, AlignedNumGPUSceneInstances),
|
|
TEXT("FInstanceCullingOcclusionQueryRenderer_VisibleInstanceMask"));
|
|
|
|
InstanceOcclusionQueryBufferFormat = VisibilityMaskFormat;
|
|
|
|
AllocatedNumInstances = NumGPUSceneInstances;
|
|
|
|
// Create a wide-format alias for the underlying resource for a more efficient clear
|
|
FRDGBufferUAVRef UAV = GraphBuilder.CreateUAV(CurrentInstanceOcclusionQueryBuffer, PF_R32G32B32A32_UINT);
|
|
AddClearUAVPass(GraphBuilder, UAV, 0xFFFFFFFF);
|
|
}
|
|
|
|
checkf(uint32(NumGPUSceneInstances) == AllocatedNumInstances, TEXT("Number of instances in GPUScene is not expected change to during the frame"));
|
|
|
|
FRDGBufferRef VisibleInstanceMaskBuffer = CurrentInstanceOcclusionQueryBuffer;
|
|
FRDGBufferUAVRef VisibilityMaskUAV = GraphBuilder.CreateUAV(VisibleInstanceMaskBuffer, VisibilityMaskFormat);
|
|
|
|
FRDGBufferRef IndirectArgsBuffer = GraphBuilder.CreateBuffer(
|
|
FRDGBufferDesc::CreateIndirectDesc<FRHIDrawIndexedIndirectParameters>(1),
|
|
TEXT("FInstanceCullingOcclusionQueryRenderer_IndirectArgsBuffer"));
|
|
FRDGBufferUAVRef IndirectArgsUAV = GraphBuilder.CreateUAV(IndirectArgsBuffer, PF_R32_UINT);
|
|
|
|
// Buffer of GPUScene instance indices to run occlusion queries for (input for setup CS)
|
|
FRDGBufferRef SetupInstanceIdBuffer;
|
|
|
|
// When using the GPU load balancer, the upload of the data holding instance ids happens below with InstanceProcessingGPULoadBalancer->Upload
|
|
if (InstanceCullingContext == nullptr)
|
|
{
|
|
SetupInstanceIdBuffer = GraphBuilder.CreateBuffer(
|
|
FRDGBufferDesc::CreateBufferDesc(sizeof(uint32), 1 /*real size is provided via callback later*/),
|
|
TEXT("FInstanceCullingOcclusionQueryRenderer_SetupInstanceIdBuffer"),
|
|
DeferredContext->DeferredNumInstanceIdData());
|
|
GraphBuilder.QueueBufferUpload(SetupInstanceIdBuffer,
|
|
DeferredContext->DeferredInstanceIdData(),
|
|
DeferredContext->DeferredInstanceIdDataSize());
|
|
}
|
|
else
|
|
{
|
|
SetupInstanceIdBuffer = GSystemTextures.GetDefaultBuffer(GraphBuilder, 4);
|
|
}
|
|
|
|
FRDGBufferSRVRef SetupInstanceIdBufferSRV = GraphBuilder.CreateSRV(SetupInstanceIdBuffer, PF_R32_UINT);
|
|
|
|
// Buffer of GPUScene instance indices that passed the filtering in the setup CS pass and should be rendered in the subsequent graphics pass
|
|
FRDGBufferRef InstanceIdBuffer = GraphBuilder.CreateBuffer(
|
|
FRDGBufferDesc::CreateBufferDesc(sizeof(uint32), 1 /*real size is provided via callback later*/),
|
|
TEXT("FInstanceCullingOcclusionQueryRenderer_InstanceIdBuffer"),
|
|
DeferredContext->DeferredAlignedNumInstancesOutputCulling());
|
|
|
|
FRDGBufferUAVRef InstanceIdUAV = GraphBuilder.CreateUAV(InstanceIdBuffer, PF_R32_UINT);
|
|
FRDGBufferSRVRef InstanceIdSRV = GraphBuilder.CreateSRV(InstanceIdBuffer, PF_R32_UINT);
|
|
|
|
AddClearUAVPass(GraphBuilder, IndirectArgsUAV, 0);
|
|
|
|
// Compute pass to perform initial per-instance filtering and prepare instance list for per-pixel occlusion tests
|
|
{
|
|
FInstanceCullingOcclusionQueryCS::FParameters* PassParameters = GraphBuilder.AllocParameters<FInstanceCullingOcclusionQueryCS::FParameters>();
|
|
|
|
// FInstanceGPULoadBalancer uses the SceneRenderingAllocator which should keep data alive until the graphbuilder execution
|
|
bool bUseGPULoadBalancer = false;
|
|
if (InstanceCullingContext != nullptr)
|
|
{
|
|
CreateLoadBalancerGPUDataDeferred(GraphBuilder, PassParameters, DeferredContext);
|
|
bUseGPULoadBalancer = true;
|
|
}
|
|
|
|
PassParameters->OutIndirectArgsBuffer = IndirectArgsUAV;
|
|
PassParameters->OutInstanceIdBuffer = InstanceIdUAV;
|
|
PassParameters->RWVisibilityMask = VisibilityMaskUAV;
|
|
PassParameters->View = View.ViewUniformBuffer;
|
|
PassParameters->HZBParameters = GetHZBParameters(GraphBuilder, View, EHZBType::FurthestHZB);
|
|
PassParameters->OcclusionSlop = OCCLUSION_SLOP;
|
|
PassParameters->GPUSceneParameters = GPUSceneParameters;
|
|
PassParameters->NumInstances = 0; // filled from DeferredContext later
|
|
PassParameters->InstanceIdBuffer = SetupInstanceIdBufferSRV;
|
|
PassParameters->ViewMask = ViewMask;
|
|
|
|
FInstanceCullingOcclusionQueryCS::FPermutationDomain CSPermutationVector;
|
|
CSPermutationVector.Set<FInstanceCullingOcclusionQueryCS::FMultiView>(bMultiView);
|
|
CSPermutationVector.Set<FInstanceCullingOcclusionQueryCS::FUseLoadBalancerDim>(bUseGPULoadBalancer);
|
|
TShaderMapRef<FInstanceCullingOcclusionQueryCS> ComputeShader(View.ShaderMap, CSPermutationVector);
|
|
|
|
ClearUnusedGraphResources(ComputeShader, PassParameters);
|
|
|
|
GraphBuilder.AddPass(
|
|
RDG_EVENT_NAME("InstanceCullingOcclusionQueryRenderer_Setup"),
|
|
PassParameters,
|
|
ERDGPassFlags::Compute,
|
|
[PassParameters, DeferredContext, ComputeShader](FRDGAsyncTask, FRHIComputeCommandList& RHICmdList)
|
|
{
|
|
if (!DeferredContext->bValid)
|
|
{
|
|
return;
|
|
}
|
|
|
|
PassParameters->NumInstances = DeferredContext->NumInstances;
|
|
|
|
FIntVector CullingNumThreadGroups(DeferredContext->NumThreadGroups);
|
|
FInstanceProcessingGPULoadBalancer* DeferredContextInstanceProcessingGPULoadBalancer = DeferredContext->InstanceProcessingGPULoadBalancer;
|
|
if (DeferredContextInstanceProcessingGPULoadBalancer)
|
|
{
|
|
PassParameters->LoadBalancerParameters.NumBatches = DeferredContextInstanceProcessingGPULoadBalancer->GetBatches().Num();
|
|
PassParameters->LoadBalancerParameters.NumItems = DeferredContextInstanceProcessingGPULoadBalancer->GetItems().Num();
|
|
CullingNumThreadGroups = DeferredContextInstanceProcessingGPULoadBalancer->GetWrappedCsGroupCount();
|
|
}
|
|
|
|
FComputeShaderUtils::Dispatch(
|
|
RHICmdList,
|
|
ComputeShader,
|
|
*PassParameters,
|
|
CullingNumThreadGroups);
|
|
});
|
|
}
|
|
|
|
// Perform per-instance per-pixel occlusion tests by drawing bounding boxes that write into VisibleInstanceMaskBuffer slots for visible instances
|
|
{
|
|
FOcclusionInstanceCullingParameters* PassParameters = GraphBuilder.AllocParameters<FOcclusionInstanceCullingParameters>();
|
|
|
|
PassParameters->VS.IndirectDrawArgsBuffer = IndirectArgsBuffer;
|
|
PassParameters->VS.View = View.ViewUniformBuffer;
|
|
PassParameters->VS.HZBParameters = GetHZBParameters(GraphBuilder, View, EHZBType::FurthestHZB);
|
|
PassParameters->VS.OcclusionSlop = OCCLUSION_SLOP;
|
|
PassParameters->VS.GPUSceneParameters = GPUSceneParameters;
|
|
PassParameters->VS.InstanceIdBuffer = InstanceIdSRV;
|
|
PassParameters->VS.ViewMask = ViewMask;
|
|
PassParameters->VS.RWVisibilityMask = VisibilityMaskUAV;
|
|
PassParameters->PS.RWVisibilityMask = VisibilityMaskUAV;
|
|
PassParameters->RenderTargets.DepthStencil = FDepthStencilBinding(DepthTexture,
|
|
ERenderTargetLoadAction::ELoad, ERenderTargetLoadAction::ENoAction,
|
|
FExclusiveDepthStencil::DepthRead_StencilNop);
|
|
|
|
|
|
GraphBuilder.AddPass(
|
|
RDG_EVENT_NAME("InstanceCullingOcclusionQueryRenderer_Draw"),
|
|
PassParameters, ERDGPassFlags::Raster | ERDGPassFlags::NeverCull,
|
|
[PassParameters, DeferredContext, bMultiView, &View](FRDGAsyncTask, FRHICommandList& RHICmdList)
|
|
{
|
|
if (!DeferredContext->bValid)
|
|
{
|
|
return;
|
|
}
|
|
|
|
RenderInstanceOcclusionCulling(RHICmdList, View, PassParameters, bMultiView);
|
|
});
|
|
}
|
|
|
|
return ViewMask;
|
|
}
|
|
|
|
void FInstanceCullingOcclusionQueryRenderer::MarkInstancesVisible(FRDGBuilder& GraphBuilder, TConstArrayView<FGPUSceneInstanceRange> Ranges)
|
|
{
|
|
if (!InstanceOcclusionQueryBuffer)
|
|
{
|
|
// Previous frame buffer does not exist, nothing to clear
|
|
return;
|
|
}
|
|
|
|
EPixelFormat VisibilityMaskFormat = GetPreferredVisibilityMaskFormat();
|
|
|
|
FRDGBufferRef Buffer = GraphBuilder.RegisterExternalBuffer(InstanceOcclusionQueryBuffer);
|
|
|
|
// Consecutive uses of the UAV will run in parallel.
|
|
// Allocating a unique RDG UAV here will still ensure that a barrier is inserted before the first dispatch.
|
|
FRDGBufferUAVRef UAV = GraphBuilder.CreateUAV(Buffer, VisibilityMaskFormat, ERDGUnorderedAccessViewFlags::SkipBarrier);
|
|
|
|
// NOTE: It is possible to make this more efficient using a specialized GPU scatter shader, if we see many small batches here in practice
|
|
for (FGPUSceneInstanceRange Range : Ranges)
|
|
{
|
|
FMemsetResourceParams MemsetParams;
|
|
MemsetParams.Value = 0xFFFFFFFF; // Mark instance visible in all views
|
|
MemsetParams.Count = Range.NumInstanceSceneDataEntries;
|
|
MemsetParams.DstOffset = Range.InstanceSceneDataOffset;
|
|
MemsetResource(GraphBuilder, UAV, MemsetParams);
|
|
}
|
|
}
|
|
|
|
void FInstanceCullingOcclusionQueryRenderer::EndFrame(FRDGBuilder& GraphBuilder)
|
|
{
|
|
if (CurrentInstanceOcclusionQueryBuffer)
|
|
{
|
|
GraphBuilder.QueueBufferExtraction(CurrentInstanceOcclusionQueryBuffer, &InstanceOcclusionQueryBuffer, ERHIAccess::SRVMask);
|
|
CurrentInstanceOcclusionQueryBuffer = {};
|
|
AllocatedNumInstances = 0;
|
|
}
|
|
CurrentRenderedViewIDs.Empty();
|
|
}
|
|
|
|
uint32 FInstanceCullingOcclusionQueryRenderer::FindOrAddViewSlot(const FViewInfo& View)
|
|
{
|
|
const uint32 ViewKey = View.GetViewKey();
|
|
|
|
if (CurrentRenderedViewIDs.Num() < MaxViews && ViewKey != 0)
|
|
{
|
|
int32 Index = CurrentRenderedViewIDs.AddUnique(ViewKey);
|
|
check(Index >= 0 && Index < MaxViews);
|
|
return 1u << Index;
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
bool FInstanceCullingOcclusionQueryRenderer::IsCompatibleWithView(const FViewInfo& View)
|
|
{
|
|
EPixelFormat VisibilityMaskFormat = GetPreferredVisibilityMaskFormat();
|
|
return FDataDrivenShaderPlatformInfo::GetSupportsVertexShaderSRVs(View.GetShaderPlatform())
|
|
&& View.GetViewKey() != 0
|
|
&& View.GetSceneTextures().Depth.Target
|
|
&& IsHZBValid(View, EHZBType::FurthestHZB)
|
|
&& VisibilityMaskFormat != PF_Unknown
|
|
&& CVarInstanceCullingOcclusionQueries.GetValueOnRenderThread() != 0;
|
|
}
|
|
|
|
// Debugging utilities
|
|
|
|
class FInstanceCullingOcclusionQueryDebugVS : public FGlobalShader
|
|
{
|
|
DECLARE_GLOBAL_SHADER(FInstanceCullingOcclusionQueryDebugVS);
|
|
SHADER_USE_PARAMETER_STRUCT(FInstanceCullingOcclusionQueryDebugVS, FGlobalShader);
|
|
|
|
public:
|
|
static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters)
|
|
{
|
|
return FDataDrivenShaderPlatformInfo::GetSupportsVertexShaderSRVs(Parameters.Platform);
|
|
}
|
|
|
|
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
|
|
{
|
|
FGlobalShader::ModifyCompilationEnvironment(Parameters, OutEnvironment);
|
|
OutEnvironment.SetDefine(TEXT("USE_GLOBAL_GPU_SCENE_DATA"), 1);
|
|
OutEnvironment.SetDefine(TEXT("VF_SUPPORTS_PRIMITIVE_SCENE_DATA"), 1);
|
|
}
|
|
|
|
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FGPUSceneResourceParameters, GPUSceneParameters)
|
|
SHADER_PARAMETER_STRUCT_REF(FViewUniformShaderParameters, View)
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FHZBParameters, HZBParameters)
|
|
SHADER_PARAMETER_RDG_BUFFER_SRV(Buffer<uint>, InstanceOcclusionQueryBuffer)
|
|
SHADER_PARAMETER_RDG_BUFFER_UAV(RWBuffer<uint>, RWVisibilityMask) // One uint8/32 per instance (0 if instance is culled, non-0 otherwise)
|
|
SHADER_PARAMETER(float, OcclusionSlop)
|
|
SHADER_PARAMETER(uint32, ViewMask)
|
|
END_SHADER_PARAMETER_STRUCT()
|
|
};
|
|
|
|
class FInstanceCullingOcclusionQueryDebugPS : public FGlobalShader
|
|
{
|
|
DECLARE_GLOBAL_SHADER(FInstanceCullingOcclusionQueryDebugPS);
|
|
SHADER_USE_PARAMETER_STRUCT(FInstanceCullingOcclusionQueryDebugPS, FGlobalShader);
|
|
|
|
public:
|
|
static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters)
|
|
{
|
|
return FDataDrivenShaderPlatformInfo::GetSupportsVertexShaderSRVs(Parameters.Platform);
|
|
}
|
|
|
|
static void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
|
|
{
|
|
}
|
|
|
|
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
|
|
END_SHADER_PARAMETER_STRUCT()
|
|
};
|
|
|
|
IMPLEMENT_GLOBAL_SHADER(FInstanceCullingOcclusionQueryDebugVS, "/Engine/Private/InstanceCulling/InstanceCullingOcclusionQuery.usf", "DebugMainVS", SF_Vertex);
|
|
IMPLEMENT_GLOBAL_SHADER(FInstanceCullingOcclusionQueryDebugPS, "/Engine/Private/InstanceCulling/InstanceCullingOcclusionQuery.usf", "DebugMainPS", SF_Pixel);
|
|
|
|
BEGIN_SHADER_PARAMETER_STRUCT(FOcclusionInstanceCullingDebugParameters, )
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FInstanceCullingOcclusionQueryDebugVS::FParameters, VS)
|
|
SHADER_PARAMETER_STRUCT_INCLUDE(FInstanceCullingOcclusionQueryDebugPS::FParameters, PS)
|
|
RENDER_TARGET_BINDING_SLOTS()
|
|
END_SHADER_PARAMETER_STRUCT()
|
|
|
|
static void RenderInstanceOcclusionCullingDebug(
|
|
FRHICommandList& RHICmdList,
|
|
const FViewInfo& View,
|
|
FOcclusionInstanceCullingDebugParameters* PassParameters,
|
|
int32 NumInstances)
|
|
{
|
|
TShaderMapRef<FInstanceCullingOcclusionQueryDebugVS> VertexShader(View.ShaderMap);
|
|
TShaderMapRef<FInstanceCullingOcclusionQueryDebugPS> PixelShader(View.ShaderMap);
|
|
|
|
FGraphicsPipelineStateInitializer GraphicsPSOInit;
|
|
RHICmdList.ApplyCachedRenderTargets(GraphicsPSOInit);
|
|
|
|
FIntVector4 ViewRect = FIntVector4(View.ViewRect.Min.X, View.ViewRect.Min.Y, View.ViewRect.Max.X, View.ViewRect.Max.Y);
|
|
RHICmdList.SetViewport(ViewRect.X, ViewRect.Y, 0.0f, ViewRect.Z, ViewRect.W, 1.0f);
|
|
|
|
GraphicsPSOInit.BoundShaderState.VertexDeclarationRHI = GInstanceCullingOcclusionQueryBox.VertexDeclaration;
|
|
GraphicsPSOInit.BoundShaderState.VertexShaderRHI = VertexShader.GetVertexShader();
|
|
GraphicsPSOInit.BoundShaderState.PixelShaderRHI = PixelShader.GetPixelShader();
|
|
GraphicsPSOInit.DepthStencilState = TStaticDepthStencilState<false, CF_Always>::GetRHI(); // No depth test or write
|
|
GraphicsPSOInit.BlendState = TStaticBlendState<CW_RGBA, BO_Add, BF_One, BF_InverseSourceAlpha, BO_Add, BF_Zero, BF_One>::GetRHI(); // Premultiplied
|
|
GraphicsPSOInit.RasterizerState = TStaticRasterizerState<FM_Solid, CM_None>::GetRHI();
|
|
GraphicsPSOInit.PrimitiveType = PT_LineList;
|
|
|
|
SetGraphicsPipelineState(RHICmdList, GraphicsPSOInit, 0);
|
|
|
|
ClearUnusedGraphResources(VertexShader, &PassParameters->VS);
|
|
ClearUnusedGraphResources(PixelShader, &PassParameters->PS);
|
|
|
|
SetShaderParameters(RHICmdList, VertexShader, VertexShader.GetVertexShader(), PassParameters->VS);
|
|
SetShaderParameters(RHICmdList, PixelShader, PixelShader.GetPixelShader(), PassParameters->PS);
|
|
|
|
RHICmdList.SetStreamSource(0, GInstanceCullingOcclusionQueryBox.VertexBuffer, 0);
|
|
|
|
RHICmdList.DrawIndexedPrimitive(GInstanceCullingOcclusionQueryBox.IndexBuffer, 0, 0, 24, 36, 12, NumInstances);
|
|
}
|
|
|
|
void FInstanceCullingOcclusionQueryRenderer::RenderDebug(FRDGBuilder& GraphBuilder, FGPUScene& GPUScene, const FViewInfo& View, FSceneTextures& SceneTextures)
|
|
{
|
|
if (!IsCompatibleWithView(View) || !InstanceOcclusionQueryBuffer)
|
|
{
|
|
return;
|
|
}
|
|
|
|
const uint32 ViewMask = FindOrAddViewSlot(View);
|
|
|
|
FRDGTextureRef SceneColor = View.GetSceneTextures().Color.Target;
|
|
FRDGTextureRef SceneDepth = View.GetSceneTextures().Depth.Target;
|
|
FRDGBufferRef InstanceOcclusionQueryBufferRDG = GraphBuilder.RegisterExternalBuffer(InstanceOcclusionQueryBuffer);
|
|
|
|
FRDGTextureRef DepthTexture = View.GetSceneTextures().Depth.Target;
|
|
|
|
checkf(DepthTexture && IsHZBValid(View, EHZBType::FurthestHZB),
|
|
TEXT("Occlusion query instance culling requires scene depth texture and HZB. See FInstanceCullingOcclusionQueryRenderer::IsCompatibleWithView()"));
|
|
|
|
const int32 NumInstances = GPUScene.GetNumInstances();
|
|
const FGPUSceneResourceParameters GPUSceneParameters = GPUScene.GetShaderParameters(GraphBuilder);
|
|
|
|
const FIntPoint ViewRectSize = View.ViewRect.Size();
|
|
|
|
FOcclusionInstanceCullingDebugParameters* PassParameters = GraphBuilder.AllocParameters<FOcclusionInstanceCullingDebugParameters>();
|
|
|
|
PassParameters->VS.OcclusionSlop = OCCLUSION_SLOP;
|
|
PassParameters->VS.View = View.ViewUniformBuffer;
|
|
PassParameters->VS.GPUSceneParameters = GPUSceneParameters;
|
|
PassParameters->VS.InstanceOcclusionQueryBuffer = GraphBuilder.CreateSRV(InstanceOcclusionQueryBufferRDG, InstanceOcclusionQueryBufferFormat);
|
|
PassParameters->VS.HZBParameters = GetHZBParameters(GraphBuilder, View, EHZBType::FurthestHZB);
|
|
PassParameters->VS.ViewMask = ViewMask;
|
|
PassParameters->RenderTargets[0] = FRenderTargetBinding(SceneColor, ERenderTargetLoadAction::ELoad);
|
|
PassParameters->RenderTargets.DepthStencil = FDepthStencilBinding(SceneDepth,
|
|
ERenderTargetLoadAction::ELoad, ERenderTargetLoadAction::ENoAction,
|
|
FExclusiveDepthStencil::DepthRead_StencilNop);
|
|
|
|
GraphBuilder.AddPass(
|
|
RDG_EVENT_NAME("InstanceCullingOcclusionQueryRenderer_Draw"),
|
|
PassParameters, ERDGPassFlags::Raster | ERDGPassFlags::NeverCull,
|
|
[PassParameters, NumInstances, &View](FRDGAsyncTask, FRHICommandList& RHICmdList)
|
|
{
|
|
RenderInstanceOcclusionCullingDebug(RHICmdList, View, PassParameters, NumInstances);
|
|
});
|
|
|
|
}
|
|
|