Files
UnrealEngine/Engine/Shaders/Private/InstanceCulling/InstanceCullingOcclusionQuery.usf
2025-05-18 13:04:45 +08:00

342 lines
12 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#include "../Common.ush"
#include "../SceneData.ush"
#include "../VertexFactoryCommon.ush" // for TransformLocalToTranslatedWorld
#include "../Nanite/NaniteHZBCull.ush"
// We need to define NUM_THREADS_PER_GROUP before including InstanceCullingSetup as it includes InstanceCullingLoadBalancer.ush which requires declares groupshared uint ItemIndex[NUM_THREADS_PER_GROUP];
#ifndef NUM_THREADS_PER_GROUP_DEFAULT
# define NUM_THREADS_PER_GROUP_DEFAULT 64
#endif // NUM_THREADS_PER_GROUP_DEFAULT
#ifndef NUM_THREADS_PER_GROUP
# define NUM_THREADS_PER_GROUP NUM_THREADS_PER_GROUP_DEFAULT
#endif // NUM_THREADS_PER_GROUP
// InstanceCullingSetup.ush pulls some variables that get set by FInstanceProcessingGPULoadBalancer::SetShaderDefines. We only call that when USE_LOAD_BALANCER == 1
#if USE_LOAD_BALANCER
#define LOAD_BALANCER_SINGLE_INSTANCE_MODE 0
#include "InstanceCullingSetup.ush"
#endif
#ifndef HZB_DESIRED_FOOTPRINT_PIXELS
# define HZB_DESIRED_FOOTPRINT_PIXELS 4
#endif
#if !PLATFORM_SUPPORTS_VERTEX_SHADER_SRVS
#error This shader requires accessing GPUScene buffers as SRVs from vertex stage
#endif
#ifndef DIM_MULTI_VIEW
# define DIM_MULTI_VIEW 0
#endif
RWBuffer<uint> RWVisibilityMask;
uint NumInstances;
uint ViewMask;
float OcclusionSlop;
#define EInstanceVisibilityStatus uint // Not all platforms support native enums
#define IVS_Hidden 0 // Instance is hidden or invalid, no occlusion query needed
#define IVS_Visible 1 // Instance is definitely visible, no occlusion query needed
#define IVS_PossiblyVisible 2 // Occlusion query is needed to determine visibility
#define IVS_Incompatible 3 // Instance can't use occlusion queries
EInstanceVisibilityStatus GetInstanceDataAndVisibility(uint InstanceId, out FInstanceSceneData OutInstanceData)
{
const FInstanceSceneData InstanceData = GetInstanceSceneData(InstanceId);
OutInstanceData = InstanceData;
if (!InstanceData.ValidInstance)
{
return IVS_Hidden;
}
const FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId);
if ((PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_INSTANCE_CULLING_OCCLUSION_QUERIES) == 0u)
{
// If instance culling occlusion queries are not allowed for this primitive, it is marked as "visible"
return IVS_Incompatible;
}
float3 BoxOrigin = InstanceData.LocalBoundsCenter;
float3 BoxExtent = InstanceData.LocalBoundsExtent + OcclusionSlop;
const bool bIsOrtho = IsOrthoProjection(ResolvedView.ViewToClip);
float4x4 LocalToTranslatedWorld = DFMultiplyTranslationDemote(InstanceData.LocalToWorld, ResolvedView.PreViewTranslation);
FFrustumCullData Cull = BoxCullFrustum(
BoxOrigin.xyz,
BoxExtent.xyz,
LocalToTranslatedWorld,
ResolvedView.TranslatedWorldToClip,
ResolvedView.ViewToClip,
bIsOrtho, true /* near clip */, false /* skip culling */
);
if (Cull.bCrossesNearPlane)
{
// Occlusion query can only be used when the proxy box is fully in front of the camera.
return IVS_Visible;
}
if (!Cull.bIsVisible)
{
// Conservative culling heuristic:
// If instance is frustum-culled this frame, it may become visible next frame when instance visibility buffer is consumed.
// Marking the instance as visible allows InstanceCullBuildInstanceIdBufferCS to perform a more accurate check next frame.
// NOTE: There is still a possibility of instance getting occlusion-culled this frame while it should be visible.
// Two-pass occlusion culling algorithm is the only robust solution.
return IVS_Visible;
}
int4 HZBViewRect = int4(0, 0, HZBViewSize.x, HZBViewSize.y);
FScreenRect Rect = GetScreenRect(HZBViewRect, Cull, HZB_DESIRED_FOOTPRINT_PIXELS);
if (any(Rect.Pixels == HZBViewRect))
{
// Conservative culling heuristic:
// Objects may be visible with HZB test but occluded with per-pixel depth test during this frame.
// However in the next frame the object may be completely visible and will cause a visible pop-in.
// Taking the more conservative HZB visibility test result for objects at the edges of the screen reduces the artifacts.
// This pop-in is still *possible* with HZB test, it is simply slightly less likely.
// NOTE: There is still a possibility of instance getting occlusion-culled this frame while it should be visible.
// Two-pass occlusion culling algorithm is the only robust solution.
return IVS_Visible;
}
if (IsVisibleHZB(Rect, true /*bSample4x4*/))
{
// Instance is visible using conservative HZB test and may benefit from a more accurate per-pixel occlusion test.
return IVS_PossiblyVisible;
}
else
{
// Instance is definitely hidden and we can skip per-pixel test as it has non-trivial cost.
return IVS_Hidden;
}
}
// TODO: could also add a wave-op variant in the future if there is evidence of significant perf overhead
groupshared uint GroupNumVisibleInstances;
groupshared uint GroupVisibleInstances[NUM_THREADS_PER_GROUP];
groupshared uint GroupOutputBaseIndex;
RWBuffer<uint> OutIndirectArgsBuffer;
RWBuffer<uint> OutInstanceIdBuffer;
// Remap linear visible index to a visible GPUScene instance index
Buffer<uint> InstanceIdBuffer;
bool LoadInstanceId(uint3 GroupId, uint GroupThreadIndex, uint DispatchThreadId, inout uint InstanceId)
{
#if USE_LOAD_BALANCER
// No need to properly fetch the final instance id for dynamic primitives since we're going to skip them anyway
uint DynamicInstanceIdOffset = 0;
uint DynamicInstanceIdMax = 0;
FInstanceCullingSetup InstanceCullingSetup = LoadInstanceCullingSetup(GroupId, GroupThreadIndex, DynamicInstanceIdOffset, DynamicInstanceIdMax, 0);
InstanceId = InstanceCullingSetup.InstanceId;
return InstanceCullingSetup.bValid && !InstanceCullingSetup.bIsDynamic;
#else
const bool bValidInvocation = DispatchThreadId < NumInstances;
uint ClampedThreadId = min(DispatchThreadId, NumInstances - 1);
InstanceId = InstanceIdBuffer[ClampedThreadId];
return bValidInvocation;
#endif
}
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
void MainCS(uint3 GroupId : SV_GroupID, uint GroupThreadIndex : SV_GroupIndex, uint DispatchThreadId : SV_DispatchThreadID)
{
ResolvedView = ResolveView();
if (GroupThreadIndex == 0)
{
GroupNumVisibleInstances = 0;
}
uint InstanceId = 0;
const bool bValidInvocation = LoadInstanceId(GroupId, GroupThreadIndex, DispatchThreadId, InstanceId);
GroupMemoryBarrierWithGroupSync();
if (DispatchThreadId == 0)
{
// First thread fills the indirect draw args
// Fill FRHIDrawIndexedIndirectParameters
OutIndirectArgsBuffer[0] = 36; // IndexCountPerInstance -- 12 triangles per cube
// OutIndirectArgsBuffer[1] = 0; // InstanceCount (filled previously by ClearUAV as we're atomically adding to it)
OutIndirectArgsBuffer[2] = 0; // StartIndexLocation
OutIndirectArgsBuffer[3] = 0; // BaseVertexLocation
OutIndirectArgsBuffer[4] = 0; // StartInstanceLocation
}
FInstanceSceneData InstanceData = (FInstanceSceneData)0;
EInstanceVisibilityStatus Status = IVS_Hidden;
if (bValidInvocation)
{
Status = GetInstanceDataAndVisibility(InstanceId, /*out*/ InstanceData);
#if DIM_MULTI_VIEW
const uint OldVisibilityMask = RWVisibilityMask[InstanceId];
const uint MaskValueHidden = OldVisibilityMask & (~ViewMask);
const uint MaskValueVisible = OldVisibilityMask | ViewMask;
#else // DIM_MULTI_VIEW
const uint MaskValueHidden = 0;
const uint MaskValueVisible = ViewMask;
#endif // DIM_MULTI_VIEW
if (Status == IVS_Visible || Status == IVS_Incompatible)
{
RWVisibilityMask[InstanceId] = MaskValueVisible; // this is the final value
}
else if (Status == IVS_Hidden)
{
RWVisibilityMask[InstanceId] = MaskValueHidden; // this is the final value
}
else // Status == IVS_PossiblyVisible
{
RWVisibilityMask[InstanceId] = MaskValueHidden; // clear the value which will be updated by pixel shader later
uint LocalOutputIndex = 0;
InterlockedAdd(GroupNumVisibleInstances, 1, LocalOutputIndex);
GroupVisibleInstances[LocalOutputIndex] = InstanceId;
}
}
GroupMemoryBarrierWithGroupSync();
if (GroupThreadIndex == 0)
{
// One thread per group atomically adds to InstanceCount
InterlockedAdd(OutIndirectArgsBuffer[1], GroupNumVisibleInstances, GroupOutputBaseIndex);
}
GroupMemoryBarrierWithGroupSync();
if (GroupThreadIndex < GroupNumVisibleInstances)
{
OutInstanceIdBuffer[GroupOutputBaseIndex + GroupThreadIndex] = GroupVisibleInstances[GroupThreadIndex];
}
}
void MainVS(
float3 InPosition : ATTRIBUTE0,
uint InstanceId : SV_InstanceID,
out nointerpolation uint OutInstanceId : INSTANCE_ID,
out nointerpolation uint OutVisibilityMask : VISIBILITY_MASK,
out float4 OutPosition : SV_POSITION
)
{
ResolvedView = ResolveView();
FInstanceSceneData InstanceData = (FInstanceSceneData)0;
InstanceId = InstanceIdBuffer[InstanceId];
GetInstanceDataAndVisibility(InstanceId, /*out*/ InstanceData);
float3 LocalPosition = (InPosition * (InstanceData.LocalBoundsExtent + OcclusionSlop) * InstanceData.DeterminantSign) + InstanceData.LocalBoundsCenter;
float4 WorldPosition = TransformLocalToTranslatedWorld(LocalPosition, InstanceData.LocalToWorld);
OutPosition = mul(WorldPosition, ResolvedView.TranslatedWorldToClip);
OutInstanceId = InstanceId;
#if DIM_MULTI_VIEW
const uint OldVisibilityMask = RWVisibilityMask[InstanceId];
OutVisibilityMask = OldVisibilityMask | ViewMask;
#else // DIM_MULTI_VIEW
OutVisibilityMask = ViewMask;
#endif // DIM_MULTI_VIEW
}
EARLYDEPTHSTENCIL
void MainPS(
in nointerpolation uint InstanceId : INSTANCE_ID,
in nointerpolation uint VisibilityMask : VISIBILITY_MASK,
out float4 OutColor : SV_Target0
)
{
RWVisibilityMask[InstanceId] = VisibilityMask;
OutColor = float4(1, 0, 0, 1);
}
// Debug utilities
Buffer<uint> InstanceOcclusionQueryBuffer;
void DebugMainVS(
float3 InPosition : ATTRIBUTE0,
uint InstanceId : SV_InstanceID,
out nointerpolation uint OutDebugVisibility : INSTANCE_DEBUG_VISIBILITY,
out float4 OutPosition : SV_POSITION
)
{
ResolvedView = ResolveView();
FInstanceSceneData InstanceData;
EInstanceVisibilityStatus VisibilityStatus = GetInstanceDataAndVisibility(InstanceId, /*out*/ InstanceData);
if (InstanceData.ValidInstance && VisibilityStatus != IVS_Incompatible)
{
float3 LocalPosition = (InPosition * (InstanceData.LocalBoundsExtent + OcclusionSlop) * InstanceData.DeterminantSign) + InstanceData.LocalBoundsCenter;
float4 WorldPosition = TransformLocalToTranslatedWorld(LocalPosition, InstanceData.LocalToWorld);
OutPosition = mul(WorldPosition, ResolvedView.TranslatedWorldToClip);
const bool bWasVisibleLastFrame = (InstanceOcclusionQueryBuffer[InstanceId] & ViewMask) == ViewMask;
if (VisibilityStatus == IVS_PossiblyVisible)
{
OutDebugVisibility = bWasVisibleLastFrame ? IVS_PossiblyVisible : IVS_Incompatible;
}
else
{
OutDebugVisibility = bWasVisibleLastFrame ? IVS_Visible : IVS_Hidden;
}
}
else
{
OutPosition = asfloat(0xFFFFFFFF);
OutDebugVisibility = IVS_Hidden;
}
}
void DebugMainPS(
in nointerpolation uint DebugVisibility : INSTANCE_DEBUG_VISIBILITY,
out float4 OutColor : SV_Target0
)
{
switch (DebugVisibility)
{
case IVS_Hidden:
// Definitely invisible due to HZB test.
// No occlusion query performed.
OutColor = float4(0.1, 0.0, 0.0, 0.0); // red
break;
case IVS_Visible:
// Definitely visible due to near plane intersection or other heuristics.
// No occlusion query performed.
OutColor = float4(0.1, 0.1, 0.0, 0.0); // yellow
break;
case IVS_PossiblyVisible:
// Visible due to occlusion query.
OutColor = float4(0.0, 0.025, 0.0, 0.0); // green
break;
case IVS_Incompatible:
// Invisible due to occlusion query.
OutColor = float4(0.25, 0.0, 0.25, 0.0); // magenta
break;
default:
OutColor = (float4)0;
break;
}
}