// Copyright Epic Games, Inc. All Rights Reserved. #include "../Common.ush" #include "../SceneData.ush" #include "../VertexFactoryCommon.ush" // for TransformLocalToTranslatedWorld #include "../Nanite/NaniteHZBCull.ush" // We need to define NUM_THREADS_PER_GROUP before including InstanceCullingSetup as it includes InstanceCullingLoadBalancer.ush which requires declares groupshared uint ItemIndex[NUM_THREADS_PER_GROUP]; #ifndef NUM_THREADS_PER_GROUP_DEFAULT # define NUM_THREADS_PER_GROUP_DEFAULT 64 #endif // NUM_THREADS_PER_GROUP_DEFAULT #ifndef NUM_THREADS_PER_GROUP # define NUM_THREADS_PER_GROUP NUM_THREADS_PER_GROUP_DEFAULT #endif // NUM_THREADS_PER_GROUP // InstanceCullingSetup.ush pulls some variables that get set by FInstanceProcessingGPULoadBalancer::SetShaderDefines. We only call that when USE_LOAD_BALANCER == 1 #if USE_LOAD_BALANCER #define LOAD_BALANCER_SINGLE_INSTANCE_MODE 0 #include "InstanceCullingSetup.ush" #endif #ifndef HZB_DESIRED_FOOTPRINT_PIXELS # define HZB_DESIRED_FOOTPRINT_PIXELS 4 #endif #if !PLATFORM_SUPPORTS_VERTEX_SHADER_SRVS #error This shader requires accessing GPUScene buffers as SRVs from vertex stage #endif #ifndef DIM_MULTI_VIEW # define DIM_MULTI_VIEW 0 #endif RWBuffer RWVisibilityMask; uint NumInstances; uint ViewMask; float OcclusionSlop; #define EInstanceVisibilityStatus uint // Not all platforms support native enums #define IVS_Hidden 0 // Instance is hidden or invalid, no occlusion query needed #define IVS_Visible 1 // Instance is definitely visible, no occlusion query needed #define IVS_PossiblyVisible 2 // Occlusion query is needed to determine visibility #define IVS_Incompatible 3 // Instance can't use occlusion queries EInstanceVisibilityStatus GetInstanceDataAndVisibility(uint InstanceId, out FInstanceSceneData OutInstanceData) { const FInstanceSceneData InstanceData = GetInstanceSceneData(InstanceId); OutInstanceData = InstanceData; if (!InstanceData.ValidInstance) { return IVS_Hidden; } const FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId); if ((PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_INSTANCE_CULLING_OCCLUSION_QUERIES) == 0u) { // If instance culling occlusion queries are not allowed for this primitive, it is marked as "visible" return IVS_Incompatible; } float3 BoxOrigin = InstanceData.LocalBoundsCenter; float3 BoxExtent = InstanceData.LocalBoundsExtent + OcclusionSlop; const bool bIsOrtho = IsOrthoProjection(ResolvedView.ViewToClip); float4x4 LocalToTranslatedWorld = DFMultiplyTranslationDemote(InstanceData.LocalToWorld, ResolvedView.PreViewTranslation); FFrustumCullData Cull = BoxCullFrustum( BoxOrigin.xyz, BoxExtent.xyz, LocalToTranslatedWorld, ResolvedView.TranslatedWorldToClip, ResolvedView.ViewToClip, bIsOrtho, true /* near clip */, false /* skip culling */ ); if (Cull.bCrossesNearPlane) { // Occlusion query can only be used when the proxy box is fully in front of the camera. return IVS_Visible; } if (!Cull.bIsVisible) { // Conservative culling heuristic: // If instance is frustum-culled this frame, it may become visible next frame when instance visibility buffer is consumed. // Marking the instance as visible allows InstanceCullBuildInstanceIdBufferCS to perform a more accurate check next frame. // NOTE: There is still a possibility of instance getting occlusion-culled this frame while it should be visible. // Two-pass occlusion culling algorithm is the only robust solution. return IVS_Visible; } int4 HZBViewRect = int4(0, 0, HZBViewSize.x, HZBViewSize.y); FScreenRect Rect = GetScreenRect(HZBViewRect, Cull, HZB_DESIRED_FOOTPRINT_PIXELS); if (any(Rect.Pixels == HZBViewRect)) { // Conservative culling heuristic: // Objects may be visible with HZB test but occluded with per-pixel depth test during this frame. // However in the next frame the object may be completely visible and will cause a visible pop-in. // Taking the more conservative HZB visibility test result for objects at the edges of the screen reduces the artifacts. // This pop-in is still *possible* with HZB test, it is simply slightly less likely. // NOTE: There is still a possibility of instance getting occlusion-culled this frame while it should be visible. // Two-pass occlusion culling algorithm is the only robust solution. return IVS_Visible; } if (IsVisibleHZB(Rect, true /*bSample4x4*/)) { // Instance is visible using conservative HZB test and may benefit from a more accurate per-pixel occlusion test. return IVS_PossiblyVisible; } else { // Instance is definitely hidden and we can skip per-pixel test as it has non-trivial cost. return IVS_Hidden; } } // TODO: could also add a wave-op variant in the future if there is evidence of significant perf overhead groupshared uint GroupNumVisibleInstances; groupshared uint GroupVisibleInstances[NUM_THREADS_PER_GROUP]; groupshared uint GroupOutputBaseIndex; RWBuffer OutIndirectArgsBuffer; RWBuffer OutInstanceIdBuffer; // Remap linear visible index to a visible GPUScene instance index Buffer InstanceIdBuffer; bool LoadInstanceId(uint3 GroupId, uint GroupThreadIndex, uint DispatchThreadId, inout uint InstanceId) { #if USE_LOAD_BALANCER // No need to properly fetch the final instance id for dynamic primitives since we're going to skip them anyway uint DynamicInstanceIdOffset = 0; uint DynamicInstanceIdMax = 0; FInstanceCullingSetup InstanceCullingSetup = LoadInstanceCullingSetup(GroupId, GroupThreadIndex, DynamicInstanceIdOffset, DynamicInstanceIdMax, 0); InstanceId = InstanceCullingSetup.InstanceId; return InstanceCullingSetup.bValid && !InstanceCullingSetup.bIsDynamic; #else const bool bValidInvocation = DispatchThreadId < NumInstances; uint ClampedThreadId = min(DispatchThreadId, NumInstances - 1); InstanceId = InstanceIdBuffer[ClampedThreadId]; return bValidInvocation; #endif } [numthreads(NUM_THREADS_PER_GROUP, 1, 1)] void MainCS(uint3 GroupId : SV_GroupID, uint GroupThreadIndex : SV_GroupIndex, uint DispatchThreadId : SV_DispatchThreadID) { ResolvedView = ResolveView(); if (GroupThreadIndex == 0) { GroupNumVisibleInstances = 0; } uint InstanceId = 0; const bool bValidInvocation = LoadInstanceId(GroupId, GroupThreadIndex, DispatchThreadId, InstanceId); GroupMemoryBarrierWithGroupSync(); if (DispatchThreadId == 0) { // First thread fills the indirect draw args // Fill FRHIDrawIndexedIndirectParameters OutIndirectArgsBuffer[0] = 36; // IndexCountPerInstance -- 12 triangles per cube // OutIndirectArgsBuffer[1] = 0; // InstanceCount (filled previously by ClearUAV as we're atomically adding to it) OutIndirectArgsBuffer[2] = 0; // StartIndexLocation OutIndirectArgsBuffer[3] = 0; // BaseVertexLocation OutIndirectArgsBuffer[4] = 0; // StartInstanceLocation } FInstanceSceneData InstanceData = (FInstanceSceneData)0; EInstanceVisibilityStatus Status = IVS_Hidden; if (bValidInvocation) { Status = GetInstanceDataAndVisibility(InstanceId, /*out*/ InstanceData); #if DIM_MULTI_VIEW const uint OldVisibilityMask = RWVisibilityMask[InstanceId]; const uint MaskValueHidden = OldVisibilityMask & (~ViewMask); const uint MaskValueVisible = OldVisibilityMask | ViewMask; #else // DIM_MULTI_VIEW const uint MaskValueHidden = 0; const uint MaskValueVisible = ViewMask; #endif // DIM_MULTI_VIEW if (Status == IVS_Visible || Status == IVS_Incompatible) { RWVisibilityMask[InstanceId] = MaskValueVisible; // this is the final value } else if (Status == IVS_Hidden) { RWVisibilityMask[InstanceId] = MaskValueHidden; // this is the final value } else // Status == IVS_PossiblyVisible { RWVisibilityMask[InstanceId] = MaskValueHidden; // clear the value which will be updated by pixel shader later uint LocalOutputIndex = 0; InterlockedAdd(GroupNumVisibleInstances, 1, LocalOutputIndex); GroupVisibleInstances[LocalOutputIndex] = InstanceId; } } GroupMemoryBarrierWithGroupSync(); if (GroupThreadIndex == 0) { // One thread per group atomically adds to InstanceCount InterlockedAdd(OutIndirectArgsBuffer[1], GroupNumVisibleInstances, GroupOutputBaseIndex); } GroupMemoryBarrierWithGroupSync(); if (GroupThreadIndex < GroupNumVisibleInstances) { OutInstanceIdBuffer[GroupOutputBaseIndex + GroupThreadIndex] = GroupVisibleInstances[GroupThreadIndex]; } } void MainVS( float3 InPosition : ATTRIBUTE0, uint InstanceId : SV_InstanceID, out nointerpolation uint OutInstanceId : INSTANCE_ID, out nointerpolation uint OutVisibilityMask : VISIBILITY_MASK, out float4 OutPosition : SV_POSITION ) { ResolvedView = ResolveView(); FInstanceSceneData InstanceData = (FInstanceSceneData)0; InstanceId = InstanceIdBuffer[InstanceId]; GetInstanceDataAndVisibility(InstanceId, /*out*/ InstanceData); float3 LocalPosition = (InPosition * (InstanceData.LocalBoundsExtent + OcclusionSlop) * InstanceData.DeterminantSign) + InstanceData.LocalBoundsCenter; float4 WorldPosition = TransformLocalToTranslatedWorld(LocalPosition, InstanceData.LocalToWorld); OutPosition = mul(WorldPosition, ResolvedView.TranslatedWorldToClip); OutInstanceId = InstanceId; #if DIM_MULTI_VIEW const uint OldVisibilityMask = RWVisibilityMask[InstanceId]; OutVisibilityMask = OldVisibilityMask | ViewMask; #else // DIM_MULTI_VIEW OutVisibilityMask = ViewMask; #endif // DIM_MULTI_VIEW } EARLYDEPTHSTENCIL void MainPS( in nointerpolation uint InstanceId : INSTANCE_ID, in nointerpolation uint VisibilityMask : VISIBILITY_MASK, out float4 OutColor : SV_Target0 ) { RWVisibilityMask[InstanceId] = VisibilityMask; OutColor = float4(1, 0, 0, 1); } // Debug utilities Buffer InstanceOcclusionQueryBuffer; void DebugMainVS( float3 InPosition : ATTRIBUTE0, uint InstanceId : SV_InstanceID, out nointerpolation uint OutDebugVisibility : INSTANCE_DEBUG_VISIBILITY, out float4 OutPosition : SV_POSITION ) { ResolvedView = ResolveView(); FInstanceSceneData InstanceData; EInstanceVisibilityStatus VisibilityStatus = GetInstanceDataAndVisibility(InstanceId, /*out*/ InstanceData); if (InstanceData.ValidInstance && VisibilityStatus != IVS_Incompatible) { float3 LocalPosition = (InPosition * (InstanceData.LocalBoundsExtent + OcclusionSlop) * InstanceData.DeterminantSign) + InstanceData.LocalBoundsCenter; float4 WorldPosition = TransformLocalToTranslatedWorld(LocalPosition, InstanceData.LocalToWorld); OutPosition = mul(WorldPosition, ResolvedView.TranslatedWorldToClip); const bool bWasVisibleLastFrame = (InstanceOcclusionQueryBuffer[InstanceId] & ViewMask) == ViewMask; if (VisibilityStatus == IVS_PossiblyVisible) { OutDebugVisibility = bWasVisibleLastFrame ? IVS_PossiblyVisible : IVS_Incompatible; } else { OutDebugVisibility = bWasVisibleLastFrame ? IVS_Visible : IVS_Hidden; } } else { OutPosition = asfloat(0xFFFFFFFF); OutDebugVisibility = IVS_Hidden; } } void DebugMainPS( in nointerpolation uint DebugVisibility : INSTANCE_DEBUG_VISIBILITY, out float4 OutColor : SV_Target0 ) { switch (DebugVisibility) { case IVS_Hidden: // Definitely invisible due to HZB test. // No occlusion query performed. OutColor = float4(0.1, 0.0, 0.0, 0.0); // red break; case IVS_Visible: // Definitely visible due to near plane intersection or other heuristics. // No occlusion query performed. OutColor = float4(0.1, 0.1, 0.0, 0.0); // yellow break; case IVS_PossiblyVisible: // Visible due to occlusion query. OutColor = float4(0.0, 0.025, 0.0, 0.0); // green break; case IVS_Incompatible: // Invisible due to occlusion query. OutColor = float4(0.25, 0.0, 0.25, 0.0); // magenta break; default: OutColor = (float4)0; break; } }