// Copyright Epic Games, Inc. All Rights Reserved. #include "../Common.ush" #include "../SceneData.ush" #include "../DeferredShadingCommon.ush" #include "../VariableRateShading/VRSShadingRateCommon.ush" #include "../HTileEncoding.ush" #include "../VelocityCommon.ush" #include "NaniteVertexDeformation.ush" #if PLATFORM_SUPPORTS_HTILE_LOOKUP #include "NaniteDataDecode.ush" #include "NaniteAttributeDecode.ush" #include "NaniteVertexFetch.ush" MAX_OCCUPANCY DISABLE_TARGET_OCCUPANCY_WARNING // .x = platform config // .y = tiles wide // .z = "decal receive" stencil value // .w = max visible clusters uint4 DepthExportConfig; uint4 ViewRect; uint bWriteCustomStencil; uint MeshPassIndex; uint RegularMaterialRasterBinCount; // Requirements: // COMPILER_SUPPORTS_TO_SCALAR_MEMORY // COMPILER_SUPPORTS_WAVE_ONCE // COMPILER_SUPPORTS_WAVE_MINMAX // PLATFORM_SUPPORTS_HTILE_LOOKUP Texture2D VisBuffer64; RWStructuredBuffer SceneHTile; RWTexture2D SceneDepth; RWTexture2D SceneStencil; RWTexture2D Velocity; #if SHADING_MASK_EXPORT RWTexture2D ShadingMask; #endif StructuredBuffer RasterBinMeta; uint EncodeTileMinMaxDepth(float MinDepth, float MaxDepth, uint PlatformConfig) { BRANCH if (IsHiStencilTileConfig(PlatformConfig)) { return EncodeTileStencilZBaseDelta( MinDepth, MaxDepth, HTILE_DEFAULT_XX, HTILE_DEFAULT_SMEM, HTILE_DEFAULT_SR0, HTILE_DEFAULT_SR1, IsZCompareGE(PlatformConfig) ); } else { return EncodeTileMinMaxDepth(MinDepth, MaxDepth); } } [numthreads(HTILE_PIXELS_PER_TILE_WIDE, HTILE_PIXELS_PER_TILE_TALL, 1)] void DepthExport(uint2 GroupId : SV_GroupID, uint ThreadIndex : SV_GroupIndex) { const uint2 PixelLocalPos = SwizzleThreadIndex(ThreadIndex & 63u); const uint2 PixelPos = ((GroupId << 3) | PixelLocalPos) + ViewRect.xy; if (any(PixelPos.xy >= ViewRect.zw)) { return; } const uint PlatformConfig = DepthExportConfig.x; const uint PixelsWide = DepthExportConfig.y; const UlongType VisPixel = VisBuffer64[PixelPos.xy]; // Calculate the HTile tile index and scalarize. const uint TileIndex = ToScalarMemory(ComputeTileOffset(PixelPos.xy, PixelsWide, PlatformConfig)); uint DepthInt = 0; uint VisibleClusterIndex = 0; uint TriIndex = 0; UnpackVisPixel(VisPixel, DepthInt, VisibleClusterIndex, TriIndex); uint HTileValue = ToScalarMemory(SceneHTile[TileIndex]); uint SceneZMask = BitFieldExtractU32(HTileValue, 4u, 0u); uint SceneSMem = IsHiStencilTileConfig(PlatformConfig) ? BitFieldExtractU32(HTileValue, 2u, 8u) : 0x3; float SceneDepthValue = DecompressDepthValue(SceneDepth, SceneZMask, PixelPos.xy, ThreadIndex, 0.0f, PlatformConfig); uint SceneStencilValue = DecompressStencilValue(SceneStencil, SceneSMem, PixelPos.xy, 0); #if SHADING_MASK_EXPORT // We need to fully write the shading mask for all pixels, since we intentionally skip clearing it to avoid an FCE // By default we write 0, which indicates non-Nanite, but every valid Nanite pixel will pack this value accordingly. uint EncodedMask = 0x0u; #endif bool bWriteDepthStencil = false; // Avoid the parallel reduction and exports if the tile has no Nanite // pixels. This is an optimization of course, but it also will preserve // existing plane equation compression for scene depth tiles rasterized // outside of Nanite. if (WaveActiveAnyTrue(VisibleClusterIndex != 0xFFFFFFFF)) // Tile has Nanite coverage { bool NanitePixelVisible = false; if (VisibleClusterIndex != 0xFFFFFFFF) { checkSlow(IsVisibleClusterIndexImposter(VisibleClusterIndex) || VisibleClusterIndex < DepthExportConfig.w); float NaniteDepthValue = asfloat(DepthInt); NanitePixelVisible = NaniteDepthValue >= SceneDepthValue; if (NanitePixelVisible) { FVisibleCluster VisibleCluster = GetVisibleCluster(VisibleClusterIndex); checkSlow(VisibleCluster.InstanceId < GetSceneData().MaxAllocatedInstanceId); #if VELOCITY_EXPORT #if INSTANCED_STEREO checkSlow(VisibleCluster.ViewId < 2); // see INSTANCED_VIEW_COUNT in SceneView.h. // Velocity export uses a per-view TemporalAAJitter anyway, so it is a fair assumption that each NaniteView has a backing FViewInfo at this point. FNaniteView NaniteView = GetNaniteView(VisibleCluster.ViewId); ResolvedView = ResolveView(VisibleCluster.ViewId); #else FNaniteView NaniteView = GetNaniteView(0); ResolvedView = ResolveView(); #endif #endif const FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked(VisibleCluster); const FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId); const FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); const uint ShadingBin = GetMaterialShadingBin(Cluster, InstanceData.PrimitiveId, MeshPassIndex, TriIndex); const bool bIsDecalReceiver = (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_DECAL_RECEIVER) != 0 && View.ShowDecalsMask > 0; const bool bHasDistanceField = (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_HAS_DISTANCE_FIELD_REPRESENTATION) != 0; SceneDepthValue = NaniteDepthValue; if (bWriteCustomStencil) { uint CustomStencilMask = PrimitiveData.CustomStencilValueAndMask & 0xFF00u; if (CustomStencilMask == 0) { // "Default" mask value of 0 means depth pass = replace, depth fail = keep // TODO: Proper support for ignore-depth, bitwise-or modes (requires a separate 32-bit target) CustomStencilMask = 0xFF00u; } CustomStencilMask = CustomStencilMask >> 8u; SceneStencilValue = PrimitiveData.CustomStencilValueAndMask & CustomStencilMask; } else { SceneStencilValue = select(bIsDecalReceiver, DepthExportConfig.z, 0u); } #if VELOCITY_EXPORT // Don't output velocity here for WPO clusters, as their velocity must be calculated in the base pass bool bWPOEnabled = (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0; const bool bFallbackRasterBin = (VisibleCluster.Flags & NANITE_CULLING_FLAG_FALLBACK_RASTER) != 0; if (bWPOEnabled) { // The cluster may have WPO enabled, but we want to know if WPO is enabled by the material to decide to output velocity const uint RasterBinIndex = GetMaterialRasterBin(Cluster, InstanceData.PrimitiveId, MeshPassIndex, TriIndex, RegularMaterialRasterBinCount, bFallbackRasterBin); const FNaniteMaterialFlags MaterialFlags = UnpackNaniteMaterialFlags(RasterBinMeta[RasterBinIndex].MaterialFlags_DepthBlock & 0xFFFFu); bWPOEnabled &= MaterialFlags.bWorldPositionOffset; } Velocity[PixelPos.xy] = CalculateNaniteVelocity(NaniteView, InstanceData, Cluster, VisibleCluster, float4(PixelPos, asfloat(DepthInt), 1.0f), TriIndex, PrimitiveData.Flags, bWPOEnabled); #endif #if SHADING_MASK_EXPORT EncodedMask = PackShadingMask( ShadingBin, D3D12_SHADING_RATE_1X1, bIsDecalReceiver, bHasDistanceField, GetPrimitive_LightingChannelMask_FromFlags(PrimitiveData.Flags) ); #endif } } // Only write out SceneDepth and expand SceneHTile if there are Nanite fragments that survive depth testing against the scene bWriteDepthStencil = WaveActiveAnyTrue(NanitePixelVisible); } const float TileMinSceneDepth = WaveActiveMin(SceneDepthValue); const float TileMaxSceneDepth = WaveActiveMax(SceneDepthValue); BRANCH if (bWriteDepthStencil) { SceneDepth[PixelPos.xy] = SceneDepthValue; // Write uncompressed depth value out SceneStencil[PixelPos.xy] = SceneStencilValue; if (WaveIsFirstLane()) { SceneHTile[TileIndex] = EncodeTileMinMaxDepth(TileMinSceneDepth, TileMaxSceneDepth, PlatformConfig); } } #if SHADING_MASK_EXPORT ShadingMask[PixelPos.xy] = EncodedMask; #endif } // NOTE: A possible further optimization could be to avoid doing a clear depth target clear for materials // and instead set an empty tile's htile encoding to zmask=0x0 to represent a clear tile, and just // set the min/max z values to 0.0 - In theory, the depth clear value is in a global register, so // the zmask=0x0 should just ignore the rest of the encoding. The behavior on GCN around this isn't // really documented, so we probably need to do a test where we set the target to clear, make the // zrange 1.0/1.0 and the depth clear value 0.0. Then render something at z=0.5 and see what happens. #else [numthreads(8, 8, 1)] void DepthExport(uint3 PixelPos : SV_DispatchThreadID) { } #endif