Files
UnrealEngine/Engine/Shaders/Private/Nanite/NaniteDepthExport.usf
2025-05-18 13:04:45 +08:00

234 lines
8.3 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#include "../Common.ush"
#include "../SceneData.ush"
#include "../DeferredShadingCommon.ush"
#include "../VariableRateShading/VRSShadingRateCommon.ush"
#include "../HTileEncoding.ush"
#include "../VelocityCommon.ush"
#include "NaniteVertexDeformation.ush"
#if PLATFORM_SUPPORTS_HTILE_LOOKUP
#include "NaniteDataDecode.ush"
#include "NaniteAttributeDecode.ush"
#include "NaniteVertexFetch.ush"
MAX_OCCUPANCY
DISABLE_TARGET_OCCUPANCY_WARNING
// .x = platform config
// .y = tiles wide
// .z = "decal receive" stencil value
// .w = max visible clusters
uint4 DepthExportConfig;
uint4 ViewRect;
uint bWriteCustomStencil;
uint MeshPassIndex;
uint RegularMaterialRasterBinCount;
// Requirements:
// COMPILER_SUPPORTS_TO_SCALAR_MEMORY
// COMPILER_SUPPORTS_WAVE_ONCE
// COMPILER_SUPPORTS_WAVE_MINMAX
// PLATFORM_SUPPORTS_HTILE_LOOKUP
Texture2D<UlongType> VisBuffer64;
RWStructuredBuffer<uint> SceneHTile;
RWTexture2D<float> SceneDepth;
RWTexture2D<uint> SceneStencil;
RWTexture2D<float4> Velocity;
#if SHADING_MASK_EXPORT
RWTexture2D<uint> ShadingMask;
#endif
StructuredBuffer<FNaniteRasterBinMeta> RasterBinMeta;
uint EncodeTileMinMaxDepth(float MinDepth, float MaxDepth, uint PlatformConfig)
{
BRANCH
if (IsHiStencilTileConfig(PlatformConfig))
{
return EncodeTileStencilZBaseDelta(
MinDepth,
MaxDepth,
HTILE_DEFAULT_XX,
HTILE_DEFAULT_SMEM,
HTILE_DEFAULT_SR0,
HTILE_DEFAULT_SR1,
IsZCompareGE(PlatformConfig)
);
}
else
{
return EncodeTileMinMaxDepth(MinDepth, MaxDepth);
}
}
[numthreads(HTILE_PIXELS_PER_TILE_WIDE, HTILE_PIXELS_PER_TILE_TALL, 1)]
void DepthExport(uint2 GroupId : SV_GroupID, uint ThreadIndex : SV_GroupIndex)
{
const uint2 PixelLocalPos = SwizzleThreadIndex(ThreadIndex & 63u);
const uint2 PixelPos = ((GroupId << 3) | PixelLocalPos) + ViewRect.xy;
if (any(PixelPos.xy >= ViewRect.zw))
{
return;
}
const uint PlatformConfig = DepthExportConfig.x;
const uint PixelsWide = DepthExportConfig.y;
const UlongType VisPixel = VisBuffer64[PixelPos.xy];
// Calculate the HTile tile index and scalarize.
const uint TileIndex = ToScalarMemory(ComputeTileOffset(PixelPos.xy, PixelsWide, PlatformConfig));
uint DepthInt = 0;
uint VisibleClusterIndex = 0;
uint TriIndex = 0;
UnpackVisPixel(VisPixel, DepthInt, VisibleClusterIndex, TriIndex);
uint HTileValue = ToScalarMemory(SceneHTile[TileIndex]);
uint SceneZMask = BitFieldExtractU32(HTileValue, 4u, 0u);
uint SceneSMem = IsHiStencilTileConfig(PlatformConfig) ? BitFieldExtractU32(HTileValue, 2u, 8u) : 0x3;
float SceneDepthValue = DecompressDepthValue(SceneDepth, SceneZMask, PixelPos.xy, ThreadIndex, 0.0f, PlatformConfig);
uint SceneStencilValue = DecompressStencilValue(SceneStencil, SceneSMem, PixelPos.xy, 0);
#if SHADING_MASK_EXPORT
// We need to fully write the shading mask for all pixels, since we intentionally skip clearing it to avoid an FCE
// By default we write 0, which indicates non-Nanite, but every valid Nanite pixel will pack this value accordingly.
uint EncodedMask = 0x0u;
#endif
bool bWriteDepthStencil = false;
// Avoid the parallel reduction and exports if the tile has no Nanite
// pixels. This is an optimization of course, but it also will preserve
// existing plane equation compression for scene depth tiles rasterized
// outside of Nanite.
if (WaveActiveAnyTrue(VisibleClusterIndex != 0xFFFFFFFF)) // Tile has Nanite coverage
{
bool NanitePixelVisible = false;
if (VisibleClusterIndex != 0xFFFFFFFF)
{
checkSlow(IsVisibleClusterIndexImposter(VisibleClusterIndex) || VisibleClusterIndex < DepthExportConfig.w);
float NaniteDepthValue = asfloat(DepthInt);
NanitePixelVisible = NaniteDepthValue >= SceneDepthValue;
if (NanitePixelVisible)
{
FVisibleCluster VisibleCluster = GetVisibleCluster(VisibleClusterIndex);
checkSlow(VisibleCluster.InstanceId < GetSceneData().MaxAllocatedInstanceId);
#if VELOCITY_EXPORT
#if INSTANCED_STEREO
checkSlow(VisibleCluster.ViewId < 2); // see INSTANCED_VIEW_COUNT in SceneView.h.
// Velocity export uses a per-view TemporalAAJitter anyway, so it is a fair assumption that each NaniteView has a backing FViewInfo at this point.
FNaniteView NaniteView = GetNaniteView(VisibleCluster.ViewId);
ResolvedView = ResolveView(VisibleCluster.ViewId);
#else
FNaniteView NaniteView = GetNaniteView(0);
ResolvedView = ResolveView();
#endif
#endif
const FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked(VisibleCluster);
const FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId);
const FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex);
const uint ShadingBin = GetMaterialShadingBin(Cluster, InstanceData.PrimitiveId, MeshPassIndex, TriIndex);
const bool bIsDecalReceiver = (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_DECAL_RECEIVER) != 0 && View.ShowDecalsMask > 0;
const bool bHasDistanceField = (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_HAS_DISTANCE_FIELD_REPRESENTATION) != 0;
SceneDepthValue = NaniteDepthValue;
if (bWriteCustomStencil)
{
uint CustomStencilMask = PrimitiveData.CustomStencilValueAndMask & 0xFF00u;
if (CustomStencilMask == 0)
{
// "Default" mask value of 0 means depth pass = replace, depth fail = keep
// TODO: Proper support for ignore-depth, bitwise-or modes (requires a separate 32-bit target)
CustomStencilMask = 0xFF00u;
}
CustomStencilMask = CustomStencilMask >> 8u;
SceneStencilValue = PrimitiveData.CustomStencilValueAndMask & CustomStencilMask;
}
else
{
SceneStencilValue = select(bIsDecalReceiver, DepthExportConfig.z, 0u);
}
#if VELOCITY_EXPORT
// Don't output velocity here for WPO clusters, as their velocity must be calculated in the base pass
bool bWPOEnabled = (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0;
const bool bFallbackRasterBin = (VisibleCluster.Flags & NANITE_CULLING_FLAG_FALLBACK_RASTER) != 0;
if (bWPOEnabled)
{
// The cluster may have WPO enabled, but we want to know if WPO is enabled by the material to decide to output velocity
const uint RasterBinIndex = GetMaterialRasterBin(Cluster, InstanceData.PrimitiveId, MeshPassIndex, TriIndex, RegularMaterialRasterBinCount, bFallbackRasterBin);
const FNaniteMaterialFlags MaterialFlags = UnpackNaniteMaterialFlags(RasterBinMeta[RasterBinIndex].MaterialFlags_DepthBlock & 0xFFFFu);
bWPOEnabled &= MaterialFlags.bWorldPositionOffset;
}
Velocity[PixelPos.xy] = CalculateNaniteVelocity(NaniteView, InstanceData, Cluster, VisibleCluster, float4(PixelPos, asfloat(DepthInt), 1.0f), TriIndex, PrimitiveData.Flags, bWPOEnabled);
#endif
#if SHADING_MASK_EXPORT
EncodedMask = PackShadingMask(
ShadingBin,
D3D12_SHADING_RATE_1X1,
bIsDecalReceiver,
bHasDistanceField,
GetPrimitive_LightingChannelMask_FromFlags(PrimitiveData.Flags)
);
#endif
}
}
// Only write out SceneDepth and expand SceneHTile if there are Nanite fragments that survive depth testing against the scene
bWriteDepthStencil = WaveActiveAnyTrue(NanitePixelVisible);
}
const float TileMinSceneDepth = WaveActiveMin(SceneDepthValue);
const float TileMaxSceneDepth = WaveActiveMax(SceneDepthValue);
BRANCH
if (bWriteDepthStencil)
{
SceneDepth[PixelPos.xy] = SceneDepthValue; // Write uncompressed depth value out
SceneStencil[PixelPos.xy] = SceneStencilValue;
if (WaveIsFirstLane())
{
SceneHTile[TileIndex] = EncodeTileMinMaxDepth(TileMinSceneDepth, TileMaxSceneDepth, PlatformConfig);
}
}
#if SHADING_MASK_EXPORT
ShadingMask[PixelPos.xy] = EncodedMask;
#endif
}
// NOTE: A possible further optimization could be to avoid doing a clear depth target clear for materials
// and instead set an empty tile's htile encoding to zmask=0x0 to represent a clear tile, and just
// set the min/max z values to 0.0 - In theory, the depth clear value is in a global register, so
// the zmask=0x0 should just ignore the rest of the encoding. The behavior on GCN around this isn't
// really documented, so we probably need to do a test where we set the target to clear, make the
// zrange 1.0/1.0 and the depth clear value 0.0. Then render something at z=0.5 and see what happens.
#else
[numthreads(8, 8, 1)]
void DepthExport(uint3 PixelPos : SV_DispatchThreadID)
{
}
#endif