234 lines
8.3 KiB
HLSL
234 lines
8.3 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#include "../Common.ush"
|
|
#include "../SceneData.ush"
|
|
#include "../DeferredShadingCommon.ush"
|
|
#include "../VariableRateShading/VRSShadingRateCommon.ush"
|
|
#include "../HTileEncoding.ush"
|
|
#include "../VelocityCommon.ush"
|
|
#include "NaniteVertexDeformation.ush"
|
|
|
|
#if PLATFORM_SUPPORTS_HTILE_LOOKUP
|
|
|
|
#include "NaniteDataDecode.ush"
|
|
#include "NaniteAttributeDecode.ush"
|
|
#include "NaniteVertexFetch.ush"
|
|
|
|
MAX_OCCUPANCY
|
|
DISABLE_TARGET_OCCUPANCY_WARNING
|
|
|
|
// .x = platform config
|
|
// .y = tiles wide
|
|
// .z = "decal receive" stencil value
|
|
// .w = max visible clusters
|
|
uint4 DepthExportConfig;
|
|
uint4 ViewRect;
|
|
uint bWriteCustomStencil;
|
|
uint MeshPassIndex;
|
|
uint RegularMaterialRasterBinCount;
|
|
|
|
// Requirements:
|
|
// COMPILER_SUPPORTS_TO_SCALAR_MEMORY
|
|
// COMPILER_SUPPORTS_WAVE_ONCE
|
|
// COMPILER_SUPPORTS_WAVE_MINMAX
|
|
// PLATFORM_SUPPORTS_HTILE_LOOKUP
|
|
|
|
Texture2D<UlongType> VisBuffer64;
|
|
|
|
RWStructuredBuffer<uint> SceneHTile;
|
|
RWTexture2D<float> SceneDepth;
|
|
RWTexture2D<uint> SceneStencil;
|
|
|
|
RWTexture2D<float4> Velocity;
|
|
|
|
#if SHADING_MASK_EXPORT
|
|
RWTexture2D<uint> ShadingMask;
|
|
#endif
|
|
StructuredBuffer<FNaniteRasterBinMeta> RasterBinMeta;
|
|
|
|
uint EncodeTileMinMaxDepth(float MinDepth, float MaxDepth, uint PlatformConfig)
|
|
{
|
|
BRANCH
|
|
if (IsHiStencilTileConfig(PlatformConfig))
|
|
{
|
|
return EncodeTileStencilZBaseDelta(
|
|
MinDepth,
|
|
MaxDepth,
|
|
HTILE_DEFAULT_XX,
|
|
HTILE_DEFAULT_SMEM,
|
|
HTILE_DEFAULT_SR0,
|
|
HTILE_DEFAULT_SR1,
|
|
IsZCompareGE(PlatformConfig)
|
|
);
|
|
}
|
|
else
|
|
{
|
|
return EncodeTileMinMaxDepth(MinDepth, MaxDepth);
|
|
}
|
|
}
|
|
|
|
[numthreads(HTILE_PIXELS_PER_TILE_WIDE, HTILE_PIXELS_PER_TILE_TALL, 1)]
|
|
void DepthExport(uint2 GroupId : SV_GroupID, uint ThreadIndex : SV_GroupIndex)
|
|
{
|
|
const uint2 PixelLocalPos = SwizzleThreadIndex(ThreadIndex & 63u);
|
|
const uint2 PixelPos = ((GroupId << 3) | PixelLocalPos) + ViewRect.xy;
|
|
|
|
if (any(PixelPos.xy >= ViewRect.zw))
|
|
{
|
|
return;
|
|
}
|
|
|
|
const uint PlatformConfig = DepthExportConfig.x;
|
|
const uint PixelsWide = DepthExportConfig.y;
|
|
const UlongType VisPixel = VisBuffer64[PixelPos.xy];
|
|
|
|
// Calculate the HTile tile index and scalarize.
|
|
const uint TileIndex = ToScalarMemory(ComputeTileOffset(PixelPos.xy, PixelsWide, PlatformConfig));
|
|
|
|
uint DepthInt = 0;
|
|
uint VisibleClusterIndex = 0;
|
|
uint TriIndex = 0;
|
|
UnpackVisPixel(VisPixel, DepthInt, VisibleClusterIndex, TriIndex);
|
|
|
|
uint HTileValue = ToScalarMemory(SceneHTile[TileIndex]);
|
|
uint SceneZMask = BitFieldExtractU32(HTileValue, 4u, 0u);
|
|
uint SceneSMem = IsHiStencilTileConfig(PlatformConfig) ? BitFieldExtractU32(HTileValue, 2u, 8u) : 0x3;
|
|
float SceneDepthValue = DecompressDepthValue(SceneDepth, SceneZMask, PixelPos.xy, ThreadIndex, 0.0f, PlatformConfig);
|
|
uint SceneStencilValue = DecompressStencilValue(SceneStencil, SceneSMem, PixelPos.xy, 0);
|
|
|
|
#if SHADING_MASK_EXPORT
|
|
// We need to fully write the shading mask for all pixels, since we intentionally skip clearing it to avoid an FCE
|
|
// By default we write 0, which indicates non-Nanite, but every valid Nanite pixel will pack this value accordingly.
|
|
uint EncodedMask = 0x0u;
|
|
#endif
|
|
|
|
bool bWriteDepthStencil = false;
|
|
|
|
// Avoid the parallel reduction and exports if the tile has no Nanite
|
|
// pixels. This is an optimization of course, but it also will preserve
|
|
// existing plane equation compression for scene depth tiles rasterized
|
|
// outside of Nanite.
|
|
if (WaveActiveAnyTrue(VisibleClusterIndex != 0xFFFFFFFF)) // Tile has Nanite coverage
|
|
{
|
|
bool NanitePixelVisible = false;
|
|
|
|
if (VisibleClusterIndex != 0xFFFFFFFF)
|
|
{
|
|
checkSlow(IsVisibleClusterIndexImposter(VisibleClusterIndex) || VisibleClusterIndex < DepthExportConfig.w);
|
|
|
|
float NaniteDepthValue = asfloat(DepthInt);
|
|
|
|
NanitePixelVisible = NaniteDepthValue >= SceneDepthValue;
|
|
if (NanitePixelVisible)
|
|
{
|
|
FVisibleCluster VisibleCluster = GetVisibleCluster(VisibleClusterIndex);
|
|
checkSlow(VisibleCluster.InstanceId < GetSceneData().MaxAllocatedInstanceId);
|
|
|
|
#if VELOCITY_EXPORT
|
|
#if INSTANCED_STEREO
|
|
checkSlow(VisibleCluster.ViewId < 2); // see INSTANCED_VIEW_COUNT in SceneView.h.
|
|
// Velocity export uses a per-view TemporalAAJitter anyway, so it is a fair assumption that each NaniteView has a backing FViewInfo at this point.
|
|
FNaniteView NaniteView = GetNaniteView(VisibleCluster.ViewId);
|
|
ResolvedView = ResolveView(VisibleCluster.ViewId);
|
|
#else
|
|
FNaniteView NaniteView = GetNaniteView(0);
|
|
ResolvedView = ResolveView();
|
|
#endif
|
|
#endif
|
|
|
|
const FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked(VisibleCluster);
|
|
const FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId);
|
|
const FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex);
|
|
|
|
const uint ShadingBin = GetMaterialShadingBin(Cluster, InstanceData.PrimitiveId, MeshPassIndex, TriIndex);
|
|
|
|
const bool bIsDecalReceiver = (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_DECAL_RECEIVER) != 0 && View.ShowDecalsMask > 0;
|
|
const bool bHasDistanceField = (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_HAS_DISTANCE_FIELD_REPRESENTATION) != 0;
|
|
|
|
SceneDepthValue = NaniteDepthValue;
|
|
|
|
if (bWriteCustomStencil)
|
|
{
|
|
uint CustomStencilMask = PrimitiveData.CustomStencilValueAndMask & 0xFF00u;
|
|
if (CustomStencilMask == 0)
|
|
{
|
|
// "Default" mask value of 0 means depth pass = replace, depth fail = keep
|
|
// TODO: Proper support for ignore-depth, bitwise-or modes (requires a separate 32-bit target)
|
|
CustomStencilMask = 0xFF00u;
|
|
}
|
|
CustomStencilMask = CustomStencilMask >> 8u;
|
|
SceneStencilValue = PrimitiveData.CustomStencilValueAndMask & CustomStencilMask;
|
|
}
|
|
else
|
|
{
|
|
SceneStencilValue = select(bIsDecalReceiver, DepthExportConfig.z, 0u);
|
|
}
|
|
|
|
#if VELOCITY_EXPORT
|
|
// Don't output velocity here for WPO clusters, as their velocity must be calculated in the base pass
|
|
bool bWPOEnabled = (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0;
|
|
const bool bFallbackRasterBin = (VisibleCluster.Flags & NANITE_CULLING_FLAG_FALLBACK_RASTER) != 0;
|
|
|
|
if (bWPOEnabled)
|
|
{
|
|
// The cluster may have WPO enabled, but we want to know if WPO is enabled by the material to decide to output velocity
|
|
const uint RasterBinIndex = GetMaterialRasterBin(Cluster, InstanceData.PrimitiveId, MeshPassIndex, TriIndex, RegularMaterialRasterBinCount, bFallbackRasterBin);
|
|
const FNaniteMaterialFlags MaterialFlags = UnpackNaniteMaterialFlags(RasterBinMeta[RasterBinIndex].MaterialFlags_DepthBlock & 0xFFFFu);
|
|
bWPOEnabled &= MaterialFlags.bWorldPositionOffset;
|
|
}
|
|
|
|
Velocity[PixelPos.xy] = CalculateNaniteVelocity(NaniteView, InstanceData, Cluster, VisibleCluster, float4(PixelPos, asfloat(DepthInt), 1.0f), TriIndex, PrimitiveData.Flags, bWPOEnabled);
|
|
|
|
#endif
|
|
|
|
#if SHADING_MASK_EXPORT
|
|
EncodedMask = PackShadingMask(
|
|
ShadingBin,
|
|
D3D12_SHADING_RATE_1X1,
|
|
bIsDecalReceiver,
|
|
bHasDistanceField,
|
|
GetPrimitive_LightingChannelMask_FromFlags(PrimitiveData.Flags)
|
|
);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
// Only write out SceneDepth and expand SceneHTile if there are Nanite fragments that survive depth testing against the scene
|
|
bWriteDepthStencil = WaveActiveAnyTrue(NanitePixelVisible);
|
|
}
|
|
|
|
const float TileMinSceneDepth = WaveActiveMin(SceneDepthValue);
|
|
const float TileMaxSceneDepth = WaveActiveMax(SceneDepthValue);
|
|
|
|
BRANCH
|
|
if (bWriteDepthStencil)
|
|
{
|
|
SceneDepth[PixelPos.xy] = SceneDepthValue; // Write uncompressed depth value out
|
|
SceneStencil[PixelPos.xy] = SceneStencilValue;
|
|
|
|
if (WaveIsFirstLane())
|
|
{
|
|
SceneHTile[TileIndex] = EncodeTileMinMaxDepth(TileMinSceneDepth, TileMaxSceneDepth, PlatformConfig);
|
|
}
|
|
}
|
|
|
|
#if SHADING_MASK_EXPORT
|
|
ShadingMask[PixelPos.xy] = EncodedMask;
|
|
#endif
|
|
}
|
|
|
|
// NOTE: A possible further optimization could be to avoid doing a clear depth target clear for materials
|
|
// and instead set an empty tile's htile encoding to zmask=0x0 to represent a clear tile, and just
|
|
// set the min/max z values to 0.0 - In theory, the depth clear value is in a global register, so
|
|
// the zmask=0x0 should just ignore the rest of the encoding. The behavior on GCN around this isn't
|
|
// really documented, so we probably need to do a test where we set the target to clear, make the
|
|
// zrange 1.0/1.0 and the depth clear value 0.0. Then render something at z=0.5 and see what happens.
|
|
|
|
#else
|
|
|
|
[numthreads(8, 8, 1)]
|
|
void DepthExport(uint3 PixelPos : SV_DispatchThreadID)
|
|
{
|
|
}
|
|
|
|
#endif |