Files
UnrealEngine/Engine/Shaders/Private/Nanite/NaniteDice.ush
2025-05-18 13:04:45 +08:00

350 lines
10 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#pragma once
#include "NaniteRasterizationCommon.ush"
#include "NaniteTessellation.ush"
#if NANITE_TESSELLATION
#define NANITE_TESSELLATION_DICE_USE_LDS 1
void RasterizeDicedTri(
FRasterTri Tri,
FRaster Raster,
FMaterialShader Shader,
uint PixelValue,
uint2 VisualizeValues )
{
#if VIRTUAL_TEXTURE_TARGET && NANITE_LATE_VSM_PAGE_TRANSLATION
if (!Raster.bSinglePage)
{
TNaniteWritePixel< FMaterialShader, FCachedPageTable > NaniteWritePixel = { Raster, Shader, PixelValue, VisualizeValues };
RasterizeTri_Rect( Tri, NaniteWritePixel );
}
else
#elif VIRTUAL_TEXTURE_TARGET
if (!Raster.bSinglePage)
{
TNaniteWritePixel< FMaterialShader, FFetchPageTable > NaniteWritePixel =
{
Raster,
Shader,
PixelValue,
VisualizeValues,
Shader.NaniteView.TargetMipLevel,
CalcPageTableLevelOffset( Shader.NaniteView.TargetLayerIndex, Shader.NaniteView.TargetMipLevel )
};
RasterizeTri_Rect( Tri, NaniteWritePixel );
}
else
#endif
{
TNaniteWritePixel< FMaterialShader > NaniteWritePixel = { Raster, Shader, PixelValue, VisualizeValues };
RasterizeTri_Rect( Tri, NaniteWritePixel );
}
}
groupshared float4 GroupPointPackedClip[ THREADGROUP_SIZE ]; // TODO: Convert to PackedClip ?
groupshared float4 GroupNormalPackedClip[ THREADGROUP_SIZE ];
struct FDiceTask
{
FRaster Raster;
FMaterialShader Shader;
uint PixelValue;
uint2 VisualizeValues;
float4 UVDensities;
bool bReverseWinding;
FNaniteTransformedVert Vert;
FTessellatedPatch TessellatedPatch;
uint4 Encoded;
uint PatchData;
// Vertex cache
float3 CachedPackedSubpixelPosition;
void Init( float3 TessFactors, uint3 VertIndexes, uint TriIndex )
{
TessellatedPatch.Init( TessFactors, VertIndexes, true );
PatchData = VertIndexes.x << 0;
PatchData |= VertIndexes.y << 8;
PatchData |= VertIndexes.z << 16;
PatchData |= TriIndex << 24;
CachedPackedSubpixelPosition = 0.0f;
}
FDiceTask CreateChild( uint ParentLaneIndex )
{
// "this" is broken: https://github.com/microsoft/DirectXShaderCompiler/issues/4914
FDiceTask ChildTask;// = this;
ChildTask.Raster = Raster;
ChildTask.Shader = Shader;
ChildTask.PixelValue = PixelValue;
ChildTask.VisualizeValues = VisualizeValues;
ChildTask.UVDensities = UVDensities;
ChildTask.bReverseWinding = bReverseWinding;
ChildTask.TessellatedPatch = WaveReadLaneAt( TessellatedPatch, ParentLaneIndex );
ChildTask.Encoded = WaveReadLaneAt( Encoded, ParentLaneIndex );
ChildTask.PatchData = WaveReadLaneAt( PatchData, ParentLaneIndex );
uint3 PatchVertIndexes;
PatchVertIndexes.x = ( ChildTask.PatchData >> 0 ) & 0xff;
PatchVertIndexes.y = ( ChildTask.PatchData >> 8 ) & 0xff;
PatchVertIndexes.z = ( ChildTask.PatchData >> 16 ) & 0xff;
FNaniteTransformedTri TransformedTri = MakeTransformedNaniteTriangle( Vert, PatchVertIndexes );
#if NANITE_TESSELLATION_DICE_USE_LDS
// Alleviate DS pressure by using wide LDS loads instead of single component permutes
for (uint Corner = 0; Corner < 3; ++Corner)
{
const uint SourceIndex = PatchVertIndexes[ Corner ];
TransformedTri.Verts[ Corner ].PointClip = GroupPointPackedClip[ SourceIndex ];
TransformedTri.Verts[ Corner ].NormalClip = GroupNormalPackedClip[ SourceIndex ];
}
#endif
ChildTask.Shader.TransformedTri = TransformedTri; // TODO mutable. This is weird
return ChildTask;
}
void CacheToLDS()
{
#if NANITE_TESSELLATION_DICE_USE_LDS
const uint LaneIndex = WaveGetLaneIndex();
GroupPointPackedClip[LaneIndex] = Vert.PointClip;
GroupNormalPackedClip[LaneIndex] = Vert.NormalClip;
GroupMemoryBarrierWithGroupSync();
#endif
}
void RunChild( inout FDiceTask ParentTask, bool bActive, uint LocalItemIndex );
};
//groupshared float3 VertexCache[ THREADGROUP_SIZE ];
//#define VertCache(x) VertexCache[ QueueOffset + ( (x) & ( LaneCount - 1 ) ) ]
void FDiceTask::RunChild( inout FDiceTask ParentTask, bool bActive, uint LocalItemIndex )
{
uint PatchIndex = PatchData >> 24;
uint3 VertIndexes = TessellatedPatch.GetIndexes( LocalItemIndex );
if( bReverseWinding )
VertIndexes.yz = VertIndexes.zy;
float4 Verts[3];
#if 1
const float3 TessFactors = TessellatedPatch.GetTessFactors();
FBarycentrics Barycentrics;
Barycentrics.Value = TessellatedPatch.GetVert( LocalItemIndex );
Barycentrics.Value_dx = 0; // float3( -1, 1, 0 ) / TessFactors.x;
Barycentrics.Value_dy = 0; // float3( 0, -1, 1 ) / TessFactors.y;
const bool bOrtho = IsOrthoProjection( Shader.NaniteView.ViewToClip );
float3 CornerPackedSubpixel0; // sub-pixel xy, linear z
// TODO: Unify these paths by having EvaluateDomain operate directly in PackedClip space
BRANCH
if (bOrtho)
{
// Optimize out .w work and lane permutes
const float3 PointPackedClip = Shader.EvaluateDomain( UVDensities, Barycentrics ).xyz;
CornerPackedSubpixel0 = PointPackedClip.xyz;
}
else
{
// Optimize out .z work and lane permutes
const float3 PointPackedClip = Shader.EvaluateDomain( UVDensities, Barycentrics ).xyw;
CornerPackedSubpixel0 = float3( PointPackedClip.xy / PointPackedClip.z, PointPackedClip.z );
}
CornerPackedSubpixel0.xy = CornerPackedSubpixel0.xy * Raster.ViewportScale + Raster.ViewportBias;
CornerPackedSubpixel0.xy = floor( CornerPackedSubpixel0.xy );
const int3 RelativeVertIndexes = WaveGetLaneIndex() - ( LocalItemIndex - VertIndexes ); // Relative to thread 0 in wave
const uint3 ReadLaneIndex = uint3( RelativeVertIndexes ) & ( WaveGetLaneCount() - 1u );
float3 CornerPackedSubpixel1 = WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.y );
float3 CornerPackedSubpixel2 = WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.z );
ParentTask.CachedPackedSubpixelPosition = CornerPackedSubpixel0;
CornerPackedSubpixel1 = select( RelativeVertIndexes.y >= 0, WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.y ), CornerPackedSubpixel1 );
CornerPackedSubpixel2 = select( RelativeVertIndexes.z >= 0, WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.z ), CornerPackedSubpixel2 );
BRANCH
if (bOrtho)
{
Verts[0] = float4( CornerPackedSubpixel0, 1.0f );
Verts[1] = float4( CornerPackedSubpixel1, 1.0f );
Verts[2] = float4( CornerPackedSubpixel2, 1.0f );
}
else
{
Verts[0] = float4( CornerPackedSubpixel0.xy, float2( ClipZFromLinearZ( Shader.NaniteView, CornerPackedSubpixel0.z ), 1.0f ) / CornerPackedSubpixel0.z );
Verts[1] = float4( CornerPackedSubpixel1.xy, float2( ClipZFromLinearZ( Shader.NaniteView, CornerPackedSubpixel1.z ), 1.0f ) / CornerPackedSubpixel1.z );
Verts[2] = float4( CornerPackedSubpixel2.xy, float2( ClipZFromLinearZ( Shader.NaniteView, CornerPackedSubpixel2.z ), 1.0f ) / CornerPackedSubpixel2.z );
}
#elif 0
// Grab what's there for this triangle before updating cache. Otherwise cache would need to be double size.
bool3 VertRead = false;
UNROLL
for( uint k = 0; k < 3; k++ )
{
if( FirstVert + VertIndexes[k] < NumCached )
{
Verts[k] = VertCache( FirstVert + VertIndexes[k] );
VertRead[k] = true;
}
}
GroupMemoryBarrier();
bool bNewVertex = PackedIndexes & (1 << 30);
if( bNewVertex )
{
uint MaxVertIndex = max3( VertIndexes.x, VertIndexes.y, VertIndexes.z );
float3 UVW = TessellatedPatch.GetVert( MaxVertIndex );
UVW = TransformBarycentrics( UVW );
FBarycentrics Barycentrics;
Barycentrics.Value = UVW;
Barycentrics.Value_dx = 0;
Barycentrics.Value_dy = 0;
VertCache( FirstVert + MaxVertIndex ) = CalculateSubpixelCoordinates( Raster, Shader.EvaluateDomain( PatchIndex, Barycentrics ) );
}
GroupMemoryBarrier();
NumCached += WaveActiveCountBits( bNewVertex ); //FIXME this increments LocalTask.NumCached which goes no where. Need persistent scalar. Need references!
UNROLL
for( uint k = 0; k < 3; k++ )
{
if( !VertRead[k] )
Verts[k] = VertCache( FirstVert + VertIndexes[k] );
}
#else
float3 TessFactors = TessellatedPatch.GetTessFactors();
UNROLL
for( uint i = 0; i < 3; i++ )
{
FBarycentrics Barycentrics;
Barycentrics.Value = TessellatedPatch.GetVert( VertIndexes[i] );
Barycentrics.Value_dx = 0; // float3( -1, 1, 0 ) / TessFactors.x;
Barycentrics.Value_dy = 0; // float3( 0, -1, 1 ) / TessFactors.y;
Verts[i] = CalculateSubpixelCoordinates( Raster, Shader.EvaluateDomain( PatchIndex, Barycentrics ) );
}
#endif
FRasterTri Tri = SetupTriangle< NANITE_SUBPIXEL_SAMPLES, !NANITE_TWO_SIDED >( Raster.ScissorRect, Verts );
// Immediate dicing doesn't doesn't do near plane culling
if( !bActive || min3( Verts[0].w, Verts[1].w, Verts[2].w ) < 0 )
Tri.bIsValid = false;
if( Tri.bIsValid )
{
#if VISUALIZE
VisualizeValues = GetVisualizeValues(1u /*AddValue */, 0u /* SubPatch */, LocalItemIndex);
#endif
RasterizeDicedTri(
Tri,
Raster,
Shader,
PixelValue | PatchIndex,
VisualizeValues );
}
}
struct FClusterSplitTask
{
FTessellatedPatch TessellatedPatch;
uint4 Encoded;
void Init( float3 TessFactors, uint VisibleClusterIndex, uint TriIndex );
FClusterSplitTask CreateChild( uint ParentLaneIndex );
void RunChild( inout FClusterSplitTask ParentTask, bool bActive, uint LocalItemIndex );
};
void FClusterSplitTask::Init( float3 TessFactors, uint VisibleClusterIndex, uint TriIndex )
{
Encoded.x = ( VisibleClusterIndex << 7 ) | TriIndex;
Encoded.y = BarycentricMax;
Encoded.z = BarycentricMax << 16;
Encoded.w = 0;
TessellatedPatch.Init( TessFactors, Encoded.yzw, false );
}
FClusterSplitTask FClusterSplitTask::CreateChild( uint ParentLaneIndex )
{
FClusterSplitTask ChildTask;
ChildTask.TessellatedPatch = WaveReadLaneAt( TessellatedPatch, ParentLaneIndex );
ChildTask.Encoded = WaveReadLaneAt( Encoded, ParentLaneIndex );
return ChildTask;
}
void FClusterSplitTask::RunChild( inout FClusterSplitTask ParentTask, bool bActive, uint LocalItemIndex )
{
if( !bActive )
return;
#if 0
Encoded.yzw = TessellatedPatch.GetTriangleEncoded( LocalItemIndex );
#else
FSplitPatch Patch;
Patch.Decode( Encoded );
uint3 VertIndexes = TessellatedPatch.GetIndexes( LocalItemIndex );
for( int i = 0; i < 3; i++ )
{
float3 Barycentrics = TessellatedPatch.GetVert( VertIndexes[i] );
Barycentrics =
Patch.Barycentrics[0] * Barycentrics.x +
Patch.Barycentrics[1] * Barycentrics.y +
Patch.Barycentrics[2] * Barycentrics.z;
Encoded[ i + 1 ] = EncodeBarycentrics( Barycentrics );
}
#endif
uint WriteOffset = SplitWorkQueue.Add();
if( WriteOffset < SplitWorkQueue.Size )
{
checkSlow(
Encoded.x != ~0u &&
Encoded.y != ~0u &&
Encoded.z != ~0u &&
Encoded.w != ~0u );
SplitWorkQueue.DataBuffer_Store4( WriteOffset * 16, Encoded );
}
}
#endif