// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "NaniteRasterizationCommon.ush" #include "NaniteTessellation.ush" #if NANITE_TESSELLATION #define NANITE_TESSELLATION_DICE_USE_LDS 1 void RasterizeDicedTri( FRasterTri Tri, FRaster Raster, FMaterialShader Shader, uint PixelValue, uint2 VisualizeValues ) { #if VIRTUAL_TEXTURE_TARGET && NANITE_LATE_VSM_PAGE_TRANSLATION if (!Raster.bSinglePage) { TNaniteWritePixel< FMaterialShader, FCachedPageTable > NaniteWritePixel = { Raster, Shader, PixelValue, VisualizeValues }; RasterizeTri_Rect( Tri, NaniteWritePixel ); } else #elif VIRTUAL_TEXTURE_TARGET if (!Raster.bSinglePage) { TNaniteWritePixel< FMaterialShader, FFetchPageTable > NaniteWritePixel = { Raster, Shader, PixelValue, VisualizeValues, Shader.NaniteView.TargetMipLevel, CalcPageTableLevelOffset( Shader.NaniteView.TargetLayerIndex, Shader.NaniteView.TargetMipLevel ) }; RasterizeTri_Rect( Tri, NaniteWritePixel ); } else #endif { TNaniteWritePixel< FMaterialShader > NaniteWritePixel = { Raster, Shader, PixelValue, VisualizeValues }; RasterizeTri_Rect( Tri, NaniteWritePixel ); } } groupshared float4 GroupPointPackedClip[ THREADGROUP_SIZE ]; // TODO: Convert to PackedClip ? groupshared float4 GroupNormalPackedClip[ THREADGROUP_SIZE ]; struct FDiceTask { FRaster Raster; FMaterialShader Shader; uint PixelValue; uint2 VisualizeValues; float4 UVDensities; bool bReverseWinding; FNaniteTransformedVert Vert; FTessellatedPatch TessellatedPatch; uint4 Encoded; uint PatchData; // Vertex cache float3 CachedPackedSubpixelPosition; void Init( float3 TessFactors, uint3 VertIndexes, uint TriIndex ) { TessellatedPatch.Init( TessFactors, VertIndexes, true ); PatchData = VertIndexes.x << 0; PatchData |= VertIndexes.y << 8; PatchData |= VertIndexes.z << 16; PatchData |= TriIndex << 24; CachedPackedSubpixelPosition = 0.0f; } FDiceTask CreateChild( uint ParentLaneIndex ) { // "this" is broken: https://github.com/microsoft/DirectXShaderCompiler/issues/4914 FDiceTask ChildTask;// = this; ChildTask.Raster = Raster; ChildTask.Shader = Shader; ChildTask.PixelValue = PixelValue; ChildTask.VisualizeValues = VisualizeValues; ChildTask.UVDensities = UVDensities; ChildTask.bReverseWinding = bReverseWinding; ChildTask.TessellatedPatch = WaveReadLaneAt( TessellatedPatch, ParentLaneIndex ); ChildTask.Encoded = WaveReadLaneAt( Encoded, ParentLaneIndex ); ChildTask.PatchData = WaveReadLaneAt( PatchData, ParentLaneIndex ); uint3 PatchVertIndexes; PatchVertIndexes.x = ( ChildTask.PatchData >> 0 ) & 0xff; PatchVertIndexes.y = ( ChildTask.PatchData >> 8 ) & 0xff; PatchVertIndexes.z = ( ChildTask.PatchData >> 16 ) & 0xff; FNaniteTransformedTri TransformedTri = MakeTransformedNaniteTriangle( Vert, PatchVertIndexes ); #if NANITE_TESSELLATION_DICE_USE_LDS // Alleviate DS pressure by using wide LDS loads instead of single component permutes for (uint Corner = 0; Corner < 3; ++Corner) { const uint SourceIndex = PatchVertIndexes[ Corner ]; TransformedTri.Verts[ Corner ].PointClip = GroupPointPackedClip[ SourceIndex ]; TransformedTri.Verts[ Corner ].NormalClip = GroupNormalPackedClip[ SourceIndex ]; } #endif ChildTask.Shader.TransformedTri = TransformedTri; // TODO mutable. This is weird return ChildTask; } void CacheToLDS() { #if NANITE_TESSELLATION_DICE_USE_LDS const uint LaneIndex = WaveGetLaneIndex(); GroupPointPackedClip[LaneIndex] = Vert.PointClip; GroupNormalPackedClip[LaneIndex] = Vert.NormalClip; GroupMemoryBarrierWithGroupSync(); #endif } void RunChild( inout FDiceTask ParentTask, bool bActive, uint LocalItemIndex ); }; //groupshared float3 VertexCache[ THREADGROUP_SIZE ]; //#define VertCache(x) VertexCache[ QueueOffset + ( (x) & ( LaneCount - 1 ) ) ] void FDiceTask::RunChild( inout FDiceTask ParentTask, bool bActive, uint LocalItemIndex ) { uint PatchIndex = PatchData >> 24; uint3 VertIndexes = TessellatedPatch.GetIndexes( LocalItemIndex ); if( bReverseWinding ) VertIndexes.yz = VertIndexes.zy; float4 Verts[3]; #if 1 const float3 TessFactors = TessellatedPatch.GetTessFactors(); FBarycentrics Barycentrics; Barycentrics.Value = TessellatedPatch.GetVert( LocalItemIndex ); Barycentrics.Value_dx = 0; // float3( -1, 1, 0 ) / TessFactors.x; Barycentrics.Value_dy = 0; // float3( 0, -1, 1 ) / TessFactors.y; const bool bOrtho = IsOrthoProjection( Shader.NaniteView.ViewToClip ); float3 CornerPackedSubpixel0; // sub-pixel xy, linear z // TODO: Unify these paths by having EvaluateDomain operate directly in PackedClip space BRANCH if (bOrtho) { // Optimize out .w work and lane permutes const float3 PointPackedClip = Shader.EvaluateDomain( UVDensities, Barycentrics ).xyz; CornerPackedSubpixel0 = PointPackedClip.xyz; } else { // Optimize out .z work and lane permutes const float3 PointPackedClip = Shader.EvaluateDomain( UVDensities, Barycentrics ).xyw; CornerPackedSubpixel0 = float3( PointPackedClip.xy / PointPackedClip.z, PointPackedClip.z ); } CornerPackedSubpixel0.xy = CornerPackedSubpixel0.xy * Raster.ViewportScale + Raster.ViewportBias; CornerPackedSubpixel0.xy = floor( CornerPackedSubpixel0.xy ); const int3 RelativeVertIndexes = WaveGetLaneIndex() - ( LocalItemIndex - VertIndexes ); // Relative to thread 0 in wave const uint3 ReadLaneIndex = uint3( RelativeVertIndexes ) & ( WaveGetLaneCount() - 1u ); float3 CornerPackedSubpixel1 = WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.y ); float3 CornerPackedSubpixel2 = WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.z ); ParentTask.CachedPackedSubpixelPosition = CornerPackedSubpixel0; CornerPackedSubpixel1 = select( RelativeVertIndexes.y >= 0, WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.y ), CornerPackedSubpixel1 ); CornerPackedSubpixel2 = select( RelativeVertIndexes.z >= 0, WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.z ), CornerPackedSubpixel2 ); BRANCH if (bOrtho) { Verts[0] = float4( CornerPackedSubpixel0, 1.0f ); Verts[1] = float4( CornerPackedSubpixel1, 1.0f ); Verts[2] = float4( CornerPackedSubpixel2, 1.0f ); } else { Verts[0] = float4( CornerPackedSubpixel0.xy, float2( ClipZFromLinearZ( Shader.NaniteView, CornerPackedSubpixel0.z ), 1.0f ) / CornerPackedSubpixel0.z ); Verts[1] = float4( CornerPackedSubpixel1.xy, float2( ClipZFromLinearZ( Shader.NaniteView, CornerPackedSubpixel1.z ), 1.0f ) / CornerPackedSubpixel1.z ); Verts[2] = float4( CornerPackedSubpixel2.xy, float2( ClipZFromLinearZ( Shader.NaniteView, CornerPackedSubpixel2.z ), 1.0f ) / CornerPackedSubpixel2.z ); } #elif 0 // Grab what's there for this triangle before updating cache. Otherwise cache would need to be double size. bool3 VertRead = false; UNROLL for( uint k = 0; k < 3; k++ ) { if( FirstVert + VertIndexes[k] < NumCached ) { Verts[k] = VertCache( FirstVert + VertIndexes[k] ); VertRead[k] = true; } } GroupMemoryBarrier(); bool bNewVertex = PackedIndexes & (1 << 30); if( bNewVertex ) { uint MaxVertIndex = max3( VertIndexes.x, VertIndexes.y, VertIndexes.z ); float3 UVW = TessellatedPatch.GetVert( MaxVertIndex ); UVW = TransformBarycentrics( UVW ); FBarycentrics Barycentrics; Barycentrics.Value = UVW; Barycentrics.Value_dx = 0; Barycentrics.Value_dy = 0; VertCache( FirstVert + MaxVertIndex ) = CalculateSubpixelCoordinates( Raster, Shader.EvaluateDomain( PatchIndex, Barycentrics ) ); } GroupMemoryBarrier(); NumCached += WaveActiveCountBits( bNewVertex ); //FIXME this increments LocalTask.NumCached which goes no where. Need persistent scalar. Need references! UNROLL for( uint k = 0; k < 3; k++ ) { if( !VertRead[k] ) Verts[k] = VertCache( FirstVert + VertIndexes[k] ); } #else float3 TessFactors = TessellatedPatch.GetTessFactors(); UNROLL for( uint i = 0; i < 3; i++ ) { FBarycentrics Barycentrics; Barycentrics.Value = TessellatedPatch.GetVert( VertIndexes[i] ); Barycentrics.Value_dx = 0; // float3( -1, 1, 0 ) / TessFactors.x; Barycentrics.Value_dy = 0; // float3( 0, -1, 1 ) / TessFactors.y; Verts[i] = CalculateSubpixelCoordinates( Raster, Shader.EvaluateDomain( PatchIndex, Barycentrics ) ); } #endif FRasterTri Tri = SetupTriangle< NANITE_SUBPIXEL_SAMPLES, !NANITE_TWO_SIDED >( Raster.ScissorRect, Verts ); // Immediate dicing doesn't doesn't do near plane culling if( !bActive || min3( Verts[0].w, Verts[1].w, Verts[2].w ) < 0 ) Tri.bIsValid = false; if( Tri.bIsValid ) { #if VISUALIZE VisualizeValues = GetVisualizeValues(1u /*AddValue */, 0u /* SubPatch */, LocalItemIndex); #endif RasterizeDicedTri( Tri, Raster, Shader, PixelValue | PatchIndex, VisualizeValues ); } } struct FClusterSplitTask { FTessellatedPatch TessellatedPatch; uint4 Encoded; void Init( float3 TessFactors, uint VisibleClusterIndex, uint TriIndex ); FClusterSplitTask CreateChild( uint ParentLaneIndex ); void RunChild( inout FClusterSplitTask ParentTask, bool bActive, uint LocalItemIndex ); }; void FClusterSplitTask::Init( float3 TessFactors, uint VisibleClusterIndex, uint TriIndex ) { Encoded.x = ( VisibleClusterIndex << 7 ) | TriIndex; Encoded.y = BarycentricMax; Encoded.z = BarycentricMax << 16; Encoded.w = 0; TessellatedPatch.Init( TessFactors, Encoded.yzw, false ); } FClusterSplitTask FClusterSplitTask::CreateChild( uint ParentLaneIndex ) { FClusterSplitTask ChildTask; ChildTask.TessellatedPatch = WaveReadLaneAt( TessellatedPatch, ParentLaneIndex ); ChildTask.Encoded = WaveReadLaneAt( Encoded, ParentLaneIndex ); return ChildTask; } void FClusterSplitTask::RunChild( inout FClusterSplitTask ParentTask, bool bActive, uint LocalItemIndex ) { if( !bActive ) return; #if 0 Encoded.yzw = TessellatedPatch.GetTriangleEncoded( LocalItemIndex ); #else FSplitPatch Patch; Patch.Decode( Encoded ); uint3 VertIndexes = TessellatedPatch.GetIndexes( LocalItemIndex ); for( int i = 0; i < 3; i++ ) { float3 Barycentrics = TessellatedPatch.GetVert( VertIndexes[i] ); Barycentrics = Patch.Barycentrics[0] * Barycentrics.x + Patch.Barycentrics[1] * Barycentrics.y + Patch.Barycentrics[2] * Barycentrics.z; Encoded[ i + 1 ] = EncodeBarycentrics( Barycentrics ); } #endif uint WriteOffset = SplitWorkQueue.Add(); if( WriteOffset < SplitWorkQueue.Size ) { checkSlow( Encoded.x != ~0u && Encoded.y != ~0u && Encoded.z != ~0u && Encoded.w != ~0u ); SplitWorkQueue.DataBuffer_Store4( WriteOffset * 16, Encoded ); } } #endif