// Copyright Epic Games, Inc. All Rights Reserved. // This must be defined before including Common.ush (see GetShadowReplaceState) #define SHADOW_DEPTH_SHADER DEPTH_ONLY #define SPLIT_WORK_QUEUE NANITE_TESSELLATION // TODO: Remove once shader rewriter has been fixed (UE-202409) #include "NaniteRasterizationCommon.ush" #include "../VirtualShadowMaps/VirtualShadowMapPageAccessCommon.ush" #include "../VirtualShadowMaps/VirtualShadowMapPageOverlap.ush" #include "../MaterialCache/MaterialCacheCommon.ush" #include "../ComputeShaderUtils.ush" #include "../Random.ush" #include "../Matrices.ush" #if NANITE_TESSELLATION #include "NaniteTessellation.ush" #include "NaniteDice.ush" #endif #ifndef WORKGRAPH_NODE #define WORKGRAPH_NODE 0 #endif #if WORKGRAPH_NODE #include "../ShaderBundleWorkGraphCommon.ush" #endif #define CONSTANT_DIR ( 0 && !VIRTUAL_TEXTURE_TARGET ) #define CONSTANT_DIR_RECT ( 1 && !VIRTUAL_TEXTURE_TARGET ) #define BRICK_TRACE_WORK_REDISTRIBUTION 1 #define BRICK_TRACE_TRANSPOSE 0 #define BRICK_TRACE_APPROXIMATE_DIVIDE 0 // Only good up to ~30x30px bricks // Update this GUID to bump and recompile all Nanite rasterization material shaders // Merge conflicts on this line should be resolved by generating a new GUID #pragma message("UESHADERMETADATA_VERSION A6174FDD-04E8-4C49-A97C-18750449C462") #if PIXELSHADER ALLOW_NO_PS_EXPORT #endif #ifndef NANITE_MESH_SHADER #define NANITE_MESH_SHADER 0 #endif #ifndef NANITE_PRIM_SHADER #define NANITE_PRIM_SHADER 0 #endif #ifndef NANITE_VERT_REUSE_BATCH #define NANITE_VERT_REUSE_BATCH 0 #endif #ifndef NANITE_TWO_SIDED #define NANITE_TWO_SIDED 0 #endif #define NANITE_HW_RASTER_INTERPOLATE_DEPTH (DEPTH_ONLY) #if NANITE_VERT_REUSE_BATCH || NANITE_VOXELS #define THREADGROUP_SIZE 32 #else #define THREADGROUP_SIZE 64 #endif #if COMPUTESHADER && (NANITE_PIXEL_PROGRAMMABLE && !NANITE_TESSELLATION) || NANITE_VOXELS MAX_OCCUPANCY DISABLE_TARGET_OCCUPANCY_WARNING #endif #if COMPUTESHADER && (NANITE_PIXEL_PROGRAMMABLE || NANITE_TESSELLATION) DISABLE_POTENTIALLY_UNINITIALIZED_WARNING #endif HOIST_DESCRIPTORS #include "/Engine/Public/RootConstants.ush" uint GetRasterBin() { return GetRootConstant0(); } RWStructuredBuffer OutStatsBuffer; StructuredBuffer RasterBinMeta; StructuredBuffer RasterBinData; // .x = VisibleIndex // .y = RangeStart // .z = RangeEnd // .w = MaterialFlags uint4 FetchSWRasterBin(const uint ClusterIndex) { const uint RasterBinOffset = RasterBinMeta[GetRasterBin()].ClusterOffset; const uint2 PackedData = RasterBinData[RasterBinOffset + ClusterIndex].xy; const uint VisibleIndex = PackedData.x; const uint RangeStart = PackedData.y >> 16u; const uint RangeEnd = PackedData.y & 0xFFFFu; return uint4(VisibleIndex, RangeStart, RangeEnd, RasterBinMeta[GetRasterBin()].MaterialFlags_DepthBlock & 0xFFFFu); } // .x = VisibleIndex // .y = RangeStart // .z = RangeEnd // .w = MaterialFlags uint4 FetchHWRasterBin(const uint ClusterIndex) { const uint RasterBinOffset = RasterBinMeta[GetRasterBin()].ClusterOffset; const uint RasterBinCapacity = RasterBinMeta[GetRasterBin()].BinSWCount + RasterBinMeta[GetRasterBin()].BinHWCount; const uint2 PackedData = RasterBinData[RasterBinOffset + ((RasterBinCapacity - 1) - ClusterIndex)].xy; // HW clusters are written from the top const uint VisibleIndex = PackedData.x; const uint RangeStart = PackedData.y >> 16u; const uint RangeEnd = PackedData.y & 0xFFFFu; return uint4(VisibleIndex, RangeStart, RangeEnd, RasterBinMeta[GetRasterBin()].MaterialFlags_DepthBlock & 0xFFFFu); } ViewState ResolveView(FNaniteView NaniteView) { ViewState Ret = ResolveView(); Ret.SVPositionToTranslatedWorld = NaniteView.SVPositionToTranslatedWorld; Ret.ViewToTranslatedWorld = NaniteView.ViewToTranslatedWorld; Ret.TranslatedWorldToView = NaniteView.TranslatedWorldToView; Ret.TranslatedWorldToClip = NaniteView.TranslatedWorldToClip; Ret.ViewToClip = NaniteView.ViewToClip; Ret.ClipToWorld = NaniteView.ClipToWorld; Ret.PrevTranslatedWorldToView = NaniteView.PrevTranslatedWorldToView; Ret.PrevTranslatedWorldToClip = NaniteView.PrevTranslatedWorldToClip; Ret.PrevViewToClip = NaniteView.PrevViewToClip; Ret.PrevClipToWorld = NaniteView.PrevClipToWorld; Ret.ViewRectMin = (float4)NaniteView.ViewRect; Ret.ViewSizeAndInvSize = NaniteView.ViewSizeAndInvSize; Ret.PreViewTranslation = NaniteView.PreViewTranslation; Ret.PrevPreViewTranslation = NaniteView.PrevPreViewTranslation; Ret.ViewForward = NaniteView.ViewForward; Ret.ViewOriginHigh = NaniteView.ViewOriginHigh; Ret.NearPlane = NaniteView.NearPlane; // HACK: This fixes some material nodes for shadows, as shadow views borrow some view uniforms from the closest // camera view, rather than exposing their own parameters. Ret.WorldCameraOrigin = DFFastSubtract(NaniteView.CullingViewOriginTranslatedWorld, NaniteView.PreViewTranslation); #if VIEW_HAS_TILEOFFSET_DATA Ret.TileOffset.PreViewTranslation = DFToTileOffset(Ret.PreViewTranslation); Ret.TileOffset.PrevPreViewTranslation = DFToTileOffset(Ret.PrevPreViewTranslation); //Ret.TileOffset.WorldViewOrigin = DFToTileOffset(Ret.WorldViewOrigin); //Ret.TileOffset.PrevWorldViewOrigin = DFToTileOffset(Ret.PrevWorldViewOrigin); Ret.TileOffset.WorldCameraOrigin = DFToTileOffset(Ret.WorldCameraOrigin); //Ret.TileOffset.PrevWorldCameraOrigin = DFToTileOffset(Ret.PrevWorldCameraOrigin); #endif return Ret; } // Default cull mode is CW. If this returns true, CCW culling is required bool ReverseWindingOrder(FNaniteView NaniteView, FPrimitiveSceneData PrimitiveData, FInstanceSceneData InstanceData) { // Negative determinant sign for non uniform scale means that an odd number of components are negative, so // we need to reverse the triangle winding order. float DeterminantSign = InstanceData.DeterminantSign; bool bReverseInstanceCull = (DeterminantSign < 0.0f); #if SUPPORT_REVERSE_CULLING_IN_NANITE if (PrimitiveData.Flags & PRIMITIVE_SCENE_DATA_FLAG_REVERSE_CULLING) { // reverse culling if the primitive has elected to do so bReverseInstanceCull = !bReverseInstanceCull; } #endif bool bViewReverseCull = (NaniteView.Flags & NANITE_VIEW_FLAG_REVERSE_CULLING); // Logical XOR return (bReverseInstanceCull != bViewReverseCull); } StructuredBuffer< uint2 > InTotalPrevDrawClusters; Buffer InClusterOffsetSWHW; struct FTriRange { uint Start; uint Num; }; FTriRange GetIndexAndTriRangeSW( inout uint VisibleIndex ) { FTriRange Range = { 0, 0 }; uint4 RasterBin = FetchSWRasterBin(VisibleIndex); VisibleIndex = RasterBin.x; Range.Start = RasterBin.y; Range.Num = RasterBin.z - RasterBin.y; return Range; } FTriRange GetIndexAndTriRangeHW( inout uint VisibleIndex ) { FTriRange Range = { 0, 0 }; uint4 RasterBin = FetchHWRasterBin(VisibleIndex); VisibleIndex = RasterBin.x; Range.Start = RasterBin.y; Range.Num = RasterBin.z - RasterBin.y; return Range; } FRaster CreateRaster( FNaniteView NaniteView, FVisibleCluster VisibleCluster ) { FRaster Raster; Raster.ScissorRect = NaniteView.ViewRect; // DX11 spec // x = (x + 1) * ViewSize.x * 0.5 + ViewRect.x; // y = (1 - y) * ViewSize.y * 0.5 + ViewRect.y; Raster.ViewportScale = float2(0.5, -0.5) * NaniteView.ViewSizeAndInvSize.xy; Raster.ViewportBias = 0.5 * NaniteView.ViewSizeAndInvSize.xy + NaniteView.ViewRect.xy; #if VIRTUAL_TEXTURE_TARGET // Scalar Raster.vPage = VisibleCluster.vPage; Raster.pPage = 0; Raster.bSinglePage = all( VisibleCluster.vPage == VisibleCluster.vPageEnd ); if (Raster.bSinglePage) { FShadowPhysicalPage PhysicalPage = ShadowGetPhysicalPage( CalcPageOffset( NaniteView.TargetLayerIndex, NaniteView.TargetMipLevel, Raster.vPage ) ); Raster.pPage = PhysicalPage.bThisLODValidForRendering ? PhysicalPage.PhysicalAddress : 0xffff; } // Virtual shadow maps can scatter instances into different physical pages for caching purposes const bool bCacheAsStatic = (VisibleCluster.Flags & NANITE_CULLING_FLAG_CACHE_AS_STATIC) != 0u; Raster.ArrayIndex = bCacheAsStatic ? GetVirtualShadowMapStaticArrayIndex() : 0; if (!Raster.bSinglePage) { #if NANITE_LATE_VSM_PAGE_TRANSLATION Raster.ScissorRect.xy = 0; Raster.ScissorRect.zw = (VisibleCluster.vPageEnd - VisibleCluster.vPage) * VSM_PAGE_SIZE + VSM_PAGE_SIZE; #else Raster.vPage = 0; Raster.ScissorRect.xy = VisibleCluster.vPage * VSM_PAGE_SIZE; Raster.ScissorRect.zw = VisibleCluster.vPageEnd * VSM_PAGE_SIZE + VSM_PAGE_SIZE; #endif } else { Raster.ScissorRect.xy = Raster.pPage * VSM_PAGE_SIZE; Raster.ScissorRect.zw = Raster.ScissorRect.xy + VSM_PAGE_SIZE; } Raster.vTranslation = ( (float2)Raster.pPage - (float2)Raster.vPage ) * VSM_PAGE_SIZE; Raster.ViewportBias += Raster.vTranslation; #endif #if !NANITE_VOXELS Raster.ViewportScale *= NANITE_SUBPIXEL_SAMPLES; Raster.ViewportBias *= NANITE_SUBPIXEL_SAMPLES; Raster.ViewportBias += 0.5f; #endif return Raster; } #if PATCHES #define VERTEX_CACHE_SIZE 120 // (MaxTessFactor+1)*(MaxTessFactor+2)/2 #else #define VERTEX_CACHE_SIZE 256 #endif groupshared float3 GroupVerts[VERTEX_CACHE_SIZE]; struct FCachedVertex { FNaniteTransformedVert TransformedVert; float4 PointSubpixelClip; }; // 64 rolling window vertex cache for pixel programmable shaders. // The expectation is that most materials will only require PointSubpixelClip and maybe 1/2 UV sets and the rest will be DCE'd groupshared float3 VertexCache_PointLocal[64]; groupshared float3 VertexCache_PointPostDeform[64]; groupshared float3 VertexCache_PrevPointPostDeform[64]; groupshared float3 VertexCache_PointWorld[64]; groupshared float3 VertexCache_PointWorld_NoOffset[64]; groupshared float4 VertexCache_PointClip[64]; groupshared half3 VertexCache_NormalPostDeform[64]; groupshared float4 VertexCache_NormalClip[64]; groupshared half4 VertexCache_TangentXAndSignPostDeform[64]; groupshared half4 VertexCache_TangentXAndSign[64]; groupshared float3 VertexCache_TangentZ[64]; groupshared float4 VertexCache_Color[64]; groupshared float2 VertexCache_TexCoords0[64]; groupshared float2 VertexCache_TexCoords1[64]; groupshared float2 VertexCache_TexCoords2[64]; groupshared float2 VertexCache_TexCoords3[64]; groupshared float2 VertexCache_CustomizedUVs0[64]; groupshared float2 VertexCache_CustomizedUVs1[64]; groupshared float2 VertexCache_CustomizedUVs2[64]; groupshared float2 VertexCache_CustomizedUVs3[64]; groupshared float4 VertexCache_PointSubpixelClip[64]; HLSL_STATIC_ASSERT( sizeof( FCachedVertex ) == 220 + 8 * NUM_TEX_COORD_INTERPOLATORS, "Unexpected size of FCachedVertex. Update StoreVertexToLDS to reflect changes." ); void StoreVertexToLDS( uint VertexIndex, FCachedVertex Vertex ) { const uint CacheIndex = VertexIndex & 63u; VertexCache_PointLocal[CacheIndex] = Vertex.TransformedVert.PointLocal; VertexCache_PointPostDeform[CacheIndex] = Vertex.TransformedVert.PointPostDeform; VertexCache_PrevPointPostDeform[CacheIndex] = Vertex.TransformedVert.PrevPointPostDeform; VertexCache_PointWorld[CacheIndex] = Vertex.TransformedVert.PointWorld; VertexCache_PointWorld_NoOffset[CacheIndex] = Vertex.TransformedVert.PointWorld_NoOffset; VertexCache_PointClip[CacheIndex] = Vertex.TransformedVert.PointClip; VertexCache_NormalClip[CacheIndex] = Vertex.TransformedVert.NormalClip; VertexCache_NormalPostDeform[CacheIndex] = Vertex.TransformedVert.TangentBasis.TangentZ; VertexCache_TangentXAndSignPostDeform[CacheIndex] = Vertex.TransformedVert.TangentBasis.TangentXAndSign; VertexCache_TangentXAndSign[CacheIndex] = Vertex.TransformedVert.RawAttributeData.TangentXAndSign; VertexCache_TangentZ[CacheIndex] = Vertex.TransformedVert.RawAttributeData.TangentZ; VertexCache_Color[CacheIndex] = Vertex.TransformedVert.RawAttributeData.Color; VertexCache_TexCoords0[CacheIndex] = Vertex.TransformedVert.RawAttributeData.TexCoords[0]; VertexCache_TexCoords1[CacheIndex] = Vertex.TransformedVert.RawAttributeData.TexCoords[1]; VertexCache_TexCoords2[CacheIndex] = Vertex.TransformedVert.RawAttributeData.TexCoords[2]; VertexCache_TexCoords3[CacheIndex] = Vertex.TransformedVert.RawAttributeData.TexCoords[3]; #if NUM_TEX_COORD_INTERPOLATORS > 0 VertexCache_CustomizedUVs0[CacheIndex] = Vertex.TransformedVert.CustomizedUVs[0]; #endif #if NUM_TEX_COORD_INTERPOLATORS > 1 VertexCache_CustomizedUVs1[CacheIndex] = Vertex.TransformedVert.CustomizedUVs[1]; #endif #if NUM_TEX_COORD_INTERPOLATORS > 2 VertexCache_CustomizedUVs2[CacheIndex] = Vertex.TransformedVert.CustomizedUVs[2]; #endif #if NUM_TEX_COORD_INTERPOLATORS > 3 VertexCache_CustomizedUVs3[CacheIndex] = Vertex.TransformedVert.CustomizedUVs[3]; #endif VertexCache_PointSubpixelClip[CacheIndex] = Vertex.PointSubpixelClip; } HLSL_STATIC_ASSERT( sizeof( FCachedVertex ) == 220 + 8 * NUM_TEX_COORD_INTERPOLATORS, "Unexpected size of FCachedVertex. Update LoadVertexFromLDS to reflect changes." ); FCachedVertex LoadVertexFromLDS( uint VertexIndex ) { const uint CacheIndex = VertexIndex & 63u; FCachedVertex Result; Result.TransformedVert.VertIndex = VertexIndex; Result.TransformedVert.PointLocal = VertexCache_PointLocal[CacheIndex]; Result.TransformedVert.PointPostDeform = VertexCache_PointPostDeform[CacheIndex]; Result.TransformedVert.PrevPointPostDeform = VertexCache_PrevPointPostDeform[CacheIndex]; Result.TransformedVert.PointWorld = VertexCache_PointWorld[CacheIndex]; Result.TransformedVert.PointWorld_NoOffset = VertexCache_PointWorld_NoOffset[CacheIndex]; Result.TransformedVert.PointClip = VertexCache_PointClip[CacheIndex]; Result.TransformedVert.NormalClip = VertexCache_NormalClip[CacheIndex]; Result.TransformedVert.TangentBasis.TangentZ = VertexCache_NormalPostDeform[CacheIndex]; Result.TransformedVert.TangentBasis.TangentXAndSign = VertexCache_TangentXAndSignPostDeform[CacheIndex]; Result.TransformedVert.RawAttributeData.TangentXAndSign = VertexCache_TangentXAndSign[CacheIndex]; Result.TransformedVert.RawAttributeData.TangentZ = VertexCache_TangentZ[CacheIndex]; Result.TransformedVert.RawAttributeData.Color = VertexCache_Color[CacheIndex]; Result.TransformedVert.RawAttributeData.TexCoords[0] = VertexCache_TexCoords0[CacheIndex]; Result.TransformedVert.RawAttributeData.TexCoords[1] = VertexCache_TexCoords1[CacheIndex]; Result.TransformedVert.RawAttributeData.TexCoords[2] = VertexCache_TexCoords2[CacheIndex]; Result.TransformedVert.RawAttributeData.TexCoords[3] = VertexCache_TexCoords3[CacheIndex]; #if NUM_TEX_COORD_INTERPOLATORS > 0 Result.TransformedVert.CustomizedUVs[0] = VertexCache_CustomizedUVs0[CacheIndex]; #endif #if NUM_TEX_COORD_INTERPOLATORS > 1 Result.TransformedVert.CustomizedUVs[1] = VertexCache_CustomizedUVs1[CacheIndex]; #endif #if NUM_TEX_COORD_INTERPOLATORS > 2 Result.TransformedVert.CustomizedUVs[2] = VertexCache_CustomizedUVs2[CacheIndex]; #endif #if NUM_TEX_COORD_INTERPOLATORS > 3 Result.TransformedVert.CustomizedUVs[3] = VertexCache_CustomizedUVs3[CacheIndex]; #endif Result.PointSubpixelClip = VertexCache_PointSubpixelClip[CacheIndex]; return Result; } void ClusterRasterize( uint VisibleIndex, uint GroupThreadIndex ) { FTriRange TriRange = GetIndexAndTriRangeSW( VisibleIndex ); // Should be all scalar. FVisibleCluster VisibleCluster = GetVisibleCluster( VisibleIndex, VIRTUAL_TEXTURE_TARGET ); FPrimitiveSceneData PrimitiveData; FInstanceSceneData InstanceData; GetNaniteMaterialSceneData(VisibleCluster, PrimitiveData, InstanceData); FNaniteView NaniteView = GetNaniteView( VisibleCluster.ViewId ); #if ALWAYS_EVALUATE_WORLD_POSITION_OFFSET const bool bEvaluateWPO = true; #else const bool bEvaluateWPO = (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0; #endif const bool bReverseWindingOrder = ReverseWindingOrder(NaniteView, PrimitiveData, InstanceData); #if NANITE_VERTEX_PROGRAMMABLE || NANITE_PIXEL_PROGRAMMABLE ResolvedView = ResolveView(NaniteView); #endif FInstanceDynamicData InstanceDynamicData = CalculateInstanceDynamicData(NaniteView, InstanceData); FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); if( TriRange.Num == 0 ) TriRange.Num = Cluster.NumTris; FMaterialShader MaterialShader; MaterialShader.PrimitiveData = PrimitiveData; MaterialShader.InstanceData = InstanceData; MaterialShader.InstanceDynamicData = InstanceDynamicData; MaterialShader.NaniteView = NaniteView; MaterialShader.Cluster = Cluster; MaterialShader.VisibleCluster = VisibleCluster; MaterialShader.VertTransforms = CalculateNaniteVertexTransforms( InstanceData, InstanceDynamicData, NaniteView ); #if MATERIAL_SHADER_HAS_DISPLACEMENT MaterialShader.InitDisplacement(RasterBinMeta[GetRasterBin()].MaterialDisplacementParams); #endif FRaster Raster = CreateRaster( NaniteView, VisibleCluster ); #if VIRTUAL_TEXTURE_TARGET && NANITE_LATE_VSM_PAGE_TRANSLATION if (!Raster.bSinglePage) { UNROLL for (uint Offset = 0; Offset < NANITE_VSM_PAGE_TABLE_CACHE_DIM * NANITE_VSM_PAGE_TABLE_CACHE_DIM; Offset += THREADGROUP_SIZE) { FetchAndCachePageTableEntry(NaniteView, VisibleCluster.vPage, VisibleCluster.vPageEnd, Offset + GroupThreadIndex); } GroupMemoryBarrierWithGroupSync(); } #endif #if NANITE_TESSELLATION float LowTessDistance = 0.0f; #if USES_DISPLACEMENT LowTessDistance = CalcDisplacementLowTessDistance(PrimitiveData, InstanceData, NaniteView); #endif uint TriIndex = TriRange.Start + GroupThreadIndex; bool bTriValid = GroupThreadIndex < TriRange.Num; uint3 VertIndexes = 0; if( bTriValid ) { VertIndexes = DecodeTriangleIndices(Cluster, TriIndex); } uint NumUniqueVerts; uint LaneVertIndex; uint3 VertLaneIndexes; DeduplicateVertIndexes( VertIndexes, GroupThreadIndex, bTriValid, NumUniqueVerts, LaneVertIndex, VertLaneIndexes ); FNaniteTransformedVert Vert; float3 PointView; if (GroupThreadIndex < NumUniqueVerts) { Vert = FetchTransformedNaniteVertex( PrimitiveData, InstanceData, GetInstanceViewData(InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId), MaterialShader.VertTransforms, Cluster, VisibleCluster, LaneVertIndex, bEvaluateWPO ); PointView = mul( float4( Vert.PointWorld, 1 ), NaniteView.TranslatedWorldToView ).xyz; } float3 TriPointView[3]; TriPointView[0] = WaveReadLaneAt( PointView, VertLaneIndexes[0] ); TriPointView[1] = WaveReadLaneAt( PointView, VertLaneIndexes[1] ); TriPointView[2] = WaveReadLaneAt( PointView, VertLaneIndexes[2] ); float3 TessFactors = GetTessFactors( NaniteView, TriPointView, LowTessDistance ); const uint ImmediateSplitLimit = 8; bool bCanDice = max3( TessFactors.x, TessFactors.y, TessFactors.z ) <= NANITE_TESSELLATION_TABLE_IMMEDIATE_SIZE; if( WaveActiveAnyTrue( bCanDice ) ) { FDiceTask DiceTask; DiceTask.Raster = Raster; DiceTask.Shader = MaterialShader; DiceTask.PixelValue = ( VisibleIndex + 1 ) << 7; DiceTask.VisualizeValues = GetVisualizeValues(); DiceTask.UVDensities = GetMaterialUVDensities( Cluster, InstanceData.PrimitiveId, TriRange.Start ); DiceTask.bReverseWinding = bReverseWindingOrder; DiceTask.Vert = Vert; DiceTask.CacheToLDS(); uint NumVerts = 0; uint NumTris = 0; if( bTriValid && bCanDice ) { DiceTask.Init( TessFactors, VertLaneIndexes, TriIndex ); NumVerts = DiceTask.TessellatedPatch.GetNumVerts(); NumTris = DiceTask.TessellatedPatch.GetNumTris(); } BRANCH if ((RenderFlags & NANITE_RENDER_FLAG_WRITE_STATS) != 0u) { WaveInterlockedAdd(OutStatsBuffer[0].NumDicedTrianglesClusters, NumTris); WaveInterlockedAddScalar(OutStatsBuffer[0].NumImmediatePatches, 1); } DistributeWork( DiceTask, GroupThreadIndex, NumTris ); } if( VIRTUAL_TEXTURE_TARGET == 0 ) { FClusterSplitTask SplitTask; uint NumVerts = 0; uint NumTris = 0; if( bTriValid && !bCanDice ) { float3 SplitFactors = min( GetSplitFactors( TessFactors ), ImmediateSplitLimit ); SplitTask.Init( SplitFactors, VisibleIndex, TriIndex ); NumVerts = SplitTask.TessellatedPatch.GetNumVerts(); NumTris = SplitTask.TessellatedPatch.GetNumTris(); } DistributeWork( SplitTask, GroupThreadIndex, NumTris ); } else if( bTriValid && !bCanDice ) { uint WriteOffset = SplitWorkQueue.Add(); if( WriteOffset < SplitWorkQueue.Size ) { uint4 Encoded; Encoded.x = ( VisibleIndex << 7 ) | TriIndex; Encoded.y = BarycentricMax; Encoded.z = BarycentricMax << 16; Encoded.w = 0; checkSlow( Encoded.x != ~0u && Encoded.y != ~0u && Encoded.z != ~0u && Encoded.w != ~0u ); SplitWorkQueue.DataBuffer_Store4( WriteOffset * 16, Encoded ); } } #elif NANITE_PIXEL_PROGRAMMABLE // We can assume wave size >= 32 here as we force HW raster for hardware that can use smaller wave sizes FCachedVertex TriangleVerts[3]; FNaniteTransformedVert CachedTransformedVerts[2]; // TODO: DXC doesn't manage to strip all the unused groupshared arrays, which is very bad for performance. // When manually stripped, the groupshared version is faster, so we should revisit once this has been fixed. const bool bGroupsharedCache = !COMPILER_DXC; uint NumCachedVerts = 0; for( uint FirstTriIndex = 0; FirstTriIndex < TriRange.Num; FirstTriIndex += 32 ) { const uint LocalTriIndex = FirstTriIndex + GroupThreadIndex; const uint TriIndex = TriRange.Start + LocalTriIndex; const bool bTriValid = LocalTriIndex < TriRange.Num; uint3 VertIndexes = 0; if( bTriValid ) { VertIndexes = DecodeTriangleIndices(Cluster, TriIndex); if( bReverseWindingOrder ) VertIndexes.yz = VertIndexes.zy; } UNROLL for( uint k = 0; k < 3; k++ ) { const uint Index = VertIndexes[k]; BRANCH if( bGroupsharedCache ) { TriangleVerts[k] = LoadVertexFromLDS( Index ); } else { const FNaniteTransformedVert A = WaveReadLaneAt( CachedTransformedVerts[0], Index & 31 ); const FNaniteTransformedVert B = WaveReadLaneAt( CachedTransformedVerts[1], Index & 31 ); FCachedVertex Vert; if( (Index - NumCachedVerts ) & 32 ) Vert.TransformedVert = A; else Vert.TransformedVert = B; Vert.PointSubpixelClip = VertexCache_PointSubpixelClip[Index & 63]; TriangleVerts[k] = Vert; } } const uint MaxVertIndex = max( VertIndexes.y, VertIndexes.z ); while( WaveActiveAnyTrue( MaxVertIndex >= NumCachedVerts ) ) { // Transform and store next batch of vertices { const uint LaneVertIndex = NumCachedVerts + GroupThreadIndex; FCachedVertex Vert; BRANCH if( LaneVertIndex < Cluster.NumVerts ) // Ideally, we would be testing against the number of verts for the range, not the whole cluster. { Vert.TransformedVert = FetchTransformedNaniteVertex( PrimitiveData, InstanceData, GetInstanceViewData(InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId), MaterialShader.VertTransforms, Cluster, VisibleCluster, LaneVertIndex, bEvaluateWPO ); Vert.PointSubpixelClip = CalculateSubpixelCoordinates( Raster, Vert.TransformedVert.PointClip ); } GroupMemoryBarrierWithGroupSync(); BRANCH if( bGroupsharedCache ) { StoreVertexToLDS( LaneVertIndex, Vert ); } else { CachedTransformedVerts[1] = CachedTransformedVerts[0]; CachedTransformedVerts[0] = Vert.TransformedVert; VertexCache_PointSubpixelClip[LaneVertIndex & 63] = Vert.PointSubpixelClip; } GroupMemoryBarrierWithGroupSync(); } UNROLL for( uint k = 0; k < 3; k++ ) { const uint Index = VertIndexes[k]; FCachedVertex Vert; if( bGroupsharedCache ) { Vert = LoadVertexFromLDS( Index ); } else { Vert.TransformedVert = WaveReadLaneAt( CachedTransformedVerts[0], Index & 31 ); // After refill any new vertex will be in CachedVertex[0] Vert.PointSubpixelClip = VertexCache_PointSubpixelClip[Index & 63]; } if( Index >= NumCachedVerts ) TriangleVerts[k] = Vert; } NumCachedVerts += 32; } float4 Verts[3]; UNROLL for( uint k = 0; k < 3; k++ ) { MaterialShader.TransformedTri.Verts[k] = TriangleVerts[k].TransformedVert; Verts[k] = TriangleVerts[k].PointSubpixelClip; } FRasterTri Tri = SetupTriangle< NANITE_SUBPIXEL_SAMPLES, !NANITE_TWO_SIDED >( Raster.ScissorRect, Verts ); if( Tri.bIsValid && bTriValid ) { uint PixelValue = (VisibleIndex + 1) << 7; PixelValue |= TriIndex; uint2 VisualizeValues = GetVisualizeValues(); #if VIRTUAL_TEXTURE_TARGET && NANITE_LATE_VSM_PAGE_TRANSLATION if (!Raster.bSinglePage) { // @lh-todo: Explicitly initialize structs with empty struct fields until DXC/SPIR-V can handle it properly TNaniteWritePixel< FMaterialShader, FCachedPageTable > NaniteWritePixel; NaniteWritePixel.Raster = Raster; NaniteWritePixel.Shader = MaterialShader; NaniteWritePixel.PixelValue = PixelValue; NaniteWritePixel.VisualizeValues = VisualizeValues; RasterizeTri_Adaptive( Tri, NaniteWritePixel ); } else #endif { // @lh-todo: Explicitly initialize structs with empty struct fields until DXC/SPIR-V can handle it properly TNaniteWritePixel< FMaterialShader > NaniteWritePixel; NaniteWritePixel.Raster = Raster; NaniteWritePixel.Shader = MaterialShader; NaniteWritePixel.PixelValue = PixelValue; NaniteWritePixel.VisualizeValues = VisualizeValues; RasterizeTri_Adaptive( Tri, NaniteWritePixel ); } } } #else UNROLL for( uint i = 0; i < VERTEX_CACHE_SIZE; i += THREADGROUP_SIZE ) { const uint VertIndex = GroupThreadIndex + i; BRANCH if (VertIndex >= Cluster.NumVerts) break; // Transform vertex and store in group shared memory. FNanitePostDeformVertex InputVert = FetchAndDeformLocalNaniteVertex(PrimitiveData, InstanceData, GetInstanceViewData(InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId), Cluster, VisibleCluster, VertIndex, NANITE_NUM_TEXCOORDS_TO_DECODE); #if MATERIAL_SHADER_HAS_DISPLACEMENT MaterialShader.ApplyFallbackDisplacement(InputVert); #endif float3 WorldPositionOffset = 0.0f; #if NANITE_VERTEX_PROGRAMMABLE BRANCH if (bEvaluateWPO) { MaterialShader.InitVertexParameters(InputVert); WorldPositionOffset = MaterialShader.EvaluateWorldPositionOffset(); } #endif const float3 PointTranslatedWorld = mul( float4( InputVert.Position, 1 ), InstanceDynamicData.LocalToTranslatedWorld ).xyz + WorldPositionOffset; const float4 PointClip = mul( float4( PointTranslatedWorld, 1 ), NaniteView.TranslatedWorldToClip ); GroupVerts[VertIndex] = CalculateSubpixelCoordinates( Raster, PointClip ).xyz; } GroupMemoryBarrierWithGroupSync(); UNROLL for( uint j = 0; j < NANITE_MAX_CLUSTER_TRIANGLES; j += THREADGROUP_SIZE ) { const uint ThreadIndex = GroupThreadIndex + j; const uint TriIndex = ThreadIndex + TriRange.Start; uint3 VertIndexes = DecodeTriangleIndices(Cluster, TriIndex); if( bReverseWindingOrder ) VertIndexes.yz = VertIndexes.zy; float4 Verts[3]; Verts[0] = float4( GroupVerts[ VertIndexes.x ], 1 ); Verts[1] = float4( GroupVerts[ VertIndexes.y ], 1 ); Verts[2] = float4( GroupVerts[ VertIndexes.z ], 1 ); BRANCH if (ThreadIndex >= TriRange.Num) break; FRasterTri Tri = SetupTriangle< NANITE_SUBPIXEL_SAMPLES, !NANITE_TWO_SIDED >( Raster.ScissorRect, Verts ); if( Tri.bIsValid ) { uint PixelValue = (VisibleIndex + 1) << 7; PixelValue |= TriIndex; uint2 VisualizeValues = GetVisualizeValues(); #if VIRTUAL_TEXTURE_TARGET && NANITE_LATE_VSM_PAGE_TRANSLATION if (!Raster.bSinglePage) { // @lh-todo: Explicitly initialize structs with empty struct fields until DXC/SPIR-V can handle it properly TNaniteWritePixel< FMaterialShader, FCachedPageTable > NaniteWritePixel; NaniteWritePixel.Raster = Raster; NaniteWritePixel.Shader = MaterialShader; NaniteWritePixel.PixelValue = PixelValue; NaniteWritePixel.VisualizeValues = VisualizeValues; RasterizeTri_Adaptive( Tri, NaniteWritePixel ); } else #endif { // @lh-todo: Explicitly initialize structs with empty struct fields until DXC/SPIR-V can handle it properly TNaniteWritePixel< FMaterialShader > NaniteWritePixel; NaniteWritePixel.Raster = Raster; NaniteWritePixel.Shader = MaterialShader; NaniteWritePixel.PixelValue = PixelValue; NaniteWritePixel.VisualizeValues = VisualizeValues; RasterizeTri_Adaptive( Tri, NaniteWritePixel ); } } } #endif } void PatchRasterize( uint GroupID, uint GroupThreadIndex ) { #if NANITE_TESSELLATION if(GroupThreadIndex >= WaveGetLaneCount()) // Workaround for wave sizes smaller than 32 { return; } const uint ThreadGroupSize = min(THREADGROUP_SIZE, WaveGetLaneCount()); const uint TotalPatches = RasterBinMeta[GetRasterBin()].BinSWCount; const uint PatchStartIndex = min(GroupID * MaxPatchesPerGroup, TotalPatches); const uint PatchEndIndex = min(PatchStartIndex + MaxPatchesPerGroup, TotalPatches); const uint NumPatches = PatchEndIndex - PatchStartIndex; // Stuff that gets calculated during the patch setup phase uint4 Patches_EncodedPatch; bool Patches_bReverseWindingOrders; FInstanceSceneData Patches_InstanceData; FInstanceDynamicData Patches_InstanceDynamicData; FSplitPatch Patches_SplitPatch; FTessellatedPatch Patches_TessellatedPatch; FNaniteVertTransforms Patches_VertTransforms; FNaniteTransformedVert Patches_Verts; float4 Patches_UVDensities; if (GroupThreadIndex < NumPatches * 3u) { const uint LocalPatchIndex = GroupThreadIndex / 3u; const uint PatchCornerIndex = GroupThreadIndex - LocalPatchIndex * 3u; const uint PatchIndex = PatchStartIndex + LocalPatchIndex; const uint PatchStartLane = LocalPatchIndex * 3; const uint4 RasterBin = FetchSWRasterBin(PatchIndex); const uint VisibleIndex = RasterBin.x; #if NANITE_TESSELLATION_PATCH_REFS const uint2 VisiblePatch = VisiblePatches.Load2(VisibleIndex * 8); Patches_EncodedPatch = SplitWorkQueue.DataBuffer_Load4(VisiblePatch.x * 16); #else Patches_EncodedPatch = VisiblePatches.Load4(VisibleIndex * 16); #endif Patches_SplitPatch.Decode(Patches_EncodedPatch); const FVisibleCluster VisibleCluster = GetVisibleCluster(Patches_SplitPatch.VisibleClusterIndex, VIRTUAL_TEXTURE_TARGET); const FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); FPrimitiveSceneData PrimitiveData; GetNaniteMaterialSceneData(VisibleCluster, PrimitiveData, Patches_InstanceData); const FNaniteView NaniteView = GetNaniteView(VisibleCluster.ViewId); float LowTessDistance = 0.0f; #if USE_DISPLACEMENT LowTessDistance = CalcDisplacementLowTessDistance(PrimitiveData, Patches_InstanceData, NaniteView); #endif #if NANITE_VERTEX_PROGRAMMABLE || NANITE_PIXEL_PROGRAMMABLE ResolvedView = ResolveView(NaniteView); #endif Patches_InstanceDynamicData = CalculateInstanceDynamicData(NaniteView, Patches_InstanceData); Patches_bReverseWindingOrders = ReverseWindingOrder(NaniteView, PrimitiveData, Patches_InstanceData); #if NANITE_VERTEX_PROGRAMMABLE || NANITE_PIXEL_PROGRAMMABLE Patches_VertTransforms = CalculateNaniteVertexTransforms(Patches_InstanceData, Patches_InstanceDynamicData, NaniteView); #endif #if ALWAYS_EVALUATE_WORLD_POSITION_OFFSET const bool bEvaluateWPO = true; #else const bool bEvaluateWPO = (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0; #endif const uint3 VertIndexes = DecodeTriangleIndices(Cluster, Patches_SplitPatch.TriIndex); Patches_Verts = FetchTransformedNaniteVertex(PrimitiveData, Patches_InstanceData, GetInstanceViewData(Patches_InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId), Patches_VertTransforms, Cluster, VisibleCluster, VertIndexes[PatchCornerIndex], bEvaluateWPO); Patches_UVDensities = GetMaterialUVDensities(Cluster, Patches_InstanceData.PrimitiveId, Patches_SplitPatch.TriIndex); #if NANITE_TESSELLATION_PATCH_REFS Patches_TessellatedPatch.Init(VisiblePatch.y, false); #else const float3 OuterPatchCornersView = mul(float4(Patches_Verts.PointWorld, 1), NaniteView.TranslatedWorldToView).xyz; const float3 InnerPatchCornersView = WaveReadLaneAt(OuterPatchCornersView, PatchStartLane + 0) * Patches_SplitPatch.Barycentrics[PatchCornerIndex].x + WaveReadLaneAt(OuterPatchCornersView, PatchStartLane + 1) * Patches_SplitPatch.Barycentrics[PatchCornerIndex].y + WaveReadLaneAt(OuterPatchCornersView, PatchStartLane + 2) * Patches_SplitPatch.Barycentrics[PatchCornerIndex].z; float3 CornersView[3]; CornersView[0] = WaveReadLaneAt(InnerPatchCornersView, PatchStartLane + 0); CornersView[1] = WaveReadLaneAt(InnerPatchCornersView, PatchStartLane + 1); CornersView[2] = WaveReadLaneAt(InnerPatchCornersView, PatchStartLane + 2); const float3 TessFactors = GetTessFactors(NaniteView, CornersView, LowTessDistance); Patches_TessellatedPatch.Init(TessFactors, Patches_EncodedPatch.yzw, false); Patches_SplitPatch.Decode(Patches_EncodedPatch); #endif } for (uint i = 0; i < NumPatches; i++) { const uint PatchStartLane = i * 3; // Read values from patch setup const bool bReverseWindingOrder = WaveReadLaneAt(Patches_bReverseWindingOrders, PatchStartLane); const FSplitPatch SplitPatch = WaveReadLaneAt(Patches_SplitPatch, PatchStartLane); const FTessellatedPatch TessellatedPatch = WaveReadLaneAt(Patches_TessellatedPatch, PatchStartLane); const float4 UVDensities = WaveReadLaneAt(Patches_UVDensities, PatchStartLane); // The following values can be used in a shader, but will most likely be dead code eliminated const FInstanceSceneData InstanceData = WaveReadLaneAt(Patches_InstanceData, PatchStartLane); const FInstanceDynamicData InstanceDynamicData = WaveReadLaneAt(Patches_InstanceDynamicData, PatchStartLane); const FNaniteVertTransforms VertTransforms = WaveReadLaneAt(Patches_VertTransforms, PatchStartLane); #if VISUALIZE const uint4 PatchEncoded = WaveReadLaneAt(Patches_EncodedPatch, PatchStartLane); #endif const FVisibleCluster VisibleCluster = GetVisibleCluster(SplitPatch.VisibleClusterIndex, VIRTUAL_TEXTURE_TARGET); const FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); const FNaniteView NaniteView = GetNaniteView(VisibleCluster.ViewId); #if NANITE_VERTEX_PROGRAMMABLE || NANITE_PIXEL_PROGRAMMABLE ResolvedView = ResolveView(NaniteView); #endif FMaterialShader MaterialShader; MaterialShader.PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId); MaterialShader.InstanceData = InstanceData; MaterialShader.InstanceDynamicData = InstanceDynamicData; MaterialShader.NaniteView = NaniteView; MaterialShader.Cluster = Cluster; MaterialShader.VisibleCluster = VisibleCluster; MaterialShader.VertTransforms = VertTransforms; MaterialShader.TransformedTri = MakeTransformedNaniteTriangle(Patches_Verts, PatchStartLane + uint3(0, 1, 2)); #if MATERIAL_SHADER_HAS_DISPLACEMENT MaterialShader.InitDisplacement(RasterBinMeta[GetRasterBin()].MaterialDisplacementParams); #endif uint PixelValue = (SplitPatch.VisibleClusterIndex + 1) << 7; uint NumVerts = TessellatedPatch.GetNumVerts(); uint NumTris = TessellatedPatch.GetNumTris(); FRaster Raster = CreateRaster( NaniteView, VisibleCluster ); GroupMemoryBarrierWithGroupSync(); #if VIRTUAL_TEXTURE_TARGET && NANITE_LATE_VSM_PAGE_TRANSLATION if (!Raster.bSinglePage) { for (uint Offset = 0; Offset < NANITE_VSM_PAGE_TABLE_CACHE_DIM * NANITE_VSM_PAGE_TABLE_CACHE_DIM; Offset += ThreadGroupSize) { FetchAndCachePageTableEntry(NaniteView, VisibleCluster.vPage, VisibleCluster.vPageEnd, Offset + GroupThreadIndex); } GroupMemoryBarrierWithGroupSync(); } #endif for( uint VertIndex = GroupThreadIndex; VertIndex < NumVerts; VertIndex += ThreadGroupSize ) { FBarycentrics Barycentrics; Barycentrics.Value = TessellatedPatch.GetVert( VertIndex ); Barycentrics.Value_dx = 0;//float3( -1, 1, 0 ) / TessFactors.x; Barycentrics.Value_dy = 0;//float3( 0, -1, 1 ) / TessFactors.y; Barycentrics = SplitPatch.TransformBarycentrics( Barycentrics ); GroupVerts[ VertIndex ] = CalculateSubpixelCoordinates( Raster, MaterialShader.EvaluateDomain( UVDensities, Barycentrics ) ).xyz; } GroupMemoryBarrierWithGroupSync(); for( uint TriIndex = GroupThreadIndex; TriIndex < NumTris; TriIndex += ThreadGroupSize ) { uint3 VertIndexes = TessellatedPatch.GetIndexes( TriIndex ); if( bReverseWindingOrder ) VertIndexes.yz = VertIndexes.zy; float4 Verts[3]; Verts[0] = float4( GroupVerts[ VertIndexes.x ], 1 ); Verts[1] = float4( GroupVerts[ VertIndexes.y ], 1 ); Verts[2] = float4( GroupVerts[ VertIndexes.z ], 1 ); FRasterTri Tri = SetupTriangle< NANITE_SUBPIXEL_SAMPLES, !NANITE_TWO_SIDED >( Raster.ScissorRect, Verts ); if( max3( Verts[0].z, Verts[1].z, Verts[2].z ) > 1 ) Tri.bIsValid = false; if( Tri.bIsValid ) { #if VISUALIZE const uint SubPatch = (Rand3DPCG32(PatchEncoded.yzw).x & 0xff0000u) >> 16u; const uint MicroTri = TriIndex & 0xffu; const uint2 VisualizeValues = GetVisualizeValues(1u /* AddValue */, SubPatch, MicroTri); #else const uint2 VisualizeValues = uint2(0, 0); #endif RasterizeDicedTri( Tri, Raster, MaterialShader, PixelValue | SplitPatch.TriIndex, VisualizeValues ); } } } #endif } #if NANITE_VOXELS #include "Voxel/Voxel.ush" bool IntersectBox(float3 RayOrigin, float3 RayDir, float3 BoxCenter, float3 BoxHalfSize, inout float OutIntersectionTime) { float3 InvDir = rcp(RayDir); float3 LocalBoxCenter = BoxCenter - RayOrigin; float3 PlaneIntersect0 = (LocalBoxCenter - BoxHalfSize) * InvDir; float3 PlaneIntersect1 = (LocalBoxCenter + BoxHalfSize) * InvDir; float3 MinIntersection = min(PlaneIntersect0, PlaneIntersect1); float3 MaxIntersection = max(PlaneIntersect0, PlaneIntersect1); float MaxMin = max3(MinIntersection.x, MinIntersection.y, MinIntersection.z); float MinMax = min3(MaxIntersection.x, MaxIntersection.y, MaxIntersection.z); OutIntersectionTime = MaxMin; return MaxMin < MinMax; } void PlotPixel(FRaster Raster, int2 PixelCoord, uint PixelValue, float DeviceZ) { FVisBufferPixel Pixel = CreateVisBufferPixel(PixelCoord, PixelValue, DeviceZ); #if VISUALIZE Pixel.VisualizeValues = GetVisualizeValues(); #endif #if VIRTUAL_TEXTURE_TARGET && NANITE_LATE_VSM_PAGE_TRANSLATION Pixel.PhysicalPosition.xy = Pixel.Position; Pixel.PhysicalPosition.z = Raster.ArrayIndex; if (!Raster.bSinglePage) { FCachedPageTable PageTranslation; if (!PageTranslation(Pixel)) { return; } } #endif Pixel.WriteOverdraw(); Pixel.Write(); } FRay GetLocalRay( FNaniteView NaniteView, FInstanceSceneData InstanceData, float4 SvPosition, bool bIsOrtho ) { FDFVector3 RayWorldOrigin; float3 RayWorldDirection; #if 1 if( bIsOrtho ) { float3 NearPoint = mul( float4( SvPosition.xy, 1, 1 ), NaniteView.SVPositionToTranslatedWorld ).xyz; float3 FarPoint = mul( float4( SvPosition.xy, 0, 1 ), NaniteView.SVPositionToTranslatedWorld ).xyz; RayWorldOrigin = DFFastSubtract( NearPoint, NaniteView.PreViewTranslation ); RayWorldDirection = FarPoint - NearPoint; } else { RayWorldOrigin = NaniteView.WorldCameraOrigin; RayWorldDirection = mul( float4( SvPosition.xy, 0, 1 ), NaniteView.SVPositionToTranslatedWorld ).xyz; } #else float4 NearPoint = mul( float4( SvPosition.xy, 1, 1 ), NaniteView.SVPositionToTranslatedWorld ); float4 FarPoint = mul( float4( SvPosition.xy, 0, 1 ), NaniteView.SVPositionToTranslatedWorld ); RayWorldOrigin = DFFastSubtract( NearPoint.xyz / NearPoint.w, NaniteView.PreViewTranslation ); RayWorldDirection = normalize( NearPoint.w * FarPoint.xyz - FarPoint.w * NearPoint.xyz ); #endif FRay RayLocal; RayLocal.Origin = DFMultiplyDemote( RayWorldOrigin, InstanceData.WorldToLocal ); RayLocal.Direction = DFMultiplyVector( RayWorldDirection, InstanceData.WorldToLocal ); RayLocal.Time[0] = 0; // TODO NaniteView.NearPlane RayLocal.Time[1] = 1e24; return RayLocal; } void ProcessBrickPixel( FNaniteView NaniteView, FRaster Raster, FInstanceSceneData InstanceData, FRay Ray, bool bIsOrtho, int2 PixelPos, uint PixelValue, uint2 ReverseBrickBits, float3 LocalVoxelBoundsExtent, float VoxelSize, float RcpVoxelSize, float Bias ) { FVisBufferPixel Pixel = CreateVisBufferPixel( PixelPos, PixelValue, 0.0f /*Unused*/ ); #if VIRTUAL_TEXTURE_TARGET Pixel.PhysicalPosition.xy = Pixel.Position; Pixel.PhysicalPosition.z = Raster.ArrayIndex; if (!Raster.bSinglePage) { FCachedPageTable PageTranslation; PageTranslation(Pixel); } #endif Ray.Time = float2(0, 1e24f); // TODO NaniteView.NearPlane const float Epsilon = 1e-8; #if 1 Ray.Direction = select( abs( Ray.Direction ) < Epsilon, Epsilon, Ray.Direction ); #elif 0 Ray.Direction = asfloat( asuint( max( abs( Ray.Direction ), Epsilon ) ) | ( asuint( Ray.Direction ) & 0x80000000u ) ); // v_max, v_and_or #elif 0 float3 Replacement = select( Ray.Direction > 0, Epsilon, -Epsilon ); Ray.Direction = select( abs( Ray.Direction ) < Epsilon, Replacement, Ray.Direction ); #endif Ray.Time = Intersect( Ray, LocalVoxelBoundsExtent, LocalVoxelBoundsExtent ); #if VISUALIZE Pixel.VisualizeValues = GetVisualizeValues(); #endif if( Ray.Time[0] < Ray.Time[1] ) { #if 1 Ray.Time += float2( Bias, -Bias ); #elif 0 Ray.Time = float2( lerp( Ray.Time[0], Ray.Time[1], 0.05 ), lerp( Ray.Time[1], Ray.Time[0], 0.05 ) ); #elif 0 if( bIsOrtho ) Ray.Time += float2(1e-7, -1e-7); else Ray.Time += float2(5e-4, -5e-4); #endif FDDA DDA = InitDDA( Ray ); StartDDA( DDA, 1, Ray ); const UlongType ReverseVoxelMask64 = PackUlongType( ReverseBrickBits ); int Hit = 0; // Negative means hit UNROLL for( uint Tests = 0; Tests < 3*3 + 1; Tests++ ) { #if COMPILER_SUPPORTS_ULONG_TYPES Hit = UnpackUlongType( ReverseVoxelMask64 << DDA.VoxelIndex ).y; #else Hit = ( DDA.VoxelIndex < 32 ? ReverseBrickBits.y : ReverseBrickBits.x ) << ( DDA.VoxelIndex & 31 ); #endif BRANCH if( Hit < 0 ) break; StepDDA( DDA, 1 ); BRANCH if (DDA.Time[0] >= DDA.Time[1]) break; } if( Hit < 0 ) { DDA.Time[0] = 0.5 * ( DDA.Time[0] + NextTime( DDA ) ); if( bIsOrtho ) Pixel.Depth = 1 - DDA.Time[0]; else Pixel.Depth = NaniteView.ViewToClip[3][2] / DDA.Time[0] + NaniteView.ViewToClip[2][2]; Pixel.WriteOverdraw(); Pixel.Write(); } } } groupshared uint GroupWorkEnd[32]; groupshared uint3 GroupBrickData[32]; groupshared uint GroupSourceLaneAndPixelPos[64]; void ProcessBrickPixelBatchFromQueue( inout int QueueNumElements, inout uint QueueReadOffset, // Uniform inputs bool bIsOrtho, FNaniteView NaniteView, FRaster Raster, FInstanceSceneData InstanceData, FCluster Cluster, uint VisibleIndex, float VoxelSize, float RcpVoxelSize, float Bias, // Uniform or variable depending on mode float3 RayDirection, float3 RayDirection_dx, float3 RayDirection_dy, float3 RayOrigin, float3 RayOrigin_dx, float3 RayOrigin_dy, // Variable inputs uint2 ReverseBrickBits, uint BrickMax_VertIndex, float CenterPixelClipW, uint GroupThreadIndex ) { const uint ReadIndex = ( QueueReadOffset + GroupThreadIndex ) & 63; const uint PackedSourceLaneAndPixelPos = GroupSourceLaneAndPixelPos[ ReadIndex ]; const uint SourceLane = PackedSourceLaneAndPixelPos & 31u; const int2 PixelPos = int2( BitFieldExtractU32( PackedSourceLaneAndPixelPos, 14, 5 ), BitFieldExtractU32( PackedSourceLaneAndPixelPos, 13, 19 ) ); const float3 SourceRayOrigin = WaveReadLaneAt( RayOrigin, SourceLane ); const uint2 SourceReverseBrickBits = WaveReadLaneAt( ReverseBrickBits, SourceLane ); const uint SourceBrickMax_VertIndex = WaveReadLaneAt( BrickMax_VertIndex, SourceLane ); const float3 SourceHalfBrickMax = float3( BitFieldExtractU32( SourceBrickMax_VertIndex, 8, 0 ), BitFieldExtractU32( SourceBrickMax_VertIndex, 8, 8 ), BitFieldExtractU32( SourceBrickMax_VertIndex, 8, 16 ) ) * 0.5f; const uint SourceVertIndex = SourceBrickMax_VertIndex >> 24; const uint SourcePixelValue = ( ( VisibleIndex + 1 ) << 7 ) | SourceVertIndex; FRay Ray; BRANCH if( CONSTANT_DIR || bIsOrtho ) { #if NANITE_PER_VOXEL_BRICK_SKINNING const float3 SourceRayDirection = WaveReadLaneAt( RayDirection, SourceLane ); const float3 SourceRayOrigin_dx = WaveReadLaneAt( RayOrigin_dx, SourceLane ); const float3 SourceRayOrigin_dy = WaveReadLaneAt( RayOrigin_dy, SourceLane ); Ray.Origin = SourceRayOrigin + PixelPos.x * SourceRayOrigin_dx + PixelPos.y * SourceRayOrigin_dy; Ray.Direction = SourceRayDirection; #elif CONSTANT_DIR const float3 SourceRayDirection = WaveReadLaneAt( RayDirection, SourceLane ); const float SourceCenterPixelClipW = WaveReadLaneAt( CenterPixelClipW, SourceLane ); Ray.Origin = SourceRayOrigin + ( PixelPos.x * SourceCenterPixelClipW ) * RayDirection_dx + ( PixelPos.y * SourceCenterPixelClipW ) * RayDirection_dy; Ray.Direction = SourceRayDirection; #else Ray.Origin = SourceRayOrigin + PixelPos.x * RayOrigin_dx + PixelPos.y * RayOrigin_dy; Ray.Direction = RayDirection; #endif } else { Ray.Origin = SourceRayOrigin; #if NANITE_PER_VOXEL_BRICK_SKINNING const float3 SourceRayDirection = WaveReadLaneAt( RayDirection, SourceLane ); const float3 SourceRayDirection_dx = WaveReadLaneAt( RayDirection_dx, SourceLane ); const float3 SourceRayDirection_dy = WaveReadLaneAt( RayDirection_dy, SourceLane ); Ray.Direction = SourceRayDirection + SourceRayDirection_dx * PixelPos.x + SourceRayDirection_dy * PixelPos.y; #else Ray.Direction = RayDirection + RayDirection_dx * PixelPos.x + RayDirection_dy * PixelPos.y; #endif } if( GroupThreadIndex < QueueNumElements ) { ProcessBrickPixel(NaniteView, Raster, InstanceData, Ray, bIsOrtho, PixelPos, SourcePixelValue, SourceReverseBrickBits, SourceHalfBrickMax, VoxelSize, RcpVoxelSize, Bias ); } QueueNumElements -= 32; QueueReadOffset += 32; } bool OcclusionTestPixel( FRaster Raster, int2 PixelPos, float Depth ) { FVisBufferPixel Pixel = CreateVisBufferPixel( PixelPos, 0, Depth ); bool bActive = true; #if VIRTUAL_TEXTURE_TARGET Pixel.PhysicalPosition.xy = Pixel.Position; Pixel.PhysicalPosition.z = Raster.ArrayIndex; if( !Raster.bSinglePage ) { FCachedPageTable PageTranslation; if( !PageTranslation( Pixel ) ) bActive = false; } #endif if( bActive ) bActive = Pixel.EarlyDepthTest(); return bActive; } void ClusterTraceBricks( uint VisibleIndex, uint GroupThreadIndex ) { GetIndexAndTriRangeSW( VisibleIndex ); FVisibleCluster VisibleCluster = GetVisibleCluster( VisibleIndex, VIRTUAL_TEXTURE_TARGET ); FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked( VisibleCluster.InstanceId ); FNaniteView NaniteView = GetNaniteView( VisibleCluster.ViewId ); FCluster Cluster = GetCluster( VisibleCluster.PageIndex, VisibleCluster.ClusterIndex ); FRaster Raster = CreateRaster( NaniteView, VisibleCluster ); FInstanceDynamicData InstanceDynamicData = CalculateInstanceDynamicData( NaniteView, InstanceData ); float4 SvPositionStart = float4( 0.5, 0.5, 0, 1 ); #if VIRTUAL_TEXTURE_TARGET SvPositionStart.xy -= Raster.vTranslation.xy; #endif // TODO: optimize for perspective main view? bool bIsOrtho = IsOrthoProjection( NaniteView.ViewToClip ); const float RcpVoxelSize = rcp( Cluster.LODError ); // Calculate ray in voxel space of local cluster FRay RayBase = GetLocalRay( NaniteView, InstanceData, SvPositionStart, bIsOrtho ); float3 RayDirection_dx, RayDirection_dy; float3 RayOrigin_dx, RayOrigin_dy; { float3 Ray_dx = DFMultiplyVector( NaniteView.SVPositionToTranslatedWorld[0].xyz, InstanceData.WorldToLocal ) * RcpVoxelSize; float3 Ray_dy = DFMultiplyVector( NaniteView.SVPositionToTranslatedWorld[1].xyz, InstanceData.WorldToLocal ) * RcpVoxelSize; BRANCH if( bIsOrtho ) { RayOrigin_dx = Ray_dx; RayOrigin_dy = Ray_dy; RayDirection_dx = 0; RayDirection_dy = 0; } else { RayOrigin_dx = 0; RayOrigin_dy = 0; RayDirection_dx = Ray_dx; RayDirection_dy = Ray_dy; } } float4x4 LocalToClip = mul( InstanceDynamicData.LocalToTranslatedWorld, NaniteView.TranslatedWorldToClip ); float4x4 LocalVoxelToPixelClip = LocalToClip; LocalVoxelToPixelClip._m00_m10_m20_m30 = Raster.ViewportScale.x * LocalVoxelToPixelClip._m00_m10_m20_m30 + Raster.ViewportBias.x * LocalVoxelToPixelClip._m03_m13_m23_m33; LocalVoxelToPixelClip._m01_m11_m21_m31 = Raster.ViewportScale.y * LocalVoxelToPixelClip._m01_m11_m21_m31 + Raster.ViewportBias.y * LocalVoxelToPixelClip._m03_m13_m23_m33; #if USE_SKINNING FNaniteSkinningHeader SkinningHeader = LoadNaniteSkinningHeader(InstanceData.PrimitiveId); FBoneInfluenceHeader BoneInfluenceHeader = GetBoneInfluenceHeader(Cluster); #if !NANITE_PER_VOXEL_BRICK_SKINNING { const float4x3 SkinningTransform4x3 = SampleVoxelSkinningTransform( InstanceData, Cluster, SkinningHeader ); const float3x3 InvSkinningTransform3x3 = Inverse( float3x3( SkinningTransform4x3[0], SkinningTransform4x3[1], SkinningTransform4x3[2] ) ); const float4x4 SkinningTransform4x4 = float4x4( float4( SkinningTransform4x3[0], 0 ), float4( SkinningTransform4x3[1], 0 ), float4( SkinningTransform4x3[2], 0 ), float4( SkinningTransform4x3[3], 1 ) ); const float3x3 SkinningTransform3x3 = float3x3( SkinningTransform4x3[0], SkinningTransform4x3[1], SkinningTransform4x3[2] ); LocalVoxelToPixelClip = mul( SkinningTransform4x4, LocalVoxelToPixelClip ); RayBase.Origin = mul( RayBase.Origin - SkinningTransform4x3[3], InvSkinningTransform3x3 ); RayBase.Direction = mul( RayBase.Direction, InvSkinningTransform3x3 ); RayDirection_dx = mul( RayDirection_dx, InvSkinningTransform3x3 ); RayDirection_dy = mul( RayDirection_dy, InvSkinningTransform3x3 ); RayOrigin_dx = mul( RayOrigin_dx, InvSkinningTransform3x3 ); RayOrigin_dy = mul( RayOrigin_dy, InvSkinningTransform3x3 ); } #endif #endif RayBase.Origin *= RcpVoxelSize; RayBase.Direction *= RcpVoxelSize; const float Bias = 0.04 / length(RayBase.Direction); // VOXELTODO: Get approximate ray length from matrix directly? LocalVoxelToPixelClip[0] *= Cluster.LODError; LocalVoxelToPixelClip[1] *= Cluster.LODError; LocalVoxelToPixelClip[2] *= Cluster.LODError; LocalVoxelToPixelClip[0] = ToScalarMemory( LocalVoxelToPixelClip[0] ); LocalVoxelToPixelClip[1] = ToScalarMemory( LocalVoxelToPixelClip[1] ); LocalVoxelToPixelClip[2] = ToScalarMemory( LocalVoxelToPixelClip[2] ); LocalVoxelToPixelClip[3] = ToScalarMemory( LocalVoxelToPixelClip[3] ); #if VIRTUAL_TEXTURE_TARGET && NANITE_LATE_VSM_PAGE_TRANSLATION if (!Raster.bSinglePage) { UNROLL for (uint Offset = 0; Offset < NANITE_VSM_PAGE_TABLE_CACHE_DIM * NANITE_VSM_PAGE_TABLE_CACHE_DIM; Offset += THREADGROUP_SIZE) { FetchAndCachePageTableEntry(NaniteView, VisibleCluster.vPage, VisibleCluster.vPageEnd, Offset + GroupThreadIndex); } GroupMemoryBarrierWithGroupSync(); } #endif #if 0 Cluster.NumVerts = min( Cluster.NumVerts, 4096 ); for( uint BrickIndex = GroupThreadIndex; BrickIndex < Cluster.NumVerts; BrickIndex += THREADGROUP_SIZE ) { const uint PixelValue = ((VisibleIndex + 1) << 7) | (BrickIndex & 127); const float3 BoundsCenter = FetchLocalNaniteVertexPosition( InstanceData, Cluster, VisibleCluster, BrickIndex ); const float3 BoundsExtent = Cluster.LODError * 0.5f; FFrustumCullData FrustumCull = BoxCullFrustum( BoundsCenter, BoundsExtent, LocalToClip, NaniteView.ViewToClip, bIsOrtho, !bIsOrtho, true ); float4 Rect = ( float4( FrustumCull.RectMin.xy, FrustumCull.RectMax.xy ) * Raster.ViewportScale.xyxy + Raster.ViewportBias.xyxy ).xwzy; // Round to nearest pixel int2 MinPixels = (int2)floor( Rect.xy + 0.5 ); int2 MaxPixels = (int2)floor( Rect.zw - 0.5 ); // inclusive! // Scissor MinPixels = max( MinPixels, Raster.ScissorRect.xy ); MaxPixels = min( MaxPixels, Raster.ScissorRect.zw - 1 ); // Limit the rasterizer bounds to a sensible max. MaxPixels = min( MaxPixels, MinPixels + 16 ); for( int y = MinPixels.y; y < MaxPixels.y; y++ ) { for( int x = MinPixels.x; x < MaxPixels.x; x++ ) { int2 PixelPos = int2(x,y); float4 SvPosition = SvPositionStart; SvPosition.xy += PixelPos; FRay Ray = GetLocalRay( NaniteView, InstanceData, SvPosition, bIsOrtho ); const float Epsilon = 1e-8; Ray.Direction = select( abs( Ray.Direction ) < Epsilon, Epsilon, Ray.Direction ); #if 1 Ray.Time = Intersect( Ray, BoundsCenter, BoundsExtent ); if( Ray.Time[0] >= Ray.Time[1] ) continue; float DeviceZ; if( bIsOrtho ) DeviceZ = 1 - Ray.Time[0]; else DeviceZ = NaniteView.ViewToClip[3][2] / Ray.Time[0] + NaniteView.ViewToClip[2][2]; #else float DeviceZ = FrustumCull.RectMax.z; #endif PlotPixel( Raster, PixelPos, PixelValue, DeviceZ ); } } } return; #endif for( uint BrickIndexBase = 0; BrickIndexBase < NANITE_MAX_CLUSTER_TRIANGLES; BrickIndexBase += THREADGROUP_SIZE ) { BRANCH if( BrickIndexBase >= Cluster.BrickDataNum ) break; const uint BrickIndex = BrickIndexBase + GroupThreadIndex; const uint FetchBrickIndex = min( BrickIndex, Cluster.BrickDataNum - 1 ); const FBrick Brick = DecodeBrick( Cluster, FetchBrickIndex ); const float3 LocalVoxelPosition = (float3)Brick.StartPos; const float3 LocalVoxelBoundsExtent = Brick.BrickMax * 0.5f; const float3 LocalVoxelBoundsCenter = LocalVoxelPosition + LocalVoxelBoundsExtent; float4x4 Brick_LocalVoxelToPixelClip = LocalVoxelToPixelClip; FRay Brick_RayBase = RayBase; float3 Brick_RayDirection_dx = RayDirection_dx; float3 Brick_RayDirection_dy = RayDirection_dy; float3 Brick_RayOrigin_dx = RayOrigin_dx; float3 Brick_RayOrigin_dy = RayOrigin_dy; #if USE_SKINNING && NANITE_PER_VOXEL_BRICK_SKINNING const float4x3 SkinningTransform4x3 = SampleSkinningTransform( InstanceData, SkinningHeader, BoneInfluenceHeader, Brick.VertOffset ); const float3 SkinningTranslation = SkinningTransform4x3[3] * RcpVoxelSize; const float3x3 InvSkinningTransform3x3 = Inverse( float3x3( SkinningTransform4x3[0], SkinningTransform4x3[1], SkinningTransform4x3[2] ) ); const float4x4 SkinningTransform4x4 = float4x4( float4( SkinningTransform4x3[0], 0 ), float4( SkinningTransform4x3[1], 0 ), float4( SkinningTransform4x3[2], 0 ), float4( SkinningTranslation, 1 ) ); const float3x3 SkinningTransform3x3 = float3x3( SkinningTransform4x3[0], SkinningTransform4x3[1], SkinningTransform4x3[2] ); Brick_LocalVoxelToPixelClip = mul( SkinningTransform4x4, Brick_LocalVoxelToPixelClip ); Brick_RayBase.Origin = mul( Brick_RayBase.Origin - SkinningTranslation, InvSkinningTransform3x3 ); Brick_RayBase.Direction = mul( Brick_RayBase.Direction, InvSkinningTransform3x3 ); Brick_RayDirection_dx = mul( Brick_RayDirection_dx, InvSkinningTransform3x3 ); Brick_RayDirection_dy = mul( Brick_RayDirection_dy, InvSkinningTransform3x3 ); Brick_RayOrigin_dx = mul( Brick_RayOrigin_dx, InvSkinningTransform3x3 ); Brick_RayOrigin_dy = mul( Brick_RayOrigin_dy, InvSkinningTransform3x3 ); #endif const float4 CenterPixelClip = mul( float4( LocalVoxelBoundsCenter, 1.0 ), Brick_LocalVoxelToPixelClip ); const float3 CenterPixel = CenterPixelClip.xyz / CenterPixelClip.w; #if CONSTANT_DIR // 0.5 to counter the half pixel shift from SvPositionStart float2 CenterPixelXY = CenterPixel.xy - 0.5f; // Constant direction picked as brick center Brick_RayBase.Direction += Brick_RayDirection_dx * CenterPixelXY.x; Brick_RayBase.Direction += Brick_RayDirection_dy * CenterPixelXY.y; // Make ray with fixed direction hit same point at mid brick depth, CenterClip.w. // Position = Origin + Direction * Time, Time = w. Brick_RayOrigin_dx = Brick_RayDirection_dx * CenterPixelClip.w; Brick_RayOrigin_dy = Brick_RayDirection_dy * CenterPixelClip.w; Brick_RayBase.Origin -= LocalVoxelPosition + CenterPixelXY.x * Brick_RayOrigin_dx + CenterPixelXY.y * Brick_RayOrigin_dy; #else Brick_RayBase.Origin -= LocalVoxelPosition; #endif #if CONSTANT_DIR_RECT // Apply shear to counter ray direction const float2 RayShear = CenterPixel.xy; const float2 ExtentClipXY = abs( LocalVoxelBoundsExtent.x * ( Brick_LocalVoxelToPixelClip[0].xy - Brick_LocalVoxelToPixelClip[0].w * RayShear ) ) + abs( LocalVoxelBoundsExtent.y * ( Brick_LocalVoxelToPixelClip[1].xy - Brick_LocalVoxelToPixelClip[1].w * RayShear ) ) + abs( LocalVoxelBoundsExtent.z * ( Brick_LocalVoxelToPixelClip[2].xy - Brick_LocalVoxelToPixelClip[2].w * RayShear ) ); const float ExtentClipW = LocalVoxelBoundsExtent.x * Brick_LocalVoxelToPixelClip[0].w + LocalVoxelBoundsExtent.y * Brick_LocalVoxelToPixelClip[1].w + LocalVoxelBoundsExtent.z * Brick_LocalVoxelToPixelClip[2].w; const float MinW = CenterPixelClip.w - ExtentClipW; const float MaxW = CenterPixelClip.w + ExtentClipW; FFrustumCullData FrustumCull; #if CONSTANT_DIR FrustumCull.RectMin.xy = CenterPixel.xy - ExtentClipXY / CenterPixelClip.w; FrustumCull.RectMax.xy = CenterPixel.xy + ExtentClipXY / CenterPixelClip.w; #else // Project near face of skewed box for conservative rect const float2 Center = CenterPixelClip.xy + ( MinW - CenterPixelClip.w ) * RayShear; FrustumCull.RectMin.xy = ( Center - ExtentClipXY ) / MinW; FrustumCull.RectMax.xy = ( Center + ExtentClipXY ) / MinW; #endif const float MinZ = NaniteView.ViewToClip[2][2] + NaniteView.ViewToClip[3][2] / MaxW; const float MaxZ = NaniteView.ViewToClip[2][2] + NaniteView.ViewToClip[3][2] / MinW; FrustumCull.RectMin.z = MinZ; FrustumCull.RectMax.z = MaxZ; const float4 Rect = float4( FrustumCull.RectMin.xy, FrustumCull.RectMax.xy ); #else // TODO: Unify with existing Frustum culling functions FFrustumCullData FrustumCull; const float3 Extent = LocalVoxelBoundsExtent; BRANCH if( bIsOrtho ) { const float3 PixelClipDelta = abs( Extent.x * Brick_LocalVoxelToPixelClip[0].xyz ) + abs( Extent.y * Brick_LocalVoxelToPixelClip[1].xyz ) + abs( Extent.z * Brick_LocalVoxelToPixelClip[2].xyz ); FrustumCull.RectMin = CenterPixelClip.xyz - PixelClipDelta; FrustumCull.RectMax = CenterPixelClip.xyz + PixelClipDelta; } else { const float4 DeltaX = ( 2.0f * Extent.x ) * Brick_LocalVoxelToPixelClip[0]; const float4 DeltaY = ( 2.0f * Extent.y ) * Brick_LocalVoxelToPixelClip[1]; const float4 DeltaZ = ( 2.0f * Extent.z ) * Brick_LocalVoxelToPixelClip[2]; float MinW = +INFINITE_FLOAT; float MaxW = -INFINITE_FLOAT; FrustumCull.RectMin.xy = +INFINITE_FLOAT; FrustumCull.RectMax.xy = -INFINITE_FLOAT; #define EVAL_X01( _PointClip ) \ { \ const float4 Clip0 = ( _PointClip ); \ const float4 Clip1 = ( _PointClip ) + DeltaX; \ const float2 Screen0 = Clip0.xy / Clip0.w; \ const float2 Screen1 = Clip1.xy / Clip1.w; \ MinW = min3( MinW, Clip0.w, Clip1.w ); \ MaxW = max3( MaxW, Clip0.w, Clip1.w ); \ FrustumCull.RectMin.xy = min3( FrustumCull.RectMin.xy, Screen0, Screen1 ); \ FrustumCull.RectMax.xy = max3( FrustumCull.RectMax.xy, Screen0, Screen1 ); \ } const float4 Clip000 = CenterPixelClip - 0.5f * ( DeltaX + DeltaY + DeltaZ ); EVAL_X01( Clip000 ); EVAL_X01( Clip000 + DeltaY ); EVAL_X01( Clip000 + DeltaZ ); EVAL_X01( Clip000 + DeltaY + DeltaZ ); const float MinZ = MaxW * NaniteView.ViewToClip[2][2] + NaniteView.ViewToClip[3][2]; const float MaxZ = MinW * NaniteView.ViewToClip[2][2] + NaniteView.ViewToClip[3][2]; FrustumCull.RectMin.z = MinZ / MaxW; FrustumCull.RectMax.z = MaxZ / MinW; #undef EVAL_X01 } const float4 Rect = float4( FrustumCull.RectMin.xy, FrustumCull.RectMax.xy ); #endif // Round to nearest pixel int2 MinPixels = (int2)floor( Rect.xy + 0.5 ); int2 MaxPixels = (int2)floor( Rect.zw - 0.5 ); // inclusive! // Scissor MinPixels = max( MinPixels, Raster.ScissorRect.xy ); MaxPixels = min( MaxPixels, Raster.ScissorRect.zw - 1 ); MaxPixels = min( MaxPixels, MinPixels + ( BRICK_TRACE_APPROXIMATE_DIVIDE ? 30 : 128 ) ); #if BRICK_TRACE_WORK_REDISTRIBUTION uint QueueReadOffset = 0; int QueueNumElements = 0; const int2 RectSize = max( MaxPixels - MinPixels + 1, 0 ); const uint BrickMax_BrickIndex = ( Brick.BrickMax.x ) | (Brick.BrickMax.y << 8 ) | (Brick.BrickMax.z << 16 ) | ( BrickIndex << 24 ); #if BRICK_TRACE_TRANSPOSE const int NumPixels = MulU24( RectSize.x, RectSize.y ); const uint PixelStartOffset = WavePrefixSum( NumPixels ); const uint TotalPixels = WaveReadLaneLast( PixelStartOffset + NumPixels ); const uint LaneMask = 0xFFFFFFFFu << GroupThreadIndex; const uint PixelEndOffset = PixelStartOffset + NumPixels - 1u; const uint PackedMinPixels = MinPixels.x | ( MinPixels.y << 16 ); #if BRICK_TRACE_APPROXIMATE_DIVIDE const uint IntRcpRectWidth = ceil(0x8000u * (1.0f / RectSize.x)); const uint RectMulValues = IntRcpRectWidth | ( -RectSize.x << 16 ); #else const uint RectMulValues = -RectSize.x; #endif GroupMemoryBarrierWithGroupSync(); GroupBrickData[ GroupThreadIndex ] = uint3( RectMulValues, asuint( FrustumCull.RectMax.z ), PackedMinPixels ); const uint AcceptThreshold = NumPixels ? 31 + (int)NumPixels : 0; const uint QueueWriteValue = GroupThreadIndex | ( PixelStartOffset << 8 ); for( uint PixelIndexBase = 0; PixelIndexBase < TotalPixels; PixelIndexBase += 32 ) { const uint PixelIndex = PixelIndexBase + GroupThreadIndex; GroupMemoryBarrierWithGroupSync(); GroupWorkEnd[ GroupThreadIndex ] = 0xFFFFFFFFu; GroupMemoryBarrierWithGroupSync(); const int RelativeIndex = int( PixelEndOffset - PixelIndexBase ); if( (uint)RelativeIndex < AcceptThreshold ) GroupWorkEnd[ min( RelativeIndex, 31 ) ] = QueueWriteValue; const uint MarkBufferValue = GroupWorkEnd[ GroupThreadIndex ]; const uint BrickStartMask = WaveBallot( MarkBufferValue != 0xFFFFFFFFu ).x; const int BrickStartIndex = firstbitlow( BrickStartMask & LaneMask ); const uint BrickLaneData = WaveReadLaneAt( MarkBufferValue, BrickStartIndex ); const uint BrickLane = BrickLaneData & 0xFFu; const uint BrickThread = PixelIndex - ( BrickLaneData >> 8 ); const uint3 BrickData = GroupBrickData[ BrickLane ]; #if BRICK_TRACE_APPROXIMATE_DIVIDE const int BrickY = MulU24( BrickThread, BrickData.x & 0xFFFFu ) >> 15; const int BrickX = MadI24( (int)BrickY, ( (int)BrickData.x >> 16 ), BrickThread); #else const int BrickY = floor( ( BrickThread + 0.5f ) / -(int)BrickData.x ); const int BrickX = MadI24( BrickY, BrickData.x, BrickThread ); #endif const float BrickRectMaxZ = asfloat( BrickData.y ); const int2 BrickPixelPos = int2( BrickData.z & 0xFFFF, BrickData.z >> 16 ) + int2( BrickX, BrickY ); bool bActive = PixelIndex < TotalPixels; BRANCH if( bActive ) { bActive = OcclusionTestPixel( Raster, BrickPixelPos, BrickRectMaxZ ); } BRANCH if( WaveActiveAnyTrue( bActive ) ) { if( bActive ) { const uint TaskIndex = QueueReadOffset + QueueNumElements + WavePrefixCountBits( true ); const uint WriteIndex = TaskIndex & 63; GroupSourceLaneAndPixelPos[ WriteIndex ] = BrickLane | ( BrickPixelPos.x << 5 ) | ( BrickPixelPos.y << 19 ); } QueueNumElements += WaveActiveCountBits( bActive ); BRANCH if( QueueNumElements >= 32 ) { GroupMemoryBarrierWithGroupSync(); ProcessBrickPixelBatchFromQueue( QueueNumElements, QueueReadOffset, bIsOrtho, NaniteView, Raster, InstanceData, Cluster, VisibleIndex, Cluster.LODError, RcpVoxelSize, Bias, Brick_RayBase.Direction, Brick_RayDirection_dx, Brick_RayDirection_dy, Brick_RayBase.Origin, Brick_RayOrigin_dx, Brick_RayOrigin_dy, Brick.ReverseBrickBits, BrickMax_BrickIndex, CenterPixelClip.w, GroupThreadIndex ); } } } #else // !BRICK_TRACE_TRANSPOSE int2 PixelPos = MinPixels; bool bLaneActive = BrickIndex < Cluster.BrickDataNum; while( WaveActiveAnyTrue( bLaneActive ) ) { bool bActive = bLaneActive; BRANCH if( bActive ) { bActive = OcclusionTestPixel( Raster, PixelPos, FrustumCull.RectMax.z ); } BRANCH if( WaveActiveAnyTrue( bActive ) ) { if( bActive ) { const uint TaskIndex = QueueReadOffset + QueueNumElements + WavePrefixCountBits( bActive ); const uint WriteIndex = TaskIndex & 63; GroupSourceLaneAndPixelPos[ WriteIndex ] = GroupThreadIndex | ( PixelPos.x << 5 ) | ( PixelPos.y << 19 ); } QueueNumElements += WaveActiveCountBits( bActive ); BRANCH if (QueueNumElements >= 32) { GroupMemoryBarrierWithGroupSync(); ProcessBrickPixelBatchFromQueue( QueueNumElements, QueueReadOffset, bIsOrtho, NaniteView, Raster, InstanceData, Cluster, VisibleIndex, Cluster.LODError, RcpVoxelSize, Bias, Brick_RayBase.Direction, Brick_RayDirection_dx, Brick_RayDirection_dy, Brick_RayBase.Origin, Brick_RayOrigin_dx, Brick_RayOrigin_dy, Brick.ReverseBrickBits, BrickMax_BrickIndex, CenterPixelClip.w, GroupThreadIndex ); GroupMemoryBarrierWithGroupSync(); } } if( PixelPos.x < MaxPixels.x ) { PixelPos.x++; } else if( PixelPos.y < MaxPixels.y ) { PixelPos.y++; PixelPos.x = MinPixels.x; } else { bLaneActive = false; } } #endif BRANCH if( QueueNumElements > 0 ) { GroupMemoryBarrierWithGroupSync(); ProcessBrickPixelBatchFromQueue( QueueNumElements, QueueReadOffset, bIsOrtho, NaniteView, Raster, InstanceData, Cluster, VisibleIndex, Cluster.LODError, RcpVoxelSize, Bias, Brick_RayBase.Direction, Brick_RayDirection_dx, Brick_RayDirection_dy, Brick_RayBase.Origin, Brick_RayOrigin_dx, Brick_RayOrigin_dy, Brick.ReverseBrickBits, BrickMax_BrickIndex, CenterPixelClip.w, GroupThreadIndex ); } #else // !BRICK_TRACE_WORK_REDISTRIBUTION BRANCH if( BrickIndex >= Cluster.BrickDataNum ) break; const uint PixelValue = ( ( VisibleIndex + 1 ) << 7 ) | BrickIndex; int2 PixelPos = MinPixels; while( true ) { FVisBufferPixel Pixel = CreateVisBufferPixel( PixelPos, PixelValue, FrustumCull.RectMax.z ); bool bDepthPassed = true; #if VIRTUAL_TEXTURE_TARGET Pixel.PhysicalPosition.xy = Pixel.Position; Pixel.PhysicalPosition.z = Raster.ArrayIndex; if( !Raster.bSinglePage ) { FCachedPageTable PageTranslation; if( !PageTranslation( Pixel ) ) bDepthPassed = false; } #endif if( bDepthPassed ) bDepthPassed = Pixel.EarlyDepthTest(); BRANCH if( bDepthPassed ) { FRay Ray = Brick_RayBase; if( CONSTANT_DIR || bIsOrtho ) { Ray.Origin += Brick_RayOrigin_dx * PixelPos.x + Brick_RayOrigin_dy * PixelPos.y; } else { Ray.Direction += Brick_RayDirection_dx * PixelPos.x + Brick_RayDirection_dy * PixelPos.y; } ProcessBrickPixel( NaniteView, Raster, InstanceData, Ray, bIsOrtho, PixelPos, PixelValue, Brick.ReverseBrickBits, LocalVoxelBoundsExtent, Cluster.LODError, RcpVoxelSize, Bias ); } if( PixelPos.x < MaxPixels.x ) { PixelPos.x++; } else if( PixelPos.y < MaxPixels.y ) { PixelPos.y++; PixelPos.x = MinPixels.x; } else { break; } } #endif } } #endif #if WORKGRAPH_NODE [Shader("node")] [NodeLaunch("broadcasting")] [NodeMaxDispatchGrid(65535,1,1)] #endif [numthreads(THREADGROUP_SIZE, 1, 1)] void MicropolyRasterize( uint DispatchThreadID : SV_DispatchThreadID, uint GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex #if WORKGRAPH_NODE , DispatchNodeInputRecord InputRecord #endif ) { #if NANITE_VOXELS ClusterTraceBricks( GroupID, GroupIndex ); #elif PATCHES PatchRasterize( GroupID, GroupIndex ); #else ClusterRasterize( GroupID, GroupIndex ); #endif } #define VERTEX_TO_TRIANGLE_MASKS (NANITE_PRIM_SHADER && (!DEPTH_ONLY || NANITE_PIXEL_PROGRAMMABLE)) #ifndef NANITE_ALLOW_SV_BARYCENTRICS #define NANITE_ALLOW_SV_BARYCENTRICS 1 #endif // Use barycentric intrinsics when available, otherwise prefer SV_Barycentrics. // If all else fails export them explicitly (incompatible with vertex reuse). #define BARYCENTRIC_MODE_NONE (!NANITE_PIXEL_PROGRAMMABLE) #define BARYCENTRIC_MODE_INTRINSICS (!BARYCENTRIC_MODE_NONE && (NANITE_MESH_SHADER || NANITE_PRIM_SHADER) && COMPILER_SUPPORTS_BARYCENTRIC_INTRINSICS) #define BARYCENTRIC_MODE_SV_BARYCENTRICS (!BARYCENTRIC_MODE_NONE && NANITE_MESH_SHADER && NANITE_ALLOW_SV_BARYCENTRICS && !COMPILER_SUPPORTS_BARYCENTRIC_INTRINSICS) #define BARYCENTRIC_MODE_EXPORT (!BARYCENTRIC_MODE_NONE && !BARYCENTRIC_MODE_INTRINSICS && !BARYCENTRIC_MODE_SV_BARYCENTRICS) struct PrimitiveAttributes { uint PixelValue; uint ViewId; bool bSwapVW; uint MipLevel; uint ArrayIndex; uint LevelOffset; uint4 ViewRect; }; struct PrimitiveAttributesPacked { // Use uint4 to prevent compiler from erroneously packing per-vertex and per-prim attributes together nointerpolation uint4 PixelValue_ViewId_SwapVW_Mip_ArrayIndex_LevelOffset_ViewRect : TEXCOORD1; }; struct VSOut { #if NANITE_HW_RASTER_INTERPOLATE_DEPTH float2 ClipZW : TEXCOORD0; #endif #if !NANITE_MESH_SHADER PrimitiveAttributesPacked PrimitivePacked; #endif #if VERTEX_TO_TRIANGLE_MASKS #if NANITE_VERT_REUSE_BATCH CUSTOM_INTERPOLATION uint2 ToTriangleMask_TriRangeStart : TEXCOORD3; #else CUSTOM_INTERPOLATION uint4 ToTriangleMasks : TEXCOORD3; #endif #endif #if BARYCENTRIC_MODE_INTRINSICS CUSTOM_INTERPOLATION uint VertexID : TEXCOORD4; #elif BARYCENTRIC_MODE_SV_BARYCENTRICS && PIXELSHADER float3 Barycentrics : SV_Barycentrics; #elif BARYCENTRIC_MODE_EXPORT float2 BarycentricsUV : TEXCOORD4; #endif #if NANITE_PIXEL_PROGRAMMABLE float4 TexCoords : TEXCOORD5; #endif float4 Position : SV_Position; #if USE_GLOBAL_CLIP_PLANE && !PIXELSHADER float OutGlobalClipPlaneDistance : SV_ClipDistance; #endif }; PrimitiveAttributesPacked PackPrimitiveAttributes(PrimitiveAttributes In) { uint4 PackedData = uint4(In.PixelValue, In.ViewId, 0u, 0u); PackedData.y |= (In.bSwapVW ? (1u << 16) : 0u); #if VIRTUAL_TEXTURE_TARGET PackedData.y |= (In.MipLevel << 18) | (In.ArrayIndex << 23); PackedData.z = In.LevelOffset; // xy: VisibleCluster.vPage * VSM_PAGE_SIZE // zw: VisibleCluster.vPageEnd * VSM_PAGE_SIZE + VSM_PAGE_SIZE const uint2 vPage = In.ViewRect.xy / VSM_PAGE_SIZE; const uint2 vPageEnd = (In.ViewRect.zw - VSM_PAGE_SIZE) / VSM_PAGE_SIZE; const uint2 vPageDelta = vPageEnd - vPage; // 3-bit delta. This must match the logic in UnpackVisibleCluster() in NaniteDecode.ush PackedData.w = ((vPageDelta.y << 29u) | (vPageDelta.x << 26u) | (vPage.y << 13u) | vPage.x); #else PackedData.zw = uint2((In.ViewRect.y << 16u) | In.ViewRect.x, (In.ViewRect.w << 16u) | In.ViewRect.z); #endif PrimitiveAttributesPacked Out; Out.PixelValue_ViewId_SwapVW_Mip_ArrayIndex_LevelOffset_ViewRect = PackedData; return Out; } PrimitiveAttributes UnpackPrimitiveAttributes(PrimitiveAttributesPacked In) { const uint4 PackedData = In.PixelValue_ViewId_SwapVW_Mip_ArrayIndex_LevelOffset_ViewRect; PrimitiveAttributes Out = (PrimitiveAttributes)0; Out.PixelValue = PackedData.x; Out.ViewId = BitFieldExtractU32(PackedData.y, 16, 0); Out.bSwapVW = BitFieldExtractU32(PackedData.y, 1, 16); #if VIRTUAL_TEXTURE_TARGET Out.MipLevel = BitFieldExtractU32(PackedData.y, 5, 18); Out.ArrayIndex = PackedData.y >> 23; Out.LevelOffset = PackedData.z; const uint2 vPage = uint2(BitFieldExtractU32(PackedData.w, 13, 0), BitFieldExtractU32(PackedData.w, 13, 13)); const uint2 vPageDelta = uint2(BitFieldExtractU32(PackedData.w, 3, 26), BitFieldExtractU32(PackedData.w, 3, 29)); const uint2 vPageEnd = vPage + vPageDelta; Out.ViewRect = uint4(vPage * VSM_PAGE_SIZE, vPageEnd * VSM_PAGE_SIZE + VSM_PAGE_SIZE); #else Out.ViewRect.x = BitFieldExtractU32(PackedData.z, 16, 0); Out.ViewRect.y = BitFieldExtractU32(PackedData.z, 16, 16); Out.ViewRect.z = BitFieldExtractU32(PackedData.w, 16, 0); Out.ViewRect.w = BitFieldExtractU32(PackedData.w, 16, 16); #endif return Out; } PrimitiveAttributes MakePrimitiveAttributes(FNaniteView NaniteView, FVisibleCluster VisibleCluster, uint PixelValue, bool bReverseWindingOrder) { PrimitiveAttributes Out = (PrimitiveAttributes)0; Out.PixelValue = PixelValue; Out.ViewId = VisibleCluster.ViewId; #if BARYCENTRIC_MODE_SV_BARYCENTRICS || BARYCENTRIC_MODE_EXPORT // Set SwapVW flag to indicate that the V and W barycentrics need to be swapped in the PS to compensate for the swapping of the i1 and i2 vertices. // BARYCENTRIC_MODE_EXPORT doesn't need this as it compensates by flipping the exported barycentrics instead. Out.bSwapVW = bReverseWindingOrder; #endif #if VIRTUAL_TEXTURE_TARGET Out.MipLevel = NaniteView.TargetMipLevel; const bool bCacheAsStatic = (VisibleCluster.Flags & NANITE_CULLING_FLAG_CACHE_AS_STATIC) != 0u; Out.ArrayIndex = bCacheAsStatic ? GetVirtualShadowMapStaticArrayIndex() : 0; Out.LevelOffset = CalcPageTableLevelOffset(NaniteView.TargetLayerIndex, NaniteView.TargetMipLevel).GetPacked(); Out.ViewRect = uint4(VisibleCluster.vPage * VSM_PAGE_SIZE, VisibleCluster.vPageEnd * VSM_PAGE_SIZE + VSM_PAGE_SIZE); #else Out.ViewRect = NaniteView.ViewRect; #endif return Out; } VSOut CommonRasterizerVS(FNaniteView NaniteView, FPrimitiveSceneData PrimitiveData, FInstanceSceneData InstanceData, FVisibleCluster VisibleCluster, FCluster Cluster, uint VertIndex, uint PixelValue, bool bReverseWindingOrder) { VSOut Out; FNanitePostDeformVertex InputVert = FetchAndDeformLocalNaniteVertex(PrimitiveData, InstanceData, GetInstanceViewData(InstanceData.InstanceId, NaniteView.SceneRendererPrimaryViewId), Cluster, VisibleCluster, VertIndex, NANITE_NUM_TEXCOORDS_TO_DECODE_HW_VS); float3 WorldPositionOffset = 0.0f; FMaterialShader MaterialShader; MaterialShader.PrimitiveData = PrimitiveData; MaterialShader.InstanceData = InstanceData; MaterialShader.InstanceDynamicData = CalculateInstanceDynamicData(NaniteView, InstanceData); MaterialShader.NaniteView = NaniteView; MaterialShader.Cluster = Cluster; MaterialShader.VisibleCluster = VisibleCluster; #if MATERIAL_SHADER_HAS_DISPLACEMENT MaterialShader.InitDisplacement(RasterBinMeta[GetRasterBin()].MaterialDisplacementParams); MaterialShader.ApplyFallbackDisplacement(InputVert); #endif MaterialShader.InitVertexParameters(InputVert); BRANCH if ( (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0 ) { WorldPositionOffset = MaterialShader.EvaluateWorldPositionOffset(); } const float3 PointTranslatedWorld = DFTransformLocalToTranslatedWorld(InputVert.Position, InstanceData.LocalToWorld, NaniteView.PreViewTranslation).xyz + WorldPositionOffset; float4 PointClip = mul( float4( PointTranslatedWorld, 1 ), NaniteView.TranslatedWorldToClip ); #if VIRTUAL_TEXTURE_TARGET /* float2 vUV = PointClip.xy * float2(0.5, -0.5) + 0.5 * PointClip.w; float2 vPixels = vUV * NaniteView.ViewSizeAndInvSize.xy; float2 LocalPixels = vPixels - VisibleCluster.vPage * VSM_PAGE_SIZE * PointClip.w; float2 LocalUV = LocalPixels / ( 4 * VSM_PAGE_SIZE ); float2 LocalClip = LocalUV * float2(2, -2) + float2(-1, 1) * PointClip.w; PointClip.xy = LocalClip; */ PointClip.xy = NaniteView.ClipSpaceScaleOffset.xy * PointClip.xy + NaniteView.ClipSpaceScaleOffset.zw * PointClip.w; // Offset 0,0 to be at vPage for a 0, VSM_PAGE_SIZE * VSM_RASTER_WINDOW_PAGES viewport. PointClip.xy += PointClip.w * ( float2(-2, 2) / VSM_RASTER_WINDOW_PAGES ) * VisibleCluster.vPage; #else PointClip.xy = NaniteView.ClipSpaceScaleOffset.xy * PointClip.xy + NaniteView.ClipSpaceScaleOffset.zw * PointClip.w; #endif #if !NANITE_MESH_SHADER Out.PrimitivePacked = PackPrimitiveAttributes(MakePrimitiveAttributes(NaniteView, VisibleCluster, PixelValue, bReverseWindingOrder)); #endif #if NANITE_PIXEL_PROGRAMMABLE && NUM_TEX_COORD_INTERPOLATORS > 0 float2 CustomizedUVs[NUM_TEX_COORD_INTERPOLATORS]; MaterialShader.GetCustomizedUVs(CustomizedUVs); #endif #if NANITE_PIXEL_PROGRAMMABLE #if NUM_TEX_COORD_INTERPOLATORS > 1 Out.TexCoords.xy = CustomizedUVs[0]; Out.TexCoords.zw = CustomizedUVs[1]; #elif NUM_TEX_COORD_INTERPOLATORS > 0 Out.TexCoords.xy = CustomizedUVs[0]; Out.TexCoords.zw = InputVert.RawAttributeData.TexCoords[1]; #else Out.TexCoords.xy = InputVert.RawAttributeData.TexCoords[0]; Out.TexCoords.zw = InputVert.RawAttributeData.TexCoords[1]; #endif #endif #if MATERIAL_CACHE #if NUM_MATERIAL_OUTPUTS_GETMATERIALCACHE > 0 float2 MaterialCacheUV = GetMaterialCache1(MaterialShader.VertexParameters); #else // NUM_MATERIAL_OUTPUTS_GETMATERIALCACHE > 0 float2 MaterialCacheUV = InputVert.RawAttributeData.TexCoords[0]; #endif // NUM_MATERIAL_OUTPUTS_GETMATERIALCACHE PointClip = GetMaterialCacheUnwrapClipPosition(MaterialCacheUV, NaniteView.MaterialCacheUnwrapMinAndInvSize, NaniteView.MaterialCachePageAdvanceAndInvCount.xy); #endif // MATERIAL_CACHE #if !PIXELSHADER Out.Position = PointClip; #if NANITE_HW_RASTER_INTERPOLATE_DEPTH Out.ClipZW = PointClip.zw; #endif const bool bNearClip = ((NaniteView.Flags & NANITE_VIEW_FLAG_NEAR_CLIP) != 0u); if (!bNearClip) { // Shader workaround to avoid HW depth clipping. Should be replaced with rasterizer state ideally. Out.Position.z = 0.5f * Out.Position.w; } #endif #if BARYCENTRIC_MODE_INTRINSICS Out.VertexID = VertIndex; #endif #if USE_GLOBAL_CLIP_PLANE && !PIXELSHADER Out.OutGlobalClipPlaneDistance = GetGlobalClipPlaneDistance(NaniteView, PointTranslatedWorld); #endif return Out; } #if NANITE_PRIM_SHADER #pragma argument(realtypes) struct PrimitiveInput { uint Index : PRIM_SHADER_SEM_VERT_INDEX; #if !NANITE_VERT_REUSE_BATCH uint WaveIndex : PRIM_SHADER_SEM_WAVE_INDEX; #endif }; struct PrimitiveOutput { VSOut Out; uint PrimExport : PRIM_SHADER_SEM_PRIM_EXPORT; uint VertCount : PRIM_SHADER_SEM_VERT_COUNT; uint PrimCount : PRIM_SHADER_SEM_PRIM_COUNT; }; uint PackTriangleExport(uint3 TriangleIndices) { return TriangleIndices.x | (TriangleIndices.y << 10) | (TriangleIndices.z << 20); } uint3 UnpackTriangleExport(uint Packed) { const uint Index0 = (Packed & 0x3FF); const uint Index1 = (Packed >> 10) & 0x3FF; const uint Index2 = (Packed >> 20); return uint3(Index0, Index1, Index2); } #define NUM_VERTEX_MASKS ((NANITE_MAX_CLUSTER_VERTICES + 31)/32) groupshared union { #if VERTEX_TO_TRIANGLE_MASKS uint VertexToTriangleMasks[NANITE_MAX_CLUSTER_VERTICES][4]; #endif struct { uint ClusterIndex; // NOTE: Overlapping ClusterIndex with VertexToTriangleMasks reduces peak LDS usage because of allocation granularity. uint ReferencedVerticesMasks[NUM_VERTEX_MASKS]; uint ReferencedVerticesPrefixSums[NUM_VERTEX_MASKS]; uchar NewToOldVertex[NANITE_MAX_CLUSTER_VERTICES]; uchar OldToNewVertex[NANITE_MAX_CLUSTER_VERTICES]; } S; } LDS; groupshared uint GroupVertToTriMasks[32]; PRIM_SHADER_OUTPUT_TRIANGLES PRIM_SHADER_PRIM_COUNT(1) PRIM_SHADER_VERT_COUNT(1) #if NANITE_VERT_REUSE_BATCH PRIM_SHADER_VERT_LIMIT(32) PRIM_SHADER_AMP_FACTOR(32) #else PRIM_SHADER_VERT_LIMIT(256) PRIM_SHADER_AMP_FACTOR(128) #endif PRIM_SHADER_AMP_ENABLE PrimitiveOutput HWRasterizeVS(PrimitiveInput Input) { const uint LaneIndex = WaveGetLaneIndex(); const uint LaneCount = WaveGetLaneCount(); #if NANITE_VERT_REUSE_BATCH const uint GroupThreadID = LaneIndex; uint VisibleIndex = WaveReadLaneAt(Input.Index, 0); #else const uint GroupThreadID = LaneIndex + Input.WaveIndex * LaneCount; if (GroupThreadID == 0) { // Input index is only initialized for lane 0, so we need to manually communicate it to all other threads in subgroup (not just wavefront). LDS.S.ClusterIndex = Input.Index; } GroupMemoryBarrierWithGroupSync(); uint VisibleIndex = LDS.S.ClusterIndex; #endif FTriRange TriRange = GetIndexAndTriRangeHW( VisibleIndex ); // Should be all scalar. FVisibleCluster VisibleCluster = GetVisibleCluster( VisibleIndex, VIRTUAL_TEXTURE_TARGET ); FPrimitiveSceneData PrimitiveData; FInstanceSceneData InstanceData; GetNaniteMaterialSceneData(VisibleCluster, PrimitiveData, InstanceData); FNaniteView NaniteView = GetNaniteView( VisibleCluster.ViewId ); const bool bReverseWindingOrder = ReverseWindingOrder(NaniteView, PrimitiveData, InstanceData); #if NANITE_VERTEX_PROGRAMMABLE ResolvedView = ResolveView(NaniteView); #endif FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); if( TriRange.Num == 0 ) TriRange.Num = Cluster.NumTris; #if NANITE_VERT_REUSE_BATCH #if VERTEX_TO_TRIANGLE_MASKS GroupVertToTriMasks[GroupThreadID] = 0; #endif const uint TriIndex = TriRange.Start + GroupThreadID; bool bTriValid = GroupThreadID < TriRange.Num; uint3 VertIndexes = 0; if (bTriValid) { VertIndexes = DecodeTriangleIndices(Cluster, TriIndex); if( bReverseWindingOrder ) VertIndexes.yz = VertIndexes.zy; } uint NumUniqueVerts; uint3 VertLaneIndexes; uint LaneVertIndex; DeduplicateVertIndexes(VertIndexes, GroupThreadID, bTriValid, NumUniqueVerts, LaneVertIndex, VertLaneIndexes); PrimitiveOutput PrimOutput; PrimOutput.VertCount = NumUniqueVerts; PrimOutput.PrimCount = TriRange.Num; if (GroupThreadID < NumUniqueVerts) { const uint PixelValue = (VisibleIndex + 1) << 7; PrimOutput.Out = CommonRasterizerVS(NaniteView, PrimitiveData, InstanceData, VisibleCluster, Cluster, LaneVertIndex, PixelValue, bReverseWindingOrder); } if (bTriValid) { PrimOutput.PrimExport = PackTriangleExport(VertLaneIndexes); } #if VERTEX_TO_TRIANGLE_MASKS if (bTriValid) { InterlockedOr(GroupVertToTriMasks[VertLaneIndexes.x], 1 << GroupThreadID); InterlockedOr(GroupVertToTriMasks[VertLaneIndexes.y], 1 << GroupThreadID); InterlockedOr(GroupVertToTriMasks[VertLaneIndexes.z], 1 << GroupThreadID); } GroupMemoryBarrier(); if (GroupThreadID < NumUniqueVerts) { PrimOutput.Out.ToTriangleMask_TriRangeStart = uint2(GroupVertToTriMasks[GroupThreadID], TriRange.Start); } #endif #else // !NANITE_VERT_REUSE_BATCH uint NumExportVertices = Cluster.NumVerts; bool bNeedsCompaction = (TriRange.Num != Cluster.NumTris); uint SrcVertexIndex = GroupThreadID; uint3 VertIndexes; if (GroupThreadID < TriRange.Num) { VertIndexes = DecodeTriangleIndices(Cluster, TriRange.Start + GroupThreadID); if( bReverseWindingOrder ) VertIndexes.yz = VertIndexes.zy; } BRANCH if (bNeedsCompaction) { // Programmable raster renders a single material at a time, so clusters with multiple materials need to only // export triangles from the current material. Unreferenced vertices are not allowed in primitive shaders, // so we need to compact the vertices and remap any references. // The expectation is that this path is going to be rare as most clusters will have just a single material and // most materials will not need programmable raster. if (GroupThreadID < NUM_VERTEX_MASKS) { // Clear vertex reference masks LDS.S.ReferencedVerticesMasks[GroupThreadID] = 0u; } GroupMemoryBarrierWithGroupSync(); if (GroupThreadID < TriRange.Num) { // Mark referenced vertices InterlockedOr(LDS.S.ReferencedVerticesMasks[VertIndexes.x >> 5], 1u << (VertIndexes.x & 31)); InterlockedOr(LDS.S.ReferencedVerticesMasks[VertIndexes.y >> 5], 1u << (VertIndexes.y & 31)); InterlockedOr(LDS.S.ReferencedVerticesMasks[VertIndexes.z >> 5], 1u << (VertIndexes.z & 31)); } GroupMemoryBarrierWithGroupSync(); if (GroupThreadID < NUM_VERTEX_MASKS) { // Calculate dword prefix sums const uint NumMaskBits = countbits(LDS.S.ReferencedVerticesMasks[GroupThreadID]); LDS.S.ReferencedVerticesPrefixSums[GroupThreadID] = WavePrefixSum(NumMaskBits); } GroupMemoryBarrierWithGroupSync(); // Update export vertices to number of referenced vertices NumExportVertices = LDS.S.ReferencedVerticesPrefixSums[NUM_VERTEX_MASKS - 1] + countbits(LDS.S.ReferencedVerticesMasks[NUM_VERTEX_MASKS - 1]); if (GroupThreadID < Cluster.NumVerts) { const uint DwordIndex = GroupThreadID >> 5; const uint BitIndex = GroupThreadID & 31; if (LDS.S.ReferencedVerticesMasks[DwordIndex] & (1u << BitIndex)) { // Fill mappings between old and new (compact) vertex indices const uint NewVertexIndex = LDS.S.ReferencedVerticesPrefixSums[DwordIndex] + countbits(BitFieldExtractU32(LDS.S.ReferencedVerticesMasks[DwordIndex], BitIndex, 0)); LDS.S.OldToNewVertex[GroupThreadID] = (uchar)NewVertexIndex; LDS.S.NewToOldVertex[NewVertexIndex] = (uchar)GroupThreadID; } } GroupMemoryBarrierWithGroupSync(); if (GroupThreadID < TriRange.Num) { // Remap triangles to new vertex indices VertIndexes = uint3(LDS.S.OldToNewVertex[VertIndexes.x], LDS.S.OldToNewVertex[VertIndexes.y], LDS.S.OldToNewVertex[VertIndexes.z]); } if (GroupThreadID < NumExportVertices) { // Remap source vertex from compact to old SrcVertexIndex = LDS.S.NewToOldVertex[GroupThreadID]; } } PrimitiveOutput PrimOutput; PrimOutput.VertCount = NumExportVertices; PrimOutput.PrimCount = TriRange.Num; if (GroupThreadID < TriRange.Num) { PrimOutput.PrimExport = PackTriangleExport(VertIndexes); } if (GroupThreadID < NumExportVertices) { const uint PixelValue = ((VisibleIndex + 1) << 7); PrimOutput.Out = CommonRasterizerVS(NaniteView, PrimitiveData, InstanceData, VisibleCluster, Cluster, SrcVertexIndex, PixelValue, bReverseWindingOrder); } #if VERTEX_TO_TRIANGLE_MASKS GroupMemoryBarrierWithGroupSync(); // Sync to make sure there is no lifetime overlap with LDS.S if (GroupThreadID < NumExportVertices) { LDS.VertexToTriangleMasks[GroupThreadID][0] = 0; LDS.VertexToTriangleMasks[GroupThreadID][1] = 0; LDS.VertexToTriangleMasks[GroupThreadID][2] = 0; LDS.VertexToTriangleMasks[GroupThreadID][3] = 0; } GroupMemoryBarrierWithGroupSync(); if (GroupThreadID < TriRange.Num) { const uint TriangleID = TriRange.Start + GroupThreadID; const uint DwordIndex = (TriangleID >> 5) & 3; const uint TriangleMask = 1 << (TriangleID & 31); InterlockedOr(LDS.VertexToTriangleMasks[VertIndexes.x][DwordIndex], TriangleMask); InterlockedOr(LDS.VertexToTriangleMasks[VertIndexes.y][DwordIndex], TriangleMask); InterlockedOr(LDS.VertexToTriangleMasks[VertIndexes.z][DwordIndex], TriangleMask); } GroupMemoryBarrierWithGroupSync(); if (GroupThreadID < NumExportVertices) { PrimOutput.Out.ToTriangleMasks = uint4( LDS.VertexToTriangleMasks[GroupThreadID][0], LDS.VertexToTriangleMasks[GroupThreadID][1], LDS.VertexToTriangleMasks[GroupThreadID][2], LDS.VertexToTriangleMasks[GroupThreadID][3]); } #endif #endif // NANITE_VERT_REUSE_BATCH return PrimOutput; } #elif NANITE_MESH_SHADER #if MESHSHADER || WORKGRAPH_NODE #if WORKGRAPH_NODE [Shader("node")] [NodeLaunch("mesh")] [NodeMaxDispatchGrid(65535,1,1)] #endif MESH_SHADER_TRIANGLE_ATTRIBUTES(NANITE_MESH_SHADER_TG_SIZE) void HWRasterizeMS( uint GroupThreadID : SV_GroupThreadID, uint3 GroupID : SV_GroupID, #if WORKGRAPH_NODE DispatchNodeInputRecord InputRecord, #endif #if NANITE_VERT_REUSE_BATCH MESH_SHADER_VERTEX_EXPORT(VSOut, 32), MESH_SHADER_TRIANGLE_EXPORT(32), MESH_SHADER_PRIMITIVE_EXPORT(PrimitiveAttributesPacked, 32) #else MESH_SHADER_VERTEX_EXPORT(VSOut, 256), MESH_SHADER_TRIANGLE_EXPORT(128), MESH_SHADER_PRIMITIVE_EXPORT(PrimitiveAttributesPacked, 128) #endif ) { bool bValidIndex = true; #if PLATFORM_REQUIRES_UNWRAPPED_MESH_SHADER_ARGS uint VisibleIndex = GroupID.x; #else // Avoid overflowing the 64k limit on single dimension of SV_GroupID uint VisibleIndex = GetUnWrappedDispatchGroupId(GroupID); BRANCH if (GroupID.y > 0 || GroupID.z > 0) { // Due to wrapping, the visible index can be out of range bValidIndex = (VisibleIndex < RasterBinMeta[GetRasterBin()].BinHWCount); } #endif // NOTE: Doing a simple early out here doesn't work. Likely because divergent control // flow is not allowed around SetMeshOutputCounts, even if the condition is uniform for // the group. The compiler succeeds but corruption occurs. FTriRange TriRange; FVisibleCluster VisibleCluster; FInstanceSceneData InstanceData; FPrimitiveSceneData PrimitiveData; FNaniteView NaniteView; uint NumUniqueVerts = 0; uint3 VertIndexes = 0; TriRange.Num = 0; uint TriIndex = 0; FCluster Cluster; uint LaneVertIndex = 0; bool bReverseWindingOrder = false; BRANCH if (bValidIndex) { TriRange = GetIndexAndTriRangeHW(VisibleIndex); VisibleCluster = GetVisibleCluster(VisibleIndex, VIRTUAL_TEXTURE_TARGET); GetNaniteMaterialSceneData(VisibleCluster, PrimitiveData, InstanceData); NaniteView = GetNaniteView(VisibleCluster.ViewId); bReverseWindingOrder = ReverseWindingOrder(NaniteView, PrimitiveData, InstanceData); #if NANITE_VERTEX_PROGRAMMABLE ResolvedView = ResolveView(NaniteView); #endif Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); if( TriRange.Num == 0 ) TriRange.Num = Cluster.NumTris; TriIndex = TriRange.Start + GroupThreadID; bool bTriValid = GroupThreadID < TriRange.Num; if (bTriValid) { VertIndexes = DecodeTriangleIndices(Cluster, TriIndex); if( bReverseWindingOrder ) VertIndexes.yz = VertIndexes.zy; } #if NANITE_VERT_REUSE_BATCH DeduplicateVertIndexes(VertIndexes, GroupThreadID, bTriValid, NumUniqueVerts, LaneVertIndex, VertIndexes); #else LaneVertIndex = GroupThreadID; NumUniqueVerts = Cluster.NumVerts; #endif } SetMeshOutputCounts(NumUniqueVerts, TriRange.Num); BRANCH if (bValidIndex) { uint PrimExportIndex = GroupThreadID; if (PrimExportIndex < TriRange.Num) { MESH_SHADER_WRITE_TRIANGLE(PrimExportIndex, VertIndexes); const uint PixelValue = ((VisibleIndex + 1) << 7) | TriIndex; PrimitiveAttributes Attributes = MakePrimitiveAttributes(NaniteView, VisibleCluster, PixelValue, bReverseWindingOrder); PrimitiveAttributesPacked AttributesPacked = PackPrimitiveAttributes(Attributes); MESH_SHADER_WRITE_PRIMITIVE(PrimExportIndex, AttributesPacked); } uint VertExportIndex = GroupThreadID; if (VertExportIndex < Cluster.NumVerts) { VSOut VertexOutput = CommonRasterizerVS(NaniteView, PrimitiveData, InstanceData, VisibleCluster, Cluster, LaneVertIndex, 0u, bReverseWindingOrder); MESH_SHADER_WRITE_VERTEX(VertExportIndex, VertexOutput); } #if NANITE_MESH_SHADER_TG_SIZE == 128 VertExportIndex += 128; if (VertExportIndex < Cluster.NumVerts) { VSOut VertexOutput = CommonRasterizerVS(NaniteView, PrimitiveData, InstanceData, VisibleCluster, Cluster, LaneVertIndex + 128, 0u, bReverseWindingOrder); MESH_SHADER_WRITE_VERTEX(VertExportIndex, VertexOutput); } #endif } } #endif // MESHSHADER || WORKGRAPH_NODE #else // NANITE_MESH_SHADER / NANITE_PRIM_SHADER VSOut HWRasterizeVS( uint VertexID : SV_VertexID, uint VisibleIndex : SV_InstanceID ) { FTriRange TriRange = GetIndexAndTriRangeHW( VisibleIndex ); uint LocalTriIndex = VertexID / 3; VertexID = VertexID - LocalTriIndex * 3; VSOut Out; #if !PIXELSHADER Out.Position = float4(0,0,0,1); #endif FVisibleCluster VisibleCluster = GetVisibleCluster( VisibleIndex, VIRTUAL_TEXTURE_TARGET ); FPrimitiveSceneData PrimitiveData; FInstanceSceneData InstanceData; GetNaniteMaterialSceneData(VisibleCluster, PrimitiveData, InstanceData); FNaniteView NaniteView = GetNaniteView( VisibleCluster.ViewId ); const bool bReverseWindingOrder = ReverseWindingOrder(NaniteView, PrimitiveData, InstanceData); #if NANITE_VERTEX_PROGRAMMABLE ResolvedView = ResolveView(NaniteView); #endif FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); if( TriRange.Num == 0 ) TriRange.Num = Cluster.NumTris; BRANCH if( LocalTriIndex < TriRange.Num ) { const uint TriIndex = TriRange.Start + LocalTriIndex; uint3 VertIndexes = DecodeTriangleIndices(Cluster, TriIndex); if( bReverseWindingOrder ) VertIndexes.yz = VertIndexes.zy; const uint PixelValue = ((VisibleIndex + 1) << 7) | TriIndex; Out = CommonRasterizerVS(NaniteView, PrimitiveData, InstanceData, VisibleCluster, Cluster, VertIndexes[VertexID], PixelValue, bReverseWindingOrder); #if BARYCENTRIC_MODE_EXPORT const uint VIndex = bReverseWindingOrder ? 2 : 1; Out.BarycentricsUV = float2(VertexID == 0, VertexID == VIndex); #endif } return Out; } #endif // NANITE_PRIM_SHADER bool QuadActiveAnyTrue(bool Expr) { // https://microsoft.github.io/DirectX-Specs/d3d/HLSL_SM_6_7_QuadAny_QuadAll.html // NOTE: From that blog post, it seems like this approach is somewhat blessed, but the docs for // QuadReadAcrossX state that the result is undefined when reading an inactive lane. // So it seems, according to the docs, this could potentially give false positives, but never false negatives. // Helper lanes are defined to be active, so this should only ever be an issue if the lanes of // a quad are made partially inactive by an earler branch. For platforms where the undefined value // isn't just zero, this could result in false positives, which should still be safe // in the context of how this is currently used. const uint UIntExpr = (uint)Expr; uint Result = UIntExpr; Result |= QuadReadAcrossX(UIntExpr); Result |= QuadReadAcrossY(UIntExpr); Result |= QuadReadAcrossDiagonal(UIntExpr); return Result != 0u; } void HWRasterizePS(VSOut In #if NANITE_MESH_SHADER , PrimitiveAttributesPacked PrimitivePacked #endif #if MATERIAL_TWOSIDED , bool bFrontFace : SV_IsFrontFace #endif ) { #if NANITE_HW_RASTER_INTERPOLATE_DEPTH // Interpolating SV_Position attributes manually can be significantly faster than having the hardware set up the registers. // Unfortunately, it has also shown to have precision problems on some hardware for extremely long and narrow trinagles. // The compromise is to always use SV_Position for .xy, so it is guaranteed to always hit the right pixels, // but interpolate depth for shadow rendering, which is usually the more HW raster heavy pass. // For visibility buffer rendering the depth imprecision alone has shown to cause issues for extremely narrow triangles (UE-177564), // so there SV_Position is also used for depth. // TODO: Have the builder detect and fix the problematic cases, so we can always safely interpolate? float4 SvPosition = float4(In.Position.xy, In.ClipZW.x / In.ClipZW.y, In.ClipZW.y); #else float4 SvPosition = In.Position; #endif uint2 PixelPos = (uint2)SvPosition.xy; PrimitiveAttributes Primitive; #if NANITE_MESH_SHADER Primitive = UnpackPrimitiveAttributes(PrimitivePacked); #else Primitive = UnpackPrimitiveAttributes(In.PrimitivePacked); #endif uint PixelValue = Primitive.PixelValue; #if VERTEX_TO_TRIANGLE_MASKS #if NANITE_VERT_REUSE_BATCH uint2 Mask_TriRangeStart = GetAttributeAtVertex0( In.ToTriangleMask_TriRangeStart ); uint Mask0 = Mask_TriRangeStart.x; uint Mask1 = GetAttributeAtVertex1( In.ToTriangleMask_TriRangeStart ).x; uint Mask2 = GetAttributeAtVertex2( In.ToTriangleMask_TriRangeStart ).x; uint Mask = Mask0 & Mask1 & Mask2; uint TriangleIndex = Mask_TriRangeStart.y + firstbitlow(Mask); PixelValue += TriangleIndex; #else uint4 Masks0 = GetAttributeAtVertex0( In.ToTriangleMasks ); uint4 Masks1 = GetAttributeAtVertex1( In.ToTriangleMasks ); uint4 Masks2 = GetAttributeAtVertex2( In.ToTriangleMasks ); uint4 Masks = Masks0 & Masks1 & Masks2; uint TriangleIndex = Masks.x ? firstbitlow( Masks.x ) : Masks.y ? firstbitlow( Masks.y ) + 32 : Masks.z ? firstbitlow( Masks.z ) + 64 : firstbitlow( Masks.w ) + 96; PixelValue += TriangleIndex; #endif #endif #if VIRTUAL_TEXTURE_TARGET PixelPos += Primitive.ViewRect.xy; if (all(PixelPos < Primitive.ViewRect.zw)) #else // In multi-view mode every view has its own scissor, so we have to scissor manually. if( all( (PixelPos >= Primitive.ViewRect.xy) & (PixelPos < Primitive.ViewRect.zw) ) ) #endif { const uint ViewId = Primitive.ViewId; const bool bSwapVW = Primitive.bSwapVW; float MaterialMask = 1.0f; FVisBufferPixel Pixel = CreateVisBufferPixel( PixelPos, PixelValue, SvPosition.z ); #if VISUALIZE Pixel.VisualizeValues = GetVisualizeValues(); #endif #if VIRTUAL_TEXTURE_TARGET const uint MipLevel = Primitive.MipLevel; const uint ArrayIndex = Primitive.ArrayIndex; const uint LevelOffset = Primitive.LevelOffset; if( !VirtualToPhysicalTexelForRendering( FVirtualSMLevelOffset::Unpack(LevelOffset), MipLevel, Pixel.Position, Pixel.PhysicalPosition.xy ) ) { // Not committed or should not be rendered into return; } Pixel.PhysicalPosition.z = ArrayIndex; #endif Pixel.WriteOverdraw(); #if ENABLE_EARLY_Z_TEST BRANCH if( !QuadActiveAnyTrue( Pixel.EarlyDepthTest() ) ) { return; } #endif // Note: NANITE_PIXEL_PROGRAMMABLE is currently too conservative and PDO / Masking needs to be checked explicitly to remove unused code // See ShouldCompileProgrammablePermutation in NaniteCullRaster.cpp #if NANITE_PIXEL_PROGRAMMABLE && (WANT_PIXEL_DEPTH_OFFSET || MATERIALBLENDING_MASKED) const FNaniteView NaniteView = GetNaniteView(ViewId); ResolvedView = ResolveView(NaniteView); const uint DepthInt = asuint(SvPosition.z); const UlongType PackedPixel = PackUlongType(uint2(PixelValue, DepthInt)); FVertexFactoryInterpolantsVSToPS Interpolants = (FVertexFactoryInterpolantsVSToPS)0; // Material parameter inputs FBarycentrics Barycentrics = (FBarycentrics)0; bool bCalcVertIndexes = true; uint3 VertIndexes = 0; #if BARYCENTRIC_MODE_INTRINSICS const uint VertexID0 = GetAttributeAtVertex0(In.VertexID); const uint VertexID1 = GetAttributeAtVertex1(In.VertexID); const uint VertexID2 = GetAttributeAtVertex2(In.VertexID); VertIndexes = uint3(VertexID0, VertexID1, VertexID2); // Recover barycentrics from hardware ViVj: // v = v0 + I (v1 - v0) + J (v2 - v0) = (1 - I - J) v0 + I v1 + J v2 const float2 ViVj = GetViVjPerspectiveCenter(); const float3 UVW = float3(1.0f - ViVj.x - ViVj.y, ViVj); // The vertex order can be rotated during the rasterization process, // so the original order needs to be recovered to make sense of the barycentrics. // Fortunately, for compression purposes, triangle indices already have the form (base, base+a, base+b), where a,b>0. // This turns out to be convenient as it allows us to recover the original vertex order by simply rotating // the lowest vertex index into the first position. This saves an export compared to the usual provoking vertex trick // that compares with an additional nointerpolation export. const uint MinVertexID = min3(VertexID0, VertexID1, VertexID2); Barycentrics.Value = (MinVertexID == VertexID1) ? UVW.yzx : (MinVertexID == VertexID2) ? UVW.zxy : UVW; // As we already have the indices on hand, so we might as well use them instead of decoding them again from memory VertIndexes = (MinVertexID == VertexID1) ? VertIndexes.yzx : (MinVertexID == VertexID2) ? VertIndexes.zxy : VertIndexes; if (bSwapVW) { Barycentrics.Value.yz = Barycentrics.Value.zy; VertIndexes.yz = VertIndexes.zy; } bCalcVertIndexes = false; #elif BARYCENTRIC_MODE_SV_BARYCENTRICS && PIXELSHADER Barycentrics.Value = In.Barycentrics; if (bSwapVW) { Barycentrics.Value.yz = Barycentrics.Value.zy; } #elif BARYCENTRIC_MODE_EXPORT Barycentrics.Value = float3(In.BarycentricsUV, 1.0f - In.BarycentricsUV.x - In.BarycentricsUV.y); #endif FMaterialPixelParameters MaterialParameters = FetchNaniteMaterialPixelParameters(NaniteView, PackedPixel, VIRTUAL_TEXTURE_TARGET, Barycentrics, false, VertIndexes, bCalcVertIndexes, Interpolants, SvPosition ); #if MATERIAL_TWOSIDED MaterialParameters.TwoSidedSign = bFrontFace ? -1.0f : 1.0f; #endif #if NUM_TEX_COORD_INTERPOLATORS > 0 MaterialParameters.TexCoords[0] = In.TexCoords.xy; MaterialParameters.TexCoords_DDX[0] = ddx( In.TexCoords.xy ); MaterialParameters.TexCoords_DDY[0] = ddy( In.TexCoords.xy ); #endif #if NUM_TEX_COORD_INTERPOLATORS > 1 MaterialParameters.TexCoords[1] = In.TexCoords.zw; MaterialParameters.TexCoords_DDX[1] = ddx( In.TexCoords.zw ); MaterialParameters.TexCoords_DDY[1] = ddy( In.TexCoords.zw ); #endif FPixelMaterialInputs PixelMaterialInputs; #if USE_WORLD_POSITION_EXCLUDING_SHADER_OFFSETS CalcMaterialParametersEx(MaterialParameters, PixelMaterialInputs, SvPosition, MaterialParameters.ScreenPosition, true, MaterialParameters.WorldPosition_CamRelative, MaterialParameters.WorldPosition_NoOffsets_CamRelative); #else CalcMaterialParameters(MaterialParameters, PixelMaterialInputs, SvPosition, true /*bIsFrontFace*/); #endif // NOTE: Disable PDO in shadow passes (it does undesirable things and has always been disabled in these passes in Unreal) #if WANT_PIXEL_DEPTH_OFFSET && SHADOW_DEPTH_SHADER == 0 ApplyPixelDepthOffsetToMaterialParameters(MaterialParameters, PixelMaterialInputs, Pixel.Depth); #endif #if MATERIALBLENDING_MASKED MaterialMask = GetMaterialMask(PixelMaterialInputs); #endif #endif // NANITE_PIXEL_PROGRAMMABLE && (WANT_PIXEL_DEPTH_OFFSET || MATERIALBLENDING_MASKED) BRANCH if (MaterialMask >= 0) { Pixel.Write(); } } }