// Copyright Epic Games, Inc. All Rights Reserved. #include "../Common.ush" #include "../SceneData.ush" #include "../WaveOpUtil.ush" #include "../ComputeShaderUtils.ush" #define SPLIT_WORK_QUEUE (PATCHES) // TODO: Remove once shader rewriter has been fixed (UE-202409) #ifndef NANITE_TESSELLATION #define NANITE_TESSELLATION PATCHES #endif #include "NaniteAttributeDecode.ush" #include "NaniteTessellation.ush" #define RASTER_BIN_INIT (RASTER_BIN_PASS == NANITE_RASTER_BIN_INIT) #define RASTER_BIN_COUNT (RASTER_BIN_PASS == NANITE_RASTER_BIN_COUNT) #define RASTER_BIN_SCATTER (RASTER_BIN_PASS == NANITE_RASTER_BIN_SCATTER) #define RASTER_BIN_RESERVE (RASTER_BIN_PASS == NANITE_RASTER_BIN_RESERVE) #define RASTER_BIN_DEPTHBLOCK (RASTER_BIN_PASS == NANITE_RASTER_BIN_DEPTHBLOCK) #define RASTER_BIN_FINALIZE (RASTER_BIN_PASS == NANITE_RASTER_BIN_FINALIZE) #ifndef VIRTUAL_TEXTURE_TARGET #define VIRTUAL_TEXTURE_TARGET 0 #endif #ifndef FORCE_BATCHING #define FORCE_BATCHING 0 #endif #ifndef DEPTH_BUCKETING #define DEPTH_BUCKETING 1 #endif uint MeshPassIndex; uint MinSupportedWaveSize; uint MaxVisiblePatches; uint MaxClusterIndirections; RWStructuredBuffer OutRasterBinMeta; RWStructuredBuffer OutDepthBuckets; FNaniteMaterialFlags GetRasterBinMaterialFlags(uint BinIndex) { return UnpackNaniteMaterialFlags(OutRasterBinMeta[BinIndex].MaterialFlags_DepthBlock & 0xFFFFu); } uint GetRasterBinDepthBlock(uint BinIndex) { return OutRasterBinMeta[BinIndex].MaterialFlags_DepthBlock >> 16; } void InitRasterBin(uint BinIndex) { OutRasterBinMeta[BinIndex].ClusterOffset = 0; OutRasterBinMeta[BinIndex].BinSWCount = 0; OutRasterBinMeta[BinIndex].BinHWCount = 0; } uint2 GetRasterBinCount(uint BinIndex) { return uint2(OutRasterBinMeta[BinIndex].BinSWCount, OutRasterBinMeta[BinIndex].BinHWCount); } uint GetRasterBinCapacity(uint BinIndex) { const uint2 BinCount = GetRasterBinCount(BinIndex); return (BinCount.x + BinCount.y); } void SetRasterBinOffset(uint BinIndex, uint BinOffset) { OutRasterBinMeta[BinIndex].ClusterOffset = BinOffset; } uint GetRasterBinOffset(uint BinIndex) { return OutRasterBinMeta[BinIndex].ClusterOffset; } uint GetRemappedRasterBinFromIndex( uint InRelativeMaterialIndex, uint InPrimitiveIndex, uint InRegularRasterBinCount, uint InRenderFlags, bool bFallbackRasterBin, bool bVoxel) { uint RasterBin = GetMaterialRasterBinFromIndex( InRelativeMaterialIndex, InPrimitiveIndex, MeshPassIndex, InRegularRasterBinCount, bFallbackRasterBin ); const FNaniteMaterialFlags MaterialFlags = GetRasterBinMaterialFlags(RasterBin); return RemapRasterBin(RasterBin, InRenderFlags, MaterialFlags, bVoxel); } #if NANITE_TESSELLATION [numthreads(1, 1, 1)] void InitVisiblePatchesArgs() { const uint NumVisiblePatches = min( RWVisiblePatchesArgs[3], MaxVisiblePatches ); RWVisiblePatchesArgs[0] = ( NumVisiblePatches + 63u ) / 64u; RWVisiblePatchesArgs[1] = 1; RWVisiblePatchesArgs[2] = 1; } #endif #if RASTER_BIN_INIT uint RasterBinCount; [numthreads(64, 1, 1)] void RasterBinInit(uint RasterBinIndex : SV_DispatchThreadID) { if (RasterBinIndex < RasterBinCount) { InitRasterBin(RasterBinIndex); } } #elif RASTER_BIN_RESERVE uint RasterBinCount; RWStructuredBuffer OutRangeAllocator; RWBuffer OutRasterBinArgsSWHW; uint AllocateRasterBinRange(uint ClusterCount) { uint RangeStart; WaveInterlockedAdd_(OutRangeAllocator[0], ClusterCount, RangeStart); return RangeStart; } [numthreads(64, 1, 1)] void RasterBinReserve(uint RasterBinIndex : SV_DispatchThreadID) { if (RasterBinIndex < RasterBinCount) { const uint RasterBinCapacity = GetRasterBinCapacity(RasterBinIndex); const uint RasterBinOffset = AllocateRasterBinRange(RasterBinCapacity); SetRasterBinOffset(RasterBinIndex, RasterBinOffset); const uint ArgsOffset = (RasterBinIndex * NANITE_RASTERIZER_ARG_COUNT); WriteRasterizerArgsSWHW(OutRasterBinArgsSWHW, ArgsOffset, 0u, 0u); } } #elif RASTER_BIN_FINALIZE uint RasterBinCount; uint FinalizeMode; RWBuffer OutRasterBinArgsSWHW; [numthreads(64, 1, 1)] void RasterBinFinalize(uint RasterBinIndex : SV_DispatchThreadID) { if (RasterBinIndex < RasterBinCount) { const uint Offset = (RasterBinIndex * NANITE_RASTERIZER_ARG_COUNT); const uint HWCounterIndex = GetHWClusterCounterIndex(RenderFlags); const uint BinOffset = GetRasterBinOffset(RasterBinIndex); const uint SWClusterCount = OutRasterBinArgsSWHW[Offset + 0u]; const uint HWClusterCount = OutRasterBinArgsSWHW[Offset + HWCounterIndex]; const uint ClampedSWClusterCount = min(BinOffset + SWClusterCount, MaxClusterIndirections) - min(BinOffset, MaxClusterIndirections); const uint ClampedHWClusterCount = min(BinOffset + SWClusterCount + HWClusterCount, MaxClusterIndirections) - min(BinOffset + SWClusterCount, MaxClusterIndirections); OutRasterBinMeta[RasterBinIndex].BinSWCount = ClampedSWClusterCount; OutRasterBinMeta[RasterBinIndex].BinHWCount = ClampedHWClusterCount; OutRasterBinArgsSWHW[Offset + 0u] = ClampedSWClusterCount; const bool bMeshShader = (RenderFlags & NANITE_RENDER_FLAG_MESH_SHADER) != 0; BRANCH if (FinalizeMode == 0 && bMeshShader) // Wrapping { #if !PLATFORM_REQUIRES_UNWRAPPED_MESH_SHADER_ARGS // Need to span the launch size across X,Y,Z so any single dimension does not exceed the 64k // limit as per https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#dispatchmesh-api // The product of X*Y*Z can be 2^22, so we convert 1D->3D->1D to work around this limitation. const uint DimensionLimit = (1u << 16u) - 1u; const uint3 GroupCount = GetGroupCountWrapped(ClampedHWClusterCount, DimensionLimit.xx); // HW: ThreadGroupCountXYZ OutRasterBinArgsSWHW[Offset + 4u] = GroupCount.x; OutRasterBinArgsSWHW[Offset + 5u] = GroupCount.y; OutRasterBinArgsSWHW[Offset + 6u] = GroupCount.z; #else OutRasterBinArgsSWHW[Offset + 4u] = ClampedHWClusterCount; #endif } else if (FinalizeMode == 1 && bMeshShader) // VK_NV_mesh_shader { // VK_NV_mesh_shader draw indirect command (count, first, ) // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDrawMeshTasksIndirectCommandNV.html OutRasterBinArgsSWHW[Offset + 4u] = ClampedHWClusterCount; OutRasterBinArgsSWHW[Offset + 5u] = 0u; } else { OutRasterBinArgsSWHW[Offset + HWCounterIndex] = ClampedHWClusterCount; } } } #elif RASTER_BIN_DEPTHBLOCK //TODO: Permute based on wave width for slightly better perf? [numthreads(64, 1, 1)] void RasterBinDepthBlock(uint DepthBlockIndex : SV_GroupID, uint ThreadIndex : SV_GroupIndex) { const uint LaneIndex = WaveGetLaneIndex(); const uint LaneCount = WaveGetLaneCount(); // Make sure this is just a single wave if (ThreadIndex >= LaneCount) { return; } uint PrevSum = 0; for (uint BaseIndex = 0; BaseIndex < NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK; BaseIndex += LaneCount) { uint Index = DepthBlockIndex * NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK + BaseIndex + LaneIndex; const uint Count = OutDepthBuckets[Index]; const uint PrefixSum = PrevSum + WavePrefixSum(Count); PrevSum = WaveReadLaneLast(PrefixSum + Count); OutDepthBuckets[Index] = PrefixSum; } } #else Buffer InClusterOffsetSWHW; StructuredBuffer InClusterCountSWHW; StructuredBuffer InTotalPrevDrawClusters; uint bUsePrimOrMeshShader; uint RegularMaterialRasterBinCount; #if RASTER_BIN_SCATTER RWBuffer OutRasterBinArgsSWHW; RWStructuredBuffer OutRasterBinData; #endif uint ExtractVertReuseBatchInfo(uint2 PackedData, uint BitOffset) { uint BitConsumed = min(5, 32 - BitOffset); uint BatchTriCount = BitFieldExtractU32(PackedData.x, BitConsumed, BitOffset); if (5 - BitConsumed > 0) { BatchTriCount |= (BitFieldExtractU32(PackedData.y, 5 - BitConsumed, 0) << BitConsumed); } return BatchTriCount + 1; } uint DecodeVertReuseBatchInfo(FCluster Cluster, uint BatchInfoOffset, uint BatchIndex, bool bInlined) { uint BitOffset = CondMask(bInlined, 12u, Cluster.VertReuseBatchCountTableSize * 4u) + (BatchInfoOffset + BatchIndex) * 5; uint DwordOffset = BitOffset >> 5; BitOffset -= DwordOffset * 32; uint2 PackedData; if (bInlined) { PackedData = uint2(Cluster.VertReuseBatchInfo[DwordOffset], Cluster.VertReuseBatchInfo[DwordOffset + 1]); } else { PackedData = ClusterPageData.Load2(Cluster.PageBaseAddress + (Cluster.VertReuseBatchCountTableOffset + DwordOffset) * 4); } return ExtractVertReuseBatchInfo(PackedData, BitOffset); } #if RASTER_BIN_COUNT void IncrementRasterBinCount(uint BinIndex, bool bSoftware, uint BatchCount, uint FlatDepthBucket) { // One atomic per lane causes severe contention that ends up dominating execution time. // This path uses wave ops to reduce it to one atomic per unique bin in the wave. // As clusters are allocated in ranges at group granularity (currently ~32 clusters), // the expectation is that most waves will only have one or at most a few unique bins. bool bDone = false; while (WaveActiveAnyTrue(!bDone)) { if (!bDone) { const uint UniformBinIndex = WaveReadLaneFirst(BinIndex); const bool bUniformSoftware = (bool)WaveReadLaneFirst((uint)bSoftware); if (BinIndex == UniformBinIndex && bSoftware == bUniformSoftware) { if (bUniformSoftware) { WaveInterlockedAdd(OutRasterBinMeta[UniformBinIndex].BinSWCount, BatchCount); } else { WaveInterlockedAdd(OutRasterBinMeta[UniformBinIndex].BinHWCount, BatchCount); } bDone = true; } } } #if DEPTH_BUCKETING BRANCH if (FlatDepthBucket != 0xFFFFFFFFu) { // TODO: We could merge this with the raster counters above if all the counters were in the same buffer. Worthwhile? // TODO: Does scalarization make sense here? InterlockedAdd(OutDepthBuckets[FlatDepthBucket], BatchCount); } #endif } #elif RASTER_BIN_SCATTER uint AllocateRasterBinCluster(uint BinIndex, bool bSoftware, uint BatchCount, uint FlatDepthBucket) { uint ClusterOffset = 0u; uint Offset = (BinIndex * NANITE_RASTERIZER_ARG_COUNT); if (!bSoftware) { Offset += CondMask((RenderFlags & NANITE_RENDER_FLAG_MESH_SHADER) || (RenderFlags & NANITE_RENDER_FLAG_PRIMITIVE_SHADER), 4u, 5u); } bool bDone = false; while(WaveActiveAnyTrue(!bDone)) { if(!bDone) { const uint UniformOffset = WaveReadLaneFirst(Offset); if (Offset == UniformOffset) { WaveInterlockedAdd_(OutRasterBinArgsSWHW[UniformOffset], BatchCount, ClusterOffset); bDone = true; } } } const uint2 BinCount = GetRasterBinCount(BinIndex); #if DEPTH_BUCKETING BRANCH if (FlatDepthBucket != 0xFFFFFFFFu) { // TODO: We could merge this with the raster counters above if all the counters were in the same buffer. Worthwhile? // TODO: Does scalarization make sense here? Initial test says no. // TODO: Consider still using the reverse order inside the buckets for slightly better order? InterlockedAdd(OutDepthBuckets[FlatDepthBucket], BatchCount, ClusterOffset); ClusterOffset = (!bSoftware ? BinCount.x : 0u) + ClusterOffset; } else #endif { if (bSoftware) { ClusterOffset = BinCount.x - ClusterOffset - BatchCount; // Write clusters in reverse order for better z-test rejection for voxels and masked materials } else { ClusterOffset = BinCount.x + ClusterOffset; // HW cluster list already reversed, so write from beginning } } return GetRasterBinOffset(BinIndex) + ClusterOffset; } #endif void ExportRasterBin(uint RasterBin, FNaniteMaterialFlags BinMaterialFlags, uint ClusterIndex, uint RangeStart, uint RangeEnd, uint BatchCount, uint BatchInfoOffset, FCluster Cluster, bool bBatchInfoInlined, bool bSoftware, uint DepthBucket) { if (MinSupportedWaveSize < 32 && BinMaterialFlags.bPixelProgrammable) { // SW pixel programmable path requires wave size >= 32. Fall back to HW raster when that is not guaranteed. bSoftware = false; } if (!BinMaterialFlags.bNoDerivativeOps) { // Materials requiring finite differences must run the HW path (SW lanes are processing random triangles) bSoftware = false; } #if FORCE_BATCHING const bool bForceBatching = true; #else const bool bForceBatching = false; #endif bool bUseBatch = !bSoftware && bUsePrimOrMeshShader && (bForceBatching || BinMaterialFlags.bVertexProgrammable); if (BinMaterialFlags.bDisplacement) { // Displacement forces 100% software, and also the vert reuse batching bSoftware = true; bUseBatch = true; } if (Cluster.bVoxel) { bSoftware = true; bUseBatch = false; } BatchCount = CondMask(bUseBatch, BatchCount, 1u); uint FlatDepthBucket = 0xFFFFFFFFu; #if DEPTH_BUCKETING uint DepthBlock = GetRasterBinDepthBlock(RasterBin); if (DepthBlock != 0xFFFFu) { DepthBlock += (bSoftware ? 0u : 1u); FlatDepthBucket = DepthBlock * NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK + DepthBucket; } #endif #if RASTER_BIN_COUNT IncrementRasterBinCount(RasterBin, bSoftware, BatchCount, FlatDepthBucket); #elif RASTER_BIN_SCATTER const uint BinClusterMapping = AllocateRasterBinCluster(RasterBin, bSoftware, BatchCount, FlatDepthBucket); // Trim any overflow exports BatchCount = min(BinClusterMapping + BatchCount, MaxClusterIndirections) - min(BinClusterMapping, MaxClusterIndirections); for (uint BatchIndex = 0; BatchIndex < BatchCount; ++BatchIndex) { if (bUseBatch) { uint BatchTriCount = DecodeVertReuseBatchInfo(Cluster, BatchInfoOffset, BatchIndex, bBatchInfoInlined); RangeEnd = RangeStart + BatchTriCount; } OutRasterBinData[BinClusterMapping + BatchIndex].x = ClusterIndex; OutRasterBinData[BinClusterMapping + BatchIndex].y = (RangeStart << 16) | RangeEnd; RangeStart = RangeEnd; } #endif } template< typename T, typename FFunction > void ScalarizeForEachMatching( T Value, FFunction Function ) { bool bDone = false; while( WaveActiveAnyTrue( !bDone ) ) { if( !bDone ) { T UniformValue = WaveReadLaneFirst( Value ); if( UniformValue == Value ) { Function( UniformValue ); bDone = true; } } } } bool IsNaniteMaterialMergingAllowed(FNaniteMaterialFlags MaterialFlags) { return !MaterialFlags.bDisplacement; // Merging ranges is incompatible with tessellation. Cluster rasterizer assumes UVDensities is uniform. } [numthreads(64, 1, 1)] void RasterBinBuild(uint RelativeClusterIndex : SV_DispatchThreadID, uint GroupThreadIndex : SV_GroupIndex) { #if PATCHES const uint VisibleIndex = RelativeClusterIndex; const uint NumVisiblePatches = min( VisiblePatchesArgs[3], MaxVisiblePatches ); if( VisibleIndex < NumVisiblePatches ) { #if NANITE_TESSELLATION_PATCH_REFS const uint PatchIndex = VisiblePatches.Load(VisibleIndex * 8); const uint PatchEncoded = SplitWorkQueue.DataBuffer_Load(PatchIndex * 16); #else const uint PatchEncoded = VisiblePatches.Load(VisibleIndex * 16); #endif uint VisibleClusterIndex = PatchEncoded.x >> 7; uint TriIndex = PatchEncoded.x & 0x7F; FVisibleCluster VisibleCluster = GetVisibleCluster( VisibleClusterIndex, VIRTUAL_TEXTURE_TARGET ); FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked( VisibleCluster.InstanceId ); FCluster Cluster = GetCluster( VisibleCluster.PageIndex, VisibleCluster.ClusterIndex ); const uint RasterBin = GetRemappedRasterBinFromIndex( GetRelativeMaterialIndex(Cluster, TriIndex), InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, /* bFallbackRasterBin */ false, Cluster.bVoxel ); uint IndexInBin = 0; // Scalarize atomics bool bDone = false; while( WaveActiveAnyTrue( !bDone ) ) { if( !bDone ) { const uint UniformBinIndex = WaveReadLaneFirst( RasterBin ); if( UniformBinIndex == RasterBin ) { #if RASTER_BIN_COUNT WaveInterlockedAddScalar(OutRasterBinMeta[UniformBinIndex].BinSWCount, 1); #elif RASTER_BIN_SCATTER WaveInterlockedAddScalar_( OutRasterBinArgsSWHW[ UniformBinIndex * NANITE_RASTERIZER_ARG_COUNT + 3 ], 1, IndexInBin ); const uint Num = IndexInBin + WaveActiveCountBits(true); if(WaveIsFirstLane()) { const uint NumGroups = (Num + MaxPatchesPerGroup - 1) / MaxPatchesPerGroup; // Slow integer divide. Fix me? InterlockedMax(OutRasterBinArgsSWHW[ UniformBinIndex * NANITE_RASTERIZER_ARG_COUNT + 0 ], NumGroups); } #endif bDone = true; } } } #if RASTER_BIN_SCATTER const uint BinMapping = GetRasterBinOffset( RasterBin ) + IndexInBin; OutRasterBinData[ BinMapping ].x = VisibleIndex; OutRasterBinData[ BinMapping ].y = 0; #endif } #else const uint SWClusterCount = InClusterCountSWHW[0].x; const uint HWClusterCount = InClusterCountSWHW[0].y; const bool bSoftware = RelativeClusterIndex < SWClusterCount; const uint ClusterCount = SWClusterCount + HWClusterCount; if (RelativeClusterIndex < ClusterCount) { RelativeClusterIndex = CondMask(bSoftware, RelativeClusterIndex, RelativeClusterIndex - SWClusterCount); uint ClusterOffset = 0; const bool bHasPrevDrawData = (RenderFlags & NANITE_RENDER_FLAG_HAS_PREV_DRAW_DATA) != 0u; BRANCH if (bHasPrevDrawData) { ClusterOffset += CondMask(bSoftware, InTotalPrevDrawClusters[0].x, InTotalPrevDrawClusters[0].y); } #if IS_POST_PASS ClusterOffset += CondMask(bSoftware, InClusterOffsetSWHW[0], InClusterOffsetSWHW[GetHWClusterCounterIndex(RenderFlags)]); #endif uint VisibleClusterIndex = RelativeClusterIndex + ClusterOffset; // HW clusters are written from the top VisibleClusterIndex = CondMask(bSoftware, VisibleClusterIndex, (MaxVisibleClusters - 1) - VisibleClusterIndex); FVisibleCluster VisibleCluster = GetVisibleCluster(VisibleClusterIndex, VIRTUAL_TEXTURE_TARGET); FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked(VisibleCluster); FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); const bool bFallbackBin = (VisibleCluster.Flags & NANITE_CULLING_FLAG_FALLBACK_RASTER) != 0; const uint DepthBucket = VisibleCluster.DepthBucket; BRANCH if (IsMaterialFastPath(Cluster)) { uint RasterBin0 = 0; uint RasterBin1 = 0; uint RasterBin2 = 0; uint RasterLen0 = 0; uint RasterLen1 = 0; uint RasterLen2 = 0; uint BatchCount0 = BitFieldExtractU32(Cluster.VertReuseBatchInfo.x, 4, 0); uint BatchCount1 = BitFieldExtractU32(Cluster.VertReuseBatchInfo.x, 4, 4); uint BatchCount2 = BitFieldExtractU32(Cluster.VertReuseBatchInfo.x, 4, 8); uint BatchInfoOffset0 = 0; uint BatchInfoOffset1 = BatchCount0; uint BatchInfoOffset2 = BatchCount0 + BatchCount1; FNaniteMaterialFlags BinMaterialFlags0 = (FNaniteMaterialFlags)0; FNaniteMaterialFlags BinMaterialFlags1 = (FNaniteMaterialFlags)0; FNaniteMaterialFlags BinMaterialFlags2 = (FNaniteMaterialFlags)0; // Length is remaining triangles after Material0 and Material1 const uint Material2Length = Cluster.MaterialTotalLength - (Cluster.Material0Length + Cluster.Material1Length); // The 0th material range is always non-zero length { RasterBin0 = GetRemappedRasterBinFromIndex(Cluster.Material0Index, InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, bFallbackBin, Cluster.bVoxel); BinMaterialFlags0 = GetRasterBinMaterialFlags(RasterBin0); } BRANCH if (Cluster.Material1Length > 0u) { RasterBin1 = GetRemappedRasterBinFromIndex(Cluster.Material1Index, InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, bFallbackBin, Cluster.bVoxel); BinMaterialFlags1 = GetRasterBinMaterialFlags(RasterBin1); } BRANCH if (Material2Length > 0) { RasterBin2 = GetRemappedRasterBinFromIndex(Cluster.Material2Index, InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, bFallbackBin, Cluster.bVoxel); BinMaterialFlags2 = GetRasterBinMaterialFlags(RasterBin2); } const bool bAllowMerging0 = IsNaniteMaterialMergingAllowed(BinMaterialFlags0); const bool bAllowMerging1 = IsNaniteMaterialMergingAllowed(BinMaterialFlags1); const bool bAllowMerging2 = IsNaniteMaterialMergingAllowed(BinMaterialFlags2); BRANCH if (RasterBin0 == RasterBin1 && bAllowMerging0 && bAllowMerging1) { if (RasterBin0 == RasterBin2 && bAllowMerging2) { RasterLen0 = Cluster.MaterialTotalLength; BatchCount0 += BatchCount1 + BatchCount2; } else { RasterLen0 = (Cluster.Material0Length + Cluster.Material1Length); RasterLen2 = Material2Length; BatchCount0 += BatchCount1; } } else if (RasterBin1 == RasterBin2 && bAllowMerging1 && bAllowMerging2) { RasterLen0 = Cluster.Material0Length; RasterLen1 = Cluster.MaterialTotalLength - Cluster.Material0Length; BatchCount1 += BatchCount2; } else { RasterLen0 = Cluster.Material0Length; RasterLen1 = Cluster.Material1Length; RasterLen2 = Material2Length; } // The 0th material range is always non-zero length { ExportRasterBin(RasterBin0, BinMaterialFlags0, VisibleClusterIndex, 0u, RasterLen0, BatchCount0, BatchInfoOffset0, Cluster, true, bSoftware, DepthBucket); } BRANCH if (RasterLen1 > 0) { const uint Range1Start = RasterLen0; const uint Range1End = Range1Start + RasterLen1; ExportRasterBin(RasterBin1, BinMaterialFlags1, VisibleClusterIndex, Range1Start, Range1End, BatchCount1, BatchInfoOffset1, Cluster, true, bSoftware, DepthBucket); } BRANCH if (RasterLen2 > 0) { const uint Range2Start = (RasterLen0 + RasterLen1); const uint Range2End = Range2Start + RasterLen2; ExportRasterBin(RasterBin2, BinMaterialFlags2, VisibleClusterIndex, Range2Start, Range2End, BatchCount2, BatchInfoOffset2, Cluster, true, bSoftware, DepthBucket); } } else { uint CurrentRangeBin = 0xFFFFFFFFu; uint CurrentRangeStart = 0; uint CurrentRangeEnd = 0; uint CurrentBatchCount = 0; uint CurrentBatchInfoOffset = 0; FNaniteMaterialFlags CurrentBinMaterialFlags = (FNaniteMaterialFlags)0; bool bCurrentAllowMerging = false; FBitStreamReaderState BatchCountStreamState = BitStreamReader_Create_Aligned(Cluster.PageBaseAddress + Cluster.VertReuseBatchCountTableOffset * 4, 0, NANITE_MAX_CLUSTER_MATERIALS * 4); uint TableOffset = Cluster.PageBaseAddress + Cluster.MaterialTableOffset * 4u; LOOP for (uint TableEntry = 0; TableEntry < Cluster.MaterialTableLength; ++TableEntry) { const uint EncodedRange = ClusterPageData.Load(TableOffset); TableOffset += 4; uint TriStart; uint TriLength; uint MaterialIndex; DecodeMaterialRange(EncodedRange, TriStart, TriLength, MaterialIndex); const uint RasterBinN = GetRemappedRasterBinFromIndex(MaterialIndex, InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, bFallbackBin, Cluster.bVoxel); const FNaniteMaterialFlags BinMaterialFlags = GetRasterBinMaterialFlags(RasterBinN); const bool bAllowMerging = IsNaniteMaterialMergingAllowed(BinMaterialFlags); const uint BatchCount = BitStreamReader_Read_RO(ClusterPageData, BatchCountStreamState, 4, 4); // Check if raster slot matches the current run, and that the triangle range is contiguous. if ((RasterBinN == CurrentRangeBin) && bAllowMerging && bCurrentAllowMerging) { // Update current range CurrentRangeEnd = TriStart + TriLength; CurrentBatchCount += BatchCount; } else { // Not merging previous range, and previous range has valid triangles that need to be flushed BRANCH if (CurrentRangeEnd > 0) { ExportRasterBin(CurrentRangeBin, CurrentBinMaterialFlags, VisibleClusterIndex, CurrentRangeStart, CurrentRangeEnd, CurrentBatchCount, CurrentBatchInfoOffset, Cluster, false, bSoftware, DepthBucket); } CurrentRangeBin = RasterBinN; CurrentRangeStart = TriStart; CurrentRangeEnd = CurrentRangeStart + TriLength; CurrentBatchInfoOffset += CurrentBatchCount; CurrentBatchCount = BatchCount; CurrentBinMaterialFlags = BinMaterialFlags; bCurrentAllowMerging = bAllowMerging; } } // Need to flush current range ExportRasterBin(CurrentRangeBin, CurrentBinMaterialFlags, VisibleClusterIndex, CurrentRangeStart, CurrentRangeEnd, CurrentBatchCount, CurrentBatchInfoOffset, Cluster, false, bSoftware, DepthBucket); } } #endif } #endif