// Copyright Epic Games, Inc. All Rights Reserved. #include "/Engine/Shared/NaniteDefinitions.h" #ifndef NANITE_HIERARCHY_TRAVERSAL # define NANITE_HIERARCHY_TRAVERSAL 0 #endif #define DEFINE_ITERATE_CLUSTER_SEGMENTS (1) #if NANITE_HIERARCHY_TRAVERSAL # define NANITE_HIERARCHY_TRAVERSAL_TYPE (CULLING_TYPE) # define GROUP_NODE_SIZE 3 # include "NaniteHierarchyTraversal.ush" #endif #include "../Common.ush" //#include "../SceneData.ush" #include "../WaveOpUtil.ush" #include "../ComputeShaderUtils.ush" #include "NaniteDataDecode.ush" #include "NaniteAttributeDecode.ush" #include "NaniteCulling.ush" /* * * Per mesh BLAS: * - Update vertex/index allocators * - Init Queue * - Count pass * - Vertex count * - Triangles per segment/material * - Allocate vertex / index ranges * - Init Queue * - Stream out data * */ struct FNaniteStreamOutRequest { uint PrimitiveId; uint NumMaterials; uint NumSegments; uint SegmentMappingOffset; uint AuxiliaryDataOffset; uint MeshDataOffset; }; StructuredBuffer StreamOutRequests; uint NumRequests; StructuredBuffer SegmentMappingBuffer; float StreamOutCutError; // MeshDataBuffer layout // 0 - num clusters // 1 - vertex buffer offset // 2 - index buffer offset // 3 - vertex count // 4 - segment 0 index count // 5 - segment 0 index buffer offset // - segment ... index count // - segment ... index buffer offset // - segment N index count // - segment N index buffer offset uint GetMeshClusterCountIndex(uint MeshDataOffset) { return MeshDataOffset + 0; } uint GetMeshVertexBufferOffsetIndex(uint MeshDataOffset) { return MeshDataOffset + 1; } uint GetMeshIndexBufferOffsetIndex(uint MeshDataOffset) { return MeshDataOffset + 2; } uint GetMeshVertexCountIndex(uint MeshDataOffset) { return MeshDataOffset + 3; } uint GetMeshIndexCountIndex(uint MeshDataOffset) { return MeshDataOffset + 4; } uint GetMeshSegmentIndexCountIndex(uint MeshDataOffset, uint SegmentIndex) { return MeshDataOffset + 5 + SegmentIndex * 2 + 0; } uint GetMeshSegmentIndexBufferOffsetIndex(uint MeshDataOffset, uint SegmentIndex) { return MeshDataOffset + 5 + SegmentIndex * 2 + 1; } RWStructuredBuffer MeshDataBuffer; RWStructuredBuffer VertexBuffer; // can't use RWStructuredBuffer because of bDisableScalarBlockLayout RWStructuredBuffer IndexBuffer; RWStructuredBuffer AuxiliaryDataBufferRW; RWByteAddressBuffer OutputClustersRW; RWStructuredBuffer OutputClustersStateRW; // | current vertex marker | current index marker | RWStructuredBuffer VertexAndIndexAllocator; uint VertexBufferSize; uint IndexBufferSize; bool Allocate(uint Size, uint BufferSize, uint AllocatorEntry, out uint OutOffset) { InterlockedAdd(VertexAndIndexAllocator[AllocatorEntry], Size, OutOffset); if (OutOffset + Size > BufferSize) { return false; } return true; } uint Align16(uint Value) { return (Value + 15) & ~15; } bool AllocateVertexAndIndexRanges(uint VertexSize, uint IndexSize, out uint OutVertexOffset, out uint OutIndexOffset) { // allocate vertex range if (!Allocate(Align16(VertexSize), VertexBufferSize, 0, OutVertexOffset)) { OutIndexOffset = 0; return false; } // allocate index range if (!Allocate(Align16(IndexSize), IndexBufferSize, 1, OutIndexOffset)) { return false; } return true; } struct FCandidateCluster { uint PageIndex; uint ClusterIndex; uint PrimitiveId; uint RequestIndex; }; uint4 PackCandidateClusterRT(uint ClusterIdx, FCandidateCluster CandidateCluster) { uint4 RawData = 0; uint BitPos = 0; WriteBits(RawData, BitPos, CandidateCluster.PageIndex, NANITE_MAX_GPU_PAGES_BITS); WriteBits(RawData, BitPos, CandidateCluster.ClusterIndex, NANITE_MAX_CLUSTERS_PER_PAGE_BITS); WriteBits(RawData, BitPos, CandidateCluster.PrimitiveId, NANITE_MAX_INSTANCES_BITS); // TODO: Should be NANITE_MAX_PRIMITIVE_BITS WriteBits(RawData, BitPos, CandidateCluster.RequestIndex, 20); // TODO: Enforce this limit this on CPU return RawData; } FCandidateCluster UnpackCandidateClusterRT(uint4 RawData) { uint BitPos = 0; FCandidateCluster CandidateCluster; CandidateCluster.PageIndex = ReadBits(RawData, BitPos, NANITE_MAX_GPU_PAGES_BITS); CandidateCluster.ClusterIndex = ReadBits(RawData, BitPos, NANITE_MAX_CLUSTERS_PER_PAGE_BITS); CandidateCluster.PrimitiveId = ReadBits(RawData, BitPos, NANITE_MAX_INSTANCES_BITS); CandidateCluster.RequestIndex = ReadBits(RawData, BitPos, 20); // TODO: Enforce this limit this on CPU return CandidateCluster; } struct FCandidateNodeRT { uint NodeIndex; uint PrimitiveId; uint RequestIndex; }; uint4 PackCandidateNodeRT(FCandidateNodeRT Node) { // Leave at least one bit unused in each of the fields, so 0xFFFFFFFFu is never a valid value. uint4 RawData = 0; uint BitPos = 0; WriteBits(RawData, BitPos, Node.NodeIndex, NANITE_MAX_NODES_PER_PRIMITIVE_BITS); WriteBits(RawData, BitPos, Node.PrimitiveId, NANITE_MAX_INSTANCES_BITS); // TODO: Should be NANITE_MAX_PRIMITIVE_BITS WriteBits(RawData, BitPos, Node.RequestIndex, 20); // TODO: Enforce this limit this on CPU checkSlow(RawData.x != 0xFFFFFFFFu && RawData.y != 0xFFFFFFFFu && RawData.z != 0xFFFFFFFFu); return RawData; } FCandidateNodeRT UnpackCandidateNodeRT(uint4 RawData) { uint BitPos = 0; FCandidateNodeRT Node; Node.NodeIndex = ReadBits(RawData, BitPos, NANITE_MAX_NODES_PER_PRIMITIVE_BITS); Node.PrimitiveId = ReadBits(RawData, BitPos, NANITE_MAX_INSTANCES_BITS); Node.RequestIndex = ReadBits(RawData, BitPos, 20); // TODO: Enforce this limit this on CPU return Node; } uint GetCandidateNodeSizeRT() { return 12u; } uint GetCandidateClusterSizeRT() { return 12u; } FCandidateCluster LoadCandidateClusterRT(ByteAddressBuffer CandidateClusters, uint ClusterIndex) { uint4 RawData = uint4(CandidateClusters.Load3(ClusterIndex * GetCandidateClusterSizeRT()), 0u); return UnpackCandidateClusterRT(RawData); } void StorePackedCandidateClusterRT(RWByteAddressBuffer CandidateClusters, uint ClusterIndex, uint4 PackedCluster) { CandidateClusters.Store3(ClusterIndex * GetCandidateClusterSizeRT(), PackedCluster.xyz); } void StoreCandidateClusterNoCheckRT(RWByteAddressBuffer CandidateClusters, uint ClusterIndex, FCandidateCluster CandidateCluster) { uint4 RawData = PackCandidateClusterRT(ClusterIndex, CandidateCluster); CandidateClusters.Store3(ClusterIndex * GetCandidateClusterSizeRT(), RawData.xyz); } void StoreCandidateClusterRT(RWByteAddressBuffer CandidateClusters, uint ClusterIndex, FCandidateCluster CandidateCluster) { checkSlow(ClusterIndex < MaxCandidateClusters); StoreCandidateClusterNoCheckRT(CandidateClusters, ClusterIndex, CandidateCluster); } uint4 LoadCandidateNodeDataRT(RWByteAddressBuffer InNodes, uint NodeIndex) { checkSlow(NodeIndex < MaxNodes); return uint4(InNodes.Load3(NodeIndex * GetCandidateNodeSizeRT()), 0); } void StoreCandidateNodeDataRT(RWByteAddressBuffer InNodes, uint NodeIndex, uint4 RawData) { checkSlow(NodeIndex < MaxNodes); InNodes.Store3(NodeIndex * GetCandidateNodeSizeRT(), RawData.xyz); } void StoreCandidateNodeRT(RWByteAddressBuffer InNodes, uint NodeIndex, FCandidateNodeRT Node) { checkSlow(NodeIndex < MaxNodes); StoreCandidateNodeDataRT(InNodes, NodeIndex, PackCandidateNodeRT(Node)); } void ClearCandidateNodeRT(RWByteAddressBuffer InNodes, uint NodeIndex) { checkSlow(NodeIndex < MaxNodes); StoreCandidateNodeDataRT(InNodes, NodeIndex, 0xFFFFFFFFu); } void WriteAuxiliaryData(FCandidateCluster CandidateCluster, uint TriangleIndex, uint AuxiliaryDataBufferOffset) { uint AuxiliaryData = 0; AuxiliaryData |= CandidateCluster.PageIndex; AuxiliaryData |= CandidateCluster.ClusterIndex << NANITE_MAX_GPU_PAGES_BITS; AuxiliaryData |= TriangleIndex << (NANITE_MAX_GPU_PAGES_BITS + NANITE_MAX_CLUSTERS_PER_PAGE_BITS); AuxiliaryDataBufferRW[AuxiliaryDataBufferOffset] = AuxiliaryData; } void WriteTriangles(FCluster Cluster, FCandidateCluster CandidateCluster, uint StartTriangle, uint NumTriangles, uint BaseIndex, uint IndexBufferOffset, uint AuxiliaryDataBufferOffset) { for (uint TriIndex = 0; TriIndex < NumTriangles; ++TriIndex) { const uint3 TriIndices = DecodeTriangleIndices(Cluster, StartTriangle + TriIndex); IndexBuffer[IndexBufferOffset + TriIndex * 3 + 0] = BaseIndex + TriIndices.x; IndexBuffer[IndexBufferOffset + TriIndex * 3 + 1] = BaseIndex + TriIndices.y; IndexBuffer[IndexBufferOffset + TriIndex * 3 + 2] = BaseIndex + TriIndices.z; WriteAuxiliaryData(CandidateCluster, StartTriangle + TriIndex, AuxiliaryDataBufferOffset + TriIndex); } } void WriteSegment(uint MaterialIndex, uint TriStart, uint TriLength, uint ClusterVertexOffset, uint MeshDataOffset, uint IndexBufferOffset, uint AuxiliaryDataOffset, FCluster Cluster, FCandidateCluster CandidateCluster, FNaniteStreamOutRequest RequestData) { checkSlow(MaterialIndex < RequestData.NumMaterials); const uint SegmentIndex = SegmentMappingBuffer[RequestData.SegmentMappingOffset + MaterialIndex]; const uint MeshSegmentIndexCountIndex = GetMeshSegmentIndexCountIndex(MeshDataOffset, SegmentIndex); const uint MeshSegmentIndexBufferOffsetIndex = GetMeshSegmentIndexBufferOffsetIndex(MeshDataOffset, SegmentIndex); const uint SegmentBaseIndexBufferOffset = MeshDataBuffer[MeshSegmentIndexBufferOffsetIndex]; uint ClusterIndexBufferOffset; InterlockedAdd(MeshDataBuffer[MeshSegmentIndexCountIndex], TriLength * 3, ClusterIndexBufferOffset); const uint SegmentStartTriangle = TriStart; const uint SegmentNumTriangles = TriLength; const uint SegmentBaseIndex = ClusterVertexOffset; const uint SegmentIndexBufferOffset = IndexBufferOffset + SegmentBaseIndexBufferOffset + ClusterIndexBufferOffset; const uint SegmentAuxiliaryDataBufferOffset = AuxiliaryDataOffset + (SegmentBaseIndexBufferOffset + ClusterIndexBufferOffset) / 3; WriteTriangles(Cluster, CandidateCluster, SegmentStartTriangle, SegmentNumTriangles, SegmentBaseIndex, SegmentIndexBufferOffset, SegmentAuxiliaryDataBufferOffset); } struct FStreamOutClusterSegmentProcessor { uint ClusterVertexOffset; uint MeshDataOffset; uint IndexBufferOffset; uint AuxiliaryDataOffset; FCluster Cluster; FCandidateCluster CandidateCluster; FNaniteStreamOutRequest RequestData; void Process(uint TriStart, uint TriLength, uint MaterialIndex) { WriteSegment(MaterialIndex, TriStart, TriLength, ClusterVertexOffset, MeshDataOffset, IndexBufferOffset, AuxiliaryDataOffset, Cluster, CandidateCluster, RequestData); } }; void StreamOutClusterCommon(FCluster Cluster, FCandidateCluster CandidateCluster, FNaniteStreamOutRequest RequestData) { // TODO experiments: // - Output using one group per cluster const uint MeshDataOffset = RequestData.MeshDataOffset; const uint MeshVertexBufferOffsetIndex = GetMeshVertexBufferOffsetIndex(MeshDataOffset); const uint MeshIndexBufferOffsetIndex = GetMeshIndexBufferOffsetIndex(MeshDataOffset); const uint MeshVertexCountIndex = GetMeshVertexCountIndex(MeshDataOffset); const uint VertexBufferOffset = MeshDataBuffer[MeshVertexBufferOffsetIndex]; const uint IndexBufferOffset = MeshDataBuffer[MeshIndexBufferOffsetIndex]; if (VertexBufferOffset == 0xFFFFFFFFu || IndexBufferOffset == 0xFFFFFFFFu) { return; } uint ClusterVertexOffset; InterlockedAdd(MeshDataBuffer[MeshVertexCountIndex], Cluster.NumVerts, ClusterVertexOffset); for (uint VertexIndex = 0; VertexIndex < Cluster.NumVerts; ++VertexIndex) { // TODO: Nanite-Assemblies: Do we need to apply assembly transform here? const float3 Pos = DecodePosition(VertexIndex, Cluster); const uint IndexInVertexBuffer = (VertexBufferOffset + ClusterVertexOffset + VertexIndex) * 3; VertexBuffer[IndexInVertexBuffer + 0] = Pos.x; VertexBuffer[IndexInVertexBuffer + 1] = Pos.y; VertexBuffer[IndexInVertexBuffer + 2] = Pos.z; } FStreamOutClusterSegmentProcessor Processor; Processor.ClusterVertexOffset = ClusterVertexOffset; Processor.MeshDataOffset = MeshDataOffset; Processor.IndexBufferOffset = IndexBufferOffset; Processor.AuxiliaryDataOffset = RequestData.AuxiliaryDataOffset; Processor.Cluster = Cluster; Processor.CandidateCluster = CandidateCluster; Processor.RequestData = RequestData; IterateClusterSegments(Cluster, ClusterPageData, Processor); } #if NANITE_HIERARCHY_TRAVERSAL RWByteAddressBuffer Nodes; RWByteAddressBuffer CandidateClusters; struct FCountTrianglesClusterSegmentProcessor { uint MeshDataOffset; FNaniteStreamOutRequest RequestData; void Process(uint TriStart, uint TriLength, uint MaterialIndex) { checkSlow(MaterialIndex < RequestData.NumMaterials); const uint SegmentIndex = SegmentMappingBuffer[RequestData.SegmentMappingOffset + MaterialIndex]; const uint MeshSegmentIndexCountIndex = GetMeshSegmentIndexCountIndex(MeshDataOffset, SegmentIndex); InterlockedAdd(MeshDataBuffer[MeshSegmentIndexCountIndex], TriLength * 3); } }; struct FNaniteTraversalStreamOutCallback { FCandidateNodeRT CandidateNode; FPrimitiveSceneData PrimitiveData; void Init(uint InChildIndex, uint InLocalNodeIndex, uint GroupNodeFetchIndex) { const uint4 NodeData = GetGroupNodeData(GroupNodeFetchIndex); CandidateNode = UnpackCandidateNodeRT(NodeData); PrimitiveData = GetPrimitiveData(CandidateNode.PrimitiveId); } uint GetHierarchyNodeOffset() { return ::GetHierarchyNodeOffset(PrimitiveData.NaniteHierarchyOffset, CandidateNode.NodeIndex); } bool ShouldVisitChild(FHierarchyNodeSlice HierarchyNodeSlice, bool bInVisible) { bool bShouldVisitChild = bInVisible; BRANCH if (bShouldVisitChild) { bShouldVisitChild = StreamOutCutError < HierarchyNodeSlice.MaxParentLODError; } return bShouldVisitChild; } void OnPreProcessNodeBatch(uint GroupIndex) { // Nothing to do } void OnPostNodeVisit(FHierarchyNodeSlice HierarchyNodeSlice) { // Nothing to do } void StoreChildNode(uint StoreIndex, FHierarchyNodeSlice HierarchyNodeSlice) { FCandidateNodeRT Node; Node.NodeIndex = HierarchyNodeSlice.ChildStartReference; Node.PrimitiveId = CandidateNode.PrimitiveId; Node.RequestIndex = CandidateNode.RequestIndex; StoreCandidateNodeRT(Nodes, StoreIndex, Node); } void StoreCluster(uint StoreIndex, FHierarchyNodeSlice HierarchyNodeSlice, uint ClusterIndex) { FCandidateCluster CandidateCluster; CandidateCluster.PrimitiveId = CandidateNode.PrimitiveId; CandidateCluster.RequestIndex = CandidateNode.RequestIndex; CandidateCluster.PageIndex = HierarchyNodeSlice.ChildStartReference >> NANITE_MAX_CLUSTERS_PER_PAGE_BITS; CandidateCluster.ClusterIndex = ClusterIndex; StoreCandidateClusterNoCheckRT(CandidateClusters, StoreIndex, CandidateCluster); //TODO: NoCheck to fix issue compilation issue with FXC } uint4 LoadPackedCluster(uint CandidateIndex) { checkSlow(CandidateIndex < MaxCandidateClusters); return uint4(CandidateClusters.Load3(CandidateIndex * GetCandidateClusterSizeRT()), 0u); } bool IsNodeDataReady(uint4 RawData) { return RawData.x != 0xFFFFFFFFu && RawData.y != 0xFFFFFFFFu && RawData.z != 0xFFFFFFFFu; } bool LoadCandidateNodeDataToGroup(uint NodeIndex, uint GroupIndex, bool bCheckIfReady = true) { uint4 NodeData = LoadCandidateNodeDataRT(Nodes, NodeIndex); bool bNodeReady = IsNodeDataReady(NodeData); if (!bCheckIfReady || bNodeReady) { SetGroupNodeData(GroupIndex, NodeData); } return bNodeReady; } void ClearCandidateNodeData(uint NodeIndex) { ClearCandidateNodeRT(Nodes, NodeIndex); } void AddToClusterBatch(uint BatchIndex, uint Num) { check(false); } void ClearClusterBatch(uint BatchIndex) { check(false); } uint LoadClusterBatch(uint BatchIndex) { check(false); return 0; } void ProcessCluster(uint4 PackedCluster) { FCandidateCluster CandidateCluster = UnpackCandidateClusterRT(PackedCluster); FCluster Cluster = GetCluster(CandidateCluster.PageIndex, CandidateCluster.ClusterIndex); bool bSmallEnoughToDraw = StreamOutCutError > Cluster.LODError; bool bVisible = bSmallEnoughToDraw || (Cluster.Flags & NANITE_CLUSTER_FLAG_STREAMING_LEAF); BRANCH if (bVisible) { const FNaniteStreamOutRequest RequestData = StreamOutRequests[CandidateCluster.RequestIndex]; const uint MeshDataOffset = RequestData.MeshDataOffset; const uint MeshVertexCountIndex = GetMeshVertexCountIndex(MeshDataOffset); #if NANITE_STREAM_OUT_CACHE_CLUSTERS uint OutputClusterIndex = 0; WaveInterlockedAddScalar_(OutputClustersStateRW[0], 1, OutputClusterIndex); StorePackedCandidateClusterRT(OutputClustersRW, OutputClusterIndex, PackedCluster); const uint MeshClusterCountIndex = GetMeshClusterCountIndex(MeshDataOffset); InterlockedAdd(MeshDataBuffer[MeshClusterCountIndex], 1); #endif #if NANITE_STREAM_OUT_COUNT_VERTICES_AND_TRIANGLES InterlockedAdd(MeshDataBuffer[MeshVertexCountIndex], Cluster.NumVerts); FCountTrianglesClusterSegmentProcessor Processor; Processor.MeshDataOffset = MeshDataOffset; Processor.RequestData = RequestData; IterateClusterSegments(Cluster, ClusterPageData, Processor); #else // !NANITE_STREAM_OUT_COUNT_VERTICES_AND_TRIANGLES StreamOutClusterCommon(Cluster, CandidateCluster, RequestData); #endif // !NANITE_STREAM_OUT_COUNT_VERTICES_AND_TRIANGLES } } }; [numthreads(NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE, 1, 1)] void NaniteStreamOutTraversalCS(uint GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex) { #if CULLING_TYPE == NANITE_CULLING_TYPE_NODES NodeCull(GroupID, GroupIndex, 0); #elif CULLING_TYPE == NANITE_CULLING_TYPE_CLUSTERS ClusterCull(GroupID, GroupIndex, 0); #endif } #endif #if !NANITE_HIERARCHY_TRAVERSAL bool AllocateRangesCommon(FNaniteStreamOutRequest RequestData) { const uint MeshDataOffset = RequestData.MeshDataOffset; const uint MeshVertexBufferOffsetIndex = GetMeshVertexBufferOffsetIndex(MeshDataOffset); const uint MeshIndexBufferOffsetIndex = GetMeshIndexBufferOffsetIndex(MeshDataOffset); const uint MeshVertexCountIndex = GetMeshVertexCountIndex(MeshDataOffset); const uint MeshIndexCountIndex = GetMeshIndexCountIndex(MeshDataOffset); uint NumVertices = MeshDataBuffer[MeshVertexCountIndex]; uint NumIndices = 0; for (uint SegmentIndex = 0; SegmentIndex < RequestData.NumSegments; ++SegmentIndex) { const uint MeshSegmentIndexCountIndex = GetMeshSegmentIndexCountIndex(MeshDataOffset, SegmentIndex); const uint MeshSegmentIndexBufferOffsetIndex = GetMeshSegmentIndexBufferOffsetIndex(MeshDataOffset, SegmentIndex); const uint SegmentNumIndices = MeshDataBuffer[MeshSegmentIndexCountIndex]; MeshDataBuffer[MeshSegmentIndexCountIndex] = 0; // reset counter back to zero so that each cluster can determine it's offset during stream out pass MeshDataBuffer[MeshSegmentIndexBufferOffsetIndex] = NumIndices; // segment first index NumIndices += SegmentNumIndices; } MeshDataBuffer[MeshIndexCountIndex] = NumIndices; uint BaseVertexOffset = 0u; uint BaseIndexOffset = 0u; if (AllocateVertexAndIndexRanges(NumVertices, NumIndices, BaseVertexOffset, BaseIndexOffset)) { MeshDataBuffer[MeshVertexBufferOffsetIndex] = BaseVertexOffset; MeshDataBuffer[MeshIndexBufferOffsetIndex] = BaseIndexOffset; MeshDataBuffer[MeshVertexCountIndex] = 0; // reset counter back to zero so that each cluster can determine it's offset during stream out pass return true; } else { MeshDataBuffer[MeshVertexBufferOffsetIndex] = 0xFFFFFFFFu; MeshDataBuffer[MeshIndexBufferOffsetIndex] = 0xFFFFFFFFu; return false; } } RWStructuredBuffer QueueState; RWByteAddressBuffer Nodes; [numthreads(64, 1, 1)] void InitQueue(uint GroupIndex : SV_GroupIndex, uint3 GroupId : SV_GroupID) { const uint Index = GetUnWrappedDispatchThreadId(GroupId, GroupIndex, 64); if (Index < NumRequests) { const FNaniteStreamOutRequest RequestData = StreamOutRequests[Index]; #if ALLOCATE_VERTICES_AND_TRIANGLES_RANGES if (!AllocateRangesCommon(RequestData)) { // don't add request to the queue if allocation failed return; } #endif uint NodeOffset = 0; WaveInterlockedAddScalar_(QueueState[0].PassState[0].NodeWriteOffset, 1, NodeOffset); WaveInterlockedAddScalar(QueueState[0].PassState[0].NodeCount, 1); { FCandidateNodeRT Node; Node.NodeIndex = 0; Node.PrimitiveId = RequestData.PrimitiveId; Node.RequestIndex = Index; StoreCandidateNodeRT(Nodes, NodeOffset, Node); } } } #define STREAM_OUT_GROUP_SIZE (64) #define STREAM_OUT_NUM_CLUSTER_PER_THREAD (8) #define STREAM_OUT_NUM_GROUPS_DIVISOR (STREAM_OUT_GROUP_SIZE * STREAM_OUT_NUM_CLUSTER_PER_THREAD) RWBuffer StreamOutDispatchIndirectArgsRW; [numthreads(64, 1, 1)] void AllocateRangesCS(uint GroupIndex : SV_GroupIndex, uint3 GroupId : SV_GroupID) { const uint Index = GetUnWrappedDispatchThreadId(GroupId, GroupIndex, 64); if (Index == 0) { const uint NumClusters = OutputClustersStateRW[0]; WriteDispatchIndirectArgs(StreamOutDispatchIndirectArgsRW, 0, (NumClusters + STREAM_OUT_NUM_GROUPS_DIVISOR - 1) / STREAM_OUT_NUM_GROUPS_DIVISOR, 1, 1); } if (Index < NumRequests) { const FNaniteStreamOutRequest RequestData = StreamOutRequests[Index]; AllocateRangesCommon(RequestData); } } ByteAddressBuffer OutputClusters; [numthreads(STREAM_OUT_GROUP_SIZE, 1, 1)] void NaniteStreamOutCS(uint GroupIndex : SV_GroupIndex, uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID) { const uint NumClusters = OutputClustersStateRW[0]; while (true) { uint ClusterIndex; WaveInterlockedAddScalar_(OutputClustersStateRW[1], 1, ClusterIndex); if (ClusterIndex >= NumClusters) { break; } const FCandidateCluster CandidateCluster = LoadCandidateClusterRT(OutputClusters, ClusterIndex); const FCluster Cluster = GetCluster(CandidateCluster.PageIndex, CandidateCluster.ClusterIndex); const FNaniteStreamOutRequest RequestData = StreamOutRequests[CandidateCluster.RequestIndex]; StreamOutClusterCommon(Cluster, CandidateCluster, RequestData); } } #endif