Files
UnrealEngine/Engine/Shaders/Private/Nanite/NaniteStreamOut.usf
2025-05-18 13:04:45 +08:00

693 lines
22 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#include "/Engine/Shared/NaniteDefinitions.h"
#ifndef NANITE_HIERARCHY_TRAVERSAL
# define NANITE_HIERARCHY_TRAVERSAL 0
#endif
#define DEFINE_ITERATE_CLUSTER_SEGMENTS (1)
#if NANITE_HIERARCHY_TRAVERSAL
# define NANITE_HIERARCHY_TRAVERSAL_TYPE (CULLING_TYPE)
# define GROUP_NODE_SIZE 3
# include "NaniteHierarchyTraversal.ush"
#endif
#include "../Common.ush"
//#include "../SceneData.ush"
#include "../WaveOpUtil.ush"
#include "../ComputeShaderUtils.ush"
#include "NaniteDataDecode.ush"
#include "NaniteAttributeDecode.ush"
#include "NaniteCulling.ush"
/*
*
* Per mesh BLAS:
* - Update vertex/index allocators
* - Init Queue
* - Count pass
* - Vertex count
* - Triangles per segment/material
* - Allocate vertex / index ranges
* - Init Queue
* - Stream out data
*
*/
struct FNaniteStreamOutRequest
{
uint PrimitiveId;
uint NumMaterials;
uint NumSegments;
uint SegmentMappingOffset;
uint AuxiliaryDataOffset;
uint MeshDataOffset;
};
StructuredBuffer<FNaniteStreamOutRequest> StreamOutRequests;
uint NumRequests;
StructuredBuffer<uint> SegmentMappingBuffer;
float StreamOutCutError;
// MeshDataBuffer layout
// 0 - num clusters
// 1 - vertex buffer offset
// 2 - index buffer offset
// 3 - vertex count
// 4 - segment 0 index count
// 5 - segment 0 index buffer offset
// - segment ... index count
// - segment ... index buffer offset
// - segment N index count
// - segment N index buffer offset
uint GetMeshClusterCountIndex(uint MeshDataOffset)
{
return MeshDataOffset + 0;
}
uint GetMeshVertexBufferOffsetIndex(uint MeshDataOffset)
{
return MeshDataOffset + 1;
}
uint GetMeshIndexBufferOffsetIndex(uint MeshDataOffset)
{
return MeshDataOffset + 2;
}
uint GetMeshVertexCountIndex(uint MeshDataOffset)
{
return MeshDataOffset + 3;
}
uint GetMeshIndexCountIndex(uint MeshDataOffset)
{
return MeshDataOffset + 4;
}
uint GetMeshSegmentIndexCountIndex(uint MeshDataOffset, uint SegmentIndex)
{
return MeshDataOffset + 5 + SegmentIndex * 2 + 0;
}
uint GetMeshSegmentIndexBufferOffsetIndex(uint MeshDataOffset, uint SegmentIndex)
{
return MeshDataOffset + 5 + SegmentIndex * 2 + 1;
}
RWStructuredBuffer<uint> MeshDataBuffer;
RWStructuredBuffer<float> VertexBuffer; // can't use RWStructuredBuffer<float3> because of bDisableScalarBlockLayout
RWStructuredBuffer<uint> IndexBuffer;
RWStructuredBuffer<uint> AuxiliaryDataBufferRW;
RWByteAddressBuffer OutputClustersRW;
RWStructuredBuffer<uint> OutputClustersStateRW;
// | current vertex marker | current index marker |
RWStructuredBuffer<uint> VertexAndIndexAllocator;
uint VertexBufferSize;
uint IndexBufferSize;
bool Allocate(uint Size, uint BufferSize, uint AllocatorEntry, out uint OutOffset)
{
InterlockedAdd(VertexAndIndexAllocator[AllocatorEntry], Size, OutOffset);
if (OutOffset + Size > BufferSize)
{
return false;
}
return true;
}
uint Align16(uint Value)
{
return (Value + 15) & ~15;
}
bool AllocateVertexAndIndexRanges(uint VertexSize, uint IndexSize, out uint OutVertexOffset, out uint OutIndexOffset)
{
// allocate vertex range
if (!Allocate(Align16(VertexSize), VertexBufferSize, 0, OutVertexOffset))
{
OutIndexOffset = 0;
return false;
}
// allocate index range
if (!Allocate(Align16(IndexSize), IndexBufferSize, 1, OutIndexOffset))
{
return false;
}
return true;
}
struct FCandidateCluster
{
uint PageIndex;
uint ClusterIndex;
uint PrimitiveId;
uint RequestIndex;
};
uint4 PackCandidateClusterRT(uint ClusterIdx, FCandidateCluster CandidateCluster)
{
uint4 RawData = 0;
uint BitPos = 0;
WriteBits(RawData, BitPos, CandidateCluster.PageIndex, NANITE_MAX_GPU_PAGES_BITS);
WriteBits(RawData, BitPos, CandidateCluster.ClusterIndex, NANITE_MAX_CLUSTERS_PER_PAGE_BITS);
WriteBits(RawData, BitPos, CandidateCluster.PrimitiveId, NANITE_MAX_INSTANCES_BITS); // TODO: Should be NANITE_MAX_PRIMITIVE_BITS
WriteBits(RawData, BitPos, CandidateCluster.RequestIndex, 20); // TODO: Enforce this limit this on CPU
return RawData;
}
FCandidateCluster UnpackCandidateClusterRT(uint4 RawData)
{
uint BitPos = 0;
FCandidateCluster CandidateCluster;
CandidateCluster.PageIndex = ReadBits(RawData, BitPos, NANITE_MAX_GPU_PAGES_BITS);
CandidateCluster.ClusterIndex = ReadBits(RawData, BitPos, NANITE_MAX_CLUSTERS_PER_PAGE_BITS);
CandidateCluster.PrimitiveId = ReadBits(RawData, BitPos, NANITE_MAX_INSTANCES_BITS);
CandidateCluster.RequestIndex = ReadBits(RawData, BitPos, 20); // TODO: Enforce this limit this on CPU
return CandidateCluster;
}
struct FCandidateNodeRT
{
uint NodeIndex;
uint PrimitiveId;
uint RequestIndex;
};
uint4 PackCandidateNodeRT(FCandidateNodeRT Node)
{
// Leave at least one bit unused in each of the fields, so 0xFFFFFFFFu is never a valid value.
uint4 RawData = 0;
uint BitPos = 0;
WriteBits(RawData, BitPos, Node.NodeIndex, NANITE_MAX_NODES_PER_PRIMITIVE_BITS);
WriteBits(RawData, BitPos, Node.PrimitiveId, NANITE_MAX_INSTANCES_BITS); // TODO: Should be NANITE_MAX_PRIMITIVE_BITS
WriteBits(RawData, BitPos, Node.RequestIndex, 20); // TODO: Enforce this limit this on CPU
checkSlow(RawData.x != 0xFFFFFFFFu && RawData.y != 0xFFFFFFFFu && RawData.z != 0xFFFFFFFFu);
return RawData;
}
FCandidateNodeRT UnpackCandidateNodeRT(uint4 RawData)
{
uint BitPos = 0;
FCandidateNodeRT Node;
Node.NodeIndex = ReadBits(RawData, BitPos, NANITE_MAX_NODES_PER_PRIMITIVE_BITS);
Node.PrimitiveId = ReadBits(RawData, BitPos, NANITE_MAX_INSTANCES_BITS);
Node.RequestIndex = ReadBits(RawData, BitPos, 20); // TODO: Enforce this limit this on CPU
return Node;
}
uint GetCandidateNodeSizeRT() { return 12u; }
uint GetCandidateClusterSizeRT() { return 12u; }
FCandidateCluster LoadCandidateClusterRT(ByteAddressBuffer CandidateClusters, uint ClusterIndex)
{
uint4 RawData = uint4(CandidateClusters.Load3(ClusterIndex * GetCandidateClusterSizeRT()), 0u);
return UnpackCandidateClusterRT(RawData);
}
void StorePackedCandidateClusterRT(RWByteAddressBuffer CandidateClusters, uint ClusterIndex, uint4 PackedCluster)
{
CandidateClusters.Store3(ClusterIndex * GetCandidateClusterSizeRT(), PackedCluster.xyz);
}
void StoreCandidateClusterNoCheckRT(RWByteAddressBuffer CandidateClusters, uint ClusterIndex, FCandidateCluster CandidateCluster)
{
uint4 RawData = PackCandidateClusterRT(ClusterIndex, CandidateCluster);
CandidateClusters.Store3(ClusterIndex * GetCandidateClusterSizeRT(), RawData.xyz);
}
void StoreCandidateClusterRT(RWByteAddressBuffer CandidateClusters, uint ClusterIndex, FCandidateCluster CandidateCluster)
{
checkSlow(ClusterIndex < MaxCandidateClusters);
StoreCandidateClusterNoCheckRT(CandidateClusters, ClusterIndex, CandidateCluster);
}
uint4 LoadCandidateNodeDataRT(RWByteAddressBuffer InNodes, uint NodeIndex)
{
checkSlow(NodeIndex < MaxNodes);
return uint4(InNodes.Load3(NodeIndex * GetCandidateNodeSizeRT()), 0);
}
void StoreCandidateNodeDataRT(RWByteAddressBuffer InNodes, uint NodeIndex, uint4 RawData)
{
checkSlow(NodeIndex < MaxNodes);
InNodes.Store3(NodeIndex * GetCandidateNodeSizeRT(), RawData.xyz);
}
void StoreCandidateNodeRT(RWByteAddressBuffer InNodes, uint NodeIndex, FCandidateNodeRT Node)
{
checkSlow(NodeIndex < MaxNodes);
StoreCandidateNodeDataRT(InNodes, NodeIndex, PackCandidateNodeRT(Node));
}
void ClearCandidateNodeRT(RWByteAddressBuffer InNodes, uint NodeIndex)
{
checkSlow(NodeIndex < MaxNodes);
StoreCandidateNodeDataRT(InNodes, NodeIndex, 0xFFFFFFFFu);
}
void WriteAuxiliaryData(FCandidateCluster CandidateCluster, uint TriangleIndex, uint AuxiliaryDataBufferOffset)
{
uint AuxiliaryData = 0;
AuxiliaryData |= CandidateCluster.PageIndex;
AuxiliaryData |= CandidateCluster.ClusterIndex << NANITE_MAX_GPU_PAGES_BITS;
AuxiliaryData |= TriangleIndex << (NANITE_MAX_GPU_PAGES_BITS + NANITE_MAX_CLUSTERS_PER_PAGE_BITS);
AuxiliaryDataBufferRW[AuxiliaryDataBufferOffset] = AuxiliaryData;
}
void WriteTriangles(FCluster Cluster, FCandidateCluster CandidateCluster, uint StartTriangle, uint NumTriangles, uint BaseIndex, uint IndexBufferOffset, uint AuxiliaryDataBufferOffset)
{
for (uint TriIndex = 0; TriIndex < NumTriangles; ++TriIndex)
{
const uint3 TriIndices = DecodeTriangleIndices(Cluster, StartTriangle + TriIndex);
IndexBuffer[IndexBufferOffset + TriIndex * 3 + 0] = BaseIndex + TriIndices.x;
IndexBuffer[IndexBufferOffset + TriIndex * 3 + 1] = BaseIndex + TriIndices.y;
IndexBuffer[IndexBufferOffset + TriIndex * 3 + 2] = BaseIndex + TriIndices.z;
WriteAuxiliaryData(CandidateCluster, StartTriangle + TriIndex, AuxiliaryDataBufferOffset + TriIndex);
}
}
void WriteSegment(uint MaterialIndex, uint TriStart, uint TriLength, uint ClusterVertexOffset, uint MeshDataOffset, uint IndexBufferOffset, uint AuxiliaryDataOffset, FCluster Cluster, FCandidateCluster CandidateCluster, FNaniteStreamOutRequest RequestData)
{
checkSlow(MaterialIndex < RequestData.NumMaterials);
const uint SegmentIndex = SegmentMappingBuffer[RequestData.SegmentMappingOffset + MaterialIndex];
const uint MeshSegmentIndexCountIndex = GetMeshSegmentIndexCountIndex(MeshDataOffset, SegmentIndex);
const uint MeshSegmentIndexBufferOffsetIndex = GetMeshSegmentIndexBufferOffsetIndex(MeshDataOffset, SegmentIndex);
const uint SegmentBaseIndexBufferOffset = MeshDataBuffer[MeshSegmentIndexBufferOffsetIndex];
uint ClusterIndexBufferOffset;
InterlockedAdd(MeshDataBuffer[MeshSegmentIndexCountIndex], TriLength * 3, ClusterIndexBufferOffset);
const uint SegmentStartTriangle = TriStart;
const uint SegmentNumTriangles = TriLength;
const uint SegmentBaseIndex = ClusterVertexOffset;
const uint SegmentIndexBufferOffset = IndexBufferOffset + SegmentBaseIndexBufferOffset + ClusterIndexBufferOffset;
const uint SegmentAuxiliaryDataBufferOffset = AuxiliaryDataOffset + (SegmentBaseIndexBufferOffset + ClusterIndexBufferOffset) / 3;
WriteTriangles(Cluster, CandidateCluster, SegmentStartTriangle, SegmentNumTriangles, SegmentBaseIndex, SegmentIndexBufferOffset, SegmentAuxiliaryDataBufferOffset);
}
struct FStreamOutClusterSegmentProcessor
{
uint ClusterVertexOffset;
uint MeshDataOffset;
uint IndexBufferOffset;
uint AuxiliaryDataOffset;
FCluster Cluster;
FCandidateCluster CandidateCluster;
FNaniteStreamOutRequest RequestData;
void Process(uint TriStart, uint TriLength, uint MaterialIndex)
{
WriteSegment(MaterialIndex, TriStart, TriLength, ClusterVertexOffset, MeshDataOffset, IndexBufferOffset, AuxiliaryDataOffset, Cluster, CandidateCluster, RequestData);
}
};
void StreamOutClusterCommon(FCluster Cluster, FCandidateCluster CandidateCluster, FNaniteStreamOutRequest RequestData)
{
// TODO experiments:
// - Output using one group per cluster
const uint MeshDataOffset = RequestData.MeshDataOffset;
const uint MeshVertexBufferOffsetIndex = GetMeshVertexBufferOffsetIndex(MeshDataOffset);
const uint MeshIndexBufferOffsetIndex = GetMeshIndexBufferOffsetIndex(MeshDataOffset);
const uint MeshVertexCountIndex = GetMeshVertexCountIndex(MeshDataOffset);
const uint VertexBufferOffset = MeshDataBuffer[MeshVertexBufferOffsetIndex];
const uint IndexBufferOffset = MeshDataBuffer[MeshIndexBufferOffsetIndex];
if (VertexBufferOffset == 0xFFFFFFFFu || IndexBufferOffset == 0xFFFFFFFFu)
{
return;
}
uint ClusterVertexOffset;
InterlockedAdd(MeshDataBuffer[MeshVertexCountIndex], Cluster.NumVerts, ClusterVertexOffset);
for (uint VertexIndex = 0; VertexIndex < Cluster.NumVerts; ++VertexIndex)
{
// TODO: Nanite-Assemblies: Do we need to apply assembly transform here?
const float3 Pos = DecodePosition(VertexIndex, Cluster);
const uint IndexInVertexBuffer = (VertexBufferOffset + ClusterVertexOffset + VertexIndex) * 3;
VertexBuffer[IndexInVertexBuffer + 0] = Pos.x;
VertexBuffer[IndexInVertexBuffer + 1] = Pos.y;
VertexBuffer[IndexInVertexBuffer + 2] = Pos.z;
}
FStreamOutClusterSegmentProcessor Processor;
Processor.ClusterVertexOffset = ClusterVertexOffset;
Processor.MeshDataOffset = MeshDataOffset;
Processor.IndexBufferOffset = IndexBufferOffset;
Processor.AuxiliaryDataOffset = RequestData.AuxiliaryDataOffset;
Processor.Cluster = Cluster;
Processor.CandidateCluster = CandidateCluster;
Processor.RequestData = RequestData;
IterateClusterSegments(Cluster, ClusterPageData, Processor);
}
#if NANITE_HIERARCHY_TRAVERSAL
RWByteAddressBuffer Nodes;
RWByteAddressBuffer CandidateClusters;
struct FCountTrianglesClusterSegmentProcessor
{
uint MeshDataOffset;
FNaniteStreamOutRequest RequestData;
void Process(uint TriStart, uint TriLength, uint MaterialIndex)
{
checkSlow(MaterialIndex < RequestData.NumMaterials);
const uint SegmentIndex = SegmentMappingBuffer[RequestData.SegmentMappingOffset + MaterialIndex];
const uint MeshSegmentIndexCountIndex = GetMeshSegmentIndexCountIndex(MeshDataOffset, SegmentIndex);
InterlockedAdd(MeshDataBuffer[MeshSegmentIndexCountIndex], TriLength * 3);
}
};
struct FNaniteTraversalStreamOutCallback
{
FCandidateNodeRT CandidateNode;
FPrimitiveSceneData PrimitiveData;
void Init(uint InChildIndex, uint InLocalNodeIndex, uint GroupNodeFetchIndex)
{
const uint4 NodeData = GetGroupNodeData(GroupNodeFetchIndex);
CandidateNode = UnpackCandidateNodeRT(NodeData);
PrimitiveData = GetPrimitiveData(CandidateNode.PrimitiveId);
}
uint GetHierarchyNodeOffset()
{
return ::GetHierarchyNodeOffset(PrimitiveData.NaniteHierarchyOffset, CandidateNode.NodeIndex);
}
bool ShouldVisitChild(FHierarchyNodeSlice HierarchyNodeSlice, bool bInVisible)
{
bool bShouldVisitChild = bInVisible;
BRANCH
if (bShouldVisitChild)
{
bShouldVisitChild = StreamOutCutError < HierarchyNodeSlice.MaxParentLODError;
}
return bShouldVisitChild;
}
void OnPreProcessNodeBatch(uint GroupIndex)
{
// Nothing to do
}
void OnPostNodeVisit(FHierarchyNodeSlice HierarchyNodeSlice)
{
// Nothing to do
}
void StoreChildNode(uint StoreIndex, FHierarchyNodeSlice HierarchyNodeSlice)
{
FCandidateNodeRT Node;
Node.NodeIndex = HierarchyNodeSlice.ChildStartReference;
Node.PrimitiveId = CandidateNode.PrimitiveId;
Node.RequestIndex = CandidateNode.RequestIndex;
StoreCandidateNodeRT(Nodes, StoreIndex, Node);
}
void StoreCluster(uint StoreIndex, FHierarchyNodeSlice HierarchyNodeSlice, uint ClusterIndex)
{
FCandidateCluster CandidateCluster;
CandidateCluster.PrimitiveId = CandidateNode.PrimitiveId;
CandidateCluster.RequestIndex = CandidateNode.RequestIndex;
CandidateCluster.PageIndex = HierarchyNodeSlice.ChildStartReference >> NANITE_MAX_CLUSTERS_PER_PAGE_BITS;
CandidateCluster.ClusterIndex = ClusterIndex;
StoreCandidateClusterNoCheckRT(CandidateClusters, StoreIndex, CandidateCluster); //TODO: NoCheck to fix issue compilation issue with FXC
}
uint4 LoadPackedCluster(uint CandidateIndex)
{
checkSlow(CandidateIndex < MaxCandidateClusters);
return uint4(CandidateClusters.Load3(CandidateIndex * GetCandidateClusterSizeRT()), 0u);
}
bool IsNodeDataReady(uint4 RawData)
{
return RawData.x != 0xFFFFFFFFu && RawData.y != 0xFFFFFFFFu && RawData.z != 0xFFFFFFFFu;
}
bool LoadCandidateNodeDataToGroup(uint NodeIndex, uint GroupIndex, bool bCheckIfReady = true)
{
uint4 NodeData = LoadCandidateNodeDataRT(Nodes, NodeIndex);
bool bNodeReady = IsNodeDataReady(NodeData);
if (!bCheckIfReady || bNodeReady)
{
SetGroupNodeData(GroupIndex, NodeData);
}
return bNodeReady;
}
void ClearCandidateNodeData(uint NodeIndex)
{
ClearCandidateNodeRT(Nodes, NodeIndex);
}
void AddToClusterBatch(uint BatchIndex, uint Num)
{
check(false);
}
void ClearClusterBatch(uint BatchIndex)
{
check(false);
}
uint LoadClusterBatch(uint BatchIndex)
{
check(false);
return 0;
}
void ProcessCluster(uint4 PackedCluster)
{
FCandidateCluster CandidateCluster = UnpackCandidateClusterRT(PackedCluster);
FCluster Cluster = GetCluster(CandidateCluster.PageIndex, CandidateCluster.ClusterIndex);
bool bSmallEnoughToDraw = StreamOutCutError > Cluster.LODError;
bool bVisible = bSmallEnoughToDraw || (Cluster.Flags & NANITE_CLUSTER_FLAG_STREAMING_LEAF);
BRANCH
if (bVisible)
{
const FNaniteStreamOutRequest RequestData = StreamOutRequests[CandidateCluster.RequestIndex];
const uint MeshDataOffset = RequestData.MeshDataOffset;
const uint MeshVertexCountIndex = GetMeshVertexCountIndex(MeshDataOffset);
#if NANITE_STREAM_OUT_CACHE_CLUSTERS
uint OutputClusterIndex = 0;
WaveInterlockedAddScalar_(OutputClustersStateRW[0], 1, OutputClusterIndex);
StorePackedCandidateClusterRT(OutputClustersRW, OutputClusterIndex, PackedCluster);
const uint MeshClusterCountIndex = GetMeshClusterCountIndex(MeshDataOffset);
InterlockedAdd(MeshDataBuffer[MeshClusterCountIndex], 1);
#endif
#if NANITE_STREAM_OUT_COUNT_VERTICES_AND_TRIANGLES
InterlockedAdd(MeshDataBuffer[MeshVertexCountIndex], Cluster.NumVerts);
FCountTrianglesClusterSegmentProcessor Processor;
Processor.MeshDataOffset = MeshDataOffset;
Processor.RequestData = RequestData;
IterateClusterSegments(Cluster, ClusterPageData, Processor);
#else // !NANITE_STREAM_OUT_COUNT_VERTICES_AND_TRIANGLES
StreamOutClusterCommon(Cluster, CandidateCluster, RequestData);
#endif // !NANITE_STREAM_OUT_COUNT_VERTICES_AND_TRIANGLES
}
}
};
[numthreads(NANITE_PERSISTENT_CLUSTER_CULLING_GROUP_SIZE, 1, 1)]
void NaniteStreamOutTraversalCS(uint GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex)
{
#if CULLING_TYPE == NANITE_CULLING_TYPE_NODES
NodeCull<FNaniteTraversalStreamOutCallback>(GroupID, GroupIndex, 0);
#elif CULLING_TYPE == NANITE_CULLING_TYPE_CLUSTERS
ClusterCull<FNaniteTraversalStreamOutCallback>(GroupID, GroupIndex, 0);
#endif
}
#endif
#if !NANITE_HIERARCHY_TRAVERSAL
bool AllocateRangesCommon(FNaniteStreamOutRequest RequestData)
{
const uint MeshDataOffset = RequestData.MeshDataOffset;
const uint MeshVertexBufferOffsetIndex = GetMeshVertexBufferOffsetIndex(MeshDataOffset);
const uint MeshIndexBufferOffsetIndex = GetMeshIndexBufferOffsetIndex(MeshDataOffset);
const uint MeshVertexCountIndex = GetMeshVertexCountIndex(MeshDataOffset);
const uint MeshIndexCountIndex = GetMeshIndexCountIndex(MeshDataOffset);
uint NumVertices = MeshDataBuffer[MeshVertexCountIndex];
uint NumIndices = 0;
for (uint SegmentIndex = 0; SegmentIndex < RequestData.NumSegments; ++SegmentIndex)
{
const uint MeshSegmentIndexCountIndex = GetMeshSegmentIndexCountIndex(MeshDataOffset, SegmentIndex);
const uint MeshSegmentIndexBufferOffsetIndex = GetMeshSegmentIndexBufferOffsetIndex(MeshDataOffset, SegmentIndex);
const uint SegmentNumIndices = MeshDataBuffer[MeshSegmentIndexCountIndex];
MeshDataBuffer[MeshSegmentIndexCountIndex] = 0; // reset counter back to zero so that each cluster can determine it's offset during stream out pass
MeshDataBuffer[MeshSegmentIndexBufferOffsetIndex] = NumIndices; // segment first index
NumIndices += SegmentNumIndices;
}
MeshDataBuffer[MeshIndexCountIndex] = NumIndices;
uint BaseVertexOffset = 0u;
uint BaseIndexOffset = 0u;
if (AllocateVertexAndIndexRanges(NumVertices, NumIndices, BaseVertexOffset, BaseIndexOffset))
{
MeshDataBuffer[MeshVertexBufferOffsetIndex] = BaseVertexOffset;
MeshDataBuffer[MeshIndexBufferOffsetIndex] = BaseIndexOffset;
MeshDataBuffer[MeshVertexCountIndex] = 0; // reset counter back to zero so that each cluster can determine it's offset during stream out pass
return true;
}
else
{
MeshDataBuffer[MeshVertexBufferOffsetIndex] = 0xFFFFFFFFu;
MeshDataBuffer[MeshIndexBufferOffsetIndex] = 0xFFFFFFFFu;
return false;
}
}
RWStructuredBuffer<FQueueState> QueueState;
RWByteAddressBuffer Nodes;
[numthreads(64, 1, 1)]
void InitQueue(uint GroupIndex : SV_GroupIndex, uint3 GroupId : SV_GroupID)
{
const uint Index = GetUnWrappedDispatchThreadId(GroupId, GroupIndex, 64);
if (Index < NumRequests)
{
const FNaniteStreamOutRequest RequestData = StreamOutRequests[Index];
#if ALLOCATE_VERTICES_AND_TRIANGLES_RANGES
if (!AllocateRangesCommon(RequestData))
{
// don't add request to the queue if allocation failed
return;
}
#endif
uint NodeOffset = 0;
WaveInterlockedAddScalar_(QueueState[0].PassState[0].NodeWriteOffset, 1, NodeOffset);
WaveInterlockedAddScalar(QueueState[0].PassState[0].NodeCount, 1);
{
FCandidateNodeRT Node;
Node.NodeIndex = 0;
Node.PrimitiveId = RequestData.PrimitiveId;
Node.RequestIndex = Index;
StoreCandidateNodeRT(Nodes, NodeOffset, Node);
}
}
}
#define STREAM_OUT_GROUP_SIZE (64)
#define STREAM_OUT_NUM_CLUSTER_PER_THREAD (8)
#define STREAM_OUT_NUM_GROUPS_DIVISOR (STREAM_OUT_GROUP_SIZE * STREAM_OUT_NUM_CLUSTER_PER_THREAD)
RWBuffer<uint> StreamOutDispatchIndirectArgsRW;
[numthreads(64, 1, 1)]
void AllocateRangesCS(uint GroupIndex : SV_GroupIndex, uint3 GroupId : SV_GroupID)
{
const uint Index = GetUnWrappedDispatchThreadId(GroupId, GroupIndex, 64);
if (Index == 0)
{
const uint NumClusters = OutputClustersStateRW[0];
WriteDispatchIndirectArgs(StreamOutDispatchIndirectArgsRW, 0, (NumClusters + STREAM_OUT_NUM_GROUPS_DIVISOR - 1) / STREAM_OUT_NUM_GROUPS_DIVISOR, 1, 1);
}
if (Index < NumRequests)
{
const FNaniteStreamOutRequest RequestData = StreamOutRequests[Index];
AllocateRangesCommon(RequestData);
}
}
ByteAddressBuffer OutputClusters;
[numthreads(STREAM_OUT_GROUP_SIZE, 1, 1)]
void NaniteStreamOutCS(uint GroupIndex : SV_GroupIndex, uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID)
{
const uint NumClusters = OutputClustersStateRW[0];
while (true)
{
uint ClusterIndex;
WaveInterlockedAddScalar_(OutputClustersStateRW[1], 1, ClusterIndex);
if (ClusterIndex >= NumClusters)
{
break;
}
const FCandidateCluster CandidateCluster = LoadCandidateClusterRT(OutputClusters, ClusterIndex);
const FCluster Cluster = GetCluster(CandidateCluster.PageIndex, CandidateCluster.ClusterIndex);
const FNaniteStreamOutRequest RequestData = StreamOutRequests[CandidateCluster.RequestIndex];
StreamOutClusterCommon(Cluster, CandidateCluster, RequestData);
}
}
#endif