743 lines
24 KiB
HLSL
743 lines
24 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#include "../Common.ush"
|
|
#include "../SceneData.ush"
|
|
#include "../WaveOpUtil.ush"
|
|
#include "../ComputeShaderUtils.ush"
|
|
|
|
#define SPLIT_WORK_QUEUE (PATCHES) // TODO: Remove once shader rewriter has been fixed (UE-202409)
|
|
|
|
#ifndef NANITE_TESSELLATION
|
|
#define NANITE_TESSELLATION PATCHES
|
|
#endif
|
|
|
|
#include "NaniteAttributeDecode.ush"
|
|
#include "NaniteTessellation.ush"
|
|
|
|
#define RASTER_BIN_INIT (RASTER_BIN_PASS == NANITE_RASTER_BIN_INIT)
|
|
#define RASTER_BIN_COUNT (RASTER_BIN_PASS == NANITE_RASTER_BIN_COUNT)
|
|
#define RASTER_BIN_SCATTER (RASTER_BIN_PASS == NANITE_RASTER_BIN_SCATTER)
|
|
#define RASTER_BIN_RESERVE (RASTER_BIN_PASS == NANITE_RASTER_BIN_RESERVE)
|
|
#define RASTER_BIN_DEPTHBLOCK (RASTER_BIN_PASS == NANITE_RASTER_BIN_DEPTHBLOCK)
|
|
#define RASTER_BIN_FINALIZE (RASTER_BIN_PASS == NANITE_RASTER_BIN_FINALIZE)
|
|
|
|
|
|
#ifndef VIRTUAL_TEXTURE_TARGET
|
|
#define VIRTUAL_TEXTURE_TARGET 0
|
|
#endif
|
|
|
|
#ifndef FORCE_BATCHING
|
|
#define FORCE_BATCHING 0
|
|
#endif
|
|
|
|
#ifndef DEPTH_BUCKETING
|
|
#define DEPTH_BUCKETING 1
|
|
#endif
|
|
|
|
uint MeshPassIndex;
|
|
uint MinSupportedWaveSize;
|
|
uint MaxVisiblePatches;
|
|
uint MaxClusterIndirections;
|
|
|
|
RWStructuredBuffer<FNaniteRasterBinMeta> OutRasterBinMeta;
|
|
RWStructuredBuffer<uint> OutDepthBuckets;
|
|
|
|
FNaniteMaterialFlags GetRasterBinMaterialFlags(uint BinIndex)
|
|
{
|
|
return UnpackNaniteMaterialFlags(OutRasterBinMeta[BinIndex].MaterialFlags_DepthBlock & 0xFFFFu);
|
|
}
|
|
|
|
uint GetRasterBinDepthBlock(uint BinIndex)
|
|
{
|
|
return OutRasterBinMeta[BinIndex].MaterialFlags_DepthBlock >> 16;
|
|
}
|
|
|
|
void InitRasterBin(uint BinIndex)
|
|
{
|
|
OutRasterBinMeta[BinIndex].ClusterOffset = 0;
|
|
OutRasterBinMeta[BinIndex].BinSWCount = 0;
|
|
OutRasterBinMeta[BinIndex].BinHWCount = 0;
|
|
}
|
|
|
|
uint2 GetRasterBinCount(uint BinIndex)
|
|
{
|
|
return uint2(OutRasterBinMeta[BinIndex].BinSWCount, OutRasterBinMeta[BinIndex].BinHWCount);
|
|
}
|
|
|
|
uint GetRasterBinCapacity(uint BinIndex)
|
|
{
|
|
const uint2 BinCount = GetRasterBinCount(BinIndex);
|
|
return (BinCount.x + BinCount.y);
|
|
}
|
|
|
|
void SetRasterBinOffset(uint BinIndex, uint BinOffset)
|
|
{
|
|
OutRasterBinMeta[BinIndex].ClusterOffset = BinOffset;
|
|
}
|
|
|
|
uint GetRasterBinOffset(uint BinIndex)
|
|
{
|
|
return OutRasterBinMeta[BinIndex].ClusterOffset;
|
|
}
|
|
|
|
uint GetRemappedRasterBinFromIndex(
|
|
uint InRelativeMaterialIndex,
|
|
uint InPrimitiveIndex,
|
|
uint InRegularRasterBinCount,
|
|
uint InRenderFlags,
|
|
bool bFallbackRasterBin,
|
|
bool bVoxel)
|
|
{
|
|
uint RasterBin = GetMaterialRasterBinFromIndex(
|
|
InRelativeMaterialIndex,
|
|
InPrimitiveIndex,
|
|
MeshPassIndex,
|
|
InRegularRasterBinCount,
|
|
bFallbackRasterBin
|
|
);
|
|
|
|
const FNaniteMaterialFlags MaterialFlags = GetRasterBinMaterialFlags(RasterBin);
|
|
return RemapRasterBin(RasterBin, InRenderFlags, MaterialFlags, bVoxel);
|
|
}
|
|
|
|
#if NANITE_TESSELLATION
|
|
|
|
[numthreads(1, 1, 1)]
|
|
void InitVisiblePatchesArgs()
|
|
{
|
|
const uint NumVisiblePatches = min( RWVisiblePatchesArgs[3], MaxVisiblePatches );
|
|
|
|
RWVisiblePatchesArgs[0] = ( NumVisiblePatches + 63u ) / 64u;
|
|
RWVisiblePatchesArgs[1] = 1;
|
|
RWVisiblePatchesArgs[2] = 1;
|
|
}
|
|
|
|
#endif
|
|
|
|
#if RASTER_BIN_INIT
|
|
|
|
uint RasterBinCount;
|
|
|
|
[numthreads(64, 1, 1)]
|
|
void RasterBinInit(uint RasterBinIndex : SV_DispatchThreadID)
|
|
{
|
|
if (RasterBinIndex < RasterBinCount)
|
|
{
|
|
InitRasterBin(RasterBinIndex);
|
|
}
|
|
}
|
|
|
|
#elif RASTER_BIN_RESERVE
|
|
|
|
uint RasterBinCount;
|
|
|
|
RWStructuredBuffer<uint> OutRangeAllocator;
|
|
RWBuffer<uint> OutRasterBinArgsSWHW;
|
|
|
|
uint AllocateRasterBinRange(uint ClusterCount)
|
|
{
|
|
uint RangeStart;
|
|
WaveInterlockedAdd_(OutRangeAllocator[0], ClusterCount, RangeStart);
|
|
return RangeStart;
|
|
}
|
|
|
|
[numthreads(64, 1, 1)]
|
|
void RasterBinReserve(uint RasterBinIndex : SV_DispatchThreadID)
|
|
{
|
|
if (RasterBinIndex < RasterBinCount)
|
|
{
|
|
const uint RasterBinCapacity = GetRasterBinCapacity(RasterBinIndex);
|
|
const uint RasterBinOffset = AllocateRasterBinRange(RasterBinCapacity);
|
|
SetRasterBinOffset(RasterBinIndex, RasterBinOffset);
|
|
|
|
const uint ArgsOffset = (RasterBinIndex * NANITE_RASTERIZER_ARG_COUNT);
|
|
WriteRasterizerArgsSWHW(OutRasterBinArgsSWHW, ArgsOffset, 0u, 0u);
|
|
}
|
|
}
|
|
|
|
#elif RASTER_BIN_FINALIZE
|
|
|
|
uint RasterBinCount;
|
|
uint FinalizeMode;
|
|
|
|
RWBuffer<uint> OutRasterBinArgsSWHW;
|
|
|
|
[numthreads(64, 1, 1)]
|
|
void RasterBinFinalize(uint RasterBinIndex : SV_DispatchThreadID)
|
|
{
|
|
if (RasterBinIndex < RasterBinCount)
|
|
{
|
|
const uint Offset = (RasterBinIndex * NANITE_RASTERIZER_ARG_COUNT);
|
|
const uint HWCounterIndex = GetHWClusterCounterIndex(RenderFlags);
|
|
|
|
const uint BinOffset = GetRasterBinOffset(RasterBinIndex);
|
|
|
|
const uint SWClusterCount = OutRasterBinArgsSWHW[Offset + 0u];
|
|
const uint HWClusterCount = OutRasterBinArgsSWHW[Offset + HWCounterIndex];
|
|
|
|
const uint ClampedSWClusterCount = min(BinOffset + SWClusterCount, MaxClusterIndirections) - min(BinOffset, MaxClusterIndirections);
|
|
const uint ClampedHWClusterCount = min(BinOffset + SWClusterCount + HWClusterCount, MaxClusterIndirections) - min(BinOffset + SWClusterCount, MaxClusterIndirections);
|
|
|
|
OutRasterBinMeta[RasterBinIndex].BinSWCount = ClampedSWClusterCount;
|
|
OutRasterBinMeta[RasterBinIndex].BinHWCount = ClampedHWClusterCount;
|
|
|
|
OutRasterBinArgsSWHW[Offset + 0u] = ClampedSWClusterCount;
|
|
|
|
const bool bMeshShader = (RenderFlags & NANITE_RENDER_FLAG_MESH_SHADER) != 0;
|
|
|
|
BRANCH
|
|
if (FinalizeMode == 0 && bMeshShader) // Wrapping
|
|
{
|
|
#if !PLATFORM_REQUIRES_UNWRAPPED_MESH_SHADER_ARGS
|
|
// Need to span the launch size across X,Y,Z so any single dimension does not exceed the 64k
|
|
// limit as per https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#dispatchmesh-api
|
|
// The product of X*Y*Z can be 2^22, so we convert 1D->3D->1D to work around this limitation.
|
|
const uint DimensionLimit = (1u << 16u) - 1u;
|
|
|
|
const uint3 GroupCount = GetGroupCountWrapped(ClampedHWClusterCount, DimensionLimit.xx);
|
|
|
|
// HW: ThreadGroupCountXYZ
|
|
OutRasterBinArgsSWHW[Offset + 4u] = GroupCount.x;
|
|
OutRasterBinArgsSWHW[Offset + 5u] = GroupCount.y;
|
|
OutRasterBinArgsSWHW[Offset + 6u] = GroupCount.z;
|
|
#else
|
|
OutRasterBinArgsSWHW[Offset + 4u] = ClampedHWClusterCount;
|
|
#endif
|
|
}
|
|
else if (FinalizeMode == 1 && bMeshShader) // VK_NV_mesh_shader
|
|
{
|
|
// VK_NV_mesh_shader draw indirect command (count, first, <unused>)
|
|
// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDrawMeshTasksIndirectCommandNV.html
|
|
OutRasterBinArgsSWHW[Offset + 4u] = ClampedHWClusterCount;
|
|
OutRasterBinArgsSWHW[Offset + 5u] = 0u;
|
|
}
|
|
else
|
|
{
|
|
OutRasterBinArgsSWHW[Offset + HWCounterIndex] = ClampedHWClusterCount;
|
|
}
|
|
}
|
|
}
|
|
|
|
#elif RASTER_BIN_DEPTHBLOCK
|
|
|
|
//TODO: Permute based on wave width for slightly better perf?
|
|
[numthreads(64, 1, 1)]
|
|
void RasterBinDepthBlock(uint DepthBlockIndex : SV_GroupID, uint ThreadIndex : SV_GroupIndex)
|
|
{
|
|
const uint LaneIndex = WaveGetLaneIndex();
|
|
const uint LaneCount = WaveGetLaneCount();
|
|
|
|
// Make sure this is just a single wave
|
|
if (ThreadIndex >= LaneCount)
|
|
{
|
|
return;
|
|
}
|
|
|
|
uint PrevSum = 0;
|
|
for (uint BaseIndex = 0; BaseIndex < NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK; BaseIndex += LaneCount)
|
|
{
|
|
uint Index = DepthBlockIndex * NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK + BaseIndex + LaneIndex;
|
|
const uint Count = OutDepthBuckets[Index];
|
|
|
|
const uint PrefixSum = PrevSum + WavePrefixSum(Count);
|
|
PrevSum = WaveReadLaneLast(PrefixSum + Count);
|
|
|
|
OutDepthBuckets[Index] = PrefixSum;
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
Buffer<uint> InClusterOffsetSWHW;
|
|
|
|
StructuredBuffer<uint2> InClusterCountSWHW;
|
|
StructuredBuffer<uint2> InTotalPrevDrawClusters;
|
|
|
|
uint bUsePrimOrMeshShader;
|
|
|
|
uint RegularMaterialRasterBinCount;
|
|
|
|
#if RASTER_BIN_SCATTER
|
|
RWBuffer<uint> OutRasterBinArgsSWHW;
|
|
RWStructuredBuffer<uint2> OutRasterBinData;
|
|
#endif
|
|
|
|
uint ExtractVertReuseBatchInfo(uint2 PackedData, uint BitOffset)
|
|
{
|
|
uint BitConsumed = min(5, 32 - BitOffset);
|
|
uint BatchTriCount = BitFieldExtractU32(PackedData.x, BitConsumed, BitOffset);
|
|
if (5 - BitConsumed > 0)
|
|
{
|
|
BatchTriCount |= (BitFieldExtractU32(PackedData.y, 5 - BitConsumed, 0) << BitConsumed);
|
|
}
|
|
return BatchTriCount + 1;
|
|
}
|
|
|
|
uint DecodeVertReuseBatchInfo(FCluster Cluster, uint BatchInfoOffset, uint BatchIndex, bool bInlined)
|
|
{
|
|
uint BitOffset = CondMask(bInlined, 12u, Cluster.VertReuseBatchCountTableSize * 4u) + (BatchInfoOffset + BatchIndex) * 5;
|
|
uint DwordOffset = BitOffset >> 5;
|
|
BitOffset -= DwordOffset * 32;
|
|
|
|
uint2 PackedData;
|
|
if (bInlined)
|
|
{
|
|
PackedData = uint2(Cluster.VertReuseBatchInfo[DwordOffset], Cluster.VertReuseBatchInfo[DwordOffset + 1]);
|
|
}
|
|
else
|
|
{
|
|
PackedData = ClusterPageData.Load2(Cluster.PageBaseAddress + (Cluster.VertReuseBatchCountTableOffset + DwordOffset) * 4);
|
|
}
|
|
|
|
return ExtractVertReuseBatchInfo(PackedData, BitOffset);
|
|
}
|
|
|
|
#if RASTER_BIN_COUNT
|
|
void IncrementRasterBinCount(uint BinIndex, bool bSoftware, uint BatchCount, uint FlatDepthBucket)
|
|
{
|
|
// One atomic per lane causes severe contention that ends up dominating execution time.
|
|
|
|
// This path uses wave ops to reduce it to one atomic per unique bin in the wave.
|
|
// As clusters are allocated in ranges at group granularity (currently ~32 clusters),
|
|
// the expectation is that most waves will only have one or at most a few unique bins.
|
|
|
|
bool bDone = false;
|
|
while (WaveActiveAnyTrue(!bDone))
|
|
{
|
|
if (!bDone)
|
|
{
|
|
const uint UniformBinIndex = WaveReadLaneFirst(BinIndex);
|
|
const bool bUniformSoftware = (bool)WaveReadLaneFirst((uint)bSoftware);
|
|
if (BinIndex == UniformBinIndex && bSoftware == bUniformSoftware)
|
|
{
|
|
if (bUniformSoftware)
|
|
{
|
|
WaveInterlockedAdd(OutRasterBinMeta[UniformBinIndex].BinSWCount, BatchCount);
|
|
}
|
|
else
|
|
{
|
|
WaveInterlockedAdd(OutRasterBinMeta[UniformBinIndex].BinHWCount, BatchCount);
|
|
}
|
|
|
|
bDone = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if DEPTH_BUCKETING
|
|
BRANCH
|
|
if (FlatDepthBucket != 0xFFFFFFFFu)
|
|
{
|
|
// TODO: We could merge this with the raster counters above if all the counters were in the same buffer. Worthwhile?
|
|
// TODO: Does scalarization make sense here?
|
|
InterlockedAdd(OutDepthBuckets[FlatDepthBucket], BatchCount);
|
|
}
|
|
#endif
|
|
}
|
|
#elif RASTER_BIN_SCATTER
|
|
uint AllocateRasterBinCluster(uint BinIndex, bool bSoftware, uint BatchCount, uint FlatDepthBucket)
|
|
{
|
|
uint ClusterOffset = 0u;
|
|
uint Offset = (BinIndex * NANITE_RASTERIZER_ARG_COUNT);
|
|
|
|
if (!bSoftware)
|
|
{
|
|
Offset += CondMask((RenderFlags & NANITE_RENDER_FLAG_MESH_SHADER) || (RenderFlags & NANITE_RENDER_FLAG_PRIMITIVE_SHADER), 4u, 5u);
|
|
}
|
|
|
|
bool bDone = false;
|
|
while(WaveActiveAnyTrue(!bDone))
|
|
{
|
|
if(!bDone)
|
|
{
|
|
const uint UniformOffset = WaveReadLaneFirst(Offset);
|
|
if (Offset == UniformOffset)
|
|
{
|
|
WaveInterlockedAdd_(OutRasterBinArgsSWHW[UniformOffset], BatchCount, ClusterOffset);
|
|
bDone = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
const uint2 BinCount = GetRasterBinCount(BinIndex);
|
|
#if DEPTH_BUCKETING
|
|
BRANCH
|
|
if (FlatDepthBucket != 0xFFFFFFFFu)
|
|
{
|
|
// TODO: We could merge this with the raster counters above if all the counters were in the same buffer. Worthwhile?
|
|
// TODO: Does scalarization make sense here? Initial test says no.
|
|
// TODO: Consider still using the reverse order inside the buckets for slightly better order?
|
|
InterlockedAdd(OutDepthBuckets[FlatDepthBucket], BatchCount, ClusterOffset);
|
|
ClusterOffset = (!bSoftware ? BinCount.x : 0u) + ClusterOffset;
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
if (bSoftware)
|
|
{
|
|
ClusterOffset = BinCount.x - ClusterOffset - BatchCount; // Write clusters in reverse order for better z-test rejection for voxels and masked materials
|
|
}
|
|
else
|
|
{
|
|
ClusterOffset = BinCount.x + ClusterOffset; // HW cluster list already reversed, so write from beginning
|
|
}
|
|
}
|
|
|
|
return GetRasterBinOffset(BinIndex) + ClusterOffset;
|
|
}
|
|
#endif
|
|
|
|
void ExportRasterBin(uint RasterBin, FNaniteMaterialFlags BinMaterialFlags, uint ClusterIndex, uint RangeStart, uint RangeEnd, uint BatchCount, uint BatchInfoOffset, FCluster Cluster, bool bBatchInfoInlined, bool bSoftware, uint DepthBucket)
|
|
{
|
|
if (MinSupportedWaveSize < 32 && BinMaterialFlags.bPixelProgrammable)
|
|
{
|
|
// SW pixel programmable path requires wave size >= 32. Fall back to HW raster when that is not guaranteed.
|
|
bSoftware = false;
|
|
}
|
|
|
|
if (!BinMaterialFlags.bNoDerivativeOps)
|
|
{
|
|
// Materials requiring finite differences must run the HW path (SW lanes are processing random triangles)
|
|
bSoftware = false;
|
|
}
|
|
|
|
#if FORCE_BATCHING
|
|
const bool bForceBatching = true;
|
|
#else
|
|
const bool bForceBatching = false;
|
|
#endif
|
|
bool bUseBatch = !bSoftware && bUsePrimOrMeshShader && (bForceBatching || BinMaterialFlags.bVertexProgrammable);
|
|
if (BinMaterialFlags.bDisplacement)
|
|
{
|
|
// Displacement forces 100% software, and also the vert reuse batching
|
|
bSoftware = true;
|
|
bUseBatch = true;
|
|
}
|
|
|
|
if (Cluster.bVoxel)
|
|
{
|
|
bSoftware = true;
|
|
bUseBatch = false;
|
|
}
|
|
|
|
BatchCount = CondMask(bUseBatch, BatchCount, 1u);
|
|
|
|
uint FlatDepthBucket = 0xFFFFFFFFu;
|
|
#if DEPTH_BUCKETING
|
|
uint DepthBlock = GetRasterBinDepthBlock(RasterBin);
|
|
if (DepthBlock != 0xFFFFu)
|
|
{
|
|
DepthBlock += (bSoftware ? 0u : 1u);
|
|
FlatDepthBucket = DepthBlock * NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK + DepthBucket;
|
|
}
|
|
#endif
|
|
|
|
#if RASTER_BIN_COUNT
|
|
IncrementRasterBinCount(RasterBin, bSoftware, BatchCount, FlatDepthBucket);
|
|
#elif RASTER_BIN_SCATTER
|
|
const uint BinClusterMapping = AllocateRasterBinCluster(RasterBin, bSoftware, BatchCount, FlatDepthBucket);
|
|
|
|
// Trim any overflow exports
|
|
BatchCount = min(BinClusterMapping + BatchCount, MaxClusterIndirections) - min(BinClusterMapping, MaxClusterIndirections);
|
|
|
|
for (uint BatchIndex = 0; BatchIndex < BatchCount; ++BatchIndex)
|
|
{
|
|
if (bUseBatch)
|
|
{
|
|
uint BatchTriCount = DecodeVertReuseBatchInfo(Cluster, BatchInfoOffset, BatchIndex, bBatchInfoInlined);
|
|
RangeEnd = RangeStart + BatchTriCount;
|
|
}
|
|
OutRasterBinData[BinClusterMapping + BatchIndex].x = ClusterIndex;
|
|
OutRasterBinData[BinClusterMapping + BatchIndex].y = (RangeStart << 16) | RangeEnd;
|
|
RangeStart = RangeEnd;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
template< typename T, typename FFunction >
|
|
void ScalarizeForEachMatching( T Value, FFunction Function )
|
|
{
|
|
bool bDone = false;
|
|
while( WaveActiveAnyTrue( !bDone ) )
|
|
{
|
|
if( !bDone )
|
|
{
|
|
T UniformValue = WaveReadLaneFirst( Value );
|
|
if( UniformValue == Value )
|
|
{
|
|
Function( UniformValue );
|
|
bDone = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool IsNaniteMaterialMergingAllowed(FNaniteMaterialFlags MaterialFlags)
|
|
{
|
|
return !MaterialFlags.bDisplacement; // Merging ranges is incompatible with tessellation. Cluster rasterizer assumes UVDensities is uniform.
|
|
}
|
|
|
|
[numthreads(64, 1, 1)]
|
|
void RasterBinBuild(uint RelativeClusterIndex : SV_DispatchThreadID, uint GroupThreadIndex : SV_GroupIndex)
|
|
{
|
|
#if PATCHES
|
|
const uint VisibleIndex = RelativeClusterIndex;
|
|
const uint NumVisiblePatches = min( VisiblePatchesArgs[3], MaxVisiblePatches );
|
|
|
|
if( VisibleIndex < NumVisiblePatches )
|
|
{
|
|
#if NANITE_TESSELLATION_PATCH_REFS
|
|
const uint PatchIndex = VisiblePatches.Load(VisibleIndex * 8);
|
|
const uint PatchEncoded = SplitWorkQueue.DataBuffer_Load(PatchIndex * 16);
|
|
#else
|
|
const uint PatchEncoded = VisiblePatches.Load(VisibleIndex * 16);
|
|
#endif
|
|
|
|
uint VisibleClusterIndex = PatchEncoded.x >> 7;
|
|
uint TriIndex = PatchEncoded.x & 0x7F;
|
|
|
|
FVisibleCluster VisibleCluster = GetVisibleCluster( VisibleClusterIndex, VIRTUAL_TEXTURE_TARGET );
|
|
FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked( VisibleCluster.InstanceId );
|
|
FCluster Cluster = GetCluster( VisibleCluster.PageIndex, VisibleCluster.ClusterIndex );
|
|
|
|
const uint RasterBin = GetRemappedRasterBinFromIndex(
|
|
GetRelativeMaterialIndex(Cluster, TriIndex),
|
|
InstanceData.PrimitiveId,
|
|
RegularMaterialRasterBinCount,
|
|
RenderFlags,
|
|
/* bFallbackRasterBin */ false,
|
|
Cluster.bVoxel
|
|
);
|
|
|
|
uint IndexInBin = 0;
|
|
|
|
// Scalarize atomics
|
|
bool bDone = false;
|
|
while( WaveActiveAnyTrue( !bDone ) )
|
|
{
|
|
if( !bDone )
|
|
{
|
|
const uint UniformBinIndex = WaveReadLaneFirst( RasterBin );
|
|
if( UniformBinIndex == RasterBin )
|
|
{
|
|
#if RASTER_BIN_COUNT
|
|
WaveInterlockedAddScalar(OutRasterBinMeta[UniformBinIndex].BinSWCount, 1);
|
|
#elif RASTER_BIN_SCATTER
|
|
WaveInterlockedAddScalar_( OutRasterBinArgsSWHW[ UniformBinIndex * NANITE_RASTERIZER_ARG_COUNT + 3 ], 1, IndexInBin );
|
|
const uint Num = IndexInBin + WaveActiveCountBits(true);
|
|
if(WaveIsFirstLane())
|
|
{
|
|
const uint NumGroups = (Num + MaxPatchesPerGroup - 1) / MaxPatchesPerGroup; // Slow integer divide. Fix me?
|
|
InterlockedMax(OutRasterBinArgsSWHW[ UniformBinIndex * NANITE_RASTERIZER_ARG_COUNT + 0 ], NumGroups);
|
|
}
|
|
#endif
|
|
bDone = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if RASTER_BIN_SCATTER
|
|
const uint BinMapping = GetRasterBinOffset( RasterBin ) + IndexInBin;
|
|
|
|
OutRasterBinData[ BinMapping ].x = VisibleIndex;
|
|
OutRasterBinData[ BinMapping ].y = 0;
|
|
#endif
|
|
}
|
|
#else
|
|
const uint SWClusterCount = InClusterCountSWHW[0].x;
|
|
const uint HWClusterCount = InClusterCountSWHW[0].y;
|
|
|
|
const bool bSoftware = RelativeClusterIndex < SWClusterCount;
|
|
const uint ClusterCount = SWClusterCount + HWClusterCount;
|
|
|
|
if (RelativeClusterIndex < ClusterCount)
|
|
{
|
|
RelativeClusterIndex = CondMask(bSoftware, RelativeClusterIndex, RelativeClusterIndex - SWClusterCount);
|
|
|
|
uint ClusterOffset = 0;
|
|
|
|
const bool bHasPrevDrawData = (RenderFlags & NANITE_RENDER_FLAG_HAS_PREV_DRAW_DATA) != 0u;
|
|
BRANCH
|
|
if (bHasPrevDrawData)
|
|
{
|
|
ClusterOffset += CondMask(bSoftware, InTotalPrevDrawClusters[0].x, InTotalPrevDrawClusters[0].y);
|
|
}
|
|
|
|
#if IS_POST_PASS
|
|
ClusterOffset += CondMask(bSoftware, InClusterOffsetSWHW[0], InClusterOffsetSWHW[GetHWClusterCounterIndex(RenderFlags)]);
|
|
#endif
|
|
|
|
uint VisibleClusterIndex = RelativeClusterIndex + ClusterOffset;
|
|
|
|
// HW clusters are written from the top
|
|
VisibleClusterIndex = CondMask(bSoftware, VisibleClusterIndex, (MaxVisibleClusters - 1) - VisibleClusterIndex);
|
|
|
|
FVisibleCluster VisibleCluster = GetVisibleCluster(VisibleClusterIndex, VIRTUAL_TEXTURE_TARGET);
|
|
FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked(VisibleCluster);
|
|
FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex);
|
|
const bool bFallbackBin = (VisibleCluster.Flags & NANITE_CULLING_FLAG_FALLBACK_RASTER) != 0;
|
|
const uint DepthBucket = VisibleCluster.DepthBucket;
|
|
|
|
BRANCH
|
|
if (IsMaterialFastPath(Cluster))
|
|
{
|
|
uint RasterBin0 = 0;
|
|
uint RasterBin1 = 0;
|
|
uint RasterBin2 = 0;
|
|
|
|
uint RasterLen0 = 0;
|
|
uint RasterLen1 = 0;
|
|
uint RasterLen2 = 0;
|
|
|
|
uint BatchCount0 = BitFieldExtractU32(Cluster.VertReuseBatchInfo.x, 4, 0);
|
|
uint BatchCount1 = BitFieldExtractU32(Cluster.VertReuseBatchInfo.x, 4, 4);
|
|
uint BatchCount2 = BitFieldExtractU32(Cluster.VertReuseBatchInfo.x, 4, 8);
|
|
uint BatchInfoOffset0 = 0;
|
|
uint BatchInfoOffset1 = BatchCount0;
|
|
uint BatchInfoOffset2 = BatchCount0 + BatchCount1;
|
|
|
|
FNaniteMaterialFlags BinMaterialFlags0 = (FNaniteMaterialFlags)0;
|
|
FNaniteMaterialFlags BinMaterialFlags1 = (FNaniteMaterialFlags)0;
|
|
FNaniteMaterialFlags BinMaterialFlags2 = (FNaniteMaterialFlags)0;
|
|
|
|
// Length is remaining triangles after Material0 and Material1
|
|
const uint Material2Length = Cluster.MaterialTotalLength - (Cluster.Material0Length + Cluster.Material1Length);
|
|
|
|
// The 0th material range is always non-zero length
|
|
{
|
|
RasterBin0 = GetRemappedRasterBinFromIndex(Cluster.Material0Index, InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, bFallbackBin, Cluster.bVoxel);
|
|
BinMaterialFlags0 = GetRasterBinMaterialFlags(RasterBin0);
|
|
}
|
|
|
|
BRANCH
|
|
if (Cluster.Material1Length > 0u)
|
|
{
|
|
RasterBin1 = GetRemappedRasterBinFromIndex(Cluster.Material1Index, InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, bFallbackBin, Cluster.bVoxel);
|
|
BinMaterialFlags1 = GetRasterBinMaterialFlags(RasterBin1);
|
|
}
|
|
|
|
BRANCH
|
|
if (Material2Length > 0)
|
|
{
|
|
RasterBin2 = GetRemappedRasterBinFromIndex(Cluster.Material2Index, InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, bFallbackBin, Cluster.bVoxel);
|
|
BinMaterialFlags2 = GetRasterBinMaterialFlags(RasterBin2);
|
|
}
|
|
|
|
const bool bAllowMerging0 = IsNaniteMaterialMergingAllowed(BinMaterialFlags0);
|
|
const bool bAllowMerging1 = IsNaniteMaterialMergingAllowed(BinMaterialFlags1);
|
|
const bool bAllowMerging2 = IsNaniteMaterialMergingAllowed(BinMaterialFlags2);
|
|
|
|
BRANCH
|
|
if (RasterBin0 == RasterBin1 && bAllowMerging0 && bAllowMerging1)
|
|
{
|
|
if (RasterBin0 == RasterBin2 && bAllowMerging2)
|
|
{
|
|
RasterLen0 = Cluster.MaterialTotalLength;
|
|
BatchCount0 += BatchCount1 + BatchCount2;
|
|
}
|
|
else
|
|
{
|
|
RasterLen0 = (Cluster.Material0Length + Cluster.Material1Length);
|
|
RasterLen2 = Material2Length;
|
|
BatchCount0 += BatchCount1;
|
|
}
|
|
}
|
|
else if (RasterBin1 == RasterBin2 && bAllowMerging1 && bAllowMerging2)
|
|
{
|
|
RasterLen0 = Cluster.Material0Length;
|
|
RasterLen1 = Cluster.MaterialTotalLength - Cluster.Material0Length;
|
|
BatchCount1 += BatchCount2;
|
|
}
|
|
else
|
|
{
|
|
RasterLen0 = Cluster.Material0Length;
|
|
RasterLen1 = Cluster.Material1Length;
|
|
RasterLen2 = Material2Length;
|
|
}
|
|
|
|
// The 0th material range is always non-zero length
|
|
{
|
|
ExportRasterBin(RasterBin0, BinMaterialFlags0, VisibleClusterIndex, 0u, RasterLen0, BatchCount0, BatchInfoOffset0, Cluster, true, bSoftware, DepthBucket);
|
|
}
|
|
|
|
BRANCH
|
|
if (RasterLen1 > 0)
|
|
{
|
|
const uint Range1Start = RasterLen0;
|
|
const uint Range1End = Range1Start + RasterLen1;
|
|
ExportRasterBin(RasterBin1, BinMaterialFlags1, VisibleClusterIndex, Range1Start, Range1End, BatchCount1, BatchInfoOffset1, Cluster, true, bSoftware, DepthBucket);
|
|
}
|
|
|
|
BRANCH
|
|
if (RasterLen2 > 0)
|
|
{
|
|
const uint Range2Start = (RasterLen0 + RasterLen1);
|
|
const uint Range2End = Range2Start + RasterLen2;
|
|
ExportRasterBin(RasterBin2, BinMaterialFlags2, VisibleClusterIndex, Range2Start, Range2End, BatchCount2, BatchInfoOffset2, Cluster, true, bSoftware, DepthBucket);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
uint CurrentRangeBin = 0xFFFFFFFFu;
|
|
uint CurrentRangeStart = 0;
|
|
uint CurrentRangeEnd = 0;
|
|
uint CurrentBatchCount = 0;
|
|
uint CurrentBatchInfoOffset = 0;
|
|
|
|
FNaniteMaterialFlags CurrentBinMaterialFlags = (FNaniteMaterialFlags)0;
|
|
bool bCurrentAllowMerging = false;
|
|
|
|
FBitStreamReaderState BatchCountStreamState = BitStreamReader_Create_Aligned(Cluster.PageBaseAddress + Cluster.VertReuseBatchCountTableOffset * 4, 0, NANITE_MAX_CLUSTER_MATERIALS * 4);
|
|
|
|
uint TableOffset = Cluster.PageBaseAddress + Cluster.MaterialTableOffset * 4u;
|
|
LOOP for (uint TableEntry = 0; TableEntry < Cluster.MaterialTableLength; ++TableEntry)
|
|
{
|
|
const uint EncodedRange = ClusterPageData.Load(TableOffset);
|
|
TableOffset += 4;
|
|
|
|
uint TriStart;
|
|
uint TriLength;
|
|
uint MaterialIndex;
|
|
DecodeMaterialRange(EncodedRange, TriStart, TriLength, MaterialIndex);
|
|
|
|
const uint RasterBinN = GetRemappedRasterBinFromIndex(MaterialIndex, InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, bFallbackBin, Cluster.bVoxel);
|
|
const FNaniteMaterialFlags BinMaterialFlags = GetRasterBinMaterialFlags(RasterBinN);
|
|
const bool bAllowMerging = IsNaniteMaterialMergingAllowed(BinMaterialFlags);
|
|
|
|
const uint BatchCount = BitStreamReader_Read_RO(ClusterPageData, BatchCountStreamState, 4, 4);
|
|
|
|
// Check if raster slot matches the current run, and that the triangle range is contiguous.
|
|
if ((RasterBinN == CurrentRangeBin) && bAllowMerging && bCurrentAllowMerging)
|
|
{
|
|
// Update current range
|
|
CurrentRangeEnd = TriStart + TriLength;
|
|
CurrentBatchCount += BatchCount;
|
|
}
|
|
else
|
|
{
|
|
// Not merging previous range, and previous range has valid triangles that need to be flushed
|
|
BRANCH
|
|
if (CurrentRangeEnd > 0)
|
|
{
|
|
ExportRasterBin(CurrentRangeBin, CurrentBinMaterialFlags, VisibleClusterIndex, CurrentRangeStart, CurrentRangeEnd, CurrentBatchCount, CurrentBatchInfoOffset, Cluster, false, bSoftware, DepthBucket);
|
|
}
|
|
|
|
CurrentRangeBin = RasterBinN;
|
|
CurrentRangeStart = TriStart;
|
|
CurrentRangeEnd = CurrentRangeStart + TriLength;
|
|
CurrentBatchInfoOffset += CurrentBatchCount;
|
|
CurrentBatchCount = BatchCount;
|
|
CurrentBinMaterialFlags = BinMaterialFlags;
|
|
bCurrentAllowMerging = bAllowMerging;
|
|
}
|
|
}
|
|
|
|
// Need to flush current range
|
|
ExportRasterBin(CurrentRangeBin, CurrentBinMaterialFlags, VisibleClusterIndex, CurrentRangeStart, CurrentRangeEnd, CurrentBatchCount, CurrentBatchInfoOffset, Cluster, false, bSoftware, DepthBucket);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#endif
|