Files
UnrealEngine/Engine/Shaders/Private/Nanite/NaniteTranscode.usf
2025-05-18 13:04:45 +08:00

698 lines
32 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#include "../Common.ush"
#include "../SceneData.ush"
#include "../ComputeShaderUtils.ush"
#include "NaniteDataDecode.ush"
#include "NaniteAttributeDecode.ush"
struct FClusterInstallInfo
{
uint LocalPageIndex;
uint LocalClusterIndex;
uint SrcPageOffset;
uint DstPageOffset;
uint PageDependenciesOffset;
};
struct FPackedClusterInstallInfo
{
uint LocalPageIndex_LocalClusterIndex;
uint SrcPageOffset;
uint DstPageOffset;
uint PageDependenciesOffset;
};
struct FPageDiskHeader
{
uint NumClusters;
uint NumRawFloat4s;
uint NumVertexRefs;
uint DecodeInfoOffset;
uint StripBitmaskOffset;
uint VertexRefBitmaskOffset;
};
#define SIZEOF_PAGE_DISK_HEADER (6*4)
struct FClusterDiskHeader
{
uint IndexDataOffset;
uint PageClusterMapOffset;
uint VertexRefDataOffset;
uint LowBytesDataOffset;
uint MidBytesDataOffset;
uint HighBytesDataOffset;
uint NumVertexRefs;
uint NumPrevRefVerticesBeforeDwords;
uint NumPrevNewVerticesBeforeDwords;
};
#define SIZEOF_CLUSTER_DISK_HEADER (9*4)
uint StartClusterIndex;
uint NumClusters;
uint ZeroUniform;
StructuredBuffer<FPackedClusterInstallInfo> ClusterInstallInfoBuffer;
StructuredBuffer<uint> PageDependenciesBuffer;
ByteAddressBuffer SrcPageBuffer;
RWByteAddressBuffer DstPageBuffer;
FPageDiskHeader GetPageDiskHeader(uint PageBaseOffset)
{
const uint4 Data0 = SrcPageBuffer.Load4(PageBaseOffset + 0);
const uint2 Data1 = SrcPageBuffer.Load2(PageBaseOffset + 16);
FPageDiskHeader DiskHeader;
DiskHeader.NumClusters = Data0.x;
DiskHeader.NumRawFloat4s = Data0.y;
DiskHeader.NumVertexRefs = Data0.z;
DiskHeader.DecodeInfoOffset = Data0.w;
DiskHeader.StripBitmaskOffset = Data1.x;
DiskHeader.VertexRefBitmaskOffset = Data1.y;
return DiskHeader;
}
FClusterDiskHeader GetClusterDiskHeader(uint PageBaseOffset, uint ClusterIndex)
{
const uint ByteOffset = PageBaseOffset + SIZEOF_PAGE_DISK_HEADER + ClusterIndex * SIZEOF_CLUSTER_DISK_HEADER;
const uint4 Data0 = SrcPageBuffer.Load4(ByteOffset);
const uint4 Data1 = SrcPageBuffer.Load4(ByteOffset + 16);
const uint Data2 = SrcPageBuffer.Load(ByteOffset + 32);
FClusterDiskHeader Header;
Header.IndexDataOffset = Data0.x;
Header.PageClusterMapOffset = Data0.y;
Header.VertexRefDataOffset = Data0.z;
Header.LowBytesDataOffset = Data0.w;
Header.MidBytesDataOffset = Data1.x;
Header.HighBytesDataOffset = Data1.y;
Header.NumVertexRefs = Data1.z;
Header.NumPrevRefVerticesBeforeDwords = Data1.w;
Header.NumPrevNewVerticesBeforeDwords = Data2;
return Header;
}
FClusterInstallInfo GetClusterInstallInfo(uint Index)
{
const FPackedClusterInstallInfo PackedData = ClusterInstallInfoBuffer[Index];
FClusterInstallInfo Info;
Info.LocalPageIndex = PackedData.LocalPageIndex_LocalClusterIndex >> NANITE_MAX_CLUSTERS_PER_PAGE_BITS;
Info.LocalClusterIndex = BitFieldExtractU32(PackedData.LocalPageIndex_LocalClusterIndex, NANITE_MAX_CLUSTERS_PER_PAGE_BITS, 0);
Info.LocalClusterIndex += ZeroUniform; // Temporary workaround for compiler bug: Force promotion from 24bit to 32bit to work around console compiler bug
Info.SrcPageOffset = PackedData.SrcPageOffset;
Info.DstPageOffset = PackedData.DstPageOffset;
Info.PageDependenciesOffset = PackedData.PageDependenciesOffset;
return Info;
}
uint ReadUnalignedDword(ByteAddressBuffer InputBuffer, uint BaseAddressInBytes, int BitOffset)
{
const uint ByteAddress = BaseAddressInBytes + (BitOffset >> 3);
const uint AlignedByteAddress = ByteAddress & ~3u;
BitOffset = ((ByteAddress & 3u) << 3) | (BitOffset & 7u);
const uint2 Data = InputBuffer.Load2(AlignedByteAddress);
return BitAlignU32(Data.y, Data.x, BitOffset);
}
template<uint NumComponents>
uint4 ReadBytes(ByteAddressBuffer InputBuffer, uint BaseAddress, uint Index)
{
const uint Address = BaseAddress + Index * NumComponents;
uint4 Result = 0;
if (NumComponents == 1)
{
const uint Data = InputBuffer.Load(Address & ~3u);
Result.x = ByteAlignU32(0, Data, Address) & 0xFF;
}
else
{
const uint2 Data = InputBuffer.Load2(Address & ~3u);
const uint AlignedData = ByteAlignU32(Data.y, Data.x, Address);
Result.x = AlignedData & 0xFFu;
Result.y = NumComponents >= 2 ? BitFieldExtractU32(AlignedData, 8, 8) : 0u;
Result.z = NumComponents >= 3 ? BitFieldExtractU32(AlignedData, 8, 16) : 0u;
Result.w = NumComponents >= 4 ? (AlignedData >> 24) : 0u;
}
return Result;
}
int DecodeZigZag(uint Data)
{
//return int(Data >> 1) ^ -int(Data & 1);
return int(Data >> 1) ^ BitFieldExtractI32(Data, 1, 0);
}
uint3 LowMidHighIncrement(uint NumBytesPerValue, uint Num)
{
return uint3( NumBytesPerValue >= 1 ? Num : 0u,
NumBytesPerValue >= 2 ? Num : 0u,
NumBytesPerValue >= 3 ? Num : 0u);
}
template<uint NumComponents, uint MaxBytesPerDelta>
int4 UnpackZigZagDeltas(ByteAddressBuffer InputBuffer, uint3 LowMidHighOffsets, uint BytesPerDelta, uint Index, inout int4 PrevLastValue)
{
uint4 PackedValues = 0;
if (MaxBytesPerDelta >= 3 && BytesPerDelta >= 3)
{
PackedValues |= ReadBytes<NumComponents>(InputBuffer, LowMidHighOffsets.z, Index) << 16;
}
if (MaxBytesPerDelta >= 2 && BytesPerDelta >= 2)
{
PackedValues |= ReadBytes<NumComponents>(InputBuffer, LowMidHighOffsets.y, Index) << 8;
}
if (MaxBytesPerDelta >= 1 && BytesPerDelta >= 1)
{
PackedValues |= ReadBytes<NumComponents>(InputBuffer, LowMidHighOffsets.x, Index);
}
int4 Value = 0;
Value.x = WaveInclusivePrefixSum(DecodeZigZag(PackedValues.x));
if (NumComponents >= 2) Value.y = WaveInclusivePrefixSum(DecodeZigZag(PackedValues.y));
if (NumComponents >= 3) Value.z = WaveInclusivePrefixSum(DecodeZigZag(PackedValues.z));
if (NumComponents >= 4) Value.w = WaveInclusivePrefixSum(DecodeZigZag(PackedValues.w));
Value += PrevLastValue;
PrevLastValue = WaveReadLaneLast(Value);
return Value;
}
// Debug only. Performance doesn't matter.
void CopyDwords(RWByteAddressBuffer DstBuffer, uint DstAddress, ByteAddressBuffer SrcBuffer, uint SrcAddress, uint NumDwords)
{
for(uint i = 0; i < NumDwords; i++)
{
DstBuffer.Store(DstAddress + i * 4, SrcBuffer.Load(SrcAddress + i * 4));
}
}
uint3 UnpackStripIndices(uint SrcPageBaseOffset, FPageDiskHeader PageDiskHeader, FClusterDiskHeader ClusterDiskHeader, uint LocalClusterIndex, uint TriIndex)
{
const uint DwordIndex = TriIndex >> 5;
const uint BitIndex = TriIndex & 31u;
//Bitmask.x: bIsStart, Bitmask.y: bIsLeft, Bitmask.z: bIsNewVertex
const uint3 StripBitmasks = SrcPageBuffer.Load3(SrcPageBaseOffset + PageDiskHeader.StripBitmaskOffset + (LocalClusterIndex * (NANITE_MAX_CLUSTER_TRIANGLES / 32) + DwordIndex) * 12);
const uint SMask = StripBitmasks.x;
const uint LMask = StripBitmasks.y;
const uint WMask = StripBitmasks.z;
const uint SLMask = SMask & LMask;
//const uint HeadRefVertexMask = ( SMask & LMask & WMask ) | ( ~SMask & WMask );
const uint HeadRefVertexMask = (SLMask | ~SMask) & WMask; // 1 if head of triangle is ref. S case with 3 refs or L/R case with 1 ref.
const uint PrevBitsMask = (1u << BitIndex) - 1u;
const uint NumPrevRefVerticesBeforeDword = DwordIndex ? BitFieldExtractU32(ClusterDiskHeader.NumPrevRefVerticesBeforeDwords, 10u, DwordIndex * 10u - 10u) : 0u;
const uint NumPrevNewVerticesBeforeDword = DwordIndex ? BitFieldExtractU32(ClusterDiskHeader.NumPrevNewVerticesBeforeDwords, 10u, DwordIndex * 10u - 10u) : 0u;
int CurrentDwordNumPrevRefVertices = (countbits(SLMask & PrevBitsMask) << 1) + countbits(WMask & PrevBitsMask);
int CurrentDwordNumPrevNewVertices = (countbits(SMask & PrevBitsMask) << 1) + BitIndex - CurrentDwordNumPrevRefVertices;
int NumPrevRefVertices = NumPrevRefVerticesBeforeDword + CurrentDwordNumPrevRefVertices;
int NumPrevNewVertices = NumPrevNewVerticesBeforeDword + CurrentDwordNumPrevNewVertices;
const int IsStart = BitFieldExtractI32(SMask, 1, BitIndex); // -1: true, 0: false
const int IsLeft = BitFieldExtractI32(LMask, 1, BitIndex); // -1: true, 0: false
const int IsRef = BitFieldExtractI32(WMask, 1, BitIndex); // -1: true, 0: false
const uint BaseVertex = NumPrevNewVertices - 1u;
uint3 OutIndices;
uint ReadBaseAddress = SrcPageBaseOffset + ClusterDiskHeader.IndexDataOffset;
uint IndexData = ReadUnalignedDword(SrcPageBuffer, ReadBaseAddress, (NumPrevRefVertices + ~IsStart) * 5); // -1 if not Start
if (IsStart)
{
const int MinusNumRefVertices = (IsLeft << 1) + IsRef;
uint NextVertex = NumPrevNewVertices;
if (MinusNumRefVertices <= -1) { OutIndices.x = BaseVertex - (IndexData & 31u); IndexData >>= 5; }
else { OutIndices[0] = NextVertex++; }
if (MinusNumRefVertices <= -2) { OutIndices.y = BaseVertex - (IndexData & 31u); IndexData >>= 5; }
else { OutIndices[1] = NextVertex++; }
if (MinusNumRefVertices <= -3) { OutIndices.z = BaseVertex - (IndexData & 31u); }
else { OutIndices[2] = NextVertex++; }
}
else
{
// Handle two first vertices
const uint PrevBitIndex = BitIndex - 1u;
const int IsPrevStart = BitFieldExtractI32(SMask, 1, PrevBitIndex);
const int IsPrevHeadRef = BitFieldExtractI32(HeadRefVertexMask, 1, PrevBitIndex);
//const int NumPrevNewVerticesInTriangle = IsPrevStart ? ( 3u - ( bfe_u32( /*SLMask*/ LMask, PrevBitIndex, 1 ) << 1 ) - bfe_u32( /*SMask &*/ WMask, PrevBitIndex, 1 ) ) : /*1u - IsPrevRefVertex*/ 0u;
const int NumPrevNewVerticesInTriangle = IsPrevStart & (3u - ((BitFieldExtractU32( /*SLMask*/ LMask, 1, PrevBitIndex) << 1) | BitFieldExtractU32( /*SMask &*/ WMask, 1, PrevBitIndex)));
//OutIndices[ 1 ] = IsPrevRefVertex ? ( BaseVertex - ( IndexData & 31u ) + NumPrevNewVerticesInTriangle ) : BaseVertex; // BaseVertex = ( NumPrevNewVertices - 1 );
OutIndices.y = BaseVertex + (IsPrevHeadRef & (NumPrevNewVerticesInTriangle - (IndexData & 31u)));
//OutIndices[ 2 ] = IsRefVertex ? ( BaseVertex - bfe_u32( IndexData, 5, 5 ) ) : NumPrevNewVertices;
OutIndices.z = NumPrevNewVertices + (IsRef & (-1 - BitFieldExtractU32(IndexData, 5, 5)));
// We have to search for the third vertex.
// Left triangles search for previous Right/Start. Right triangles search for previous Left/Start.
const uint SearchMask = SMask | (LMask ^ IsLeft); // SMask | ( IsRight ? LMask : RMask );
const uint FoundBitIndex = firstbithigh(SearchMask & PrevBitsMask);
const int IsFoundCaseS = BitFieldExtractI32(SMask, 1, FoundBitIndex); // -1: true, 0: false
const uint FoundPrevBitsMask = (1u << FoundBitIndex) - 1u;
int FoundCurrentDwordNumPrevRefVertices = (countbits(SLMask & FoundPrevBitsMask) << 1) + countbits(WMask & FoundPrevBitsMask);
int FoundCurrentDwordNumPrevNewVertices = (countbits(SMask & FoundPrevBitsMask) << 1) + FoundBitIndex - FoundCurrentDwordNumPrevRefVertices;
int FoundNumPrevNewVertices = NumPrevNewVerticesBeforeDword + FoundCurrentDwordNumPrevNewVertices;
int FoundNumPrevRefVertices = NumPrevRefVerticesBeforeDword + FoundCurrentDwordNumPrevRefVertices;
const uint FoundNumRefVertices = (BitFieldExtractU32(LMask, 1, FoundBitIndex) << 1) + BitFieldExtractU32(WMask, 1, FoundBitIndex);
const uint IsBeforeFoundRefVertex = BitFieldExtractU32(HeadRefVertexMask, 1, FoundBitIndex - 1);
// ReadOffset: Where is the vertex relative to triangle we searched for?
const int ReadOffset = IsFoundCaseS ? IsLeft : 1;
const uint FoundIndexData = ReadUnalignedDword(SrcPageBuffer, ReadBaseAddress, (FoundNumPrevRefVertices - ReadOffset) * 5);
const uint FoundIndex = (FoundNumPrevNewVertices - 1u) - BitFieldExtractU32(FoundIndexData, 5, 0);
bool bCondition = IsFoundCaseS ? ((int)FoundNumRefVertices >= 1 - IsLeft) : IsBeforeFoundRefVertex;
int FoundNewVertex = FoundNumPrevNewVertices + (IsFoundCaseS ? (IsLeft & (FoundNumRefVertices == 0)) : -1);
OutIndices.x = bCondition ? FoundIndex : FoundNewVertex;
// Would it be better to code New verts instead of Ref verts?
// HeadRefVertexMask would just be WMask?
if (IsLeft)
{
OutIndices.yz = OutIndices.zy;
}
}
return OutIndices;
}
template<uint NumTexCoords>
void TranscodeVertexAttributes(FPageDiskHeader PageDiskHeader, FCluster Cluster, uint DstPageBaseOffset, uint LocalClusterIndex, uint VertexIndex,
FCluster RefCluster, uint RefPageBaseOffset, uint RefVertexIndex, uint SrcPageBaseOffset)
{
const uint CompileTimeMaxAttributeBits = CalculateMaxAttributeBits(NumTexCoords);
FBitStreamWriterState OutputStream = BitStreamWriter_Create_Aligned(DstPageBaseOffset + Cluster.AttributeOffset, VertexIndex * Cluster.BitsPerAttribute);
FBitStreamReaderState InputStream = BitStreamReader_Create_Aligned(RefPageBaseOffset + RefCluster.AttributeOffset, RefVertexIndex * RefCluster.BitsPerAttribute, CompileTimeMaxAttributeBits);
// Normal
const uint PackedNormal = BitStreamReader_Read_RW(DstPageBuffer, InputStream, 2 * Cluster.NormalPrecision, 2 * NANITE_MAX_NORMAL_QUANTIZATION_BITS);
BitStreamWriter_Writer(DstPageBuffer, OutputStream, PackedNormal, 2 * Cluster.NormalPrecision, 2 * NANITE_MAX_NORMAL_QUANTIZATION_BITS);
// Tangent
const int NumTangentBits = (Cluster.bHasTangents ? (1 + Cluster.TangentPrecision) : 0);
const uint PackedTangent = BitStreamReader_Read_RW(DstPageBuffer, InputStream, NumTangentBits, 1 + NANITE_MAX_TANGENT_QUANTIZATION_BITS);
BitStreamWriter_Writer(DstPageBuffer, OutputStream, PackedTangent, NumTangentBits, 1 + NANITE_MAX_TANGENT_QUANTIZATION_BITS);
// Color
{
const uint4 SrcComponentBits = UnpackToUint4(RefCluster.ColorBits, 4);
const uint4 SrcColorDelta = BitStreamReader_Read4_RW(DstPageBuffer, InputStream, SrcComponentBits, 8);
if (Cluster.ColorMode == NANITE_VERTEX_COLOR_MODE_VARIABLE)
{
const uint SrcPackedColorDelta = SrcColorDelta.x | (SrcColorDelta.y << 8) | (SrcColorDelta.z << 16) | (SrcColorDelta.w << 24);
const uint PackedColor = RefCluster.ColorMin + SrcPackedColorDelta;
const uint4 DstComponentBits = UnpackToUint4(Cluster.ColorBits, 4);
const uint DstPackedColorDelta = PackedColor - Cluster.ColorMin;
const uint PackedDeltaColor = BitFieldExtractU32(DstPackedColorDelta, 8, 0) |
(BitFieldExtractU32(DstPackedColorDelta, 8, 8) << (DstComponentBits.x)) |
(BitFieldExtractU32(DstPackedColorDelta, 8, 16) << (DstComponentBits.x + DstComponentBits.y)) |
(BitFieldExtractU32(DstPackedColorDelta, 8, 24) << (DstComponentBits.x + DstComponentBits.y + DstComponentBits.z));
BitStreamWriter_Writer(DstPageBuffer, OutputStream, PackedDeltaColor, DstComponentBits.x + DstComponentBits.y + DstComponentBits.z + DstComponentBits.w, 4 * NANITE_MAX_COLOR_QUANTIZATION_BITS);
}
}
const uint Stride = NumTexCoords * SIZEOF_PACKED_UV_HEADER + (Cluster.bSkinning ? SIZEOF_PACKED_BONE_INFLUENCE_HEADER : 0u); // Assumes bSkinning is the same for all clusters on the page.
// UVs
UNROLL_N(NANITE_MAX_UVS)
for (uint TexCoordIndex = 0; TexCoordIndex < NumTexCoords ; TexCoordIndex++)
{
const FUVHeader SrcUVHeader = GetUVHeader(DstPageBuffer, RefPageBaseOffset + RefCluster.DecodeInfoOffset, TexCoordIndex);
const FUVHeader DstUVHeader = GetUVHeader(SrcPageBuffer, SrcPageBaseOffset + PageDiskHeader.DecodeInfoOffset + LocalClusterIndex * Stride, TexCoordIndex);
const uint2 SrcLocalUV = BitStreamReader_Read2_RW(DstPageBuffer, InputStream, SrcUVHeader.NumBits, NANITE_MAX_TEXCOORD_COMPONENT_BITS);
const uint2 DstLocalUV = SrcLocalUV + SrcUVHeader.Min - DstUVHeader.Min;
BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstLocalUV.x, DstUVHeader.NumBits.x, NANITE_MAX_TEXCOORD_COMPONENT_BITS);
BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstLocalUV.y, DstUVHeader.NumBits.y, NANITE_MAX_TEXCOORD_COMPONENT_BITS);
}
BitStreamWriter_Flush(DstPageBuffer, OutputStream);
}
groupshared uint GroupRefToVertex[NANITE_MAX_CLUSTER_VERTICES];
groupshared uint GroupNonRefToVertex[NANITE_MAX_CLUSTER_VERTICES];
template<bool bRefToVertex, bool bNonRefToVertex>
void BuildRefTable(uint AlignedBitmaskOffset, uint NumVerts, uint WaveNumActiveLanes, uint GroupIndex)
{
NumVerts = min( NumVerts, NANITE_MAX_CLUSTER_VERTICES );
uint NumPrevPassRefs = 0u;
uint ReadOffset = AlignedBitmaskOffset;
for (uint VertexIndex = GroupIndex; VertexIndex < NumVerts; VertexIndex += WaveNumActiveLanes)
{
const uint RefMask = SrcPageBuffer.Load(AlignedBitmaskOffset + (VertexIndex >> 5) * 4);
const bool bRef = BitFieldExtractU32(RefMask, 1, VertexIndex & 31u) != 0;
const uint NumMaskedRefs = WavePrefixCountBits(bRef);
const uint RefIndex = NumPrevPassRefs + NumMaskedRefs;
const uint NonRefIndex = VertexIndex - RefIndex;
if (bRefToVertex && bRef)
{
GroupRefToVertex[RefIndex] = VertexIndex;
}
if (bNonRefToVertex && !bRef)
{
GroupNonRefToVertex[NonRefIndex] = VertexIndex;
}
NumPrevPassRefs += WaveActiveCountBits(bRef);
}
}
FPageHeader PageHeaderFromPageDiskHeader(FPageDiskHeader PageDiskHeader)
{
FPageHeader PageHeader;
PageHeader.NumClusters = PageDiskHeader.NumClusters;
// Other members are not needed for transcode (yet)
return PageHeader;
}
void TranscodePageIndependent(uint ClusterInstallIndex, uint WaveNumActiveLanes, uint GroupIndex)
{
const FClusterInstallInfo ClusterInstallInfo = GetClusterInstallInfo(ClusterInstallIndex);
const uint SrcPageBaseOffset = ClusterInstallInfo.SrcPageOffset;
const uint DstPageBaseOffset = ClusterInstallInfo.DstPageOffset;
const FPageDiskHeader PageDiskHeader = GetPageDiskHeader(SrcPageBaseOffset);
const uint SrcPackedClusterOffset = SrcPageBaseOffset + SIZEOF_PAGE_DISK_HEADER + PageDiskHeader.NumClusters * SIZEOF_CLUSTER_DISK_HEADER;
const uint DstPackedClusterOffset = DstPageBaseOffset;
// Raw copy: FPackedClusters, Material Dwords and DecodeInfo.
if (ClusterInstallInfo.LocalClusterIndex == 0)
{
const uint NumRawFloat4s = PageDiskHeader.NumRawFloat4s;
for (uint i = GroupIndex; i < NumRawFloat4s; i += WaveNumActiveLanes)
{
uint4 Data = SrcPageBuffer.Load4(SrcPackedClusterOffset + i * 16);
DstPageBuffer.Store4(DstPackedClusterOffset + i * 16, Data);
}
}
const uint LocalClusterIndex = ClusterInstallInfo.LocalClusterIndex;
const FClusterDiskHeader ClusterDiskHeader = GetClusterDiskHeader(SrcPageBaseOffset, LocalClusterIndex);
const FPageHeader PageHeader = PageHeaderFromPageDiskHeader(PageDiskHeader);
const FCluster Cluster = GetCluster(SrcPageBuffer, PageHeader, SrcPackedClusterOffset, LocalClusterIndex);
const uint BitsPerTriangle = Cluster.BitsPerIndex + 2 * 5;
const uint AlignedBitmaskOffset = SrcPageBaseOffset + PageDiskHeader.VertexRefBitmaskOffset + LocalClusterIndex * (NANITE_MAX_CLUSTER_VERTICES / 8);
BuildRefTable<false, true>(AlignedBitmaskOffset, Cluster.NumVerts, WaveNumActiveLanes, GroupIndex);
GroupMemoryBarrierWithGroupSync();
// Decode indices
for(uint TriangleIndex = GroupIndex; TriangleIndex < Cluster.NumTris; TriangleIndex += WaveNumActiveLanes)
{
#if NANITE_USE_STRIP_INDICES
uint3 Indices = UnpackStripIndices(SrcPageBaseOffset, PageDiskHeader, ClusterDiskHeader, LocalClusterIndex, TriangleIndex);
#else
FBitStreamReaderState InputStream = BitStreamReader_Create_Aligned(SrcPageBaseOffset + ClusterDiskHeader.IndexDataOffset, TriangleIndex * 24, 24);
uint Indices24 = BitStreamReader_Read_RO(SrcPageBuffer, InputStream, 24, 24);
uint3 Indices = uint3(Indices24 & 0xFF, (Indices24 >> 8) & 0xFF, (Indices24 >> 16) & 0xFF);
#endif
// Rotate triangle so first vertex has the lowest index
if (Indices.y < min(Indices.x, Indices.z)) Indices = Indices.yzx;
else if (Indices.z < min(Indices.x, Indices.y)) Indices = Indices.zxy;
// Store triangle as one base index and two 5-bit offsets. Cluster constraints guarantee that the offsets are never larger than 5 bits.
uint BaseIndex = Indices.x;
uint Delta0 = Indices.y - BaseIndex;
uint Delta1 = Indices.z - BaseIndex;
uint PackedIndices = BaseIndex | (Delta0 << Cluster.BitsPerIndex) | (Delta1 << (Cluster.BitsPerIndex + 5));
PutBits(DstPageBuffer, DstPageBaseOffset + Cluster.IndexOffset, TriangleIndex * BitsPerTriangle, PackedIndices, BitsPerTriangle);
}
// Non-Ref vertices
const uint NumNonRefVertices = Cluster.NumVerts - ClusterDiskHeader.NumVertexRefs;
#if NANITE_USE_UNCOMPRESSED_VERTEX_DATA
for(uint VertexIndex = 0; VertexIndex < NumNonRefVertices; VertexIndex++)
{
// Position
uint3 PositionData = SrcPageBuffer.Load3(SrcPageBaseOffset + ClusterDiskHeader.LowBytesDataOffset + VertexIndex * 12);
DstPageBuffer.Store3(DstPageBaseOffset + Cluster.PositionOffset + VertexIndex * 12, PositionData);
// Attributes
CopyDwords( DstPageBuffer, DstPageBaseOffset + Cluster.AttributeOffset + VertexIndex * Cluster.BitsPerAttribute / 8,
SrcPageBuffer, SrcPageBaseOffset + ClusterDiskHeader.MidBytesDataOffset + VertexIndex * Cluster.BitsPerAttribute / 8,
Cluster.BitsPerAttribute / 32);
}
#else
const uint PositionBitsPerVertex = Cluster.PosBits.x + Cluster.PosBits.y + Cluster.PosBits.z;
const uint PositionBytesPerValue = (max3(Cluster.PosBits.x, Cluster.PosBits.y, Cluster.PosBits.z) + 7) / 8;
uint3 NextLowMidHighOffsets = SrcPageBaseOffset + uint3(ClusterDiskHeader.LowBytesDataOffset, ClusterDiskHeader.MidBytesDataOffset, ClusterDiskHeader.HighBytesDataOffset);
const uint3 PositionLowMidHighOffsets = NextLowMidHighOffsets;
NextLowMidHighOffsets += LowMidHighIncrement(PositionBytesPerValue, 3 * NumNonRefVertices);
const uint NormalBytesPerValue = (Cluster.NormalPrecision + 7) / 8;
const uint TangentBytesPerValue = (Cluster.TangentPrecision + 1 + 7) / 8;
const uint3 NormalLowMidHighOffsets = NextLowMidHighOffsets;
NextLowMidHighOffsets += LowMidHighIncrement(NormalBytesPerValue, 2 * NumNonRefVertices);
const uint3 TangentLowMidHighOffsets = NextLowMidHighOffsets;
if (Cluster.bHasTangents)
{
NextLowMidHighOffsets += LowMidHighIncrement(TangentBytesPerValue, NumNonRefVertices);
}
const uint3 VertexColorLowMidHighOffsets = NextLowMidHighOffsets;
if(Cluster.ColorMode == NANITE_VERTEX_COLOR_MODE_VARIABLE)
{
NextLowMidHighOffsets += LowMidHighIncrement(1, 4 * NumNonRefVertices);
}
const uint3 TexCoordLowMidHighBaseOffsets = NextLowMidHighOffsets;
int4 PrevPassPosition = int4(int3(int(1) << Cluster.PosBits) >> 1, 0);
int4 PrevPassNormal = 0;
int4 PrevPassTangent = 0;
int4 PrevPassVertexColor = 0;
int4 PrevPassUVs[NANITE_MAX_UVS] = {int4(0,0,0,0), int4(0,0,0,0), int4(0,0,0,0), int4(0,0,0,0)};
for(uint NonRefVertexIndex = GroupIndex; NonRefVertexIndex < NumNonRefVertices; NonRefVertexIndex += WaveNumActiveLanes)
{
uint VertexIndex = NonRefVertexIndex;
if(Cluster.NumTris > 0)
VertexIndex = GroupNonRefToVertex[NonRefVertexIndex];
// Position stream
{
FBitStreamWriterState PositionStream = BitStreamWriter_Create_Aligned(DstPageBaseOffset + Cluster.PositionOffset, VertexIndex * PositionBitsPerVertex);
const int3 DstPosition = UnpackZigZagDeltas<3, NANITE_MAX_POSITION_QUANTIZATION_BYTES>(SrcPageBuffer, PositionLowMidHighOffsets, PositionBytesPerValue, NonRefVertexIndex, PrevPassPosition).xyz & ((int(1) << Cluster.PosBits) - 1);
BitStreamWriter_Writer(DstPageBuffer, PositionStream, DstPosition.x, Cluster.PosBits.x, NANITE_MAX_POSITION_QUANTIZATION_BITS);
BitStreamWriter_Writer(DstPageBuffer, PositionStream, DstPosition.y, Cluster.PosBits.y, NANITE_MAX_POSITION_QUANTIZATION_BITS);
BitStreamWriter_Writer(DstPageBuffer, PositionStream, DstPosition.z, Cluster.PosBits.z, NANITE_MAX_POSITION_QUANTIZATION_BITS);
BitStreamWriter_Flush(DstPageBuffer, PositionStream);
}
// Attribute stream
FBitStreamWriterState AttribStream = BitStreamWriter_Create_Aligned(DstPageBaseOffset + Cluster.AttributeOffset, VertexIndex * Cluster.BitsPerAttribute);
const int2 DstNormal = UnpackZigZagDeltas<2, NANITE_MAX_NORMAL_QUANTIZATION_BYTES>(SrcPageBuffer, NormalLowMidHighOffsets, NormalBytesPerValue, NonRefVertexIndex, PrevPassNormal).xy & ((int(1) << Cluster.NormalPrecision) - 1);
const uint PackedNormal = (DstNormal.y << Cluster.NormalPrecision) | DstNormal.x;
BitStreamWriter_Writer(DstPageBuffer, AttribStream, PackedNormal, 2 * Cluster.NormalPrecision, 2 * NANITE_MAX_NORMAL_QUANTIZATION_BITS);
if (Cluster.bHasTangents)
{
const uint NumTangentBits = Cluster.TangentPrecision + 1;
const int4 PackedTangent = UnpackZigZagDeltas<1, NANITE_MAX_TANGENT_AND_SIGN_QUANTIZATION_BYTES>(SrcPageBuffer, TangentLowMidHighOffsets, TangentBytesPerValue, NonRefVertexIndex, PrevPassTangent) & ((1u << NumTangentBits) - 1u);
BitStreamWriter_Writer(DstPageBuffer, AttribStream, PackedTangent.x, NumTangentBits, NANITE_MAX_TANGENT_QUANTIZATION_BITS + 1);
}
if (Cluster.ColorMode == NANITE_VERTEX_COLOR_MODE_VARIABLE)
{
const uint4 DstComponentBits = UnpackToUint4(Cluster.ColorBits, 4);
const int4 VertexColor = UnpackZigZagDeltas<4, NANITE_MAX_COLOR_QUANTIZATION_BYTES>(SrcPageBuffer, VertexColorLowMidHighOffsets, 1, NonRefVertexIndex, PrevPassVertexColor) & ((1u << DstComponentBits) - 1u);
const uint PackedDeltaColor = VertexColor.r |
(VertexColor.g << (DstComponentBits.x)) |
(VertexColor.b << (DstComponentBits.x + DstComponentBits.y)) |
(VertexColor.a << (DstComponentBits.x + DstComponentBits.y + DstComponentBits.z));
BitStreamWriter_Writer(DstPageBuffer, AttribStream, PackedDeltaColor, DstComponentBits.x + DstComponentBits.y + DstComponentBits.z + DstComponentBits.w, 4 * NANITE_MAX_COLOR_QUANTIZATION_BITS);
}
const uint Stride = Cluster.NumUVs * SIZEOF_PACKED_UV_HEADER + (Cluster.bSkinning ? SIZEOF_PACKED_BONE_INFLUENCE_HEADER : 0u); // Assumes bSkinning is the same for all clusters on the page.
uint3 TexCoordLowMidHighOffsets = TexCoordLowMidHighBaseOffsets;
UNROLL_N(NANITE_MAX_UVS)
for (uint TexCoordIndex = 0; TexCoordIndex < Cluster.NumUVs; TexCoordIndex++)
{
const FUVHeader UVHeader = GetUVHeader(SrcPageBuffer, SrcPageBaseOffset + PageDiskHeader.DecodeInfoOffset + LocalClusterIndex * Stride, TexCoordIndex);
const uint TexCoordBytesPerValue = (max(UVHeader.NumBits.x, UVHeader.NumBits.y) + 7) / 8;
const int2 UV = UnpackZigZagDeltas<2, NANITE_MAX_TEXCOORD_COMPONENT_BYTES>(SrcPageBuffer, TexCoordLowMidHighOffsets, TexCoordBytesPerValue, NonRefVertexIndex, PrevPassUVs[TexCoordIndex]).xy & ((int(1) << int2(UVHeader.NumBits)) - 1);
BitStreamWriter_Writer(DstPageBuffer, AttribStream, UV.x, UVHeader.NumBits.x, NANITE_MAX_TEXCOORD_COMPONENT_BITS);
BitStreamWriter_Writer(DstPageBuffer, AttribStream, UV.y, UVHeader.NumBits.y, NANITE_MAX_TEXCOORD_COMPONENT_BITS);
TexCoordLowMidHighOffsets += LowMidHighIncrement(TexCoordBytesPerValue, 2 * NumNonRefVertices);
}
BitStreamWriter_Flush(DstPageBuffer, AttribStream);
}
#endif
}
void TranscodePageParentDependent(uint ClusterInstallIndex, uint WaveNumActiveLanes, uint GroupIndex)
{
const FClusterInstallInfo ClusterInstallInfo = GetClusterInstallInfo(ClusterInstallIndex);
const uint SrcPageBaseOffset = ClusterInstallInfo.SrcPageOffset;
const uint DstPageBaseOffset = ClusterInstallInfo.DstPageOffset;
const FPageDiskHeader PageDiskHeader = GetPageDiskHeader(SrcPageBaseOffset);
const FPageHeader PageHeader = PageHeaderFromPageDiskHeader(PageDiskHeader);
const uint SrcPackedClusterOffset = SrcPageBaseOffset + SIZEOF_PAGE_DISK_HEADER + PageDiskHeader.NumClusters * SIZEOF_CLUSTER_DISK_HEADER;
const uint LocalClusterIndex = ClusterInstallInfo.LocalClusterIndex;
const FClusterDiskHeader ClusterDiskHeader = GetClusterDiskHeader(SrcPageBaseOffset, LocalClusterIndex);
const FCluster Cluster = GetCluster(SrcPageBuffer, PageHeader, SrcPackedClusterOffset, LocalClusterIndex);
const uint AlignedBitmaskOffset = SrcPageBaseOffset + PageDiskHeader.VertexRefBitmaskOffset + LocalClusterIndex * (NANITE_MAX_CLUSTER_VERTICES / 8);
BuildRefTable<true, false>(AlignedBitmaskOffset, Cluster.NumVerts, WaveNumActiveLanes, GroupIndex);
GroupMemoryBarrierWithGroupSync();
// Ref vertices
int4 PrevRefVertexIndex = 0;
for (uint RefIndex = GroupIndex; RefIndex < ClusterDiskHeader.NumVertexRefs; RefIndex += WaveNumActiveLanes)
{
const uint VertexIndex = GroupRefToVertex[RefIndex];
const uint PageClusterIndex = ReadBytes<1>(SrcPageBuffer, SrcPageBaseOffset + ClusterDiskHeader.VertexRefDataOffset, RefIndex).x;
const uint PageClusterData = SrcPageBuffer.Load(SrcPageBaseOffset + ClusterDiskHeader.PageClusterMapOffset + PageClusterIndex * 4);
const uint RefPageIndex = PageClusterData >> NANITE_MAX_CLUSTERS_PER_PAGE_BITS;
const uint RefLocalClusterIndex = BitFieldExtractU32(PageClusterData, NANITE_MAX_CLUSTERS_PER_PAGE_BITS, 0);
const uint RefVertexIndex = UnpackZigZagDeltas<1, 1>(SrcPageBuffer, uint3(SrcPageBaseOffset + ClusterDiskHeader.VertexRefDataOffset + PageDiskHeader.NumVertexRefs, 0, 0), 1, RefIndex, PrevRefVertexIndex).x & 0xFF;
uint RefPageBaseOffset = 0;
if (RefPageIndex != 0)
{
// Parent ref
RefPageBaseOffset = GPUPageIndexToGPUOffset(PageDependenciesBuffer[ClusterInstallInfo.PageDependenciesOffset + (RefPageIndex - 1)]);
}
else
{
// Same page ref
RefPageBaseOffset = DstPageBaseOffset;
}
const FPageHeader RefPageHeader = GetPageHeader(DstPageBuffer, RefPageBaseOffset);
const FCluster RefCluster = GetCluster(DstPageBuffer, RefPageHeader, RefPageBaseOffset, RefLocalClusterIndex);
// Transcode position
{
const uint RefPositionBitsPerVertex = RefCluster.PosBits.x + RefCluster.PosBits.y + RefCluster.PosBits.z;
FBitStreamReaderState InputStream = BitStreamReader_Create_Aligned(RefPageBaseOffset + RefCluster.PositionOffset, RefVertexIndex * RefPositionBitsPerVertex, 3 * NANITE_MAX_POSITION_QUANTIZATION_BITS);
const int3 RefPosition = BitStreamReader_Read3_RW(DstPageBuffer, InputStream, RefCluster.PosBits, NANITE_MAX_POSITION_QUANTIZATION_BITS);
const int3 DstPosition = RefPosition + RefCluster.PosStart - Cluster.PosStart;
const uint PositionBitsPerVertex = Cluster.PosBits.x + Cluster.PosBits.y + Cluster.PosBits.z;
FBitStreamWriterState OutputStream = BitStreamWriter_Create_Aligned(DstPageBaseOffset + Cluster.PositionOffset, VertexIndex * PositionBitsPerVertex);
BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstPosition.x, Cluster.PosBits.x, NANITE_MAX_POSITION_QUANTIZATION_BITS);
BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstPosition.y, Cluster.PosBits.y, NANITE_MAX_POSITION_QUANTIZATION_BITS);
BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstPosition.z, Cluster.PosBits.z, NANITE_MAX_POSITION_QUANTIZATION_BITS);
BitStreamWriter_Flush(DstPageBuffer, OutputStream);
}
// Specialize vertex transcoding codegen for each of the possible values for NumTexCoords
const uint NumTexCoords = Cluster.NumUVs;
if (NumTexCoords == 0)
{
TranscodeVertexAttributes<0>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex,
RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset);
}
else if(NumTexCoords == 1)
{
TranscodeVertexAttributes<1>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex,
RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset);
}
else if(NumTexCoords == 2)
{
TranscodeVertexAttributes<2>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex,
RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset);
}
else if(NumTexCoords == 3)
{
TranscodeVertexAttributes<3>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex,
RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset);
}
else if(NumTexCoords == 4)
{
TranscodeVertexAttributes<4>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex,
RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset);
}
}
}
#if COMPILER_SUPPORTS_WAVE_SIZE
WAVESIZE(GROUP_SIZE)
#endif
[numthreads(GROUP_SIZE, 1, 1)]
void TranscodePageToGPU(uint3 GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex)
{
const uint FlatGroupID = GetUnWrappedDispatchGroupId(GroupID);
if (FlatGroupID >= NumClusters)
{
return;
}
const uint ClusterInstallIndex = StartClusterIndex + FlatGroupID;
if (GroupIndex >= WaveGetLaneCount())
{
// Workaround for any platform that might not support setting the wave size explicitly
return;
}
const uint WaveNumActiveLanes = min(WaveGetLaneCount(), GROUP_SIZE);
#if NANITE_TRANSCODE_PASS == NANITE_TRANSCODE_PASS_INDEPENDENT
TranscodePageIndependent(ClusterInstallIndex, WaveNumActiveLanes, GroupIndex);
#elif NANITE_TRANSCODE_PASS == NANITE_TRANSCODE_PASS_PARENT_DEPENDENT
TranscodePageParentDependent(ClusterInstallIndex, WaveNumActiveLanes, GroupIndex);
#endif
}