// Copyright Epic Games, Inc. All Rights Reserved. #include "../Common.ush" #include "../SceneData.ush" #include "../ComputeShaderUtils.ush" #include "NaniteDataDecode.ush" #include "NaniteAttributeDecode.ush" struct FClusterInstallInfo { uint LocalPageIndex; uint LocalClusterIndex; uint SrcPageOffset; uint DstPageOffset; uint PageDependenciesOffset; }; struct FPackedClusterInstallInfo { uint LocalPageIndex_LocalClusterIndex; uint SrcPageOffset; uint DstPageOffset; uint PageDependenciesOffset; }; struct FPageDiskHeader { uint NumClusters; uint NumRawFloat4s; uint NumVertexRefs; uint DecodeInfoOffset; uint StripBitmaskOffset; uint VertexRefBitmaskOffset; }; #define SIZEOF_PAGE_DISK_HEADER (6*4) struct FClusterDiskHeader { uint IndexDataOffset; uint PageClusterMapOffset; uint VertexRefDataOffset; uint LowBytesDataOffset; uint MidBytesDataOffset; uint HighBytesDataOffset; uint NumVertexRefs; uint NumPrevRefVerticesBeforeDwords; uint NumPrevNewVerticesBeforeDwords; }; #define SIZEOF_CLUSTER_DISK_HEADER (9*4) uint StartClusterIndex; uint NumClusters; uint ZeroUniform; StructuredBuffer ClusterInstallInfoBuffer; StructuredBuffer PageDependenciesBuffer; ByteAddressBuffer SrcPageBuffer; RWByteAddressBuffer DstPageBuffer; FPageDiskHeader GetPageDiskHeader(uint PageBaseOffset) { const uint4 Data0 = SrcPageBuffer.Load4(PageBaseOffset + 0); const uint2 Data1 = SrcPageBuffer.Load2(PageBaseOffset + 16); FPageDiskHeader DiskHeader; DiskHeader.NumClusters = Data0.x; DiskHeader.NumRawFloat4s = Data0.y; DiskHeader.NumVertexRefs = Data0.z; DiskHeader.DecodeInfoOffset = Data0.w; DiskHeader.StripBitmaskOffset = Data1.x; DiskHeader.VertexRefBitmaskOffset = Data1.y; return DiskHeader; } FClusterDiskHeader GetClusterDiskHeader(uint PageBaseOffset, uint ClusterIndex) { const uint ByteOffset = PageBaseOffset + SIZEOF_PAGE_DISK_HEADER + ClusterIndex * SIZEOF_CLUSTER_DISK_HEADER; const uint4 Data0 = SrcPageBuffer.Load4(ByteOffset); const uint4 Data1 = SrcPageBuffer.Load4(ByteOffset + 16); const uint Data2 = SrcPageBuffer.Load(ByteOffset + 32); FClusterDiskHeader Header; Header.IndexDataOffset = Data0.x; Header.PageClusterMapOffset = Data0.y; Header.VertexRefDataOffset = Data0.z; Header.LowBytesDataOffset = Data0.w; Header.MidBytesDataOffset = Data1.x; Header.HighBytesDataOffset = Data1.y; Header.NumVertexRefs = Data1.z; Header.NumPrevRefVerticesBeforeDwords = Data1.w; Header.NumPrevNewVerticesBeforeDwords = Data2; return Header; } FClusterInstallInfo GetClusterInstallInfo(uint Index) { const FPackedClusterInstallInfo PackedData = ClusterInstallInfoBuffer[Index]; FClusterInstallInfo Info; Info.LocalPageIndex = PackedData.LocalPageIndex_LocalClusterIndex >> NANITE_MAX_CLUSTERS_PER_PAGE_BITS; Info.LocalClusterIndex = BitFieldExtractU32(PackedData.LocalPageIndex_LocalClusterIndex, NANITE_MAX_CLUSTERS_PER_PAGE_BITS, 0); Info.LocalClusterIndex += ZeroUniform; // Temporary workaround for compiler bug: Force promotion from 24bit to 32bit to work around console compiler bug Info.SrcPageOffset = PackedData.SrcPageOffset; Info.DstPageOffset = PackedData.DstPageOffset; Info.PageDependenciesOffset = PackedData.PageDependenciesOffset; return Info; } uint ReadUnalignedDword(ByteAddressBuffer InputBuffer, uint BaseAddressInBytes, int BitOffset) { const uint ByteAddress = BaseAddressInBytes + (BitOffset >> 3); const uint AlignedByteAddress = ByteAddress & ~3u; BitOffset = ((ByteAddress & 3u) << 3) | (BitOffset & 7u); const uint2 Data = InputBuffer.Load2(AlignedByteAddress); return BitAlignU32(Data.y, Data.x, BitOffset); } template uint4 ReadBytes(ByteAddressBuffer InputBuffer, uint BaseAddress, uint Index) { const uint Address = BaseAddress + Index * NumComponents; uint4 Result = 0; if (NumComponents == 1) { const uint Data = InputBuffer.Load(Address & ~3u); Result.x = ByteAlignU32(0, Data, Address) & 0xFF; } else { const uint2 Data = InputBuffer.Load2(Address & ~3u); const uint AlignedData = ByteAlignU32(Data.y, Data.x, Address); Result.x = AlignedData & 0xFFu; Result.y = NumComponents >= 2 ? BitFieldExtractU32(AlignedData, 8, 8) : 0u; Result.z = NumComponents >= 3 ? BitFieldExtractU32(AlignedData, 8, 16) : 0u; Result.w = NumComponents >= 4 ? (AlignedData >> 24) : 0u; } return Result; } int DecodeZigZag(uint Data) { //return int(Data >> 1) ^ -int(Data & 1); return int(Data >> 1) ^ BitFieldExtractI32(Data, 1, 0); } uint3 LowMidHighIncrement(uint NumBytesPerValue, uint Num) { return uint3( NumBytesPerValue >= 1 ? Num : 0u, NumBytesPerValue >= 2 ? Num : 0u, NumBytesPerValue >= 3 ? Num : 0u); } template int4 UnpackZigZagDeltas(ByteAddressBuffer InputBuffer, uint3 LowMidHighOffsets, uint BytesPerDelta, uint Index, inout int4 PrevLastValue) { uint4 PackedValues = 0; if (MaxBytesPerDelta >= 3 && BytesPerDelta >= 3) { PackedValues |= ReadBytes(InputBuffer, LowMidHighOffsets.z, Index) << 16; } if (MaxBytesPerDelta >= 2 && BytesPerDelta >= 2) { PackedValues |= ReadBytes(InputBuffer, LowMidHighOffsets.y, Index) << 8; } if (MaxBytesPerDelta >= 1 && BytesPerDelta >= 1) { PackedValues |= ReadBytes(InputBuffer, LowMidHighOffsets.x, Index); } int4 Value = 0; Value.x = WaveInclusivePrefixSum(DecodeZigZag(PackedValues.x)); if (NumComponents >= 2) Value.y = WaveInclusivePrefixSum(DecodeZigZag(PackedValues.y)); if (NumComponents >= 3) Value.z = WaveInclusivePrefixSum(DecodeZigZag(PackedValues.z)); if (NumComponents >= 4) Value.w = WaveInclusivePrefixSum(DecodeZigZag(PackedValues.w)); Value += PrevLastValue; PrevLastValue = WaveReadLaneLast(Value); return Value; } // Debug only. Performance doesn't matter. void CopyDwords(RWByteAddressBuffer DstBuffer, uint DstAddress, ByteAddressBuffer SrcBuffer, uint SrcAddress, uint NumDwords) { for(uint i = 0; i < NumDwords; i++) { DstBuffer.Store(DstAddress + i * 4, SrcBuffer.Load(SrcAddress + i * 4)); } } uint3 UnpackStripIndices(uint SrcPageBaseOffset, FPageDiskHeader PageDiskHeader, FClusterDiskHeader ClusterDiskHeader, uint LocalClusterIndex, uint TriIndex) { const uint DwordIndex = TriIndex >> 5; const uint BitIndex = TriIndex & 31u; //Bitmask.x: bIsStart, Bitmask.y: bIsLeft, Bitmask.z: bIsNewVertex const uint3 StripBitmasks = SrcPageBuffer.Load3(SrcPageBaseOffset + PageDiskHeader.StripBitmaskOffset + (LocalClusterIndex * (NANITE_MAX_CLUSTER_TRIANGLES / 32) + DwordIndex) * 12); const uint SMask = StripBitmasks.x; const uint LMask = StripBitmasks.y; const uint WMask = StripBitmasks.z; const uint SLMask = SMask & LMask; //const uint HeadRefVertexMask = ( SMask & LMask & WMask ) | ( ~SMask & WMask ); const uint HeadRefVertexMask = (SLMask | ~SMask) & WMask; // 1 if head of triangle is ref. S case with 3 refs or L/R case with 1 ref. const uint PrevBitsMask = (1u << BitIndex) - 1u; const uint NumPrevRefVerticesBeforeDword = DwordIndex ? BitFieldExtractU32(ClusterDiskHeader.NumPrevRefVerticesBeforeDwords, 10u, DwordIndex * 10u - 10u) : 0u; const uint NumPrevNewVerticesBeforeDword = DwordIndex ? BitFieldExtractU32(ClusterDiskHeader.NumPrevNewVerticesBeforeDwords, 10u, DwordIndex * 10u - 10u) : 0u; int CurrentDwordNumPrevRefVertices = (countbits(SLMask & PrevBitsMask) << 1) + countbits(WMask & PrevBitsMask); int CurrentDwordNumPrevNewVertices = (countbits(SMask & PrevBitsMask) << 1) + BitIndex - CurrentDwordNumPrevRefVertices; int NumPrevRefVertices = NumPrevRefVerticesBeforeDword + CurrentDwordNumPrevRefVertices; int NumPrevNewVertices = NumPrevNewVerticesBeforeDword + CurrentDwordNumPrevNewVertices; const int IsStart = BitFieldExtractI32(SMask, 1, BitIndex); // -1: true, 0: false const int IsLeft = BitFieldExtractI32(LMask, 1, BitIndex); // -1: true, 0: false const int IsRef = BitFieldExtractI32(WMask, 1, BitIndex); // -1: true, 0: false const uint BaseVertex = NumPrevNewVertices - 1u; uint3 OutIndices; uint ReadBaseAddress = SrcPageBaseOffset + ClusterDiskHeader.IndexDataOffset; uint IndexData = ReadUnalignedDword(SrcPageBuffer, ReadBaseAddress, (NumPrevRefVertices + ~IsStart) * 5); // -1 if not Start if (IsStart) { const int MinusNumRefVertices = (IsLeft << 1) + IsRef; uint NextVertex = NumPrevNewVertices; if (MinusNumRefVertices <= -1) { OutIndices.x = BaseVertex - (IndexData & 31u); IndexData >>= 5; } else { OutIndices[0] = NextVertex++; } if (MinusNumRefVertices <= -2) { OutIndices.y = BaseVertex - (IndexData & 31u); IndexData >>= 5; } else { OutIndices[1] = NextVertex++; } if (MinusNumRefVertices <= -3) { OutIndices.z = BaseVertex - (IndexData & 31u); } else { OutIndices[2] = NextVertex++; } } else { // Handle two first vertices const uint PrevBitIndex = BitIndex - 1u; const int IsPrevStart = BitFieldExtractI32(SMask, 1, PrevBitIndex); const int IsPrevHeadRef = BitFieldExtractI32(HeadRefVertexMask, 1, PrevBitIndex); //const int NumPrevNewVerticesInTriangle = IsPrevStart ? ( 3u - ( bfe_u32( /*SLMask*/ LMask, PrevBitIndex, 1 ) << 1 ) - bfe_u32( /*SMask &*/ WMask, PrevBitIndex, 1 ) ) : /*1u - IsPrevRefVertex*/ 0u; const int NumPrevNewVerticesInTriangle = IsPrevStart & (3u - ((BitFieldExtractU32( /*SLMask*/ LMask, 1, PrevBitIndex) << 1) | BitFieldExtractU32( /*SMask &*/ WMask, 1, PrevBitIndex))); //OutIndices[ 1 ] = IsPrevRefVertex ? ( BaseVertex - ( IndexData & 31u ) + NumPrevNewVerticesInTriangle ) : BaseVertex; // BaseVertex = ( NumPrevNewVertices - 1 ); OutIndices.y = BaseVertex + (IsPrevHeadRef & (NumPrevNewVerticesInTriangle - (IndexData & 31u))); //OutIndices[ 2 ] = IsRefVertex ? ( BaseVertex - bfe_u32( IndexData, 5, 5 ) ) : NumPrevNewVertices; OutIndices.z = NumPrevNewVertices + (IsRef & (-1 - BitFieldExtractU32(IndexData, 5, 5))); // We have to search for the third vertex. // Left triangles search for previous Right/Start. Right triangles search for previous Left/Start. const uint SearchMask = SMask | (LMask ^ IsLeft); // SMask | ( IsRight ? LMask : RMask ); const uint FoundBitIndex = firstbithigh(SearchMask & PrevBitsMask); const int IsFoundCaseS = BitFieldExtractI32(SMask, 1, FoundBitIndex); // -1: true, 0: false const uint FoundPrevBitsMask = (1u << FoundBitIndex) - 1u; int FoundCurrentDwordNumPrevRefVertices = (countbits(SLMask & FoundPrevBitsMask) << 1) + countbits(WMask & FoundPrevBitsMask); int FoundCurrentDwordNumPrevNewVertices = (countbits(SMask & FoundPrevBitsMask) << 1) + FoundBitIndex - FoundCurrentDwordNumPrevRefVertices; int FoundNumPrevNewVertices = NumPrevNewVerticesBeforeDword + FoundCurrentDwordNumPrevNewVertices; int FoundNumPrevRefVertices = NumPrevRefVerticesBeforeDword + FoundCurrentDwordNumPrevRefVertices; const uint FoundNumRefVertices = (BitFieldExtractU32(LMask, 1, FoundBitIndex) << 1) + BitFieldExtractU32(WMask, 1, FoundBitIndex); const uint IsBeforeFoundRefVertex = BitFieldExtractU32(HeadRefVertexMask, 1, FoundBitIndex - 1); // ReadOffset: Where is the vertex relative to triangle we searched for? const int ReadOffset = IsFoundCaseS ? IsLeft : 1; const uint FoundIndexData = ReadUnalignedDword(SrcPageBuffer, ReadBaseAddress, (FoundNumPrevRefVertices - ReadOffset) * 5); const uint FoundIndex = (FoundNumPrevNewVertices - 1u) - BitFieldExtractU32(FoundIndexData, 5, 0); bool bCondition = IsFoundCaseS ? ((int)FoundNumRefVertices >= 1 - IsLeft) : IsBeforeFoundRefVertex; int FoundNewVertex = FoundNumPrevNewVertices + (IsFoundCaseS ? (IsLeft & (FoundNumRefVertices == 0)) : -1); OutIndices.x = bCondition ? FoundIndex : FoundNewVertex; // Would it be better to code New verts instead of Ref verts? // HeadRefVertexMask would just be WMask? if (IsLeft) { OutIndices.yz = OutIndices.zy; } } return OutIndices; } template void TranscodeVertexAttributes(FPageDiskHeader PageDiskHeader, FCluster Cluster, uint DstPageBaseOffset, uint LocalClusterIndex, uint VertexIndex, FCluster RefCluster, uint RefPageBaseOffset, uint RefVertexIndex, uint SrcPageBaseOffset) { const uint CompileTimeMaxAttributeBits = CalculateMaxAttributeBits(NumTexCoords); FBitStreamWriterState OutputStream = BitStreamWriter_Create_Aligned(DstPageBaseOffset + Cluster.AttributeOffset, VertexIndex * Cluster.BitsPerAttribute); FBitStreamReaderState InputStream = BitStreamReader_Create_Aligned(RefPageBaseOffset + RefCluster.AttributeOffset, RefVertexIndex * RefCluster.BitsPerAttribute, CompileTimeMaxAttributeBits); // Normal const uint PackedNormal = BitStreamReader_Read_RW(DstPageBuffer, InputStream, 2 * Cluster.NormalPrecision, 2 * NANITE_MAX_NORMAL_QUANTIZATION_BITS); BitStreamWriter_Writer(DstPageBuffer, OutputStream, PackedNormal, 2 * Cluster.NormalPrecision, 2 * NANITE_MAX_NORMAL_QUANTIZATION_BITS); // Tangent const int NumTangentBits = (Cluster.bHasTangents ? (1 + Cluster.TangentPrecision) : 0); const uint PackedTangent = BitStreamReader_Read_RW(DstPageBuffer, InputStream, NumTangentBits, 1 + NANITE_MAX_TANGENT_QUANTIZATION_BITS); BitStreamWriter_Writer(DstPageBuffer, OutputStream, PackedTangent, NumTangentBits, 1 + NANITE_MAX_TANGENT_QUANTIZATION_BITS); // Color { const uint4 SrcComponentBits = UnpackToUint4(RefCluster.ColorBits, 4); const uint4 SrcColorDelta = BitStreamReader_Read4_RW(DstPageBuffer, InputStream, SrcComponentBits, 8); if (Cluster.ColorMode == NANITE_VERTEX_COLOR_MODE_VARIABLE) { const uint SrcPackedColorDelta = SrcColorDelta.x | (SrcColorDelta.y << 8) | (SrcColorDelta.z << 16) | (SrcColorDelta.w << 24); const uint PackedColor = RefCluster.ColorMin + SrcPackedColorDelta; const uint4 DstComponentBits = UnpackToUint4(Cluster.ColorBits, 4); const uint DstPackedColorDelta = PackedColor - Cluster.ColorMin; const uint PackedDeltaColor = BitFieldExtractU32(DstPackedColorDelta, 8, 0) | (BitFieldExtractU32(DstPackedColorDelta, 8, 8) << (DstComponentBits.x)) | (BitFieldExtractU32(DstPackedColorDelta, 8, 16) << (DstComponentBits.x + DstComponentBits.y)) | (BitFieldExtractU32(DstPackedColorDelta, 8, 24) << (DstComponentBits.x + DstComponentBits.y + DstComponentBits.z)); BitStreamWriter_Writer(DstPageBuffer, OutputStream, PackedDeltaColor, DstComponentBits.x + DstComponentBits.y + DstComponentBits.z + DstComponentBits.w, 4 * NANITE_MAX_COLOR_QUANTIZATION_BITS); } } const uint Stride = NumTexCoords * SIZEOF_PACKED_UV_HEADER + (Cluster.bSkinning ? SIZEOF_PACKED_BONE_INFLUENCE_HEADER : 0u); // Assumes bSkinning is the same for all clusters on the page. // UVs UNROLL_N(NANITE_MAX_UVS) for (uint TexCoordIndex = 0; TexCoordIndex < NumTexCoords ; TexCoordIndex++) { const FUVHeader SrcUVHeader = GetUVHeader(DstPageBuffer, RefPageBaseOffset + RefCluster.DecodeInfoOffset, TexCoordIndex); const FUVHeader DstUVHeader = GetUVHeader(SrcPageBuffer, SrcPageBaseOffset + PageDiskHeader.DecodeInfoOffset + LocalClusterIndex * Stride, TexCoordIndex); const uint2 SrcLocalUV = BitStreamReader_Read2_RW(DstPageBuffer, InputStream, SrcUVHeader.NumBits, NANITE_MAX_TEXCOORD_COMPONENT_BITS); const uint2 DstLocalUV = SrcLocalUV + SrcUVHeader.Min - DstUVHeader.Min; BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstLocalUV.x, DstUVHeader.NumBits.x, NANITE_MAX_TEXCOORD_COMPONENT_BITS); BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstLocalUV.y, DstUVHeader.NumBits.y, NANITE_MAX_TEXCOORD_COMPONENT_BITS); } BitStreamWriter_Flush(DstPageBuffer, OutputStream); } groupshared uint GroupRefToVertex[NANITE_MAX_CLUSTER_VERTICES]; groupshared uint GroupNonRefToVertex[NANITE_MAX_CLUSTER_VERTICES]; template void BuildRefTable(uint AlignedBitmaskOffset, uint NumVerts, uint WaveNumActiveLanes, uint GroupIndex) { NumVerts = min( NumVerts, NANITE_MAX_CLUSTER_VERTICES ); uint NumPrevPassRefs = 0u; uint ReadOffset = AlignedBitmaskOffset; for (uint VertexIndex = GroupIndex; VertexIndex < NumVerts; VertexIndex += WaveNumActiveLanes) { const uint RefMask = SrcPageBuffer.Load(AlignedBitmaskOffset + (VertexIndex >> 5) * 4); const bool bRef = BitFieldExtractU32(RefMask, 1, VertexIndex & 31u) != 0; const uint NumMaskedRefs = WavePrefixCountBits(bRef); const uint RefIndex = NumPrevPassRefs + NumMaskedRefs; const uint NonRefIndex = VertexIndex - RefIndex; if (bRefToVertex && bRef) { GroupRefToVertex[RefIndex] = VertexIndex; } if (bNonRefToVertex && !bRef) { GroupNonRefToVertex[NonRefIndex] = VertexIndex; } NumPrevPassRefs += WaveActiveCountBits(bRef); } } FPageHeader PageHeaderFromPageDiskHeader(FPageDiskHeader PageDiskHeader) { FPageHeader PageHeader; PageHeader.NumClusters = PageDiskHeader.NumClusters; // Other members are not needed for transcode (yet) return PageHeader; } void TranscodePageIndependent(uint ClusterInstallIndex, uint WaveNumActiveLanes, uint GroupIndex) { const FClusterInstallInfo ClusterInstallInfo = GetClusterInstallInfo(ClusterInstallIndex); const uint SrcPageBaseOffset = ClusterInstallInfo.SrcPageOffset; const uint DstPageBaseOffset = ClusterInstallInfo.DstPageOffset; const FPageDiskHeader PageDiskHeader = GetPageDiskHeader(SrcPageBaseOffset); const uint SrcPackedClusterOffset = SrcPageBaseOffset + SIZEOF_PAGE_DISK_HEADER + PageDiskHeader.NumClusters * SIZEOF_CLUSTER_DISK_HEADER; const uint DstPackedClusterOffset = DstPageBaseOffset; // Raw copy: FPackedClusters, Material Dwords and DecodeInfo. if (ClusterInstallInfo.LocalClusterIndex == 0) { const uint NumRawFloat4s = PageDiskHeader.NumRawFloat4s; for (uint i = GroupIndex; i < NumRawFloat4s; i += WaveNumActiveLanes) { uint4 Data = SrcPageBuffer.Load4(SrcPackedClusterOffset + i * 16); DstPageBuffer.Store4(DstPackedClusterOffset + i * 16, Data); } } const uint LocalClusterIndex = ClusterInstallInfo.LocalClusterIndex; const FClusterDiskHeader ClusterDiskHeader = GetClusterDiskHeader(SrcPageBaseOffset, LocalClusterIndex); const FPageHeader PageHeader = PageHeaderFromPageDiskHeader(PageDiskHeader); const FCluster Cluster = GetCluster(SrcPageBuffer, PageHeader, SrcPackedClusterOffset, LocalClusterIndex); const uint BitsPerTriangle = Cluster.BitsPerIndex + 2 * 5; const uint AlignedBitmaskOffset = SrcPageBaseOffset + PageDiskHeader.VertexRefBitmaskOffset + LocalClusterIndex * (NANITE_MAX_CLUSTER_VERTICES / 8); BuildRefTable(AlignedBitmaskOffset, Cluster.NumVerts, WaveNumActiveLanes, GroupIndex); GroupMemoryBarrierWithGroupSync(); // Decode indices for(uint TriangleIndex = GroupIndex; TriangleIndex < Cluster.NumTris; TriangleIndex += WaveNumActiveLanes) { #if NANITE_USE_STRIP_INDICES uint3 Indices = UnpackStripIndices(SrcPageBaseOffset, PageDiskHeader, ClusterDiskHeader, LocalClusterIndex, TriangleIndex); #else FBitStreamReaderState InputStream = BitStreamReader_Create_Aligned(SrcPageBaseOffset + ClusterDiskHeader.IndexDataOffset, TriangleIndex * 24, 24); uint Indices24 = BitStreamReader_Read_RO(SrcPageBuffer, InputStream, 24, 24); uint3 Indices = uint3(Indices24 & 0xFF, (Indices24 >> 8) & 0xFF, (Indices24 >> 16) & 0xFF); #endif // Rotate triangle so first vertex has the lowest index if (Indices.y < min(Indices.x, Indices.z)) Indices = Indices.yzx; else if (Indices.z < min(Indices.x, Indices.y)) Indices = Indices.zxy; // Store triangle as one base index and two 5-bit offsets. Cluster constraints guarantee that the offsets are never larger than 5 bits. uint BaseIndex = Indices.x; uint Delta0 = Indices.y - BaseIndex; uint Delta1 = Indices.z - BaseIndex; uint PackedIndices = BaseIndex | (Delta0 << Cluster.BitsPerIndex) | (Delta1 << (Cluster.BitsPerIndex + 5)); PutBits(DstPageBuffer, DstPageBaseOffset + Cluster.IndexOffset, TriangleIndex * BitsPerTriangle, PackedIndices, BitsPerTriangle); } // Non-Ref vertices const uint NumNonRefVertices = Cluster.NumVerts - ClusterDiskHeader.NumVertexRefs; #if NANITE_USE_UNCOMPRESSED_VERTEX_DATA for(uint VertexIndex = 0; VertexIndex < NumNonRefVertices; VertexIndex++) { // Position uint3 PositionData = SrcPageBuffer.Load3(SrcPageBaseOffset + ClusterDiskHeader.LowBytesDataOffset + VertexIndex * 12); DstPageBuffer.Store3(DstPageBaseOffset + Cluster.PositionOffset + VertexIndex * 12, PositionData); // Attributes CopyDwords( DstPageBuffer, DstPageBaseOffset + Cluster.AttributeOffset + VertexIndex * Cluster.BitsPerAttribute / 8, SrcPageBuffer, SrcPageBaseOffset + ClusterDiskHeader.MidBytesDataOffset + VertexIndex * Cluster.BitsPerAttribute / 8, Cluster.BitsPerAttribute / 32); } #else const uint PositionBitsPerVertex = Cluster.PosBits.x + Cluster.PosBits.y + Cluster.PosBits.z; const uint PositionBytesPerValue = (max3(Cluster.PosBits.x, Cluster.PosBits.y, Cluster.PosBits.z) + 7) / 8; uint3 NextLowMidHighOffsets = SrcPageBaseOffset + uint3(ClusterDiskHeader.LowBytesDataOffset, ClusterDiskHeader.MidBytesDataOffset, ClusterDiskHeader.HighBytesDataOffset); const uint3 PositionLowMidHighOffsets = NextLowMidHighOffsets; NextLowMidHighOffsets += LowMidHighIncrement(PositionBytesPerValue, 3 * NumNonRefVertices); const uint NormalBytesPerValue = (Cluster.NormalPrecision + 7) / 8; const uint TangentBytesPerValue = (Cluster.TangentPrecision + 1 + 7) / 8; const uint3 NormalLowMidHighOffsets = NextLowMidHighOffsets; NextLowMidHighOffsets += LowMidHighIncrement(NormalBytesPerValue, 2 * NumNonRefVertices); const uint3 TangentLowMidHighOffsets = NextLowMidHighOffsets; if (Cluster.bHasTangents) { NextLowMidHighOffsets += LowMidHighIncrement(TangentBytesPerValue, NumNonRefVertices); } const uint3 VertexColorLowMidHighOffsets = NextLowMidHighOffsets; if(Cluster.ColorMode == NANITE_VERTEX_COLOR_MODE_VARIABLE) { NextLowMidHighOffsets += LowMidHighIncrement(1, 4 * NumNonRefVertices); } const uint3 TexCoordLowMidHighBaseOffsets = NextLowMidHighOffsets; int4 PrevPassPosition = int4(int3(int(1) << Cluster.PosBits) >> 1, 0); int4 PrevPassNormal = 0; int4 PrevPassTangent = 0; int4 PrevPassVertexColor = 0; int4 PrevPassUVs[NANITE_MAX_UVS] = {int4(0,0,0,0), int4(0,0,0,0), int4(0,0,0,0), int4(0,0,0,0)}; for(uint NonRefVertexIndex = GroupIndex; NonRefVertexIndex < NumNonRefVertices; NonRefVertexIndex += WaveNumActiveLanes) { uint VertexIndex = NonRefVertexIndex; if(Cluster.NumTris > 0) VertexIndex = GroupNonRefToVertex[NonRefVertexIndex]; // Position stream { FBitStreamWriterState PositionStream = BitStreamWriter_Create_Aligned(DstPageBaseOffset + Cluster.PositionOffset, VertexIndex * PositionBitsPerVertex); const int3 DstPosition = UnpackZigZagDeltas<3, NANITE_MAX_POSITION_QUANTIZATION_BYTES>(SrcPageBuffer, PositionLowMidHighOffsets, PositionBytesPerValue, NonRefVertexIndex, PrevPassPosition).xyz & ((int(1) << Cluster.PosBits) - 1); BitStreamWriter_Writer(DstPageBuffer, PositionStream, DstPosition.x, Cluster.PosBits.x, NANITE_MAX_POSITION_QUANTIZATION_BITS); BitStreamWriter_Writer(DstPageBuffer, PositionStream, DstPosition.y, Cluster.PosBits.y, NANITE_MAX_POSITION_QUANTIZATION_BITS); BitStreamWriter_Writer(DstPageBuffer, PositionStream, DstPosition.z, Cluster.PosBits.z, NANITE_MAX_POSITION_QUANTIZATION_BITS); BitStreamWriter_Flush(DstPageBuffer, PositionStream); } // Attribute stream FBitStreamWriterState AttribStream = BitStreamWriter_Create_Aligned(DstPageBaseOffset + Cluster.AttributeOffset, VertexIndex * Cluster.BitsPerAttribute); const int2 DstNormal = UnpackZigZagDeltas<2, NANITE_MAX_NORMAL_QUANTIZATION_BYTES>(SrcPageBuffer, NormalLowMidHighOffsets, NormalBytesPerValue, NonRefVertexIndex, PrevPassNormal).xy & ((int(1) << Cluster.NormalPrecision) - 1); const uint PackedNormal = (DstNormal.y << Cluster.NormalPrecision) | DstNormal.x; BitStreamWriter_Writer(DstPageBuffer, AttribStream, PackedNormal, 2 * Cluster.NormalPrecision, 2 * NANITE_MAX_NORMAL_QUANTIZATION_BITS); if (Cluster.bHasTangents) { const uint NumTangentBits = Cluster.TangentPrecision + 1; const int4 PackedTangent = UnpackZigZagDeltas<1, NANITE_MAX_TANGENT_AND_SIGN_QUANTIZATION_BYTES>(SrcPageBuffer, TangentLowMidHighOffsets, TangentBytesPerValue, NonRefVertexIndex, PrevPassTangent) & ((1u << NumTangentBits) - 1u); BitStreamWriter_Writer(DstPageBuffer, AttribStream, PackedTangent.x, NumTangentBits, NANITE_MAX_TANGENT_QUANTIZATION_BITS + 1); } if (Cluster.ColorMode == NANITE_VERTEX_COLOR_MODE_VARIABLE) { const uint4 DstComponentBits = UnpackToUint4(Cluster.ColorBits, 4); const int4 VertexColor = UnpackZigZagDeltas<4, NANITE_MAX_COLOR_QUANTIZATION_BYTES>(SrcPageBuffer, VertexColorLowMidHighOffsets, 1, NonRefVertexIndex, PrevPassVertexColor) & ((1u << DstComponentBits) - 1u); const uint PackedDeltaColor = VertexColor.r | (VertexColor.g << (DstComponentBits.x)) | (VertexColor.b << (DstComponentBits.x + DstComponentBits.y)) | (VertexColor.a << (DstComponentBits.x + DstComponentBits.y + DstComponentBits.z)); BitStreamWriter_Writer(DstPageBuffer, AttribStream, PackedDeltaColor, DstComponentBits.x + DstComponentBits.y + DstComponentBits.z + DstComponentBits.w, 4 * NANITE_MAX_COLOR_QUANTIZATION_BITS); } const uint Stride = Cluster.NumUVs * SIZEOF_PACKED_UV_HEADER + (Cluster.bSkinning ? SIZEOF_PACKED_BONE_INFLUENCE_HEADER : 0u); // Assumes bSkinning is the same for all clusters on the page. uint3 TexCoordLowMidHighOffsets = TexCoordLowMidHighBaseOffsets; UNROLL_N(NANITE_MAX_UVS) for (uint TexCoordIndex = 0; TexCoordIndex < Cluster.NumUVs; TexCoordIndex++) { const FUVHeader UVHeader = GetUVHeader(SrcPageBuffer, SrcPageBaseOffset + PageDiskHeader.DecodeInfoOffset + LocalClusterIndex * Stride, TexCoordIndex); const uint TexCoordBytesPerValue = (max(UVHeader.NumBits.x, UVHeader.NumBits.y) + 7) / 8; const int2 UV = UnpackZigZagDeltas<2, NANITE_MAX_TEXCOORD_COMPONENT_BYTES>(SrcPageBuffer, TexCoordLowMidHighOffsets, TexCoordBytesPerValue, NonRefVertexIndex, PrevPassUVs[TexCoordIndex]).xy & ((int(1) << int2(UVHeader.NumBits)) - 1); BitStreamWriter_Writer(DstPageBuffer, AttribStream, UV.x, UVHeader.NumBits.x, NANITE_MAX_TEXCOORD_COMPONENT_BITS); BitStreamWriter_Writer(DstPageBuffer, AttribStream, UV.y, UVHeader.NumBits.y, NANITE_MAX_TEXCOORD_COMPONENT_BITS); TexCoordLowMidHighOffsets += LowMidHighIncrement(TexCoordBytesPerValue, 2 * NumNonRefVertices); } BitStreamWriter_Flush(DstPageBuffer, AttribStream); } #endif } void TranscodePageParentDependent(uint ClusterInstallIndex, uint WaveNumActiveLanes, uint GroupIndex) { const FClusterInstallInfo ClusterInstallInfo = GetClusterInstallInfo(ClusterInstallIndex); const uint SrcPageBaseOffset = ClusterInstallInfo.SrcPageOffset; const uint DstPageBaseOffset = ClusterInstallInfo.DstPageOffset; const FPageDiskHeader PageDiskHeader = GetPageDiskHeader(SrcPageBaseOffset); const FPageHeader PageHeader = PageHeaderFromPageDiskHeader(PageDiskHeader); const uint SrcPackedClusterOffset = SrcPageBaseOffset + SIZEOF_PAGE_DISK_HEADER + PageDiskHeader.NumClusters * SIZEOF_CLUSTER_DISK_HEADER; const uint LocalClusterIndex = ClusterInstallInfo.LocalClusterIndex; const FClusterDiskHeader ClusterDiskHeader = GetClusterDiskHeader(SrcPageBaseOffset, LocalClusterIndex); const FCluster Cluster = GetCluster(SrcPageBuffer, PageHeader, SrcPackedClusterOffset, LocalClusterIndex); const uint AlignedBitmaskOffset = SrcPageBaseOffset + PageDiskHeader.VertexRefBitmaskOffset + LocalClusterIndex * (NANITE_MAX_CLUSTER_VERTICES / 8); BuildRefTable(AlignedBitmaskOffset, Cluster.NumVerts, WaveNumActiveLanes, GroupIndex); GroupMemoryBarrierWithGroupSync(); // Ref vertices int4 PrevRefVertexIndex = 0; for (uint RefIndex = GroupIndex; RefIndex < ClusterDiskHeader.NumVertexRefs; RefIndex += WaveNumActiveLanes) { const uint VertexIndex = GroupRefToVertex[RefIndex]; const uint PageClusterIndex = ReadBytes<1>(SrcPageBuffer, SrcPageBaseOffset + ClusterDiskHeader.VertexRefDataOffset, RefIndex).x; const uint PageClusterData = SrcPageBuffer.Load(SrcPageBaseOffset + ClusterDiskHeader.PageClusterMapOffset + PageClusterIndex * 4); const uint RefPageIndex = PageClusterData >> NANITE_MAX_CLUSTERS_PER_PAGE_BITS; const uint RefLocalClusterIndex = BitFieldExtractU32(PageClusterData, NANITE_MAX_CLUSTERS_PER_PAGE_BITS, 0); const uint RefVertexIndex = UnpackZigZagDeltas<1, 1>(SrcPageBuffer, uint3(SrcPageBaseOffset + ClusterDiskHeader.VertexRefDataOffset + PageDiskHeader.NumVertexRefs, 0, 0), 1, RefIndex, PrevRefVertexIndex).x & 0xFF; uint RefPageBaseOffset = 0; if (RefPageIndex != 0) { // Parent ref RefPageBaseOffset = GPUPageIndexToGPUOffset(PageDependenciesBuffer[ClusterInstallInfo.PageDependenciesOffset + (RefPageIndex - 1)]); } else { // Same page ref RefPageBaseOffset = DstPageBaseOffset; } const FPageHeader RefPageHeader = GetPageHeader(DstPageBuffer, RefPageBaseOffset); const FCluster RefCluster = GetCluster(DstPageBuffer, RefPageHeader, RefPageBaseOffset, RefLocalClusterIndex); // Transcode position { const uint RefPositionBitsPerVertex = RefCluster.PosBits.x + RefCluster.PosBits.y + RefCluster.PosBits.z; FBitStreamReaderState InputStream = BitStreamReader_Create_Aligned(RefPageBaseOffset + RefCluster.PositionOffset, RefVertexIndex * RefPositionBitsPerVertex, 3 * NANITE_MAX_POSITION_QUANTIZATION_BITS); const int3 RefPosition = BitStreamReader_Read3_RW(DstPageBuffer, InputStream, RefCluster.PosBits, NANITE_MAX_POSITION_QUANTIZATION_BITS); const int3 DstPosition = RefPosition + RefCluster.PosStart - Cluster.PosStart; const uint PositionBitsPerVertex = Cluster.PosBits.x + Cluster.PosBits.y + Cluster.PosBits.z; FBitStreamWriterState OutputStream = BitStreamWriter_Create_Aligned(DstPageBaseOffset + Cluster.PositionOffset, VertexIndex * PositionBitsPerVertex); BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstPosition.x, Cluster.PosBits.x, NANITE_MAX_POSITION_QUANTIZATION_BITS); BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstPosition.y, Cluster.PosBits.y, NANITE_MAX_POSITION_QUANTIZATION_BITS); BitStreamWriter_Writer(DstPageBuffer, OutputStream, DstPosition.z, Cluster.PosBits.z, NANITE_MAX_POSITION_QUANTIZATION_BITS); BitStreamWriter_Flush(DstPageBuffer, OutputStream); } // Specialize vertex transcoding codegen for each of the possible values for NumTexCoords const uint NumTexCoords = Cluster.NumUVs; if (NumTexCoords == 0) { TranscodeVertexAttributes<0>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex, RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset); } else if(NumTexCoords == 1) { TranscodeVertexAttributes<1>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex, RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset); } else if(NumTexCoords == 2) { TranscodeVertexAttributes<2>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex, RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset); } else if(NumTexCoords == 3) { TranscodeVertexAttributes<3>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex, RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset); } else if(NumTexCoords == 4) { TranscodeVertexAttributes<4>(PageDiskHeader, Cluster, DstPageBaseOffset, LocalClusterIndex, VertexIndex, RefCluster, RefPageBaseOffset, RefVertexIndex, SrcPageBaseOffset); } } } #if COMPILER_SUPPORTS_WAVE_SIZE WAVESIZE(GROUP_SIZE) #endif [numthreads(GROUP_SIZE, 1, 1)] void TranscodePageToGPU(uint3 GroupID : SV_GroupID, uint GroupIndex : SV_GroupIndex) { const uint FlatGroupID = GetUnWrappedDispatchGroupId(GroupID); if (FlatGroupID >= NumClusters) { return; } const uint ClusterInstallIndex = StartClusterIndex + FlatGroupID; if (GroupIndex >= WaveGetLaneCount()) { // Workaround for any platform that might not support setting the wave size explicitly return; } const uint WaveNumActiveLanes = min(WaveGetLaneCount(), GROUP_SIZE); #if NANITE_TRANSCODE_PASS == NANITE_TRANSCODE_PASS_INDEPENDENT TranscodePageIndependent(ClusterInstallIndex, WaveNumActiveLanes, GroupIndex); #elif NANITE_TRANSCODE_PASS == NANITE_TRANSCODE_PASS_PARENT_DEPENDENT TranscodePageParentDependent(ClusterInstallIndex, WaveNumActiveLanes, GroupIndex); #endif }