// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "../Common.ush" #include "../BitPacking.ush" #include "../SceneData.ush" #include "../WaveOpUtil.ush" #include "../BoneTransform.ush" #include "/Engine/Shared/NaniteDefinitions.h" #include "/Engine/Shared/SkinningDefinitions.h" #include "/Engine/Shared/HLSLStaticAssert.h" #include "NanitePackedNaniteView.ush" #ifndef DEBUG_FLAGS #define DEBUG_FLAGS 0 #endif uint GetHWClusterCounterIndex(uint InRenderFlags) { // Ensure rasterizer uses compile time constants. #ifdef NANITE_HW_COUNTER_INDEX return NANITE_HW_COUNTER_INDEX; #else // Other passes use a uniform branch to minimize permutations. return CondMask(InRenderFlags & (NANITE_RENDER_FLAG_MESH_SHADER | NANITE_RENDER_FLAG_PRIMITIVE_SHADER), 4u, 5u); #endif } struct FVisibleCluster { uint Flags; uint ViewId; uint InstanceId; uint PageIndex; uint ClusterIndex; uint AssemblyTransformIndex; uint DepthBucket; uint2 vPage; uint2 vPageEnd; // Last page to render (inclusive). Only used during SW rasterization currently }; struct FPageHeader { uint NumClusters; uint MaxClusterBoneInfluences; uint MaxVoxelBoneInfluences; }; struct FCluster { uint PageBaseAddress; uint NumVerts; uint PositionOffset; uint NumTris; uint IndexOffset; int3 PosStart; uint BitsPerIndex; int PosPrecision; uint3 PosBits; uint NormalPrecision; uint TangentPrecision; float PosScale; float PosRcpScale; float4 LODBounds; float3 BoxBoundsCenter; float LODError; float EdgeLength; float3 BoxBoundsExtent; uint Flags; uint AttributeOffset; uint BitsPerAttribute; uint DecodeInfoOffset; bool bHasTangents; bool bSkinning; bool bVoxel; uint NumUVs; uint ColorMode; uint UVBitOffsets; uint ColorMin; uint ColorBits; uint GroupIndex; // Debug only uint NumClusterBoneInfluences; uint ClusterBoneInfluenceAddress; uint ClusterBoneInfluenceStride; // Material Slow path uint MaterialTableOffset; uint MaterialTableLength; uint VertReuseBatchCountTableOffset; // dword offset from page base uint VertReuseBatchCountTableSize; // number of entries, each 4-bit // Material Fast path uint Material0Length; uint Material0Index; uint Material1Length; uint Material1Index; uint Material2Index; uint MaterialTotalLength; uint4 VertReuseBatchInfo; uint ExtendedDataOffset; uint ExtendedDataNum; uint BrickDataOffset; uint BrickDataNum; }; struct FBrick { uint2 ReverseBrickBits; int3 StartPos; uint3 BrickMax; uint VertOffset; }; struct FClusterBoneInfluence { uint BoneIndex; #if NANITE_USE_PRECISE_SKINNING_BOUNDS float MinWeight; float MaxWeight; float3 BoundMin; float3 BoundMax; #endif }; struct FVoxelBoneInfluence { uint BoneIndex; float Weight; }; struct FHierarchyNodeSlice { float4 LODBounds; float3 BoxBoundsCenter; float3 BoxBoundsExtent; float MinLODError; float MaxParentLODError; uint ChildStartReference; // Can be node (index) or cluster (page:cluster) uint NumChildren; uint StartPageIndex; uint NumPages; uint AssemblyTransformIndex; bool bEnabled; bool bLoaded; bool bLeaf; }; struct FInstanceDynamicData { float4x4 LocalToTranslatedWorld; float4x4 PrevLocalToTranslatedWorld; bool bHasMoved; }; struct FNaniteView { float4x4 SVPositionToTranslatedWorld; float4x4 ViewToTranslatedWorld; float4x4 TranslatedWorldToView; float4x4 TranslatedWorldToClip; float4x4 ViewToClip; FDFMatrix ClipToWorld; float4x4 PrevTranslatedWorldToView; float4x4 PrevTranslatedWorldToClip; float4x4 PrevViewToClip; FDFMatrix PrevClipToWorld; float3x3 FirstPersonTransform; float4 TranslatedGlobalClipPlane; int4 ViewRect; float4 ViewSizeAndInvSize; float4 ClipSpaceScaleOffset; float4 MaterialCacheUnwrapMinAndInvSize; float4 MaterialCachePageAdvanceAndInvCount; FDFVector3 PreViewTranslation; FDFVector3 PrevPreViewTranslation; FDFVector3 WorldCameraOrigin; float3 CullingViewOriginTranslatedWorld; float3 ViewForward; float3 ViewOriginHigh; float NearPlane; float LODScale; float LODScaleHW; float CullingViewMinRadiusTestFactorSq; uint StreamingPriorityCategory; uint Flags; int TargetLayerIndex; int TargetMipLevel; int TargetNumMipLevels; int TargetPrevLayerIndex; float RangeBasedCullingDistance; int4 HZBTestViewRect; float CullingViewScreenMultipleSq; uint InstanceOcclusionQueryMask; bool bUseLightingChannelMask; uint LightingChannelMask; int SceneRendererPrimaryViewId; float2 DynamicDepthCullRange; }; struct FInstanceDraw { uint InstanceId; uint ViewId; }; struct FNaniteFullscreenVSToPS { #if INSTANCED_STEREO nointerpolation uint EyeIndex : PACKED_EYE_INDEX; #endif nointerpolation uint ViewIndex : PACKED_VIEW_INDEX; nointerpolation uint TileIndex : MACRO_TILE_INDEX; }; #if NANITE_USE_RAYTRACING_UNIFORM_BUFFER #define PageConstants NaniteRayTracing.PageConstants #define MaxNodes NaniteRayTracing.MaxNodes #define ClusterPageData NaniteRayTracing.ClusterPageData #define HierarchyBuffer NaniteRayTracing.HierarchyBuffer #define RayTracingDataBuffer NaniteRayTracing.RayTracingDataBuffer // These parameters shouldn't be used in RT shaders #define RenderFlags 0 //uint MaxVisibleClusters; //uint DebugFlags; //ByteAddressBuffer VisibleClustersSWHW; #else #if NANITE_USE_RASTER_UNIFORM_BUFFER #define PageConstants NaniteRaster.PageConstants #define MaxNodes NaniteRaster.MaxNodes #define MaxVisibleClusters NaniteRaster.MaxVisibleClusters #define MaxPatchesPerGroup NaniteRaster.MaxPatchesPerGroup #define MeshPass NaniteRaster.MeshPass #define InvDiceRate NaniteRaster.InvDiceRate #define RenderFlags NaniteRaster.RenderFlags #define DebugFlags NaniteRaster.DebugFlags #else uint4 PageConstants; uint MaxNodes; uint MaxVisibleClusters; uint MaxPatchesPerGroup; uint MeshPass; float InvDiceRate; uint RenderFlags; uint DebugFlags; #endif #if NANITE_USE_SHADING_UNIFORM_BUFFER #define ClusterPageData NaniteShading.ClusterPageData #define VisibleClustersSWHW NaniteShading.VisibleClustersSWHW #define HierarchyBuffer NaniteShading.HierarchyBuffer #else // !NANITE_USE_SHADING_UNIFORM_BUFFER ByteAddressBuffer ClusterPageData; ByteAddressBuffer VisibleClustersSWHW; ByteAddressBuffer HierarchyBuffer; StructuredBuffer RayTracingDataBuffer; #endif // !NANITE_USE_SHADING_UNIFORM_BUFFER #endif #if FEATURE_LEVEL >= FEATURE_LEVEL_SM6 || PLATFORM_SUPPORTS_SM6_0_WAVE_OPERATIONS // TODO: This header can be included from SM5 from BuildInstanceDrawCommands.usf. Refactor it. HLSL_STATIC_ASSERT(sizeof(FInstanceDynamicData) == 132, "Unexpected size of FInstanceDynamicData. Update WaveReadLaneAt to reflect changes."); FInstanceDynamicData WaveReadLaneAt(FInstanceDynamicData In, uint SrcIndex) { FInstanceDynamicData Result; Result.LocalToTranslatedWorld = WaveReadLaneAtMatrix(In.LocalToTranslatedWorld, SrcIndex); Result.PrevLocalToTranslatedWorld = WaveReadLaneAtMatrix(In.PrevLocalToTranslatedWorld, SrcIndex); Result.bHasMoved = WaveReadLaneAt(In.bHasMoved, SrcIndex); return Result; } #endif float ClipZFromLinearZ(FNaniteView NaniteView, float LinearZ) { return LinearZ * NaniteView.ViewToClip[2][2] + NaniteView.ViewToClip[3][2]; // TODO: Pack coefficients into single load? } // Packs a (PageIndex, ClusterIndex) pair into a flat index based on max clusters per page. uint PackPoolClusterRef(uint PageIndex, uint ClusterIndex) { const uint MaxStreamingPages = PageConstants.y; return (min(PageIndex, MaxStreamingPages) << NANITE_STREAMING_PAGE_MAX_CLUSTERS_BITS) + ((uint)max((int)PageIndex - (int)MaxStreamingPages, 0) << NANITE_ROOT_PAGE_MAX_CLUSTERS_BITS) + ClusterIndex; } void UnpackPoolClusterRef(uint PackedClusterRef, inout uint PageIndex, inout uint ClusterIndex) { const uint MaxStreamingPages = PageConstants.y; const uint MaxStreamingClusters = MaxStreamingPages << NANITE_STREAMING_PAGE_MAX_CLUSTERS_BITS; if (PackedClusterRef < MaxStreamingClusters) { PageIndex = PackedClusterRef >> NANITE_STREAMING_PAGE_MAX_CLUSTERS_BITS; ClusterIndex = PackedClusterRef & ((1u << NANITE_STREAMING_PAGE_MAX_CLUSTERS_BITS) - 1u); } else { PackedClusterRef -= MaxStreamingClusters; PageIndex = MaxStreamingPages + (PackedClusterRef >> NANITE_ROOT_PAGE_MAX_CLUSTERS_BITS); ClusterIndex = PackedClusterRef & ((1u << NANITE_ROOT_PAGE_MAX_CLUSTERS_BITS) - 1u); } } uint4 PackVisibleCluster(FVisibleCluster VisibleCluster, bool bHasPageData) { uint4 RawData = 0; uint BitPos = 0; const uint PackedClusterRef = PackPoolClusterRef(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex); WriteBits(RawData, BitPos, VisibleCluster.Flags, NANITE_NUM_CULLING_FLAG_BITS); WriteBits(RawData, BitPos, VisibleCluster.ViewId, NANITE_MAX_VIEWS_PER_CULL_RASTERIZE_PASS_BITS); WriteBits(RawData, BitPos, VisibleCluster.InstanceId, NANITE_MAX_INSTANCES_BITS); WriteBits(RawData, BitPos, PackedClusterRef, NANITE_POOL_CLUSTER_REF_BITS); #if NANITE_EXTENDED_VISIBLE_CLUSTERS WriteBits(RawData, BitPos, VisibleCluster.AssemblyTransformIndex, NANITE_ASSEMBLY_TRANSFORM_INDEX_BITS); WriteBits(RawData, BitPos, VisibleCluster.DepthBucket, NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK_BITS); // This is not needed for candidate clusters. // We could make a separate CandidateCluster struct if/when this makes a difference. #endif if (bHasPageData) { WriteBits(RawData, BitPos, VisibleCluster.vPage.x, 13); WriteBits(RawData, BitPos, VisibleCluster.vPage.y, 13); uint2 Delta = (VisibleCluster.vPageEnd - VisibleCluster.vPage) & 0x7; WriteBits(RawData, BitPos, Delta.x, 3); WriteBits(RawData, BitPos, Delta.y, 3); } return RawData; } FVisibleCluster UnpackVisibleCluster(uint4 RawData, bool bHasPageData = false) { uint BitPos = 0; FVisibleCluster VisibleCluster; VisibleCluster.Flags = ReadBits( RawData, BitPos, NANITE_NUM_CULLING_FLAG_BITS ); VisibleCluster.ViewId = ReadBits( RawData, BitPos, NANITE_MAX_VIEWS_PER_CULL_RASTERIZE_PASS_BITS ); VisibleCluster.InstanceId = ReadBits( RawData, BitPos, NANITE_MAX_INSTANCES_BITS ); const uint PackedClusterRef = ReadBits( RawData, BitPos, NANITE_POOL_CLUSTER_REF_BITS ); #if NANITE_EXTENDED_VISIBLE_CLUSTERS VisibleCluster.AssemblyTransformIndex = ReadBits( RawData, BitPos, NANITE_ASSEMBLY_TRANSFORM_INDEX_BITS ); VisibleCluster.DepthBucket = ReadBits( RawData, BitPos, NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK_BITS ); #else VisibleCluster.AssemblyTransformIndex = 0xFFFFFFFFu; VisibleCluster.DepthBucket = 0; #endif UnpackPoolClusterRef( PackedClusterRef, VisibleCluster.PageIndex, VisibleCluster.ClusterIndex ); if( bHasPageData ) { VisibleCluster.vPage.x = ReadBits( RawData, BitPos, 13 ); VisibleCluster.vPage.y = ReadBits( RawData, BitPos, 13 ); VisibleCluster.vPageEnd.x = ReadBits( RawData, BitPos, 3 ); VisibleCluster.vPageEnd.y = ReadBits( RawData, BitPos, 3 ); VisibleCluster.vPageEnd += VisibleCluster.vPage; } else { VisibleCluster.vPage = 0; } return VisibleCluster; } FVisibleCluster GetVisibleCluster( ByteAddressBuffer VisibleClusters, uint ClusterIdx, bool bHasPageData = false ) { uint4 RawData; #if NANITE_EXTENDED_VISIBLE_CLUSTERS if( bHasPageData ) RawData = VisibleClusters.Load4( ClusterIdx * 16 ); else RawData = uint4( VisibleClusters.Load3( ClusterIdx * 12 ), 0 ); #else if( bHasPageData ) RawData = uint4( VisibleClusters.Load3( ClusterIdx * 12 ), 0 ); else RawData = uint4( VisibleClusters.Load2( ClusterIdx * 8 ), 0, 0 ); #endif return UnpackVisibleCluster(RawData, bHasPageData); } FVisibleCluster GetVisibleCluster( uint ClusterIdx, bool bHasPageData ) { #if NANITE_USE_RAYTRACING_UNIFORM_BUFFER return (FVisibleCluster)0; #else return GetVisibleCluster( VisibleClustersSWHW, ClusterIdx, bHasPageData ); #endif } bool IsVisibleClusterIndexImposter(uint ClusterIndex) { #if NANITE_IMPOSTERS_SUPPORTED return ClusterIndex >= (1 << 24); #else return false; #endif } FVisibleCluster GetVisibleCluster( uint ClusterIndex ) { FVisibleCluster VisibleCluster; #if NANITE_IMPOSTERS_SUPPORTED if( IsVisibleClusterIndexImposter(ClusterIndex) ) { // Couldn't have been stored so signals this is an imposter VisibleCluster.Flags = 1 << NANITE_NUM_CULLING_FLAG_BITS; VisibleCluster.ViewId = 0; // TODO VisibleCluster.InstanceId = BitFieldExtractU32( ClusterIndex, NANITE_MAX_INSTANCES_BITS - 1, 1 ); VisibleCluster.AssemblyTransformIndex = uint32(-1); VisibleCluster.PageIndex = 0; VisibleCluster.ClusterIndex = ClusterIndex & 1; } else #endif { VisibleCluster = GetVisibleCluster( ClusterIndex, false ); } return VisibleCluster; } bool IsValidAssemblyTransformIndex(uint AssemblyTransformIndex) { // NOTE: This should effectively cause all assembly transformation to DCE without having to #ifdef callsites #if NANITE_ASSEMBLY_DATA return AssemblyTransformIndex < NANITE_MAX_ASSEMBLY_TRANSFORMS; #else return false; #endif } bool IsAssemblyPartCluster(FVisibleCluster VisibleCluster) { return IsValidAssemblyTransformIndex(VisibleCluster.AssemblyTransformIndex); } FCluster UnpackCluster(uint4 ClusterData[NANITE_NUM_PACKED_CLUSTER_FLOAT4S], FPageHeader PageHeader, uint PageBaseAddress, uint LocalClusterIndex) { FCluster Cluster; Cluster.PageBaseAddress = PageBaseAddress; Cluster.NumVerts = BitFieldExtractU32(ClusterData[0].x, 14, 0); Cluster.PositionOffset = BitFieldExtractU32(ClusterData[0].x, 18, 14); Cluster.NumTris = BitFieldExtractU32(ClusterData[0].y, 8, 0); Cluster.IndexOffset = BitFieldExtractU32(ClusterData[0].y, 24, 8); Cluster.ColorMin = ClusterData[0].z; Cluster.ColorBits = BitFieldExtractU32(ClusterData[0].w, 16, 0); Cluster.GroupIndex = BitFieldExtractU32(ClusterData[0].w, 16, 16); // Debug only Cluster.PosStart = ClusterData[1].xyz; Cluster.BitsPerIndex = BitFieldExtractU32(ClusterData[1].w, 3, 0) + 1; Cluster.PosPrecision = (int)BitFieldExtractU32(ClusterData[1].w, 6, 3) + NANITE_MIN_POSITION_PRECISION; Cluster.PosBits.x = BitFieldExtractU32(ClusterData[1].w, 5, 9); Cluster.PosBits.y = BitFieldExtractU32(ClusterData[1].w, 5, 14); Cluster.PosBits.z = BitFieldExtractU32(ClusterData[1].w, 5, 19); Cluster.NormalPrecision = BitFieldExtractU32(ClusterData[1].w, 4, 24); Cluster.TangentPrecision = BitFieldExtractU32(ClusterData[1].w, 4, 28); Cluster.PosScale = asfloat(asint(1.0f) - (Cluster.PosPrecision << 23)); Cluster.PosRcpScale = asfloat(asint(1.0f) + (Cluster.PosPrecision << 23)); Cluster.LODBounds = asfloat(ClusterData[2]); Cluster.BoxBoundsCenter = asfloat(ClusterData[3].xyz); Cluster.LODError = f16tof32(ClusterData[3].w); Cluster.EdgeLength = f16tof32(ClusterData[3].w >> 16); Cluster.BoxBoundsExtent = asfloat(ClusterData[4].xyz); Cluster.Flags = BitFieldExtractU32(ClusterData[4].w, 4, 0); Cluster.NumClusterBoneInfluences = BitFieldExtractU32(ClusterData[4].w, 5, 4); Cluster.AttributeOffset = BitFieldExtractU32(ClusterData[5].x, 22, 0); Cluster.BitsPerAttribute = BitFieldExtractU32(ClusterData[5].x, 10, 22); Cluster.DecodeInfoOffset = BitFieldExtractU32(ClusterData[5].y, 22, 0); Cluster.bHasTangents = BitFieldExtractU32(ClusterData[5].y, 1, 22); Cluster.bSkinning = BitFieldExtractU32(ClusterData[5].y, 1, 23); Cluster.bVoxel = (Cluster.NumTris == 0) && NANITE_VOXEL_DATA; Cluster.NumUVs = BitFieldExtractU32(ClusterData[5].y, 3, 24); Cluster.ColorMode = BitFieldExtractU32(ClusterData[5].y, 1, 27); Cluster.UVBitOffsets = ClusterData[5].z; const uint MaterialEncoding = ClusterData[5].w; Cluster.ExtendedDataOffset = BitFieldExtractU32(ClusterData[6].x, 22, 0); Cluster.ExtendedDataNum = BitFieldExtractU32(ClusterData[6].x, 10, 22); Cluster.BrickDataOffset = BitFieldExtractU32(ClusterData[6].y, 22, 0); Cluster.BrickDataNum = BitFieldExtractU32(ClusterData[6].y, 10, 22); // Material Table Range Encoding (32 bits) // uint TriStart : 8; // max 128 triangles // uint TriLength : 8; // max 128 triangles // uint MaterialIndex : 6; // max 64 materials // uint Padding : 10; // Material Packed Range - Fast Path (32 bits) // uint Material0Index : 6; // max 64 materials (0:Material0Length) // uint Material1Index : 6; // max 64 materials (Material0Length:Material1Length) // uint Material2Index : 6; // max 64 materials (remainder) // uint Material0Length : 7; // max 128 triangles (num minus one) // uint Material1Length : 7; // max 64 triangles (materials are sorted, so at most 128/2) // Material Packed Range - Slow Path (32 bits) // uint BufferIndex : 19; // 2^19 max value (tons, it's per prim) // uint BufferLength : 6; // max 64 ranges (num minus one) // uint Padding : 7; // always 127 for slow path. corresponds to Material1Length=127 in fast path BRANCH if (MaterialEncoding < 0xFE000000u) { // Fast inline path Cluster.MaterialTableOffset = 0; Cluster.MaterialTableLength = 0; Cluster.Material0Index = BitFieldExtractU32(MaterialEncoding, 6, 0); Cluster.Material1Index = BitFieldExtractU32(MaterialEncoding, 6, 6); Cluster.Material2Index = BitFieldExtractU32(MaterialEncoding, 6, 12); Cluster.Material0Length = BitFieldExtractU32(MaterialEncoding, 7, 18) + 1; Cluster.Material1Length = BitFieldExtractU32(MaterialEncoding, 7, 25); Cluster.VertReuseBatchCountTableOffset = 0; Cluster.VertReuseBatchCountTableSize = 0; Cluster.VertReuseBatchInfo = ClusterData[7]; } else { // Slow global search path Cluster.MaterialTableOffset = BitFieldExtractU32(MaterialEncoding, 19, 0); Cluster.MaterialTableLength = BitFieldExtractU32(MaterialEncoding, 6, 19) + 1; Cluster.Material0Index = 0; Cluster.Material1Index = 0; Cluster.Material2Index = 0; Cluster.Material0Length = 0; Cluster.Material1Length = 0; Cluster.VertReuseBatchCountTableOffset = ClusterData[7].x; Cluster.VertReuseBatchCountTableSize = ClusterData[7].y; Cluster.VertReuseBatchInfo = 0; } Cluster.MaterialTotalLength = Cluster.bVoxel ? Cluster.BrickDataNum : Cluster.NumTris; const uint ClusterBoneInfluenceAddress = PageBaseAddress + NANITE_GPU_PAGE_HEADER_SIZE + NANITE_NUM_PACKED_CLUSTER_FLOAT4S * 16 * PageHeader.NumClusters; uint VoxelBoneInfluenceAddress = ClusterBoneInfluenceAddress + PageHeader.NumClusters * PageHeader.MaxClusterBoneInfluences * (uint)sizeof(FClusterBoneInfluence); VoxelBoneInfluenceAddress = (VoxelBoneInfluenceAddress + 15) & ~15u; // Align to match builder behavior. //VOXELTODO: We don't seem to actually need more than 4 byte alignment. Fix the builder and save the ~8 bytes per page instead? // TODO: Unify these two paths and formats into one if/when we finally kill NANITE_USE_PRECISE_SKINNING_BOUNDS if (Cluster.bVoxel) { Cluster.ClusterBoneInfluenceAddress = VoxelBoneInfluenceAddress + LocalClusterIndex * 4; Cluster.ClusterBoneInfluenceStride = PageHeader.NumClusters * 4; } else { Cluster.ClusterBoneInfluenceAddress = ClusterBoneInfluenceAddress + LocalClusterIndex * (uint)sizeof(FClusterBoneInfluence); Cluster.ClusterBoneInfluenceStride = PageHeader.NumClusters * (uint)sizeof(FClusterBoneInfluence); } return Cluster; } uint GPUPageIndexToGPUOffset(uint PageIndex) { const uint MaxStreamingPages = PageConstants.y; return (min(PageIndex, MaxStreamingPages) << NANITE_STREAMING_PAGE_GPU_SIZE_BITS) + ((uint)max((int)PageIndex - (int)MaxStreamingPages, 0) << NANITE_ROOT_PAGE_GPU_SIZE_BITS); } FPageHeader UnpackPageHeader(uint4 Data) { FPageHeader Header; Header.NumClusters = BitFieldExtractU32(Data.x, 16, 0); Header.MaxClusterBoneInfluences = BitFieldExtractU32(Data.x, 8, 16); Header.MaxVoxelBoneInfluences = BitFieldExtractU32(Data.x, 8, 24); return Header; } FPageHeader GetPageHeader(ByteAddressBuffer InputBuffer, uint PageAddress) { return UnpackPageHeader(InputBuffer.Load4(PageAddress)); } FPageHeader GetPageHeader(RWByteAddressBuffer InputBuffer, uint PageAddress) { return UnpackPageHeader(InputBuffer.Load4(PageAddress)); } FCluster GetCluster(ByteAddressBuffer InputBuffer, FPageHeader PageHeader, uint SrcBaseOffset, uint ClusterIndex) { const uint ClusterSOAStride = (PageHeader.NumClusters << 4); const uint ClusterBaseAddress = SrcBaseOffset + ( ClusterIndex << 4 ); uint4 ClusterData[NANITE_NUM_PACKED_CLUSTER_FLOAT4S]; UNROLL for(int i = 0; i < NANITE_NUM_PACKED_CLUSTER_FLOAT4S; i++) { ClusterData[i] = InputBuffer.Load4( ClusterBaseAddress + i * ClusterSOAStride + NANITE_GPU_PAGE_HEADER_SIZE ); // Adding NANITE_GPU_PAGE_HEADER_SIZE inside the loop prevents compiler confusion about offset modifier and generates better code } return UnpackCluster(ClusterData, PageHeader, SrcBaseOffset, ClusterIndex); } FCluster GetCluster(RWByteAddressBuffer InputBuffer, FPageHeader PageHeader, uint PageBaseAddress, uint ClusterIndex) { const uint ClusterSOAStride = (PageHeader.NumClusters << 4); const uint ClusterBaseAddress = PageBaseAddress + (ClusterIndex << 4); uint4 ClusterData[NANITE_NUM_PACKED_CLUSTER_FLOAT4S]; UNROLL for (int i = 0; i < NANITE_NUM_PACKED_CLUSTER_FLOAT4S; i++) { ClusterData[i] = InputBuffer.Load4( ClusterBaseAddress + i * ClusterSOAStride + NANITE_GPU_PAGE_HEADER_SIZE ); // Adding NANITE_GPU_PAGE_HEADER_SIZE inside the loop prevents compiler confusion about offset modifier and generates better code } return UnpackCluster(ClusterData, PageHeader, PageBaseAddress, ClusterIndex); } FCluster GetCluster(uint PageIndex, uint ClusterIndex) { uint PageBaseAddress = GPUPageIndexToGPUOffset(PageIndex); FPageHeader PageHeader = GetPageHeader(ClusterPageData, PageBaseAddress); return GetCluster(ClusterPageData, PageHeader, PageBaseAddress, ClusterIndex); } FHierarchyNodeSlice UnpackHierarchyNodeSlice(uint4 RawData0, uint4 RawData1, uint4 RawData2, uint2 RawData3) { const uint4 Misc0 = RawData1; const uint4 Misc1 = RawData2; const uint2 Misc2 = RawData3; FHierarchyNodeSlice Node; Node.LODBounds = asfloat(RawData0); Node.BoxBoundsCenter = asfloat(Misc0.xyz); Node.BoxBoundsExtent = asfloat(Misc1.xyz); Node.MinLODError = f16tof32(Misc0.w); Node.MaxParentLODError = f16tof32(Misc0.w >> 16); Node.ChildStartReference = Misc1.w; // When changing this, remember to also update StoreHierarchyNodeChildStartReference Node.bLoaded = (Misc1.w != 0xFFFFFFFFu); Node.NumChildren = BitFieldExtractU32(Misc2.x, NANITE_MAX_CLUSTERS_PER_GROUP_BITS, 0); Node.NumPages = BitFieldExtractU32(Misc2.x, NANITE_MAX_GROUP_PARTS_BITS, NANITE_MAX_CLUSTERS_PER_GROUP_BITS); Node.StartPageIndex = BitFieldExtractU32(Misc2.x, NANITE_MAX_RESOURCE_PAGES_BITS, NANITE_MAX_CLUSTERS_PER_GROUP_BITS + NANITE_MAX_GROUP_PARTS_BITS); Node.bEnabled = Misc2.x != 0u; Node.bLeaf = Misc2.x != 0xFFFFFFFFu; Node.AssemblyTransformIndex = Misc2.y; return Node; } uint GetHierarchyNodeOffset(uint RootOffset, uint NodeIndex) { return RootOffset + NodeIndex * NANITE_HIERARCHY_NODE_SLICE_SIZE_DWORDS; } FHierarchyNodeSlice GetHierarchyNodeSlice(ByteAddressBuffer InputBuffer, uint NodeOffset, uint ChildIndex) { // NOTE: Offset is expected in dwords const uint BaseAddress = NodeOffset * 4; const uint4 RawData0 = InputBuffer.Load4(BaseAddress + 16 * ChildIndex); const uint4 RawData1 = InputBuffer.Load4(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 16) + 16 * ChildIndex); const uint4 RawData2 = InputBuffer.Load4(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 32) + 16 * ChildIndex); #if NANITE_ASSEMBLY_DATA const uint2 RawData3 = InputBuffer.Load2(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 48) + 8 * ChildIndex); #else const uint2 RawData3 = uint2(InputBuffer.Load(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 48) + 4 * ChildIndex), 0xFFFFFFFFu); #endif return UnpackHierarchyNodeSlice(RawData0, RawData1, RawData2, RawData3); } FHierarchyNodeSlice GetHierarchyNodeSlice(RWByteAddressBuffer InputBuffer, uint NodeOffset, uint ChildIndex) { // NOTE: Offset is expected in dwords const uint BaseAddress = NodeOffset * 4; const uint4 RawData0 = InputBuffer.Load4(BaseAddress + 16 * ChildIndex); const uint4 RawData1 = InputBuffer.Load4(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 16) + 16 * ChildIndex); const uint4 RawData2 = InputBuffer.Load4(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 32) + 16 * ChildIndex); #if NANITE_ASSEMBLY_DATA const uint2 RawData3 = InputBuffer.Load2(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 48) + 8 * ChildIndex); #else const uint2 RawData3 = uint2(InputBuffer.Load(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 48) + 4 * ChildIndex), 0xFFFFFFFFu); #endif return UnpackHierarchyNodeSlice(RawData0, RawData1, RawData2, RawData3); } FHierarchyNodeSlice GetHierarchyNodeSlice(uint NodeOffset, uint ChildIndex) { return GetHierarchyNodeSlice(HierarchyBuffer, NodeOffset, ChildIndex); } void StoreHierarchyNodeChildStartReference(RWByteAddressBuffer OutputBuffer, uint NodeOffset, uint ChildIndex, uint ChildStartReference) { const uint Address = NodeOffset * 4 + NANITE_MAX_BVH_NODE_FANOUT * 32 + 16 * ChildIndex + 12; OutputBuffer.Store(Address, ChildStartReference); } // Decode triangle that is represented by one base index and two 5-bit offsets. uint3 DecodeTriangleIndices(FCluster Cluster, uint TriIndex) { if( Cluster.bVoxel ) return uint3( TriIndex, TriIndex, TriIndex ); const uint BitsPerTriangle = Cluster.BitsPerIndex + 2 * 5; FBitStreamReaderState BitStreamReader = BitStreamReader_Create_Aligned(Cluster.PageBaseAddress + Cluster.IndexOffset, TriIndex * BitsPerTriangle, 8 + 2*5); uint BaseIndex = BitStreamReader_Read_RO(ClusterPageData, BitStreamReader, Cluster.BitsPerIndex, 8); uint Delta0 = BitStreamReader_Read_RO(ClusterPageData, BitStreamReader, 5, 5); uint Delta1 = BitStreamReader_Read_RO(ClusterPageData, BitStreamReader, 5, 5); return BaseIndex + uint3(0, Delta0, Delta1); } FBrick DecodeBrick(FCluster Cluster, uint BrickIndex) { FBrick Brick; const uint4 BrickData0 = ClusterPageData.Load4( Cluster.PageBaseAddress + Cluster.BrickDataOffset + 20 * BrickIndex ); const uint BrickData1 = ClusterPageData.Load ( Cluster.PageBaseAddress + Cluster.BrickDataOffset + 20 * BrickIndex + 16 ); Brick.ReverseBrickBits = BrickData0.xy; Brick.BrickMax = uint3(BitFieldExtractU32( BrickData0.z, 2, 0 ), BitFieldExtractU32( BrickData0.z, 2, 2 ), BitFieldExtractU32( BrickData0.z, 2, 4 ) ) + 1u; Brick.StartPos = int3( BitFieldExtractI32( BrickData0.z, 19, 6 ), BitFieldExtractI32( BitAlignU32( BrickData0.w, BrickData0.z, 25 ), 19, 0 ), BitFieldExtractI32( BrickData0.w, 19, 12 ) ); Brick.VertOffset = BrickData1; return Brick; } struct FShadingMask { bool bIsNanitePixel; bool bIsDecalReceiver; bool bHasDistanceField; uint LightingChannels; uint ShadingBin; uint ShadingRate; }; uint PackShadingMask( uint ShadingBin, uint ShadingRate, bool bIsDecalReceiver, bool bHasDistanceField, uint LightingChannels ) { uint Packed = 0x1; // Is Nanite Packed |= (BitFieldMaskU32(3, 1) & (LightingChannels << 1u)); // 3 bits for channels 0,1,2 Packed |= (BitFieldMaskU32(14, 4) & (ShadingBin << 4u)); Packed |= select(bIsDecalReceiver, 1u << 18u, 0u); Packed |= select(bHasDistanceField, 1u << 19u, 0u); Packed |= (BitFieldMaskU32(4, 20) & (ShadingRate << 20u)); // 4 bits for 2x2 tier2 VRS return Packed; } uint PackShadingMask(FShadingMask Mask) { return PackShadingMask( Mask.ShadingBin, Mask.ShadingRate, Mask.bIsDecalReceiver, Mask.bHasDistanceField, Mask.LightingChannels ); } FShadingMask UnpackShadingMask(uint Packed) { FShadingMask UnpackedMask; UnpackedMask.bIsNanitePixel = BitFieldExtractU32(Packed.x, 1, 0) != 0; UnpackedMask.LightingChannels = BitFieldExtractU32(Packed.x, 3, 1); UnpackedMask.ShadingBin = BitFieldExtractU32(Packed.x, 14, 4); UnpackedMask.bIsDecalReceiver = BitFieldExtractU32(Packed.x, 1, 18) != 0; UnpackedMask.bHasDistanceField = BitFieldExtractU32(Packed.x, 1, 19) != 0; UnpackedMask.ShadingRate = BitFieldExtractU32(Packed.x, 4, 20); return UnpackedMask; } void UnpackVisPixel( UlongType Pixel, out uint DepthInt, out uint VisibleClusterIndex, out uint TriIndex ) { const uint2 Unpacked = UnpackUlongType(Pixel); VisibleClusterIndex = Unpacked.x >> 7; TriIndex = Unpacked.x & 0x7F; DepthInt = Unpacked.y; VisibleClusterIndex--; } void UnpackVisPixel( UlongType Pixel, out uint DepthInt, out uint VisibleClusterIndex, out uint TriIndex, out bool bIsImposter ) { const uint2 Unpacked = UnpackUlongType(Pixel); VisibleClusterIndex = Unpacked.x >> 7; TriIndex = Unpacked.x & 0x7F; DepthInt = Unpacked.y; #if NANITE_IMPOSTERS_SUPPORTED bIsImposter = (Unpacked.x >> 31); #else bIsImposter = false; #endif VisibleClusterIndex--; } void UnpackDbgPixel( UlongType Pixel, out uint DepthInt, out uint DebugValue ) { const uint2 Unpacked = UnpackUlongType(Pixel); DebugValue = Unpacked.x; DepthInt = Unpacked.y; } uint3 GetClusterPosition(uint VertIndex, FCluster Cluster) { const uint BitsPerVertex = Cluster.PosBits.x + Cluster.PosBits.y + Cluster.PosBits.z; const uint BitOffset = MulU24( VertIndex, BitsPerVertex ); uint3 Data = ClusterPageData.Load3(Cluster.PageBaseAddress + Cluster.PositionOffset + ((BitOffset >> 5) << 2)); uint2 Packed = uint2(BitAlignU32(Data.y, Data.x, BitOffset), BitAlignU32(Data.z, Data.y, BitOffset)); uint3 Pos; Pos.x = BitFieldExtractU32(Packed.x, Cluster.PosBits.x, 0); Packed.x = BitAlignU32(Packed.y, Packed.x, Cluster.PosBits.x); Packed.y >>= Cluster.PosBits.x; Pos.y = BitFieldExtractU32(Packed.x, Cluster.PosBits.y, 0); Packed.x = BitAlignU32(Packed.y, Packed.x, Cluster.PosBits.y); Pos.z = BitFieldExtractU32(Packed.x, Cluster.PosBits.z, 0); return Pos; } float3 DecodePosition(uint VertIndex, FCluster Cluster) { #if NANITE_USE_UNCOMPRESSED_VERTEX_DATA return asfloat(ClusterPageData.Load3(Cluster.PageBaseAddress + Cluster.PositionOffset + VertIndex * 12)); #else const uint3 ClusterPos = GetClusterPosition(VertIndex, Cluster); return ((int3)ClusterPos + Cluster.PosStart) * Cluster.PosScale; #endif } FNaniteView UnpackNaniteView(FPackedNaniteView PackedView) { const float3 ViewOriginHigh = { PackedView.ViewOriginHighX, PackedView.ViewOriginHighY, PackedView.ViewOriginHighZ }; FNaniteView NaniteView; NaniteView.SVPositionToTranslatedWorld = PackedView.SVPositionToTranslatedWorld; NaniteView.ViewToTranslatedWorld = PackedView.ViewToTranslatedWorld; NaniteView.ViewOriginHigh = ViewOriginHigh; NaniteView.TranslatedWorldToView = PackedView.TranslatedWorldToView; NaniteView.TranslatedWorldToClip = PackedView.TranslatedWorldToClip; NaniteView.ViewToClip = PackedView.ViewToClip; NaniteView.ClipToWorld = MakeDFMatrix(ViewOriginHigh, PackedView.ClipToRelativeWorld); NaniteView.PrevTranslatedWorldToView = PackedView.PrevTranslatedWorldToView; NaniteView.PrevTranslatedWorldToClip = PackedView.PrevTranslatedWorldToClip; NaniteView.PrevViewToClip = PackedView.PrevViewToClip; NaniteView.PrevClipToWorld = MakeDFMatrix(ViewOriginHigh, PackedView.PrevClipToRelativeWorld); NaniteView.TranslatedGlobalClipPlane = PackedView.TranslatedGlobalClipPlane; NaniteView.ViewRect = PackedView.ViewRect; NaniteView.ViewSizeAndInvSize = PackedView.ViewSizeAndInvSize; NaniteView.ClipSpaceScaleOffset = PackedView.ClipSpaceScaleOffset; NaniteView.MaterialCacheUnwrapMinAndInvSize = PackedView.MaterialCacheUnwrapMinAndInvSize; NaniteView.MaterialCachePageAdvanceAndInvCount = PackedView.MaterialCachePageAdvanceAndInvCount; NaniteView.PreViewTranslation = MakeDFVector3(PackedView.PreViewTranslationHigh, PackedView.PreViewTranslationLow); NaniteView.PrevPreViewTranslation = MakeDFVector3(PackedView.PrevPreViewTranslationHigh, PackedView.PrevPreViewTranslationLow); NaniteView.WorldCameraOrigin = MakeDFVector3(ViewOriginHigh, PackedView.ViewOriginLow); NaniteView.CullingViewOriginTranslatedWorld = PackedView.CullingViewOriginTranslatedWorld; NaniteView.ViewForward = PackedView.ViewForward; NaniteView.NearPlane = PackedView.NearPlane; NaniteView.LODScale = PackedView.LODScales.x; NaniteView.LODScaleHW = PackedView.LODScales.y; NaniteView.CullingViewMinRadiusTestFactorSq = PackedView.CullingViewMinRadiusTestFactorSq; NaniteView.CullingViewScreenMultipleSq = PackedView.CullingViewScreenMultipleSq; NaniteView.StreamingPriorityCategory = PackedView.StreamingPriorityCategory_AndFlags & NANITE_STREAMING_PRIORITY_CATEGORY_MASK; NaniteView.Flags = PackedView.StreamingPriorityCategory_AndFlags >> NANITE_NUM_STREAMING_PRIORITY_CATEGORY_BITS; NaniteView.TargetLayerIndex = PackedView.TargetLayerIdX_AndMipLevelY_AndNumMipLevelsZ.x; NaniteView.TargetMipLevel = PackedView.TargetLayerIdX_AndMipLevelY_AndNumMipLevelsZ.y; NaniteView.TargetNumMipLevels = PackedView.TargetLayerIdX_AndMipLevelY_AndNumMipLevelsZ.z; NaniteView.TargetPrevLayerIndex = PackedView.TargetLayerIdX_AndMipLevelY_AndNumMipLevelsZ.w; NaniteView.RangeBasedCullingDistance = PackedView.RangeBasedCullingDistance; NaniteView.HZBTestViewRect = PackedView.HZBTestViewRect; NaniteView.InstanceOcclusionQueryMask = PackedView.InstanceOcclusionQueryMask; NaniteView.bUseLightingChannelMask = (PackedView.LightingChannelMask & 0x8u) > 0; // 0b1000 with COMPILER_SUPPORTS_HLSL2021 NaniteView.LightingChannelMask = (PackedView.LightingChannelMask & 0x7u); // 0b0111 NaniteView.FirstPersonTransform = float3x3( f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.x ), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.x >> 16u), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.y ), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.y >> 16u), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.z ), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.z >> 16u), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.w ), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.w >> 16u), f16tof32(PackedView.FirstPersonTransformRow2Z)); NaniteView.SceneRendererPrimaryViewId = PackedView.SceneRendererPrimaryViewId; NaniteView.DynamicDepthCullRange = PackedView.DynamicDepthCullRange; return NaniteView; } StructuredBuffer< FPackedNaniteView > InViews; FNaniteView GetNaniteView( uint ViewIndex ) { #if NANITE_USE_VIEW_UNIFORM_BUFFER #if INSTANCED_STEREO ViewState LocalView = GetInstancedView(ViewIndex); #else ViewState LocalView = GetPrimaryView(); #endif FNaniteView NaniteView; NaniteView.SVPositionToTranslatedWorld = LocalView.SVPositionToTranslatedWorld; NaniteView.ViewToTranslatedWorld = LocalView.ViewToTranslatedWorld; NaniteView.ViewOriginHigh = LocalView.ViewOriginHigh; NaniteView.TranslatedWorldToView = LocalView.TranslatedWorldToView; NaniteView.TranslatedWorldToClip = LocalView.TranslatedWorldToClip; NaniteView.ViewToClip = LocalView.ViewToClip; NaniteView.ClipToWorld = LocalView.ClipToWorld; NaniteView.PrevTranslatedWorldToView = LocalView.PrevTranslatedWorldToView; NaniteView.PrevTranslatedWorldToClip = LocalView.PrevTranslatedWorldToClip; NaniteView.PrevViewToClip = LocalView.PrevViewToClip; NaniteView.PrevClipToWorld = LocalView.PrevClipToWorld; NaniteView.TranslatedGlobalClipPlane = LocalView.GlobalClippingPlane; NaniteView.ViewSizeAndInvSize = LocalView.ViewSizeAndInvSize; NaniteView.ViewRect = int4(int2(LocalView.ViewRectMin.xy + 0.5f), int2(LocalView.ViewRectMin.xy + LocalView.ViewSizeAndInvSize.xy + 0.5f)); NaniteView.PreViewTranslation = LocalView.PreViewTranslation; NaniteView.PrevPreViewTranslation = LocalView.PrevPreViewTranslation; NaniteView.WorldCameraOrigin = LocalView.WorldCameraOrigin; NaniteView.ViewForward = LocalView.ViewForward; NaniteView.NearPlane = LocalView.NearPlane; NaniteView.LODScale = 1.0f; NaniteView.LODScaleHW = 1.0f; NaniteView.CullingViewMinRadiusTestFactorSq = 0.0f; NaniteView.StreamingPriorityCategory = 3; NaniteView.Flags = NANITE_VIEW_FLAG_HZBTEST | NANITE_VIEW_FLAG_NEAR_CLIP; NaniteView.TargetLayerIndex = -1; // INDEX_NONE NaniteView.TargetMipLevel = 0; NaniteView.TargetNumMipLevels = 0; NaniteView.TargetPrevLayerIndex = -1; // INDEX_NONE NaniteView.RangeBasedCullingDistance = 0.0f; NaniteView.HZBTestViewRect = NaniteView.ViewRect; NaniteView.InstanceOcclusionQueryMask = 0; //This path isn't used for the shadow passes but initializing to same value as in GetDefaultLightingChannelMask() for consistency NaniteView.LightingChannelMask = 0x1; NaniteView.FirstPersonTransform = (float3x3)LocalView.FirstPersonTransform; NaniteView.SceneRendererPrimaryViewId = LocalView.GPUSceneViewId; #else // !NANITE_USE_VIEW_UNIFORM_BUFFER #if NANITE_MULTI_VIEW FPackedNaniteView PackedView = InViews[ViewIndex]; #else FPackedNaniteView PackedView = InViews[0]; #endif FNaniteView NaniteView = UnpackNaniteView(PackedView); #endif // NANITE_USE_VIEW_UNIFORM_BUFFER return NaniteView; } // Fill ViewState using data from a NaniteView void PatchViewState(FNaniteView NaniteView, inout ViewState InOutView) { InOutView.SVPositionToTranslatedWorld = NaniteView.SVPositionToTranslatedWorld; InOutView.ViewToTranslatedWorld = NaniteView.ViewToTranslatedWorld; InOutView.ViewOriginHigh = NaniteView.ViewOriginHigh; InOutView.TranslatedWorldToView = NaniteView.TranslatedWorldToView; InOutView.TranslatedWorldToClip = NaniteView.TranslatedWorldToClip; InOutView.ViewToClip = NaniteView.ViewToClip; InOutView.ClipToWorld = NaniteView.ClipToWorld; InOutView.PrevTranslatedWorldToView = NaniteView.PrevTranslatedWorldToView; InOutView.PrevTranslatedWorldToClip = NaniteView.PrevTranslatedWorldToClip; InOutView.PrevViewToClip = NaniteView.PrevViewToClip; InOutView.PrevClipToWorld = NaniteView.PrevClipToWorld; InOutView.ViewSizeAndInvSize = NaniteView.ViewSizeAndInvSize; InOutView.ViewRectMin.xy = NaniteView.ViewRect.xy - 0.5f; // Convert from float2 with a half texel offset to an int2 texel coord InOutView.PreViewTranslation = NaniteView.PreViewTranslation; InOutView.PrevPreViewTranslation = NaniteView.PrevPreViewTranslation; InOutView.WorldCameraOrigin = NaniteView.WorldCameraOrigin; InOutView.ViewForward = NaniteView.ViewForward; InOutView.NearPlane = NaniteView.NearPlane; #if VIEW_HAS_TILEOFFSET_DATA InOutView.TileOffset.PreViewTranslation = DFToTileOffset(InOutView.PreViewTranslation); //DF_TODO: should we upload TO data? InOutView.TileOffset.PrevPreViewTranslation = DFToTileOffset(InOutView.PrevPreViewTranslation); //InOutView.TileOffset.WorldViewOrigin = DFToTileOffset(InOutView.WorldViewOrigin); //InOutView.TileOffset.PrevWorldViewOrigin = DFToTileOffset(InOutView.PrevWorldViewOrigin); InOutView.TileOffset.WorldCameraOrigin = DFToTileOffset(InOutView.WorldCameraOrigin); //InOutView.TileOffset.PrevWorldCameraOrigin = DFToTileOffset(InOutView.PrevWorldCameraOrigin); #endif } float GetProjectedEdgeLengthAtDepth(float InLength, float ViewZ, FNaniteView NaniteView) { const bool bOrtho = NaniteView.ViewToClip[3][3] >= 1; return (InLength * NaniteView.LODScale) / (bOrtho ? 1.0f : ViewZ); } void WriteDispatchArgsSWHW(RWBuffer RasterizerArgsSWHW, uint ArgsOffset, uint NumClustersSW, uint NumClustersHW) { RasterizerArgsSWHW[ArgsOffset + 0] = (NumClustersSW + 63u) / 64u; // SW: ThreadGroupCountX RasterizerArgsSWHW[ArgsOffset + 1] = 1; // SW: ThreadGroupCountY RasterizerArgsSWHW[ArgsOffset + 2] = 1; // SW: ThreadGroupCountZ RasterizerArgsSWHW[ArgsOffset + 3] = 0; // padding RasterizerArgsSWHW[ArgsOffset + 4] = (NumClustersHW + 63u) / 64u; // HW: ThreadGroupCountX RasterizerArgsSWHW[ArgsOffset + 5] = 1; // HW: ThreadGroupCountY RasterizerArgsSWHW[ArgsOffset + 6] = 1; // HW: ThreadGroupCountZ RasterizerArgsSWHW[ArgsOffset + 7] = 0; // padding } void WriteRasterizerArgsSWHW(RWBuffer RasterizerArgsSWHW, uint ArgsOffset, uint NumClustersSW, uint NumClustersHW) { RasterizerArgsSWHW[ArgsOffset + 0] = NumClustersSW; // SW: ThreadGroupCountX RasterizerArgsSWHW[ArgsOffset + 1] = 1; // SW: ThreadGroupCountY RasterizerArgsSWHW[ArgsOffset + 2] = 1; // SW: ThreadGroupCountZ RasterizerArgsSWHW[ArgsOffset + 3] = 0; // padding uint3 HWArgs; // Assign to local before writing to RasterizerArgsSWHW to work around an FXC issue where the write to RasterizerArgsSWHW[ArgsOffset + 4] would be omitted if (RenderFlags & NANITE_RENDER_FLAG_MESH_SHADER) { HWArgs.x = NumClustersHW; // HW: ThreadGroupCountX HWArgs.y = 1; // HW: ThreadGroupCountY HWArgs.z = 1; // HW: ThreadGroupCountZ } else if (RenderFlags & NANITE_RENDER_FLAG_PRIMITIVE_SHADER) { HWArgs.x = NumClustersHW; // HW: VertexCountPerInstance HWArgs.y = 1; // HW: InstanceCount HWArgs.z = 0; // HW: StartVertexLocation } else { HWArgs.x = NANITE_MAX_CLUSTER_TRIANGLES * 3; // HW: VertexCountPerInstance HWArgs.y = NumClustersHW; // HW: InstanceCount HWArgs.z = 0; // HW: StartVertexLocation } RasterizerArgsSWHW[ArgsOffset + 4] = HWArgs.x; RasterizerArgsSWHW[ArgsOffset + 5] = HWArgs.y; RasterizerArgsSWHW[ArgsOffset + 6] = HWArgs.z; RasterizerArgsSWHW[ArgsOffset + 7] = 0; // HW: StartInstanceLocation } #if COMPILER_SUPPORTS_HLSL2021 FNaniteSkinningHeader LoadNaniteSkinningHeader(uint InPrimitiveIndex) { const uint Offset = InPrimitiveIndex * (uint)sizeof(FNaniteSkinningHeader); return SceneUB(NaniteSkinning).SkinningHeaders.Load(Offset); } #if USE_COMPRESSED_BONE_TRANSFORM //TODO: Move these outside Nanite and use them for Non-Nanite animation sampling? float4x3 UnpackCompressedBoneTransform(uint4 Data0, uint4 Data1) { float4x3 Result; Result[0] = float3(f16tof32(Data0.w), f16tof32(Data0.w >> 16), f16tof32(Data1.x)); Result[1] = float3(f16tof32(Data1.x >> 16), f16tof32(Data1.y), f16tof32(Data1.y >> 16)); Result[2] = float3(f16tof32(Data1.z), f16tof32(Data1.z >> 16), f16tof32(Data1.w)); Result[3] = asfloat(Data0.xyz); return Result; } float4x3 LoadCompressedBoneTransform(ByteAddressBuffer SrcBuffer, uint BaseOffset, uint BoneIndex) { const uint Offset = BaseOffset + BoneIndex * 32u; const uint4 Data0 = SrcBuffer.Load4(Offset); const uint4 Data1 = SrcBuffer.Load4(Offset + 16); return UnpackCompressedBoneTransform(Data0, Data1); } float4x3 LoadCompressedBoneTransform(RWByteAddressBuffer SrcBuffer, uint BaseOffset, uint BoneIndex) { const uint Offset = BaseOffset + BoneIndex * 32u; const uint4 Data0 = SrcBuffer.Load4(Offset); const uint4 Data1 = SrcBuffer.Load4(Offset + 16); return UnpackCompressedBoneTransform(Data0, Data1); } void StoreCompressedBoneTransform(RWByteAddressBuffer DstBuffer, uint BaseOffset, uint BoneIndex, float4x3 BoneTransform) { const uint Offset = BaseOffset + BoneIndex * 32u; const uint3 XAxis = f32tof16(BoneTransform[0]); const uint3 YAxis = f32tof16(BoneTransform[1]); const uint3 ZAxis = f32tof16(BoneTransform[2]); const uint4 Data0 = uint4(asuint(BoneTransform[3]), XAxis.x | (XAxis.y << 16)); const uint4 Data1 = uint4(XAxis.z | (YAxis.x << 16), YAxis.y | (YAxis.z << 16), ZAxis.x | (ZAxis.y << 16), ZAxis.z); DstBuffer.Store4(Offset, Data0); DstBuffer.Store4(Offset + 16, Data1); } #else float4x3 LoadCompressedBoneTransform(ByteAddressBuffer SrcBuffer, uint BaseOffset, uint BoneIndex) { return transpose(SrcBuffer.Load(BaseOffset + BoneIndex * (uint)sizeof(float3x4))); } float4x3 LoadCompressedBoneTransform(RWByteAddressBuffer SrcBuffer, uint BaseOffset, uint BoneIndex) { return transpose(SrcBuffer.Load(BaseOffset + BoneIndex * (uint)sizeof(float3x4))); } void StoreCompressedBoneTransform(RWByteAddressBuffer DstBuffer, uint BaseOffset, uint BoneIndex, float4x3 BoneTransform) { const uint Address = BaseOffset + BoneIndex * (uint)sizeof(float3x4); const float3x4 Tmp = transpose(BoneTransform); #if COMPILER_SUPPORTS_TYPEDSTORE DstBuffer.TypedStore(Address, Tmp); #else DstBuffer.Store(Address, Tmp); #endif } #endif float4x3 LoadNaniteBoneTransform(uint TransformIndex) { return LoadCompressedBoneTransform(SceneUB(NaniteSkinning).BoneTransforms, 0, TransformIndex); } FBoneTransformWithScale LoadNaniteBoneObjectSpaceWithScale(uint BufferOffset, uint BoneIndex) { const uint BufferOffsetBytes = BufferOffset * (uint)sizeof(float); return SceneUB(NaniteSkinning).BoneObjectSpace.Load(BufferOffsetBytes + BoneIndex * (uint)sizeof(FBoneTransformWithScale)); } FBoneTransform LoadNaniteBoneObjectSpace(uint BufferOffset, uint BoneIndex) { const uint BufferOffsetBytes = BufferOffset * (uint)sizeof(float); return SceneUB(NaniteSkinning).BoneObjectSpace.Load(BufferOffsetBytes + BoneIndex * (uint)sizeof(FBoneTransform)); } float4x4 LoadNaniteAssemblyTransform(uint HierarchyBufferOffset, uint TransformIndex) { #if NANITE_ASSEMBLY_DATA const uint BufferAddress = HierarchyBufferOffset * 4u + TransformIndex * (uint)sizeof(float3x4); const float3x4 TransposedTransform = HierarchyBuffer.Load(BufferAddress); return transpose(float4x4( TransposedTransform[0], TransposedTransform[1], TransposedTransform[2], float4(0, 0, 0, 1) )); #else return float4x4( float4(1, 0, 0, 0), float4(0, 1, 0, 0), float4(0, 0, 1, 0), float4(0, 0, 0, 1) ); #endif } #endif