Files
UnrealEngine/Engine/Shaders/Private/Nanite/NaniteDataDecode.ush
2025-05-18 13:04:45 +08:00

1227 lines
45 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#pragma once
#include "../Common.ush"
#include "../BitPacking.ush"
#include "../SceneData.ush"
#include "../WaveOpUtil.ush"
#include "../BoneTransform.ush"
#include "/Engine/Shared/NaniteDefinitions.h"
#include "/Engine/Shared/SkinningDefinitions.h"
#include "/Engine/Shared/HLSLStaticAssert.h"
#include "NanitePackedNaniteView.ush"
#ifndef DEBUG_FLAGS
#define DEBUG_FLAGS 0
#endif
uint GetHWClusterCounterIndex(uint InRenderFlags)
{
// Ensure rasterizer uses compile time constants.
#ifdef NANITE_HW_COUNTER_INDEX
return NANITE_HW_COUNTER_INDEX;
#else
// Other passes use a uniform branch to minimize permutations.
return CondMask(InRenderFlags & (NANITE_RENDER_FLAG_MESH_SHADER | NANITE_RENDER_FLAG_PRIMITIVE_SHADER), 4u, 5u);
#endif
}
struct FVisibleCluster
{
uint Flags;
uint ViewId;
uint InstanceId;
uint PageIndex;
uint ClusterIndex;
uint AssemblyTransformIndex;
uint DepthBucket;
uint2 vPage;
uint2 vPageEnd; // Last page to render (inclusive). Only used during SW rasterization currently
};
struct FPageHeader
{
uint NumClusters;
uint MaxClusterBoneInfluences;
uint MaxVoxelBoneInfluences;
};
struct FCluster
{
uint PageBaseAddress;
uint NumVerts;
uint PositionOffset;
uint NumTris;
uint IndexOffset;
int3 PosStart;
uint BitsPerIndex;
int PosPrecision;
uint3 PosBits;
uint NormalPrecision;
uint TangentPrecision;
float PosScale;
float PosRcpScale;
float4 LODBounds;
float3 BoxBoundsCenter;
float LODError;
float EdgeLength;
float3 BoxBoundsExtent;
uint Flags;
uint AttributeOffset;
uint BitsPerAttribute;
uint DecodeInfoOffset;
bool bHasTangents;
bool bSkinning;
bool bVoxel;
uint NumUVs;
uint ColorMode;
uint UVBitOffsets;
uint ColorMin;
uint ColorBits;
uint GroupIndex; // Debug only
uint NumClusterBoneInfluences;
uint ClusterBoneInfluenceAddress;
uint ClusterBoneInfluenceStride;
// Material Slow path
uint MaterialTableOffset;
uint MaterialTableLength;
uint VertReuseBatchCountTableOffset; // dword offset from page base
uint VertReuseBatchCountTableSize; // number of entries, each 4-bit
// Material Fast path
uint Material0Length;
uint Material0Index;
uint Material1Length;
uint Material1Index;
uint Material2Index;
uint MaterialTotalLength;
uint4 VertReuseBatchInfo;
uint ExtendedDataOffset;
uint ExtendedDataNum;
uint BrickDataOffset;
uint BrickDataNum;
};
struct FBrick
{
uint2 ReverseBrickBits;
int3 StartPos;
uint3 BrickMax;
uint VertOffset;
};
struct FClusterBoneInfluence
{
uint BoneIndex;
#if NANITE_USE_PRECISE_SKINNING_BOUNDS
float MinWeight;
float MaxWeight;
float3 BoundMin;
float3 BoundMax;
#endif
};
struct FVoxelBoneInfluence
{
uint BoneIndex;
float Weight;
};
struct FHierarchyNodeSlice
{
float4 LODBounds;
float3 BoxBoundsCenter;
float3 BoxBoundsExtent;
float MinLODError;
float MaxParentLODError;
uint ChildStartReference; // Can be node (index) or cluster (page:cluster)
uint NumChildren;
uint StartPageIndex;
uint NumPages;
uint AssemblyTransformIndex;
bool bEnabled;
bool bLoaded;
bool bLeaf;
};
struct FInstanceDynamicData
{
float4x4 LocalToTranslatedWorld;
float4x4 PrevLocalToTranslatedWorld;
bool bHasMoved;
};
struct FNaniteView
{
float4x4 SVPositionToTranslatedWorld;
float4x4 ViewToTranslatedWorld;
float4x4 TranslatedWorldToView;
float4x4 TranslatedWorldToClip;
float4x4 ViewToClip;
FDFMatrix ClipToWorld;
float4x4 PrevTranslatedWorldToView;
float4x4 PrevTranslatedWorldToClip;
float4x4 PrevViewToClip;
FDFMatrix PrevClipToWorld;
float3x3 FirstPersonTransform;
float4 TranslatedGlobalClipPlane;
int4 ViewRect;
float4 ViewSizeAndInvSize;
float4 ClipSpaceScaleOffset;
float4 MaterialCacheUnwrapMinAndInvSize;
float4 MaterialCachePageAdvanceAndInvCount;
FDFVector3 PreViewTranslation;
FDFVector3 PrevPreViewTranslation;
FDFVector3 WorldCameraOrigin;
float3 CullingViewOriginTranslatedWorld;
float3 ViewForward;
float3 ViewOriginHigh;
float NearPlane;
float LODScale;
float LODScaleHW;
float CullingViewMinRadiusTestFactorSq;
uint StreamingPriorityCategory;
uint Flags;
int TargetLayerIndex;
int TargetMipLevel;
int TargetNumMipLevels;
int TargetPrevLayerIndex;
float RangeBasedCullingDistance;
int4 HZBTestViewRect;
float CullingViewScreenMultipleSq;
uint InstanceOcclusionQueryMask;
bool bUseLightingChannelMask;
uint LightingChannelMask;
int SceneRendererPrimaryViewId;
float2 DynamicDepthCullRange;
};
struct FInstanceDraw
{
uint InstanceId;
uint ViewId;
};
struct FNaniteFullscreenVSToPS
{
#if INSTANCED_STEREO
nointerpolation uint EyeIndex : PACKED_EYE_INDEX;
#endif
nointerpolation uint ViewIndex : PACKED_VIEW_INDEX;
nointerpolation uint TileIndex : MACRO_TILE_INDEX;
};
#if NANITE_USE_RAYTRACING_UNIFORM_BUFFER
#define PageConstants NaniteRayTracing.PageConstants
#define MaxNodes NaniteRayTracing.MaxNodes
#define ClusterPageData NaniteRayTracing.ClusterPageData
#define HierarchyBuffer NaniteRayTracing.HierarchyBuffer
#define RayTracingDataBuffer NaniteRayTracing.RayTracingDataBuffer
// These parameters shouldn't be used in RT shaders
#define RenderFlags 0
//uint MaxVisibleClusters;
//uint DebugFlags;
//ByteAddressBuffer VisibleClustersSWHW;
#else
#if NANITE_USE_RASTER_UNIFORM_BUFFER
#define PageConstants NaniteRaster.PageConstants
#define MaxNodes NaniteRaster.MaxNodes
#define MaxVisibleClusters NaniteRaster.MaxVisibleClusters
#define MaxPatchesPerGroup NaniteRaster.MaxPatchesPerGroup
#define MeshPass NaniteRaster.MeshPass
#define InvDiceRate NaniteRaster.InvDiceRate
#define RenderFlags NaniteRaster.RenderFlags
#define DebugFlags NaniteRaster.DebugFlags
#else
uint4 PageConstants;
uint MaxNodes;
uint MaxVisibleClusters;
uint MaxPatchesPerGroup;
uint MeshPass;
float InvDiceRate;
uint RenderFlags;
uint DebugFlags;
#endif
#if NANITE_USE_SHADING_UNIFORM_BUFFER
#define ClusterPageData NaniteShading.ClusterPageData
#define VisibleClustersSWHW NaniteShading.VisibleClustersSWHW
#define HierarchyBuffer NaniteShading.HierarchyBuffer
#else // !NANITE_USE_SHADING_UNIFORM_BUFFER
ByteAddressBuffer ClusterPageData;
ByteAddressBuffer VisibleClustersSWHW;
ByteAddressBuffer HierarchyBuffer;
StructuredBuffer<uint> RayTracingDataBuffer;
#endif // !NANITE_USE_SHADING_UNIFORM_BUFFER
#endif
#if FEATURE_LEVEL >= FEATURE_LEVEL_SM6 || PLATFORM_SUPPORTS_SM6_0_WAVE_OPERATIONS // TODO: This header can be included from SM5 from BuildInstanceDrawCommands.usf. Refactor it.
HLSL_STATIC_ASSERT(sizeof(FInstanceDynamicData) == 132, "Unexpected size of FInstanceDynamicData. Update WaveReadLaneAt to reflect changes.");
FInstanceDynamicData WaveReadLaneAt(FInstanceDynamicData In, uint SrcIndex)
{
FInstanceDynamicData Result;
Result.LocalToTranslatedWorld = WaveReadLaneAtMatrix(In.LocalToTranslatedWorld, SrcIndex);
Result.PrevLocalToTranslatedWorld = WaveReadLaneAtMatrix(In.PrevLocalToTranslatedWorld, SrcIndex);
Result.bHasMoved = WaveReadLaneAt(In.bHasMoved, SrcIndex);
return Result;
}
#endif
float ClipZFromLinearZ(FNaniteView NaniteView, float LinearZ)
{
return LinearZ * NaniteView.ViewToClip[2][2] + NaniteView.ViewToClip[3][2]; // TODO: Pack coefficients into single load?
}
// Packs a (PageIndex, ClusterIndex) pair into a flat index based on max clusters per page.
uint PackPoolClusterRef(uint PageIndex, uint ClusterIndex)
{
const uint MaxStreamingPages = PageConstants.y;
return (min(PageIndex, MaxStreamingPages) << NANITE_STREAMING_PAGE_MAX_CLUSTERS_BITS) +
((uint)max((int)PageIndex - (int)MaxStreamingPages, 0) << NANITE_ROOT_PAGE_MAX_CLUSTERS_BITS) +
ClusterIndex;
}
void UnpackPoolClusterRef(uint PackedClusterRef, inout uint PageIndex, inout uint ClusterIndex)
{
const uint MaxStreamingPages = PageConstants.y;
const uint MaxStreamingClusters = MaxStreamingPages << NANITE_STREAMING_PAGE_MAX_CLUSTERS_BITS;
if (PackedClusterRef < MaxStreamingClusters)
{
PageIndex = PackedClusterRef >> NANITE_STREAMING_PAGE_MAX_CLUSTERS_BITS;
ClusterIndex = PackedClusterRef & ((1u << NANITE_STREAMING_PAGE_MAX_CLUSTERS_BITS) - 1u);
}
else
{
PackedClusterRef -= MaxStreamingClusters;
PageIndex = MaxStreamingPages + (PackedClusterRef >> NANITE_ROOT_PAGE_MAX_CLUSTERS_BITS);
ClusterIndex = PackedClusterRef & ((1u << NANITE_ROOT_PAGE_MAX_CLUSTERS_BITS) - 1u);
}
}
uint4 PackVisibleCluster(FVisibleCluster VisibleCluster, bool bHasPageData)
{
uint4 RawData = 0;
uint BitPos = 0;
const uint PackedClusterRef = PackPoolClusterRef(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex);
WriteBits(RawData, BitPos, VisibleCluster.Flags, NANITE_NUM_CULLING_FLAG_BITS);
WriteBits(RawData, BitPos, VisibleCluster.ViewId, NANITE_MAX_VIEWS_PER_CULL_RASTERIZE_PASS_BITS);
WriteBits(RawData, BitPos, VisibleCluster.InstanceId, NANITE_MAX_INSTANCES_BITS);
WriteBits(RawData, BitPos, PackedClusterRef, NANITE_POOL_CLUSTER_REF_BITS);
#if NANITE_EXTENDED_VISIBLE_CLUSTERS
WriteBits(RawData, BitPos, VisibleCluster.AssemblyTransformIndex, NANITE_ASSEMBLY_TRANSFORM_INDEX_BITS);
WriteBits(RawData, BitPos, VisibleCluster.DepthBucket, NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK_BITS); // This is not needed for candidate clusters.
// We could make a separate CandidateCluster struct if/when this makes a difference.
#endif
if (bHasPageData)
{
WriteBits(RawData, BitPos, VisibleCluster.vPage.x, 13);
WriteBits(RawData, BitPos, VisibleCluster.vPage.y, 13);
uint2 Delta = (VisibleCluster.vPageEnd - VisibleCluster.vPage) & 0x7;
WriteBits(RawData, BitPos, Delta.x, 3);
WriteBits(RawData, BitPos, Delta.y, 3);
}
return RawData;
}
FVisibleCluster UnpackVisibleCluster(uint4 RawData, bool bHasPageData = false)
{
uint BitPos = 0;
FVisibleCluster VisibleCluster;
VisibleCluster.Flags = ReadBits( RawData, BitPos, NANITE_NUM_CULLING_FLAG_BITS );
VisibleCluster.ViewId = ReadBits( RawData, BitPos, NANITE_MAX_VIEWS_PER_CULL_RASTERIZE_PASS_BITS );
VisibleCluster.InstanceId = ReadBits( RawData, BitPos, NANITE_MAX_INSTANCES_BITS );
const uint PackedClusterRef = ReadBits( RawData, BitPos, NANITE_POOL_CLUSTER_REF_BITS );
#if NANITE_EXTENDED_VISIBLE_CLUSTERS
VisibleCluster.AssemblyTransformIndex = ReadBits( RawData, BitPos, NANITE_ASSEMBLY_TRANSFORM_INDEX_BITS );
VisibleCluster.DepthBucket = ReadBits( RawData, BitPos, NANITE_NUM_DEPTH_BUCKETS_PER_BLOCK_BITS );
#else
VisibleCluster.AssemblyTransformIndex = 0xFFFFFFFFu;
VisibleCluster.DepthBucket = 0;
#endif
UnpackPoolClusterRef( PackedClusterRef, VisibleCluster.PageIndex, VisibleCluster.ClusterIndex );
if( bHasPageData )
{
VisibleCluster.vPage.x = ReadBits( RawData, BitPos, 13 );
VisibleCluster.vPage.y = ReadBits( RawData, BitPos, 13 );
VisibleCluster.vPageEnd.x = ReadBits( RawData, BitPos, 3 );
VisibleCluster.vPageEnd.y = ReadBits( RawData, BitPos, 3 );
VisibleCluster.vPageEnd += VisibleCluster.vPage;
}
else
{
VisibleCluster.vPage = 0;
}
return VisibleCluster;
}
FVisibleCluster GetVisibleCluster( ByteAddressBuffer VisibleClusters, uint ClusterIdx, bool bHasPageData = false )
{
uint4 RawData;
#if NANITE_EXTENDED_VISIBLE_CLUSTERS
if( bHasPageData )
RawData = VisibleClusters.Load4( ClusterIdx * 16 );
else
RawData = uint4( VisibleClusters.Load3( ClusterIdx * 12 ), 0 );
#else
if( bHasPageData )
RawData = uint4( VisibleClusters.Load3( ClusterIdx * 12 ), 0 );
else
RawData = uint4( VisibleClusters.Load2( ClusterIdx * 8 ), 0, 0 );
#endif
return UnpackVisibleCluster(RawData, bHasPageData);
}
FVisibleCluster GetVisibleCluster( uint ClusterIdx, bool bHasPageData )
{
#if NANITE_USE_RAYTRACING_UNIFORM_BUFFER
return (FVisibleCluster)0;
#else
return GetVisibleCluster( VisibleClustersSWHW, ClusterIdx, bHasPageData );
#endif
}
bool IsVisibleClusterIndexImposter(uint ClusterIndex)
{
#if NANITE_IMPOSTERS_SUPPORTED
return ClusterIndex >= (1 << 24);
#else
return false;
#endif
}
FVisibleCluster GetVisibleCluster( uint ClusterIndex )
{
FVisibleCluster VisibleCluster;
#if NANITE_IMPOSTERS_SUPPORTED
if( IsVisibleClusterIndexImposter(ClusterIndex) )
{
// Couldn't have been stored so signals this is an imposter
VisibleCluster.Flags = 1 << NANITE_NUM_CULLING_FLAG_BITS;
VisibleCluster.ViewId = 0; // TODO
VisibleCluster.InstanceId = BitFieldExtractU32( ClusterIndex, NANITE_MAX_INSTANCES_BITS - 1, 1 );
VisibleCluster.AssemblyTransformIndex = uint32(-1);
VisibleCluster.PageIndex = 0;
VisibleCluster.ClusterIndex = ClusterIndex & 1;
}
else
#endif
{
VisibleCluster = GetVisibleCluster( ClusterIndex, false );
}
return VisibleCluster;
}
bool IsValidAssemblyTransformIndex(uint AssemblyTransformIndex)
{
// NOTE: This should effectively cause all assembly transformation to DCE without having to #ifdef callsites
#if NANITE_ASSEMBLY_DATA
return AssemblyTransformIndex < NANITE_MAX_ASSEMBLY_TRANSFORMS;
#else
return false;
#endif
}
bool IsAssemblyPartCluster(FVisibleCluster VisibleCluster)
{
return IsValidAssemblyTransformIndex(VisibleCluster.AssemblyTransformIndex);
}
FCluster UnpackCluster(uint4 ClusterData[NANITE_NUM_PACKED_CLUSTER_FLOAT4S], FPageHeader PageHeader, uint PageBaseAddress, uint LocalClusterIndex)
{
FCluster Cluster;
Cluster.PageBaseAddress = PageBaseAddress;
Cluster.NumVerts = BitFieldExtractU32(ClusterData[0].x, 14, 0);
Cluster.PositionOffset = BitFieldExtractU32(ClusterData[0].x, 18, 14);
Cluster.NumTris = BitFieldExtractU32(ClusterData[0].y, 8, 0);
Cluster.IndexOffset = BitFieldExtractU32(ClusterData[0].y, 24, 8);
Cluster.ColorMin = ClusterData[0].z;
Cluster.ColorBits = BitFieldExtractU32(ClusterData[0].w, 16, 0);
Cluster.GroupIndex = BitFieldExtractU32(ClusterData[0].w, 16, 16); // Debug only
Cluster.PosStart = ClusterData[1].xyz;
Cluster.BitsPerIndex = BitFieldExtractU32(ClusterData[1].w, 3, 0) + 1;
Cluster.PosPrecision = (int)BitFieldExtractU32(ClusterData[1].w, 6, 3) + NANITE_MIN_POSITION_PRECISION;
Cluster.PosBits.x = BitFieldExtractU32(ClusterData[1].w, 5, 9);
Cluster.PosBits.y = BitFieldExtractU32(ClusterData[1].w, 5, 14);
Cluster.PosBits.z = BitFieldExtractU32(ClusterData[1].w, 5, 19);
Cluster.NormalPrecision = BitFieldExtractU32(ClusterData[1].w, 4, 24);
Cluster.TangentPrecision = BitFieldExtractU32(ClusterData[1].w, 4, 28);
Cluster.PosScale = asfloat(asint(1.0f) - (Cluster.PosPrecision << 23));
Cluster.PosRcpScale = asfloat(asint(1.0f) + (Cluster.PosPrecision << 23));
Cluster.LODBounds = asfloat(ClusterData[2]);
Cluster.BoxBoundsCenter = asfloat(ClusterData[3].xyz);
Cluster.LODError = f16tof32(ClusterData[3].w);
Cluster.EdgeLength = f16tof32(ClusterData[3].w >> 16);
Cluster.BoxBoundsExtent = asfloat(ClusterData[4].xyz);
Cluster.Flags = BitFieldExtractU32(ClusterData[4].w, 4, 0);
Cluster.NumClusterBoneInfluences = BitFieldExtractU32(ClusterData[4].w, 5, 4);
Cluster.AttributeOffset = BitFieldExtractU32(ClusterData[5].x, 22, 0);
Cluster.BitsPerAttribute = BitFieldExtractU32(ClusterData[5].x, 10, 22);
Cluster.DecodeInfoOffset = BitFieldExtractU32(ClusterData[5].y, 22, 0);
Cluster.bHasTangents = BitFieldExtractU32(ClusterData[5].y, 1, 22);
Cluster.bSkinning = BitFieldExtractU32(ClusterData[5].y, 1, 23);
Cluster.bVoxel = (Cluster.NumTris == 0) && NANITE_VOXEL_DATA;
Cluster.NumUVs = BitFieldExtractU32(ClusterData[5].y, 3, 24);
Cluster.ColorMode = BitFieldExtractU32(ClusterData[5].y, 1, 27);
Cluster.UVBitOffsets = ClusterData[5].z;
const uint MaterialEncoding = ClusterData[5].w;
Cluster.ExtendedDataOffset = BitFieldExtractU32(ClusterData[6].x, 22, 0);
Cluster.ExtendedDataNum = BitFieldExtractU32(ClusterData[6].x, 10, 22);
Cluster.BrickDataOffset = BitFieldExtractU32(ClusterData[6].y, 22, 0);
Cluster.BrickDataNum = BitFieldExtractU32(ClusterData[6].y, 10, 22);
// Material Table Range Encoding (32 bits)
// uint TriStart : 8; // max 128 triangles
// uint TriLength : 8; // max 128 triangles
// uint MaterialIndex : 6; // max 64 materials
// uint Padding : 10;
// Material Packed Range - Fast Path (32 bits)
// uint Material0Index : 6; // max 64 materials (0:Material0Length)
// uint Material1Index : 6; // max 64 materials (Material0Length:Material1Length)
// uint Material2Index : 6; // max 64 materials (remainder)
// uint Material0Length : 7; // max 128 triangles (num minus one)
// uint Material1Length : 7; // max 64 triangles (materials are sorted, so at most 128/2)
// Material Packed Range - Slow Path (32 bits)
// uint BufferIndex : 19; // 2^19 max value (tons, it's per prim)
// uint BufferLength : 6; // max 64 ranges (num minus one)
// uint Padding : 7; // always 127 for slow path. corresponds to Material1Length=127 in fast path
BRANCH
if (MaterialEncoding < 0xFE000000u)
{
// Fast inline path
Cluster.MaterialTableOffset = 0;
Cluster.MaterialTableLength = 0;
Cluster.Material0Index = BitFieldExtractU32(MaterialEncoding, 6, 0);
Cluster.Material1Index = BitFieldExtractU32(MaterialEncoding, 6, 6);
Cluster.Material2Index = BitFieldExtractU32(MaterialEncoding, 6, 12);
Cluster.Material0Length = BitFieldExtractU32(MaterialEncoding, 7, 18) + 1;
Cluster.Material1Length = BitFieldExtractU32(MaterialEncoding, 7, 25);
Cluster.VertReuseBatchCountTableOffset = 0;
Cluster.VertReuseBatchCountTableSize = 0;
Cluster.VertReuseBatchInfo = ClusterData[7];
}
else
{
// Slow global search path
Cluster.MaterialTableOffset = BitFieldExtractU32(MaterialEncoding, 19, 0);
Cluster.MaterialTableLength = BitFieldExtractU32(MaterialEncoding, 6, 19) + 1;
Cluster.Material0Index = 0;
Cluster.Material1Index = 0;
Cluster.Material2Index = 0;
Cluster.Material0Length = 0;
Cluster.Material1Length = 0;
Cluster.VertReuseBatchCountTableOffset = ClusterData[7].x;
Cluster.VertReuseBatchCountTableSize = ClusterData[7].y;
Cluster.VertReuseBatchInfo = 0;
}
Cluster.MaterialTotalLength = Cluster.bVoxel ? Cluster.BrickDataNum : Cluster.NumTris;
const uint ClusterBoneInfluenceAddress = PageBaseAddress + NANITE_GPU_PAGE_HEADER_SIZE + NANITE_NUM_PACKED_CLUSTER_FLOAT4S * 16 * PageHeader.NumClusters;
uint VoxelBoneInfluenceAddress = ClusterBoneInfluenceAddress + PageHeader.NumClusters * PageHeader.MaxClusterBoneInfluences * (uint)sizeof(FClusterBoneInfluence);
VoxelBoneInfluenceAddress = (VoxelBoneInfluenceAddress + 15) & ~15u; // Align to match builder behavior. //VOXELTODO: We don't seem to actually need more than 4 byte alignment. Fix the builder and save the ~8 bytes per page instead?
// TODO: Unify these two paths and formats into one if/when we finally kill NANITE_USE_PRECISE_SKINNING_BOUNDS
if (Cluster.bVoxel)
{
Cluster.ClusterBoneInfluenceAddress = VoxelBoneInfluenceAddress + LocalClusterIndex * 4;
Cluster.ClusterBoneInfluenceStride = PageHeader.NumClusters * 4;
}
else
{
Cluster.ClusterBoneInfluenceAddress = ClusterBoneInfluenceAddress + LocalClusterIndex * (uint)sizeof(FClusterBoneInfluence);
Cluster.ClusterBoneInfluenceStride = PageHeader.NumClusters * (uint)sizeof(FClusterBoneInfluence);
}
return Cluster;
}
uint GPUPageIndexToGPUOffset(uint PageIndex)
{
const uint MaxStreamingPages = PageConstants.y;
return (min(PageIndex, MaxStreamingPages) << NANITE_STREAMING_PAGE_GPU_SIZE_BITS) + ((uint)max((int)PageIndex - (int)MaxStreamingPages, 0) << NANITE_ROOT_PAGE_GPU_SIZE_BITS);
}
FPageHeader UnpackPageHeader(uint4 Data)
{
FPageHeader Header;
Header.NumClusters = BitFieldExtractU32(Data.x, 16, 0);
Header.MaxClusterBoneInfluences = BitFieldExtractU32(Data.x, 8, 16);
Header.MaxVoxelBoneInfluences = BitFieldExtractU32(Data.x, 8, 24);
return Header;
}
FPageHeader GetPageHeader(ByteAddressBuffer InputBuffer, uint PageAddress)
{
return UnpackPageHeader(InputBuffer.Load4(PageAddress));
}
FPageHeader GetPageHeader(RWByteAddressBuffer InputBuffer, uint PageAddress)
{
return UnpackPageHeader(InputBuffer.Load4(PageAddress));
}
FCluster GetCluster(ByteAddressBuffer InputBuffer, FPageHeader PageHeader, uint SrcBaseOffset, uint ClusterIndex)
{
const uint ClusterSOAStride = (PageHeader.NumClusters << 4);
const uint ClusterBaseAddress = SrcBaseOffset + ( ClusterIndex << 4 );
uint4 ClusterData[NANITE_NUM_PACKED_CLUSTER_FLOAT4S];
UNROLL
for(int i = 0; i < NANITE_NUM_PACKED_CLUSTER_FLOAT4S; i++)
{
ClusterData[i] = InputBuffer.Load4( ClusterBaseAddress + i * ClusterSOAStride + NANITE_GPU_PAGE_HEADER_SIZE ); // Adding NANITE_GPU_PAGE_HEADER_SIZE inside the loop prevents compiler confusion about offset modifier and generates better code
}
return UnpackCluster(ClusterData, PageHeader, SrcBaseOffset, ClusterIndex);
}
FCluster GetCluster(RWByteAddressBuffer InputBuffer, FPageHeader PageHeader, uint PageBaseAddress, uint ClusterIndex)
{
const uint ClusterSOAStride = (PageHeader.NumClusters << 4);
const uint ClusterBaseAddress = PageBaseAddress + (ClusterIndex << 4);
uint4 ClusterData[NANITE_NUM_PACKED_CLUSTER_FLOAT4S];
UNROLL
for (int i = 0; i < NANITE_NUM_PACKED_CLUSTER_FLOAT4S; i++)
{
ClusterData[i] = InputBuffer.Load4( ClusterBaseAddress + i * ClusterSOAStride + NANITE_GPU_PAGE_HEADER_SIZE ); // Adding NANITE_GPU_PAGE_HEADER_SIZE inside the loop prevents compiler confusion about offset modifier and generates better code
}
return UnpackCluster(ClusterData, PageHeader, PageBaseAddress, ClusterIndex);
}
FCluster GetCluster(uint PageIndex, uint ClusterIndex)
{
uint PageBaseAddress = GPUPageIndexToGPUOffset(PageIndex);
FPageHeader PageHeader = GetPageHeader(ClusterPageData, PageBaseAddress);
return GetCluster(ClusterPageData, PageHeader, PageBaseAddress, ClusterIndex);
}
FHierarchyNodeSlice UnpackHierarchyNodeSlice(uint4 RawData0, uint4 RawData1, uint4 RawData2, uint2 RawData3)
{
const uint4 Misc0 = RawData1;
const uint4 Misc1 = RawData2;
const uint2 Misc2 = RawData3;
FHierarchyNodeSlice Node;
Node.LODBounds = asfloat(RawData0);
Node.BoxBoundsCenter = asfloat(Misc0.xyz);
Node.BoxBoundsExtent = asfloat(Misc1.xyz);
Node.MinLODError = f16tof32(Misc0.w);
Node.MaxParentLODError = f16tof32(Misc0.w >> 16);
Node.ChildStartReference = Misc1.w; // When changing this, remember to also update StoreHierarchyNodeChildStartReference
Node.bLoaded = (Misc1.w != 0xFFFFFFFFu);
Node.NumChildren = BitFieldExtractU32(Misc2.x, NANITE_MAX_CLUSTERS_PER_GROUP_BITS, 0);
Node.NumPages = BitFieldExtractU32(Misc2.x, NANITE_MAX_GROUP_PARTS_BITS, NANITE_MAX_CLUSTERS_PER_GROUP_BITS);
Node.StartPageIndex = BitFieldExtractU32(Misc2.x, NANITE_MAX_RESOURCE_PAGES_BITS, NANITE_MAX_CLUSTERS_PER_GROUP_BITS + NANITE_MAX_GROUP_PARTS_BITS);
Node.bEnabled = Misc2.x != 0u;
Node.bLeaf = Misc2.x != 0xFFFFFFFFu;
Node.AssemblyTransformIndex = Misc2.y;
return Node;
}
uint GetHierarchyNodeOffset(uint RootOffset, uint NodeIndex)
{
return RootOffset + NodeIndex * NANITE_HIERARCHY_NODE_SLICE_SIZE_DWORDS;
}
FHierarchyNodeSlice GetHierarchyNodeSlice(ByteAddressBuffer InputBuffer, uint NodeOffset, uint ChildIndex)
{
// NOTE: Offset is expected in dwords
const uint BaseAddress = NodeOffset * 4;
const uint4 RawData0 = InputBuffer.Load4(BaseAddress + 16 * ChildIndex);
const uint4 RawData1 = InputBuffer.Load4(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 16) + 16 * ChildIndex);
const uint4 RawData2 = InputBuffer.Load4(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 32) + 16 * ChildIndex);
#if NANITE_ASSEMBLY_DATA
const uint2 RawData3 = InputBuffer.Load2(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 48) + 8 * ChildIndex);
#else
const uint2 RawData3 = uint2(InputBuffer.Load(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 48) + 4 * ChildIndex), 0xFFFFFFFFu);
#endif
return UnpackHierarchyNodeSlice(RawData0, RawData1, RawData2, RawData3);
}
FHierarchyNodeSlice GetHierarchyNodeSlice(RWByteAddressBuffer InputBuffer, uint NodeOffset, uint ChildIndex)
{
// NOTE: Offset is expected in dwords
const uint BaseAddress = NodeOffset * 4;
const uint4 RawData0 = InputBuffer.Load4(BaseAddress + 16 * ChildIndex);
const uint4 RawData1 = InputBuffer.Load4(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 16) + 16 * ChildIndex);
const uint4 RawData2 = InputBuffer.Load4(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 32) + 16 * ChildIndex);
#if NANITE_ASSEMBLY_DATA
const uint2 RawData3 = InputBuffer.Load2(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 48) + 8 * ChildIndex);
#else
const uint2 RawData3 = uint2(InputBuffer.Load(BaseAddress + (NANITE_MAX_BVH_NODE_FANOUT * 48) + 4 * ChildIndex), 0xFFFFFFFFu);
#endif
return UnpackHierarchyNodeSlice(RawData0, RawData1, RawData2, RawData3);
}
FHierarchyNodeSlice GetHierarchyNodeSlice(uint NodeOffset, uint ChildIndex)
{
return GetHierarchyNodeSlice(HierarchyBuffer, NodeOffset, ChildIndex);
}
void StoreHierarchyNodeChildStartReference(RWByteAddressBuffer OutputBuffer, uint NodeOffset, uint ChildIndex, uint ChildStartReference)
{
const uint Address = NodeOffset * 4 + NANITE_MAX_BVH_NODE_FANOUT * 32 + 16 * ChildIndex + 12;
OutputBuffer.Store(Address, ChildStartReference);
}
// Decode triangle that is represented by one base index and two 5-bit offsets.
uint3 DecodeTriangleIndices(FCluster Cluster, uint TriIndex)
{
if( Cluster.bVoxel )
return uint3( TriIndex, TriIndex, TriIndex );
const uint BitsPerTriangle = Cluster.BitsPerIndex + 2 * 5;
FBitStreamReaderState BitStreamReader = BitStreamReader_Create_Aligned(Cluster.PageBaseAddress + Cluster.IndexOffset, TriIndex * BitsPerTriangle, 8 + 2*5);
uint BaseIndex = BitStreamReader_Read_RO(ClusterPageData, BitStreamReader, Cluster.BitsPerIndex, 8);
uint Delta0 = BitStreamReader_Read_RO(ClusterPageData, BitStreamReader, 5, 5);
uint Delta1 = BitStreamReader_Read_RO(ClusterPageData, BitStreamReader, 5, 5);
return BaseIndex + uint3(0, Delta0, Delta1);
}
FBrick DecodeBrick(FCluster Cluster, uint BrickIndex)
{
FBrick Brick;
const uint4 BrickData0 = ClusterPageData.Load4( Cluster.PageBaseAddress + Cluster.BrickDataOffset + 20 * BrickIndex );
const uint BrickData1 = ClusterPageData.Load ( Cluster.PageBaseAddress + Cluster.BrickDataOffset + 20 * BrickIndex + 16 );
Brick.ReverseBrickBits = BrickData0.xy;
Brick.BrickMax = uint3(BitFieldExtractU32( BrickData0.z, 2, 0 ),
BitFieldExtractU32( BrickData0.z, 2, 2 ),
BitFieldExtractU32( BrickData0.z, 2, 4 ) ) + 1u;
Brick.StartPos = int3( BitFieldExtractI32( BrickData0.z, 19, 6 ),
BitFieldExtractI32( BitAlignU32( BrickData0.w, BrickData0.z, 25 ), 19, 0 ),
BitFieldExtractI32( BrickData0.w, 19, 12 ) );
Brick.VertOffset = BrickData1;
return Brick;
}
struct FShadingMask
{
bool bIsNanitePixel;
bool bIsDecalReceiver;
bool bHasDistanceField;
uint LightingChannels;
uint ShadingBin;
uint ShadingRate;
};
uint PackShadingMask(
uint ShadingBin,
uint ShadingRate,
bool bIsDecalReceiver,
bool bHasDistanceField,
uint LightingChannels
)
{
uint Packed = 0x1; // Is Nanite
Packed |= (BitFieldMaskU32(3, 1) & (LightingChannels << 1u)); // 3 bits for channels 0,1,2
Packed |= (BitFieldMaskU32(14, 4) & (ShadingBin << 4u));
Packed |= select(bIsDecalReceiver, 1u << 18u, 0u);
Packed |= select(bHasDistanceField, 1u << 19u, 0u);
Packed |= (BitFieldMaskU32(4, 20) & (ShadingRate << 20u)); // 4 bits for 2x2 tier2 VRS
return Packed;
}
uint PackShadingMask(FShadingMask Mask)
{
return PackShadingMask(
Mask.ShadingBin,
Mask.ShadingRate,
Mask.bIsDecalReceiver,
Mask.bHasDistanceField,
Mask.LightingChannels
);
}
FShadingMask UnpackShadingMask(uint Packed)
{
FShadingMask UnpackedMask;
UnpackedMask.bIsNanitePixel = BitFieldExtractU32(Packed.x, 1, 0) != 0;
UnpackedMask.LightingChannels = BitFieldExtractU32(Packed.x, 3, 1);
UnpackedMask.ShadingBin = BitFieldExtractU32(Packed.x, 14, 4);
UnpackedMask.bIsDecalReceiver = BitFieldExtractU32(Packed.x, 1, 18) != 0;
UnpackedMask.bHasDistanceField = BitFieldExtractU32(Packed.x, 1, 19) != 0;
UnpackedMask.ShadingRate = BitFieldExtractU32(Packed.x, 4, 20);
return UnpackedMask;
}
void UnpackVisPixel(
UlongType Pixel,
out uint DepthInt,
out uint VisibleClusterIndex,
out uint TriIndex
)
{
const uint2 Unpacked = UnpackUlongType(Pixel);
VisibleClusterIndex = Unpacked.x >> 7;
TriIndex = Unpacked.x & 0x7F;
DepthInt = Unpacked.y;
VisibleClusterIndex--;
}
void UnpackVisPixel(
UlongType Pixel,
out uint DepthInt,
out uint VisibleClusterIndex,
out uint TriIndex,
out bool bIsImposter
)
{
const uint2 Unpacked = UnpackUlongType(Pixel);
VisibleClusterIndex = Unpacked.x >> 7;
TriIndex = Unpacked.x & 0x7F;
DepthInt = Unpacked.y;
#if NANITE_IMPOSTERS_SUPPORTED
bIsImposter = (Unpacked.x >> 31);
#else
bIsImposter = false;
#endif
VisibleClusterIndex--;
}
void UnpackDbgPixel(
UlongType Pixel,
out uint DepthInt,
out uint DebugValue
)
{
const uint2 Unpacked = UnpackUlongType(Pixel);
DebugValue = Unpacked.x;
DepthInt = Unpacked.y;
}
uint3 GetClusterPosition(uint VertIndex, FCluster Cluster)
{
const uint BitsPerVertex = Cluster.PosBits.x + Cluster.PosBits.y + Cluster.PosBits.z;
const uint BitOffset = MulU24( VertIndex, BitsPerVertex );
uint3 Data = ClusterPageData.Load3(Cluster.PageBaseAddress + Cluster.PositionOffset + ((BitOffset >> 5) << 2));
uint2 Packed = uint2(BitAlignU32(Data.y, Data.x, BitOffset), BitAlignU32(Data.z, Data.y, BitOffset));
uint3 Pos;
Pos.x = BitFieldExtractU32(Packed.x, Cluster.PosBits.x, 0);
Packed.x = BitAlignU32(Packed.y, Packed.x, Cluster.PosBits.x);
Packed.y >>= Cluster.PosBits.x;
Pos.y = BitFieldExtractU32(Packed.x, Cluster.PosBits.y, 0);
Packed.x = BitAlignU32(Packed.y, Packed.x, Cluster.PosBits.y);
Pos.z = BitFieldExtractU32(Packed.x, Cluster.PosBits.z, 0);
return Pos;
}
float3 DecodePosition(uint VertIndex, FCluster Cluster)
{
#if NANITE_USE_UNCOMPRESSED_VERTEX_DATA
return asfloat(ClusterPageData.Load3(Cluster.PageBaseAddress + Cluster.PositionOffset + VertIndex * 12));
#else
const uint3 ClusterPos = GetClusterPosition(VertIndex, Cluster);
return ((int3)ClusterPos + Cluster.PosStart) * Cluster.PosScale;
#endif
}
FNaniteView UnpackNaniteView(FPackedNaniteView PackedView)
{
const float3 ViewOriginHigh =
{
PackedView.ViewOriginHighX,
PackedView.ViewOriginHighY,
PackedView.ViewOriginHighZ
};
FNaniteView NaniteView;
NaniteView.SVPositionToTranslatedWorld = PackedView.SVPositionToTranslatedWorld;
NaniteView.ViewToTranslatedWorld = PackedView.ViewToTranslatedWorld;
NaniteView.ViewOriginHigh = ViewOriginHigh;
NaniteView.TranslatedWorldToView = PackedView.TranslatedWorldToView;
NaniteView.TranslatedWorldToClip = PackedView.TranslatedWorldToClip;
NaniteView.ViewToClip = PackedView.ViewToClip;
NaniteView.ClipToWorld = MakeDFMatrix(ViewOriginHigh, PackedView.ClipToRelativeWorld);
NaniteView.PrevTranslatedWorldToView = PackedView.PrevTranslatedWorldToView;
NaniteView.PrevTranslatedWorldToClip = PackedView.PrevTranslatedWorldToClip;
NaniteView.PrevViewToClip = PackedView.PrevViewToClip;
NaniteView.PrevClipToWorld = MakeDFMatrix(ViewOriginHigh, PackedView.PrevClipToRelativeWorld);
NaniteView.TranslatedGlobalClipPlane = PackedView.TranslatedGlobalClipPlane;
NaniteView.ViewRect = PackedView.ViewRect;
NaniteView.ViewSizeAndInvSize = PackedView.ViewSizeAndInvSize;
NaniteView.ClipSpaceScaleOffset = PackedView.ClipSpaceScaleOffset;
NaniteView.MaterialCacheUnwrapMinAndInvSize = PackedView.MaterialCacheUnwrapMinAndInvSize;
NaniteView.MaterialCachePageAdvanceAndInvCount = PackedView.MaterialCachePageAdvanceAndInvCount;
NaniteView.PreViewTranslation = MakeDFVector3(PackedView.PreViewTranslationHigh, PackedView.PreViewTranslationLow);
NaniteView.PrevPreViewTranslation = MakeDFVector3(PackedView.PrevPreViewTranslationHigh, PackedView.PrevPreViewTranslationLow);
NaniteView.WorldCameraOrigin = MakeDFVector3(ViewOriginHigh, PackedView.ViewOriginLow);
NaniteView.CullingViewOriginTranslatedWorld = PackedView.CullingViewOriginTranslatedWorld;
NaniteView.ViewForward = PackedView.ViewForward;
NaniteView.NearPlane = PackedView.NearPlane;
NaniteView.LODScale = PackedView.LODScales.x;
NaniteView.LODScaleHW = PackedView.LODScales.y;
NaniteView.CullingViewMinRadiusTestFactorSq = PackedView.CullingViewMinRadiusTestFactorSq;
NaniteView.CullingViewScreenMultipleSq = PackedView.CullingViewScreenMultipleSq;
NaniteView.StreamingPriorityCategory = PackedView.StreamingPriorityCategory_AndFlags & NANITE_STREAMING_PRIORITY_CATEGORY_MASK;
NaniteView.Flags = PackedView.StreamingPriorityCategory_AndFlags >> NANITE_NUM_STREAMING_PRIORITY_CATEGORY_BITS;
NaniteView.TargetLayerIndex = PackedView.TargetLayerIdX_AndMipLevelY_AndNumMipLevelsZ.x;
NaniteView.TargetMipLevel = PackedView.TargetLayerIdX_AndMipLevelY_AndNumMipLevelsZ.y;
NaniteView.TargetNumMipLevels = PackedView.TargetLayerIdX_AndMipLevelY_AndNumMipLevelsZ.z;
NaniteView.TargetPrevLayerIndex = PackedView.TargetLayerIdX_AndMipLevelY_AndNumMipLevelsZ.w;
NaniteView.RangeBasedCullingDistance = PackedView.RangeBasedCullingDistance;
NaniteView.HZBTestViewRect = PackedView.HZBTestViewRect;
NaniteView.InstanceOcclusionQueryMask = PackedView.InstanceOcclusionQueryMask;
NaniteView.bUseLightingChannelMask = (PackedView.LightingChannelMask & 0x8u) > 0; // 0b1000 with COMPILER_SUPPORTS_HLSL2021
NaniteView.LightingChannelMask = (PackedView.LightingChannelMask & 0x7u); // 0b0111
NaniteView.FirstPersonTransform = float3x3(
f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.x ), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.x >> 16u), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.y ),
f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.y >> 16u), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.z ), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.z >> 16u),
f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.w ), f16tof32(PackedView.FirstPersonTransformRowsExceptRow2Z.w >> 16u), f16tof32(PackedView.FirstPersonTransformRow2Z));
NaniteView.SceneRendererPrimaryViewId = PackedView.SceneRendererPrimaryViewId;
NaniteView.DynamicDepthCullRange = PackedView.DynamicDepthCullRange;
return NaniteView;
}
StructuredBuffer< FPackedNaniteView > InViews;
FNaniteView GetNaniteView( uint ViewIndex )
{
#if NANITE_USE_VIEW_UNIFORM_BUFFER
#if INSTANCED_STEREO
ViewState LocalView = GetInstancedView(ViewIndex);
#else
ViewState LocalView = GetPrimaryView();
#endif
FNaniteView NaniteView;
NaniteView.SVPositionToTranslatedWorld = LocalView.SVPositionToTranslatedWorld;
NaniteView.ViewToTranslatedWorld = LocalView.ViewToTranslatedWorld;
NaniteView.ViewOriginHigh = LocalView.ViewOriginHigh;
NaniteView.TranslatedWorldToView = LocalView.TranslatedWorldToView;
NaniteView.TranslatedWorldToClip = LocalView.TranslatedWorldToClip;
NaniteView.ViewToClip = LocalView.ViewToClip;
NaniteView.ClipToWorld = LocalView.ClipToWorld;
NaniteView.PrevTranslatedWorldToView = LocalView.PrevTranslatedWorldToView;
NaniteView.PrevTranslatedWorldToClip = LocalView.PrevTranslatedWorldToClip;
NaniteView.PrevViewToClip = LocalView.PrevViewToClip;
NaniteView.PrevClipToWorld = LocalView.PrevClipToWorld;
NaniteView.TranslatedGlobalClipPlane = LocalView.GlobalClippingPlane;
NaniteView.ViewSizeAndInvSize = LocalView.ViewSizeAndInvSize;
NaniteView.ViewRect = int4(int2(LocalView.ViewRectMin.xy + 0.5f), int2(LocalView.ViewRectMin.xy + LocalView.ViewSizeAndInvSize.xy + 0.5f));
NaniteView.PreViewTranslation = LocalView.PreViewTranslation;
NaniteView.PrevPreViewTranslation = LocalView.PrevPreViewTranslation;
NaniteView.WorldCameraOrigin = LocalView.WorldCameraOrigin;
NaniteView.ViewForward = LocalView.ViewForward;
NaniteView.NearPlane = LocalView.NearPlane;
NaniteView.LODScale = 1.0f;
NaniteView.LODScaleHW = 1.0f;
NaniteView.CullingViewMinRadiusTestFactorSq = 0.0f;
NaniteView.StreamingPriorityCategory = 3;
NaniteView.Flags = NANITE_VIEW_FLAG_HZBTEST | NANITE_VIEW_FLAG_NEAR_CLIP;
NaniteView.TargetLayerIndex = -1; // INDEX_NONE
NaniteView.TargetMipLevel = 0;
NaniteView.TargetNumMipLevels = 0;
NaniteView.TargetPrevLayerIndex = -1; // INDEX_NONE
NaniteView.RangeBasedCullingDistance = 0.0f;
NaniteView.HZBTestViewRect = NaniteView.ViewRect;
NaniteView.InstanceOcclusionQueryMask = 0;
//This path isn't used for the shadow passes but initializing to same value as in GetDefaultLightingChannelMask() for consistency
NaniteView.LightingChannelMask = 0x1;
NaniteView.FirstPersonTransform = (float3x3)LocalView.FirstPersonTransform;
NaniteView.SceneRendererPrimaryViewId = LocalView.GPUSceneViewId;
#else // !NANITE_USE_VIEW_UNIFORM_BUFFER
#if NANITE_MULTI_VIEW
FPackedNaniteView PackedView = InViews[ViewIndex];
#else
FPackedNaniteView PackedView = InViews[0];
#endif
FNaniteView NaniteView = UnpackNaniteView(PackedView);
#endif // NANITE_USE_VIEW_UNIFORM_BUFFER
return NaniteView;
}
// Fill ViewState using data from a NaniteView
void PatchViewState(FNaniteView NaniteView, inout ViewState InOutView)
{
InOutView.SVPositionToTranslatedWorld = NaniteView.SVPositionToTranslatedWorld;
InOutView.ViewToTranslatedWorld = NaniteView.ViewToTranslatedWorld;
InOutView.ViewOriginHigh = NaniteView.ViewOriginHigh;
InOutView.TranslatedWorldToView = NaniteView.TranslatedWorldToView;
InOutView.TranslatedWorldToClip = NaniteView.TranslatedWorldToClip;
InOutView.ViewToClip = NaniteView.ViewToClip;
InOutView.ClipToWorld = NaniteView.ClipToWorld;
InOutView.PrevTranslatedWorldToView = NaniteView.PrevTranslatedWorldToView;
InOutView.PrevTranslatedWorldToClip = NaniteView.PrevTranslatedWorldToClip;
InOutView.PrevViewToClip = NaniteView.PrevViewToClip;
InOutView.PrevClipToWorld = NaniteView.PrevClipToWorld;
InOutView.ViewSizeAndInvSize = NaniteView.ViewSizeAndInvSize;
InOutView.ViewRectMin.xy = NaniteView.ViewRect.xy - 0.5f; // Convert from float2 with a half texel offset to an int2 texel coord
InOutView.PreViewTranslation = NaniteView.PreViewTranslation;
InOutView.PrevPreViewTranslation = NaniteView.PrevPreViewTranslation;
InOutView.WorldCameraOrigin = NaniteView.WorldCameraOrigin;
InOutView.ViewForward = NaniteView.ViewForward;
InOutView.NearPlane = NaniteView.NearPlane;
#if VIEW_HAS_TILEOFFSET_DATA
InOutView.TileOffset.PreViewTranslation = DFToTileOffset(InOutView.PreViewTranslation); //DF_TODO: should we upload TO data?
InOutView.TileOffset.PrevPreViewTranslation = DFToTileOffset(InOutView.PrevPreViewTranslation);
//InOutView.TileOffset.WorldViewOrigin = DFToTileOffset(InOutView.WorldViewOrigin);
//InOutView.TileOffset.PrevWorldViewOrigin = DFToTileOffset(InOutView.PrevWorldViewOrigin);
InOutView.TileOffset.WorldCameraOrigin = DFToTileOffset(InOutView.WorldCameraOrigin);
//InOutView.TileOffset.PrevWorldCameraOrigin = DFToTileOffset(InOutView.PrevWorldCameraOrigin);
#endif
}
float GetProjectedEdgeLengthAtDepth(float InLength, float ViewZ, FNaniteView NaniteView)
{
const bool bOrtho = NaniteView.ViewToClip[3][3] >= 1;
return (InLength * NaniteView.LODScale) / (bOrtho ? 1.0f : ViewZ);
}
void WriteDispatchArgsSWHW(RWBuffer<uint> RasterizerArgsSWHW, uint ArgsOffset, uint NumClustersSW, uint NumClustersHW)
{
RasterizerArgsSWHW[ArgsOffset + 0] = (NumClustersSW + 63u) / 64u; // SW: ThreadGroupCountX
RasterizerArgsSWHW[ArgsOffset + 1] = 1; // SW: ThreadGroupCountY
RasterizerArgsSWHW[ArgsOffset + 2] = 1; // SW: ThreadGroupCountZ
RasterizerArgsSWHW[ArgsOffset + 3] = 0; // padding
RasterizerArgsSWHW[ArgsOffset + 4] = (NumClustersHW + 63u) / 64u; // HW: ThreadGroupCountX
RasterizerArgsSWHW[ArgsOffset + 5] = 1; // HW: ThreadGroupCountY
RasterizerArgsSWHW[ArgsOffset + 6] = 1; // HW: ThreadGroupCountZ
RasterizerArgsSWHW[ArgsOffset + 7] = 0; // padding
}
void WriteRasterizerArgsSWHW(RWBuffer<uint> RasterizerArgsSWHW, uint ArgsOffset, uint NumClustersSW, uint NumClustersHW)
{
RasterizerArgsSWHW[ArgsOffset + 0] = NumClustersSW; // SW: ThreadGroupCountX
RasterizerArgsSWHW[ArgsOffset + 1] = 1; // SW: ThreadGroupCountY
RasterizerArgsSWHW[ArgsOffset + 2] = 1; // SW: ThreadGroupCountZ
RasterizerArgsSWHW[ArgsOffset + 3] = 0; // padding
uint3 HWArgs; // Assign to local before writing to RasterizerArgsSWHW to work around an FXC issue where the write to RasterizerArgsSWHW[ArgsOffset + 4] would be omitted
if (RenderFlags & NANITE_RENDER_FLAG_MESH_SHADER)
{
HWArgs.x = NumClustersHW; // HW: ThreadGroupCountX
HWArgs.y = 1; // HW: ThreadGroupCountY
HWArgs.z = 1; // HW: ThreadGroupCountZ
}
else if (RenderFlags & NANITE_RENDER_FLAG_PRIMITIVE_SHADER)
{
HWArgs.x = NumClustersHW; // HW: VertexCountPerInstance
HWArgs.y = 1; // HW: InstanceCount
HWArgs.z = 0; // HW: StartVertexLocation
}
else
{
HWArgs.x = NANITE_MAX_CLUSTER_TRIANGLES * 3; // HW: VertexCountPerInstance
HWArgs.y = NumClustersHW; // HW: InstanceCount
HWArgs.z = 0; // HW: StartVertexLocation
}
RasterizerArgsSWHW[ArgsOffset + 4] = HWArgs.x;
RasterizerArgsSWHW[ArgsOffset + 5] = HWArgs.y;
RasterizerArgsSWHW[ArgsOffset + 6] = HWArgs.z;
RasterizerArgsSWHW[ArgsOffset + 7] = 0; // HW: StartInstanceLocation
}
#if COMPILER_SUPPORTS_HLSL2021
FNaniteSkinningHeader LoadNaniteSkinningHeader(uint InPrimitiveIndex)
{
const uint Offset = InPrimitiveIndex * (uint)sizeof(FNaniteSkinningHeader);
return SceneUB(NaniteSkinning).SkinningHeaders.Load<FNaniteSkinningHeader>(Offset);
}
#if USE_COMPRESSED_BONE_TRANSFORM
//TODO: Move these outside Nanite and use them for Non-Nanite animation sampling?
float4x3 UnpackCompressedBoneTransform(uint4 Data0, uint4 Data1)
{
float4x3 Result;
Result[0] = float3(f16tof32(Data0.w), f16tof32(Data0.w >> 16), f16tof32(Data1.x));
Result[1] = float3(f16tof32(Data1.x >> 16), f16tof32(Data1.y), f16tof32(Data1.y >> 16));
Result[2] = float3(f16tof32(Data1.z), f16tof32(Data1.z >> 16), f16tof32(Data1.w));
Result[3] = asfloat(Data0.xyz);
return Result;
}
float4x3 LoadCompressedBoneTransform(ByteAddressBuffer SrcBuffer, uint BaseOffset, uint BoneIndex)
{
const uint Offset = BaseOffset + BoneIndex * 32u;
const uint4 Data0 = SrcBuffer.Load4(Offset);
const uint4 Data1 = SrcBuffer.Load4(Offset + 16);
return UnpackCompressedBoneTransform(Data0, Data1);
}
float4x3 LoadCompressedBoneTransform(RWByteAddressBuffer SrcBuffer, uint BaseOffset, uint BoneIndex)
{
const uint Offset = BaseOffset + BoneIndex * 32u;
const uint4 Data0 = SrcBuffer.Load4(Offset);
const uint4 Data1 = SrcBuffer.Load4(Offset + 16);
return UnpackCompressedBoneTransform(Data0, Data1);
}
void StoreCompressedBoneTransform(RWByteAddressBuffer DstBuffer, uint BaseOffset, uint BoneIndex, float4x3 BoneTransform)
{
const uint Offset = BaseOffset + BoneIndex * 32u;
const uint3 XAxis = f32tof16(BoneTransform[0]);
const uint3 YAxis = f32tof16(BoneTransform[1]);
const uint3 ZAxis = f32tof16(BoneTransform[2]);
const uint4 Data0 = uint4(asuint(BoneTransform[3]), XAxis.x | (XAxis.y << 16));
const uint4 Data1 = uint4(XAxis.z | (YAxis.x << 16), YAxis.y | (YAxis.z << 16), ZAxis.x | (ZAxis.y << 16), ZAxis.z);
DstBuffer.Store4(Offset, Data0);
DstBuffer.Store4(Offset + 16, Data1);
}
#else
float4x3 LoadCompressedBoneTransform(ByteAddressBuffer SrcBuffer, uint BaseOffset, uint BoneIndex)
{
return transpose(SrcBuffer.Load<float3x4>(BaseOffset + BoneIndex * (uint)sizeof(float3x4)));
}
float4x3 LoadCompressedBoneTransform(RWByteAddressBuffer SrcBuffer, uint BaseOffset, uint BoneIndex)
{
return transpose(SrcBuffer.Load<float3x4>(BaseOffset + BoneIndex * (uint)sizeof(float3x4)));
}
void StoreCompressedBoneTransform(RWByteAddressBuffer DstBuffer, uint BaseOffset, uint BoneIndex, float4x3 BoneTransform)
{
const uint Address = BaseOffset + BoneIndex * (uint)sizeof(float3x4);
const float3x4 Tmp = transpose(BoneTransform);
#if COMPILER_SUPPORTS_TYPEDSTORE
DstBuffer.TypedStore<float3x4>(Address, Tmp);
#else
DstBuffer.Store<float3x4>(Address, Tmp);
#endif
}
#endif
float4x3 LoadNaniteBoneTransform(uint TransformIndex)
{
return LoadCompressedBoneTransform(SceneUB(NaniteSkinning).BoneTransforms, 0, TransformIndex);
}
FBoneTransformWithScale LoadNaniteBoneObjectSpaceWithScale(uint BufferOffset, uint BoneIndex)
{
const uint BufferOffsetBytes = BufferOffset * (uint)sizeof(float);
return SceneUB(NaniteSkinning).BoneObjectSpace.Load<FBoneTransformWithScale>(BufferOffsetBytes + BoneIndex * (uint)sizeof(FBoneTransformWithScale));
}
FBoneTransform LoadNaniteBoneObjectSpace(uint BufferOffset, uint BoneIndex)
{
const uint BufferOffsetBytes = BufferOffset * (uint)sizeof(float);
return SceneUB(NaniteSkinning).BoneObjectSpace.Load<FBoneTransform>(BufferOffsetBytes + BoneIndex * (uint)sizeof(FBoneTransform));
}
float4x4 LoadNaniteAssemblyTransform(uint HierarchyBufferOffset, uint TransformIndex)
{
#if NANITE_ASSEMBLY_DATA
const uint BufferAddress = HierarchyBufferOffset * 4u + TransformIndex * (uint)sizeof(float3x4);
const float3x4 TransposedTransform = HierarchyBuffer.Load<float3x4>(BufferAddress);
return transpose(float4x4(
TransposedTransform[0],
TransposedTransform[1],
TransposedTransform[2],
float4(0, 0, 0, 1)
));
#else
return float4x4(
float4(1, 0, 0, 0),
float4(0, 1, 0, 0),
float4(0, 0, 1, 0),
float4(0, 0, 0, 1)
);
#endif
}
#endif