// Copyright Epic Games, Inc. All Rights Reserved. #include "/Engine/Public/Platform.ush" #include "../ColorMap.ush" #include "HairStrandsAABBCommon.ush" #include "HairStrandsClusterCommon.ush" #include "HairStrandsVertexFactoryCommon.ush" #ifndef USE_HAIR_TRIANGLE_STRIP #error Hair triangle geometry type needs to be defined #endif #define COMPACTCS_NUMTHREAD 64 // According to https://docs.microsoft.com/en-us/windows/win32/direct3d11/overviews-direct3d-11-devices-downlevel-compute-shaders#thread-group-shared-memory-tgsm // A thread can access to 32 byte per thread when number of thread in a group is 512. This is largeley enough for this use case. #define PREFIXSUM_PARALLEL 512 #define PREFIXSUM_LASTTHREAD (PREFIXSUM_PARALLEL-1) #define D_USE_CONTINUOUSLOD 1 #define D_LOD_DEBUG_SCALE 1000.0f // Cluster data in ClusterAABB buffers is packed as {uint3 AABBMin, uint3 AABBMax} ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Utility functions uint3 ComputeDispatchGroupCount2D(uint GroupCount) { const uint DispatchCountX = int(floor(sqrt(float(GroupCount)))); const uint DispatchCountY = DispatchCountX + DivideAndRoundUp(GroupCount - DispatchCountX * DispatchCountX, DispatchCountX); return uint3(DispatchCountX, DispatchCountY, 1); } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// Clear group/cluster AABBs #ifdef SHADER_CLEARAABB uint AABBCount; RWBuffer OutAABBBuffer; [numthreads(64, 1, 1)] void MainClearAABBCS(uint2 DispatchThreadId : SV_DispatchThreadID) { if (DispatchThreadId.x < AABBCount*6) { const int ClusterId = DispatchThreadId.x / 6; const bool bIsMin = (DispatchThreadId.x - ClusterId * 6) < 3 ? 1 : 0; checkBufferAccessSlow(OutAABBBuffer, DispatchThreadId.x); OutAABBBuffer[DispatchThreadId.x] = bIsMin ? 2147483647 : -2147483648; } } #endif ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Write out culled index buffex #if SHADER_CLUSTER_CULL #include "../ShaderPrint.ush" #define NUM_THREADS_PER_GROUP GROUP_SIZE #include "../ThreadGroupPrefixSum.ush" //uint ForceLOD; uint ClusterGroupIndex; float LODIndex; float LODBias; uint CurveCount; uint CurvePerGroup; uint DispatchCountX; float4 ClusterInfoParameters; ByteAddressBuffer CurveBuffer; Buffer CurveToClusterIdBuffer; Buffer PointLODBuffer; StructuredBuffer ClusterInfoBuffer; RWStructuredBuffer OutPointCounter; RWBuffer OutIndexBuffer; RWByteAddressBuffer OutCulledCurveBuffer; #ifndef GROUP_SIZE #error GROUP_SIZE needs to be Undefined #endif #define CURVE_PER_GROUP (GROUP_SIZE / PERMUTATION_POINT_PER_CURVE) #define MAX_CURVE CURVE_PER_GROUP #define MAX_POINT PERMUTATION_POINT_PER_CURVE uint GetGlobalCurveIndex(uint ThreadIndex, uint2 GroupIndex) { const uint LinearGroupIndex = GroupIndex.x + GroupIndex.y * DispatchCountX; #if PERMUTATION_POINT_PER_CURVE == 64 return LinearGroupIndex; #else return ThreadIndex + LinearGroupIndex * CURVE_PER_GROUP; #endif } //------------------------------------------------------------------------------------------------- #if PERMUTATION_POINT_PER_CURVE < 64 // Permutation for curves having less that 32 control points // In this permutation we process several curves with the same thread group groupshared uint CurvePointIndex[MAX_CURVE]; groupshared uint CurvePointCount[MAX_CURVE]; groupshared uint ClusterLOD[MAX_CURVE]; groupshared uint LODCount[MAX_CURVE]; groupshared uint AllocationOffset; groupshared uint AllocationCount; // Store locally the culled point index (encoded into 8 bits, and store 4 per uint) groupshared uint CulledPointIndices[MAX_CURVE * MAX_POINT]; groupshared uint ValidPoints[MAX_CURVE][MAX_POINT]; groupshared uint WriteOffset0[MAX_CURVE]; groupshared uint WriteOffsetN[MAX_CURVE]; [numthreads(GROUP_SIZE, 1, 1)] void Main(uint3 DispatchThreadId : SV_DispatchThreadID, uint ThreadIndex : SV_GroupIndex, uint2 GroupIndex : SV_GroupID) { // 1. Read the Curve data if (ThreadIndex < CURVE_PER_GROUP) { const uint LocalCurveIndex = ThreadIndex; const uint GlobalCurveIndex = GetGlobalCurveIndex(ThreadIndex, GroupIndex); const bool bValidCurveIndex = GlobalCurveIndex < CurveCount; // 1.1 Cluster ID // 1.3 Get cluster info (i.e. map all info per LOD for a given cluster ID) and derive effective LOD data FHairClusterLOD ClusterLODInfo = (FHairClusterLOD)0; if (bValidCurveIndex) { const uint ClusterId = CurveToClusterIdBuffer[GlobalCurveIndex]; ClusterLODInfo = GetHairClusterLOD(ClusterInfoBuffer[ClusterId], ClusterInfoParameters, LODIndex); } ClusterLOD[LocalCurveIndex] = ClusterLODInfo.LOD; LODCount[LocalCurveIndex] = ClusterLODInfo.LODCount; // 1.4 Curve's point count/offset CurvePointIndex[LocalCurveIndex] = 0; CurvePointCount[LocalCurveIndex] = 0; if (bValidCurveIndex) { const FHairCurve Curve = ReadHairCurve(CurveBuffer, GlobalCurveIndex); CurvePointIndex[LocalCurveIndex] = Curve.PointIndex; CurvePointCount[LocalCurveIndex] = Curve.PointCount; } } if (ThreadIndex == 0) { AllocationCount = 0; AllocationOffset = 0; } GroupMemoryBarrierWithGroupSync(); // 2. Mark valid point and compute their final position { const uint CurveIndex = ThreadIndex / uint(MAX_POINT); const uint CurrentPointIndex = ThreadIndex % uint(MAX_POINT); // 2.2 Mark point having their LOD visibility smaller than the requested LOD level ValidPoints[CurveIndex][CurrentPointIndex] = 0; if (CurveIndex < CurveCount) // incorrect { if (CurrentPointIndex < CurvePointCount[CurveIndex]) { const uint PointIndex = CurrentPointIndex + CurvePointIndex[CurveIndex]; const uint MinLOD = GetHairControlPointMinLOD(PointIndex, PointLODBuffer); // Prune control point based on their MinLOD value if (IsHairControlPointActive(MinLOD, ClusterLOD[CurveIndex])) { ValidPoints[CurveIndex][CurrentPointIndex] = 1; } } } // 2.3 Count/Compact visible points { uint LocalAllocationCount = 0; const uint bIsValid = ValidPoints[CurveIndex][CurrentPointIndex]; const uint PrefixSum = ThreadGroupPrefixSum(bIsValid, ThreadIndex, LocalAllocationCount); if (ThreadIndex == 0) { AllocationCount = LocalAllocationCount; } if (bIsValid) { CulledPointIndices[PrefixSum] = CurrentPointIndex + CurvePointIndex[CurveIndex]; } if (CurveIndex < CurveCount) { if (CurrentPointIndex == 0) WriteOffset0[CurveIndex] = PrefixSum; if (CurrentPointIndex == CurvePointCount[CurveIndex]-1) WriteOffsetN[CurveIndex] = PrefixSum; } } } // 4. Global allocate if (ThreadIndex == 0) { InterlockedAdd(OutPointCounter[ClusterGroupIndex], AllocationCount, AllocationOffset); } GroupMemoryBarrierWithGroupSync(); // 5. Write out index/scale if (ThreadIndex < AllocationCount) { OutIndexBuffer[AllocationOffset + ThreadIndex] = CulledPointIndices[ThreadIndex]; } // 6. Write out curve data with visible points for subsequent passes if (ThreadIndex < CURVE_PER_GROUP) { const uint LocalCurveIndex = ThreadIndex; const uint GlobalCurveIndex = GetGlobalCurveIndex(ThreadIndex, GroupIndex); if (GlobalCurveIndex < CurveCount) { FHairCurve CulledCurve; CulledCurve.PointIndex = WriteOffset0[LocalCurveIndex]; CulledCurve.PointCount =(WriteOffsetN[LocalCurveIndex] - WriteOffset0[LocalCurveIndex]) + 1; WriteHairCurve(OutCulledCurveBuffer, GlobalCurveIndex, CulledCurve); } } } #endif // PERMUTATION_POINT_PER_CURVE < 64 //------------------------------------------------------------------------------------------------- #if PERMUTATION_POINT_PER_CURVE == 64 // Permutation is for curve having more than 32 control point per curve // In this permutation we process 1 curve per thread group groupshared uint CurvePointIndex; groupshared uint CurvePointCount; groupshared float ClusterLOD; groupshared uint LODCount; groupshared uint AllocationOffset; groupshared uint AllocationCount; // Store locally the culled point index (encoded into 8 bits, and store 4 per uint) groupshared uint CulledPointIndices[HAIR_MAX_NUM_POINT_PER_CURVE]; groupshared uint ValidPoints[HAIR_MAX_NUM_POINT_PER_CURVE]; // Version for curve with >32 points [numthreads(GROUP_SIZE, 1, 1)] void Main(uint3 DispatchThreadId : SV_DispatchThreadID, uint ThreadIndex : SV_GroupIndex, uint2 GroupIndex : SV_GroupID) { // 1. Read the Curve data const uint CurveIndex = GetGlobalCurveIndex(ThreadIndex, GroupIndex); const bool bIsCurveValid = CurveIndex < CurveCount; if (!bIsCurveValid) { return; } if (ThreadIndex == 0) { // 1.1 Cluster ID // 1.2 Get cluster info (i.e. map all info per LOD for a given cluster ID) and derive effective LOD data FHairClusterLOD ClusterLODInfo = (FHairClusterLOD)0; if (bIsCurveValid) { const uint ClusterId = CurveToClusterIdBuffer[CurveIndex]; ClusterLODInfo = GetHairClusterLOD(ClusterInfoBuffer[ClusterId], ClusterInfoParameters, LODIndex); } ClusterLOD = ClusterLODInfo.LOD; LODCount = ClusterLODInfo.LODCount; // 1.4 Curve's point count/offset CurvePointIndex = 0; CurvePointCount = 0; if (bIsCurveValid) { const FHairCurve Curve = ReadHairCurve(CurveBuffer, CurveIndex); CurvePointIndex = Curve.PointIndex; CurvePointCount = Curve.PointCount; } AllocationCount = 0; AllocationOffset = 0; } GroupMemoryBarrierWithGroupSync(); // 2. Mark valid point and compute their final position { const uint LoopCount = DivideAndRoundUp(CurvePointCount, GROUP_SIZE); for (uint LoopIt = 0; LoopIt < LoopCount; ++LoopIt) { const uint CurrentPointIndex = ThreadIndex + LoopIt * GROUP_SIZE; // 2.2 Mark point having their LOD visibility smaller than the requested LOD level ValidPoints[CurrentPointIndex] = 0; if (CurveIndex < CurveCount) { if (CurrentPointIndex < CurvePointCount) { const uint PointIndex = CurrentPointIndex + CurvePointIndex; const uint MinLOD = GetHairControlPointMinLOD(PointIndex, PointLODBuffer); // Prune control point based on their MinLOD value if (IsHairControlPointActive(MinLOD, ClusterLOD)) { ValidPoints[CurrentPointIndex] = 1; } } } // 2.3 Count/Compact visible points { uint IndexOffset = AllocationCount; uint LocalAllocationCount = 0; const uint bIsValid = ValidPoints[CurrentPointIndex]; const uint PrefixSum = ThreadGroupPrefixSum(bIsValid, ThreadIndex, LocalAllocationCount); // Has a ThreadMemoryGroupSync internally if (bIsValid) { CulledPointIndices[IndexOffset + PrefixSum] = CurrentPointIndex; } if (ThreadIndex == 0) { AllocationCount += LocalAllocationCount; } } } } // 4. Global allocate if (ThreadIndex == 0) { InterlockedAdd(OutPointCounter[ClusterGroupIndex], AllocationCount, AllocationOffset); } GroupMemoryBarrierWithGroupSync(); // 5. Write out index/scale if (CurveIndex < CurveCount) { const uint LoopCount = DivideAndRoundUp(AllocationCount, GROUP_SIZE); for (uint LoopIt = 0; LoopIt < LoopCount; ++LoopIt) { const uint CurrentPointIndex = ThreadIndex + LoopIt * GROUP_SIZE; if (CurrentPointIndex < AllocationCount) { const uint CullPointIndex = CulledPointIndices[CurrentPointIndex]; OutIndexBuffer[AllocationOffset + CurrentPointIndex] = CurvePointIndex + CullPointIndex; } } } // 6. Write out curve data with visible points for subsequent passes if (ThreadIndex == 0 && bIsCurveValid) { FHairCurve CulledCurve; CulledCurve.PointCount = AllocationCount; CulledCurve.PointIndex = AllocationOffset; WriteHairCurve(OutCulledCurveBuffer, CurveIndex, CulledCurve); } } #endif // PERMUTATION_POINT_PER_CURVE == 64 //------------------------------------------------------------------------------------------------- #endif // SHADER_CLUSTER_CULL ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Build indirect Draw/Dispatch based on culled curve output #if SHADER_CLUSTER_CULL_ARGS uint InstanceRegisteredIndex; uint ClusterGroupIndex; StructuredBuffer PointCounterBuffer; RWBuffer RWIndirectDrawArgsBuffer; RWBuffer RWIndirectDispatchArgsBuffer; RWBuffer RWIndirectDispatchArgsGlobalBuffer; [numthreads(1, 1, 1)] void Main(uint DispatchThreadId : SV_DispatchThreadID, uint ThreadIndex : SV_GroupIndex) { const uint PointCount = PointCounterBuffer[ClusterGroupIndex]; const uint DispatchCount = DivideAndRoundUp(PointCount, HAIR_VERTEXCOUNT_GROUP_SIZE); RWIndirectDrawArgsBuffer[0] = PointCount * HAIR_POINT_TO_VERTEX; RWIndirectDrawArgsBuffer[1] = 1; // 1 instance RWIndirectDrawArgsBuffer[2] = 0; RWIndirectDrawArgsBuffer[3] = 0; RWIndirectDispatchArgsBuffer[0] = DispatchCount; RWIndirectDispatchArgsBuffer[1] = 1; RWIndirectDispatchArgsBuffer[2] = 1; RWIndirectDispatchArgsBuffer[3] = PointCount; RWIndirectDispatchArgsGlobalBuffer[InstanceRegisteredIndex * 4 + 0] = DispatchCount; RWIndirectDispatchArgsGlobalBuffer[InstanceRegisteredIndex * 4 + 1] = 1; RWIndirectDispatchArgsGlobalBuffer[InstanceRegisteredIndex * 4 + 2] = 1; RWIndirectDispatchArgsGlobalBuffer[InstanceRegisteredIndex * 4 + 3] = PointCount; } #endif // SHADER_CLUSTER_CULL_ARGS /////////////////////////////////////////////////////////////////////////////////////////////////// // Hair clusters update #if SHADER_CLUSTERAABB uint InstanceRegisteredIndex; uint CurveCount; uint ClusterOffset; uint ClusterCount; float ClusterScale; float3 CPUBoundMin; float3 CPUBoundMax; float LODIndex; float4x4 LocalToTranslatedWorldMatrix; ByteAddressBuffer RenCurveBuffer; Buffer RenPointLODBuffer; ByteAddressBuffer RenderDeformedPositionBuffer; StructuredBuffer RenderDeformedOffsetBuffer; RWBuffer OutClusterAABBBuffer; // Cluster data packed as {uint3 AABBMin, uint3 AABBMax} RWBuffer OutGroupAABBBuffer; // Group data packed as {uint3 AABBMin, uint3 AABBMax} groupshared float3 SharedClusterAABBMin[GROUP_SIZE]; groupshared float3 SharedClusterAABBMax[GROUP_SIZE]; groupshared uint CurvePointIndex; groupshared uint CurvePointCount; groupshared float3 OutHairPositionOffset; [numthreads(GROUP_SIZE, 1, 1)] void ClusterAABBEvaluationCS(uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupId : SV_GroupID, uint3 GroupThreadId : SV_GroupThreadID, uint ThreadIndex : SV_GroupIndex) { #if PERMUTATION_USE_CPU_AABB // Only update the full group AABB if (all(DispatchThreadId == 0)) { FHairAABB Bound; Bound.Min = CPUBoundMin; Bound.Max = CPUBoundMax; WriteHairAABB(InstanceRegisteredIndex, Bound, OutGroupAABBBuffer); } #else // PERMUTATION_USE_CPU_AABB // Use the first K curves to to compute cluster bound. This is an approximation, // but since curves are now shuffled, it covers most of the groom const uint ClusterIndex = GroupId.x; if (ThreadIndex == 0) { CurvePointIndex = 0; CurvePointCount = 0; const uint CurveIndex = GroupId.x; if (CurveIndex < CurveCount) { const FHairCurve RenCurve = ReadHairCurve(RenCurveBuffer, CurveIndex); CurvePointIndex = RenCurve.PointIndex; CurvePointCount = RenCurve.PointCount; } OutHairPositionOffset = ReadRenPositionOffset(RenderDeformedOffsetBuffer, InstanceRegisteredIndex); } GroupMemoryBarrierWithGroupSync(); int3 ClusterAABBMin = 9999999.0f; int3 ClusterAABBMax =-9999999.0f; const uint LoopCount = DivideAndRoundUp(CurvePointCount, GROUP_SIZE); for (uint LoopIt = 0; LoopIt < LoopCount; ++LoopIt) { const uint LocalPointIndex = ThreadIndex + LoopIt * GROUP_SIZE; if (LocalPointIndex < CurvePointCount) { const uint GlobalPointIndex = CurvePointIndex + LocalPointIndex; const bool bIsActive = IsHairControlPointActive(GlobalPointIndex, RenPointLODBuffer, LODIndex); const FHairControlPoint CP = ReadHairControlPoint(RenderDeformedPositionBuffer, GlobalPointIndex, OutHairPositionOffset, 1, 1, 1); if (bIsActive && all(IsFinite(CP.Position))) { const int3 WorldSpaceCentimeters = int3(mul(float4(CP.Position, 1.0f), LocalToTranslatedWorldMatrix).xyz); const int3 WorldSpaceCentimetersMin = WorldSpaceCentimeters - 1; const int3 WorldSpaceCentimetersMax = WorldSpaceCentimeters + 1; ClusterAABBMin = min(ClusterAABBMin, WorldSpaceCentimetersMin); ClusterAABBMax = max(ClusterAABBMax, WorldSpaceCentimetersMax); } } } // Scaling const float3 ClusterAABBCenter = float3(ClusterAABBMin + ClusterAABBMax) * 0.5f; const float3 ClusterAABBExtent = float3(ClusterAABBMax - ClusterAABBMin) * 0.5f; ClusterAABBMin = ClusterAABBCenter - ClusterAABBExtent * ClusterScale; ClusterAABBMax = ClusterAABBCenter + ClusterAABBExtent * ClusterScale; // Write each thread result to shared memory SharedClusterAABBMin[ThreadIndex] = ClusterAABBMin; SharedClusterAABBMax[ThreadIndex] = ClusterAABBMax; // Do a local reduction in shared memory. Assumes ClusterLOD0.VertexCount>64 to have all min&max values valid. GroupMemoryBarrierWithGroupSync(); if (ThreadIndex < 32) { SharedClusterAABBMin[ThreadIndex] = min(SharedClusterAABBMin[ThreadIndex], SharedClusterAABBMin[ThreadIndex + 32]); SharedClusterAABBMax[ThreadIndex] = max(SharedClusterAABBMax[ThreadIndex], SharedClusterAABBMax[ThreadIndex + 32]); } GroupMemoryBarrierWithGroupSync(); if (ThreadIndex < 16) { SharedClusterAABBMin[ThreadIndex] = min(SharedClusterAABBMin[ThreadIndex], SharedClusterAABBMin[ThreadIndex + 16]); SharedClusterAABBMax[ThreadIndex] = max(SharedClusterAABBMax[ThreadIndex], SharedClusterAABBMax[ThreadIndex + 16]); } GroupMemoryBarrierWithGroupSync(); if (ThreadIndex < 8) { SharedClusterAABBMin[ThreadIndex] = min(SharedClusterAABBMin[ThreadIndex], SharedClusterAABBMin[ThreadIndex + 8]); SharedClusterAABBMax[ThreadIndex] = max(SharedClusterAABBMax[ThreadIndex], SharedClusterAABBMax[ThreadIndex + 8]); } // No hardware has SIMD Vector unit operating in sync with less than 16 threads per group. So can skip sync now. //GroupMemoryBarrierWithGroupSync(); if (ThreadIndex < 4) { SharedClusterAABBMin[ThreadIndex] = min(SharedClusterAABBMin[ThreadIndex], SharedClusterAABBMin[ThreadIndex + 4]); SharedClusterAABBMax[ThreadIndex] = max(SharedClusterAABBMax[ThreadIndex], SharedClusterAABBMax[ThreadIndex + 4]); } //GroupMemoryBarrierWithGroupSync(); if (ThreadIndex < 2) { SharedClusterAABBMin[ThreadIndex] = min(SharedClusterAABBMin[ThreadIndex], SharedClusterAABBMin[ThreadIndex + 2]); SharedClusterAABBMax[ThreadIndex] = max(SharedClusterAABBMax[ThreadIndex], SharedClusterAABBMax[ThreadIndex + 2]); } //GroupMemoryBarrierWithGroupSync(); if (ThreadIndex < 1) { SharedClusterAABBMin[ThreadIndex] = min(SharedClusterAABBMin[ThreadIndex], SharedClusterAABBMin[ThreadIndex + 1]); SharedClusterAABBMax[ThreadIndex] = max(SharedClusterAABBMax[ThreadIndex], SharedClusterAABBMax[ThreadIndex + 1]); } // Write out hair cluster AABB if (ThreadIndex == 0) { const uint ClusterIdx6 = (ClusterOffset + ClusterIndex) * 6; OutClusterAABBBuffer[ClusterIdx6 + 0] = SharedClusterAABBMin[0].x; OutClusterAABBBuffer[ClusterIdx6 + 1] = SharedClusterAABBMin[0].y; OutClusterAABBBuffer[ClusterIdx6 + 2] = SharedClusterAABBMin[0].z; OutClusterAABBBuffer[ClusterIdx6 + 3] = SharedClusterAABBMax[0].x; OutClusterAABBBuffer[ClusterIdx6 + 4] = SharedClusterAABBMax[0].y; OutClusterAABBBuffer[ClusterIdx6 + 5] = SharedClusterAABBMax[0].z; // And contribute to the group full AABB FHairAABB Bound; Bound.Min.x = SharedClusterAABBMin[0].x; Bound.Min.y = SharedClusterAABBMin[0].y; Bound.Min.z = SharedClusterAABBMin[0].z; Bound.Max.x = SharedClusterAABBMax[0].x; Bound.Max.y = SharedClusterAABBMax[0].y; Bound.Max.z = SharedClusterAABBMax[0].z; InterlockHairAABB(InstanceRegisteredIndex, Bound, OutGroupAABBBuffer); } #if DEBUG_ENABLE if (ThreadIndex == 0) { const float4 DebugColor = float4(ColorMapViridis(ClusterIndex / float(ClusterCount)),1); AddAABBTWS(SharedClusterAABBMin[0], SharedClusterAABBMax[0], DebugColor); AddReferentialWS(LocalToWorldMatrix, 10); } #endif // Min/Max group AABB is done later by another pass #endif // PERMUTATION_USE_CPU_AABB } #endif // SHADER_CLUSTERAABB