// Copyright Epic Games, Inc. All Rights Reserved. #define HAIR_STRANDS_PARAMETERS 1 #include "../Common.ush" #include "../Common.ush" #include "../Matrices.ush" #include "../SceneTextureParameters.ush" #include "../PositionReconstructionCommon.ush" #include "../DeferredShadingCommon.ush" #include "../ShaderPrint.ush" #include "HairStrandsAABBCommon.ush" #include "HairStrandsVisibilityCommon.ush" #include "HairStrandsClusterCommon.ush" #include "HairStrandsVertexFactoryCommon.ush" #include "HairStrandsVoxelPageCommon.ush" #include "HairStrandsDeepShadowCommonStruct.ush" #define GLOBAL_PAGE_COUNTER_INDEX 0 #define GROUPS_PAGE_COUNTER_INDEX 1 /////////////////////////////////////////////////////////////////////////////////////////////////// // Page allocation #if SHADER_ALLOCATEPAGEINDEX float CPUPageWorldSize; float CPUVoxelWorldSize; uint bUseCPUVoxelWorldSize; // When adaptive voxel size is disabled, we use CPU voxel size value uint TotalPageIndexCount; // This is the max page index count; uint PageResolution; // Resolution of a page uint MacroGroupCount; uint IndirectDispatchGroupSize; uint bDoesMacroGroupSupportVoxelization; // For testing parity with CPU version float4 CPU_TranslatedWorldMinAABB[MAX_HAIR_MACROGROUP_COUNT]; float4 CPU_TranslatedWorldMaxAABB[MAX_HAIR_MACROGROUP_COUNT]; int4 CPU_PageIndexResolution[MAX_HAIR_MACROGROUP_COUNT]; uint CPU_bUseCPUData; Buffer GPUVoxelWorldSize; Buffer MacroGroupVoxelSizeBuffer; Buffer MacroGroupAABBBuffer; RWBuffer MacroGroupVoxelAlignedAABBBuffer; RWBuffer OutPageIndexResolutionAndOffsetBuffer; RWBuffer OutPageIndexAllocationIndirectBufferArgs; #if GROUP_SIZE != MAX_HAIR_MACROGROUP_COUNT #error MAX_HAIR_MACROGROUP_COUNT needs to match MAX_HAIR_MACROGROUP_COUNT #endif #define INVALID_OFFSET 0xFFFFFFFF groupshared uint PageIndexOffsets[MAX_HAIR_MACROGROUP_COUNT]; // This code assume we have less than 32 macro group (which fit into a single CU/SM) [numthreads(GROUP_SIZE, 1, 1)] void AllocatePageIndex(uint2 DispatchThreadId : SV_DispatchThreadID) { const uint MacroGroupId = DispatchThreadId.x; FHairAABB Bound = InitHairAABB(); float PageWorldSize = CPUPageWorldSize; bool bIsValid = MacroGroupId < MacroGroupCount; if (bIsValid) { const bool bSupportVoxelization = (bDoesMacroGroupSupportVoxelization >> MacroGroupId) & 0x1; if (CPU_bUseCPUData > 0) { Bound.Min = CPU_TranslatedWorldMinAABB[MacroGroupId].xyz; Bound.Max = CPU_TranslatedWorldMaxAABB[MacroGroupId].xyz; } else { Bound = ReadHairAABB(MacroGroupId, MacroGroupAABBBuffer); } const float VoxelWorldSize = QuantizeVoxelWorldSize(bUseCPUVoxelWorldSize ? CPUVoxelWorldSize : max(GPUVoxelWorldSize[0], MacroGroupVoxelSizeBuffer[MacroGroupId])); PageWorldSize = VoxelWorldSize * PageResolution; if (any(Bound.Min > Bound.Max) || !bSupportVoxelization) { Bound.Min = 0; Bound.Max = 0; bIsValid = false; } } // Page index allocation int3 PageIndexResolution = 0; { // Snap the max AABB to the voxel size. // The contents of MacroGroupAABBBuffer (tight fitting AABBs) and MacroGroupVoxelAlignedAABBBuffer diverge here // because the macro group AABBs for voxelization need to be snapped to the voxel page boundary. // Allocate enough pages to cover the AABB, where page (0,0,0) origin sit on MinAABB. if (bIsValid) { float3 MacroGroupSize = Bound.Max - Bound.Min; if (CPU_bUseCPUData > 0) { PageIndexResolution = CPU_PageIndexResolution[MacroGroupId].xyz; } else { PageIndexResolution = ceil(MacroGroupSize / PageWorldSize); } Bound.Max = (PageIndexResolution * PageWorldSize) + Bound.Min; // Snap Bound's Max to page size } const uint TotalPageIndex = PageIndexResolution.x * PageIndexResolution.y * PageIndexResolution.z; PageIndexOffsets[MacroGroupId] = TotalPageIndex; GroupMemoryBarrierWithGroupSync(); // Postfix sum to instance group are always ordered by index if (DispatchThreadId.x == 0) { bool bValidAllocation = true; uint PageIndexOffset = 0; for (uint LocalMacroGroupId = 0; LocalMacroGroupId < MacroGroupCount; ++LocalMacroGroupId) { const uint PageCount = PageIndexOffsets[LocalMacroGroupId]; bValidAllocation = bValidAllocation && (PageIndexOffset + PageCount <= TotalPageIndexCount); PageIndexOffsets[LocalMacroGroupId] = bValidAllocation ? PageIndexOffset : INVALID_OFFSET; PageIndexOffset += PageCount; } } GroupMemoryBarrierWithGroupSync(); const uint PageIndexOffset = PageIndexOffsets[MacroGroupId]; bIsValid = bIsValid && (PageIndexOffset != INVALID_OFFSET); if (bIsValid) { OutPageIndexResolutionAndOffsetBuffer[MacroGroupId] = uint4(PageIndexResolution, PageIndexOffset); WriteHairAABB(MacroGroupId, Bound, MacroGroupVoxelAlignedAABBBuffer); } else { // Clear all output if the allocation is not valid OutPageIndexResolutionAndOffsetBuffer[MacroGroupId] = uint4(0, 0, 0, 0); WriteDispatchIndirectArgs(OutPageIndexAllocationIndirectBufferArgs, MacroGroupId, 0, 1, 1); } } if (!bIsValid) { return; } // Prepare indirect buffer for doing the actual page index allocation and filling the page index { const uint AllocatedPageIndexCount = PageIndexResolution.x * PageIndexResolution.y * PageIndexResolution.z; WriteDispatchIndirectArgs(OutPageIndexAllocationIndirectBufferArgs, MacroGroupId, DivideAndRoundUp(AllocatedPageIndexCount, IndirectDispatchGroupSize), 1, 1); } } #endif /////////////////////////////////////////////////////////////////////////////////////////////////// #if SHADER_MARKVALID_PREPARE uint InstanceRegisteredIndex; uint ClusterOffset; uint ClusterCount; uint MacroGroupId; uint bUseMacroGroupBoundCPU; float3 MacroGroupBoundCPU_TranslatedWorldMinAABB; float3 MacroGroupBoundCPU_TranslatedWorldMaxAABB; float3 TranslatedWorldOffsetCorrection; Buffer GroupAABBsBuffer; Buffer ClusterAABBsBuffer; Buffer MacroGroupVoxelAlignedAABBBuffer; Buffer PageIndexResolutionAndOffsetBuffer; RWBuffer OutValidPageIndexBuffer; // PageIndexBuffer is sampled with linear coordinate computed from the 3d page coordinate. VALID NODE ARE NOT COMPACTED. It contains the LINEAR PAGE INDEX (to map to the 3d volume). // PageIndexCoordBuffer is sampled with linear coordinate for allocated nodes. VALID NODE ARE COMPACTED. It contains the 3d page coordinate and ClustedId. Only used for opaque voxel injection. #if PERMUTATION_USE_CLUSTER [numthreads(GROUP_SIZE, 1, 1)] void MarkValid_PrepareCS(uint2 DispatchThreadId : SV_DispatchThreadID) { const uint ClusterIndex = DispatchThreadId.x; if (ClusterIndex >= ClusterCount) { return; } const uint BaseClusterIndex = 6 * (ClusterOffset + ClusterIndex); FHairAABB ClusterBound; ClusterBound.Min.x = float(ClusterAABBsBuffer[BaseClusterIndex + 0]); ClusterBound.Min.y = float(ClusterAABBsBuffer[BaseClusterIndex + 1]); ClusterBound.Min.z = float(ClusterAABBsBuffer[BaseClusterIndex + 2]); ClusterBound.Max.x = float(ClusterAABBsBuffer[BaseClusterIndex + 3]); ClusterBound.Max.y = float(ClusterAABBsBuffer[BaseClusterIndex + 4]); ClusterBound.Max.z = float(ClusterAABBsBuffer[BaseClusterIndex + 5]); if (any(ClusterBound.Min >= ClusterBound.Max)) return; if (any(!IsFinite(ClusterBound.Min)) || any(!IsFinite(ClusterBound.Max))) return; const uint4 PageIndexResolutionAndOffset = PageIndexResolutionAndOffsetBuffer.Load(MacroGroupId); FHairAABB MacroGroupBound = ReadHairAABB(MacroGroupId, MacroGroupVoxelAlignedAABBBuffer); const int3 PageIndexResolution = PageIndexResolutionAndOffset.xyz; const uint PageIndexOffset = PageIndexResolutionAndOffset.w; if (any(MacroGroupBound.Min >= MacroGroupBound.Max)) return; if (any(!IsFinite(MacroGroupBound.Min)) || any(!IsFinite(MacroGroupBound.Max))) return; uint3 MinCoord = PositionToCoord(ClusterBound.Min, MacroGroupBound.Min, MacroGroupBound.Max, PageIndexResolution); uint3 MaxCoord = PositionToCoord(ClusterBound.Max, MacroGroupBound.Min, MacroGroupBound.Max, PageIndexResolution); uint3 PageIndexResolutionMinusOne = uint3(PageIndexResolution - 1); MinCoord = clamp(MinCoord, uint3(0, 0, 0), PageIndexResolutionMinusOne); MaxCoord = clamp(MaxCoord, uint3(0, 0, 0), PageIndexResolutionMinusOne); const uint3 CoordResolution = (MaxCoord - MinCoord) + 1; const uint ScatterCount = CoordResolution.x * CoordResolution.y * CoordResolution.z; // Arbitrary large number (e.g., 100x10x10 pages covered) // This acts as guards against degenerated case, where the sim would deformed strands large position making the cluster arbitratry large. if (ScatterCount > 10000) return; if (any(!IsFinite(float3(MinCoord)))) return; if (any(!IsFinite(float3(MaxCoord)))) return; // Find a good sweet spot for (uint z = MinCoord.z; z <= MaxCoord.z; ++z) { for (uint y = MinCoord.y; y <= MaxCoord.y; ++y) { for (uint x = MinCoord.x; x <= MaxCoord.x; ++x) { const uint3 PageIndexCoord = uint3(x, y, z); const uint LinearPageIndexCoord = CoordToIndex(PageIndexCoord, PageIndexResolution, PageIndexOffset); InterlockedOr(OutValidPageIndexBuffer[LinearPageIndexCoord], 1u); } } } } #else // PERMUTATION_USE_CLUSTER [numthreads(GROUP_SIZE, GROUP_SIZE, GROUP_SIZE)] void MarkValid_PrepareCS(uint3 DispatchThreadId : SV_DispatchThreadID) { const uint4 PageIndexResolutionAndOffset = PageIndexResolutionAndOffsetBuffer.Load(MacroGroupId); const uint3 PageIndexResolution = PageIndexResolutionAndOffset.xyz; const uint PageIndexOffset = PageIndexResolutionAndOffset.w; const uint3 Coord = DispatchThreadId; if (any(Coord >= PageIndexResolution)) return; FHairAABB MacroGroupBound; FHairAABB GroupBound; if (bUseMacroGroupBoundCPU) { MacroGroupBound.Min = MacroGroupBoundCPU_TranslatedWorldMinAABB; MacroGroupBound.Max = MacroGroupBoundCPU_TranslatedWorldMaxAABB; // HAIR_TODO: Can we have reliable primitive AABB to have tigher bound? GroupBound.Min = MacroGroupBoundCPU_TranslatedWorldMinAABB; GroupBound.Max = MacroGroupBoundCPU_TranslatedWorldMaxAABB; } else { MacroGroupBound = ReadHairAABB(MacroGroupId, MacroGroupVoxelAlignedAABBBuffer); GroupBound = ReadHairAABB(InstanceRegisteredIndex, GroupAABBsBuffer); // Correct View0 translated world offset to ViewX translated world offset GroupBound.Min += TranslatedWorldOffsetCorrection; GroupBound.Max += TranslatedWorldOffsetCorrection; } const uint3 MinCoord = PositionToCoord(GroupBound.Min, MacroGroupBound.Min, MacroGroupBound.Max, PageIndexResolution); const uint3 MaxCoord = PositionToCoord(GroupBound.Max, MacroGroupBound.Min, MacroGroupBound.Max, PageIndexResolution); if (any(!IsFinite(float3(MinCoord)))) return; if (any(!IsFinite(float3(MaxCoord)))) return; if (all(Coord >= MinCoord) && all(Coord <= MaxCoord)) { const uint3 PageIndexCoord = uint3(Coord.x, Coord.y, Coord.z); const uint LinearPageIndexCoord = CoordToIndex(PageIndexCoord, PageIndexResolution, PageIndexOffset); InterlockedOr(OutValidPageIndexBuffer[LinearPageIndexCoord], 1u); } } #endif // PERMUTATION_USE_CLUSTER #endif // SHADER_MARKVALID_PREPARE /////////////////////////////////////////////////////////////////////////////////////////////////// #if SHADER_ALLOCATE uint MacroGroupId; uint PageCount; uint CPU_PageIndexCount; uint CPU_PageIndexOffset; uint3 CPU_PageIndexResolution; Buffer PageIndexResolutionAndOffsetBuffer; Buffer IndirectBufferArgs; RWBuffer RWPageIndexGlobalCounter; RWBuffer RWPageIndexBuffer; RWBuffer RWPageToPageIndexBuffer; RWBuffer RWPageIndexCoordBuffer; groupshared uint LocalCounter; groupshared uint GroupBase[2]; [numthreads(GROUP_SIZE, 1, 1)] void AllocateCS(uint GroupIndex : SV_GroupIndex, uint3 DispatchThreadId : SV_DispatchThreadID) { if (GroupIndex == 0) { GroupBase[0] = 0; GroupBase[1] = 0; LocalCounter = 0; } GroupMemoryBarrierWithGroupSync(); #if PERMUTATION_GPU_DRIVEN == 1 const uint4 PageIndexResolutionAndOffset = PageIndexResolutionAndOffsetBuffer.Load(MacroGroupId); const uint3 PageIndexResolution = PageIndexResolutionAndOffset.xyz; const uint PageIndexOffset = PageIndexResolutionAndOffset.w; const uint PageIndexCount = PageIndexResolution.x * PageIndexResolution.y * PageIndexResolution.z; #else const uint3 PageIndexResolution = CPU_PageIndexResolution; const uint PageIndexOffset = CPU_PageIndexOffset; const uint PageIndexCount = CPU_PageIndexCount; #endif const uint GridIndex = DispatchThreadId.x + PageIndexOffset; bool bIsValid = false; if (DispatchThreadId.x < PageIndexCount) { bIsValid = RWPageIndexBuffer[GridIndex] > 0; } uint Offset = 0; if (bIsValid) { InterlockedAdd(LocalCounter, 1u, Offset); } GroupMemoryBarrierWithGroupSync(); if (GroupIndex == 0) { // * Add page count to global counter for global tacking // * Add page count to the group for per group work InterlockedAdd(RWPageIndexGlobalCounter[GLOBAL_PAGE_COUNTER_INDEX], LocalCounter, GroupBase[0]); InterlockedAdd(RWPageIndexGlobalCounter[GROUPS_PAGE_COUNTER_INDEX + MacroGroupId], LocalCounter, GroupBase[1]); } GroupMemoryBarrierWithGroupSync(); if (bIsValid) { const uint PageIndex0 = GroupBase[0] + Offset; // Global page index const uint PageIndex1 = GroupBase[1] + Offset; // Group page index const bool bIsAllocationValid = PageIndex0 < PageCount; RWPageIndexBuffer[GridIndex] = bIsAllocationValid ? PageIndex0 : INVALID_VOXEL_PAGE_INDEX; if (bIsAllocationValid) { RWPageToPageIndexBuffer[PageIndex0] = GridIndex; } // Output the coordinates of the allocated page for indirect dispatch usage // If the allocated failed (run out of page), then we mark the IndexCoord with a invalid GroupID const uint LinearIndex = DispatchThreadId.x; const uint3 PageIndexCoord = IndexToCoord(LinearIndex, PageIndexResolution); RWPageIndexCoordBuffer[PageIndexOffset + PageIndex1] = uint4(PageIndexCoord, bIsAllocationValid ? MacroGroupId : INVALID_MACRO_GROUP_ID); } // Mark page index as invalid // Insure that even if write more (due to larger dispatch count that needed), we do not stomp other instance group page index else if (DispatchThreadId.x < PageIndexCount) { RWPageIndexBuffer[GridIndex] = INVALID_VOXEL_PAGE_INDEX; } } #endif /////////////////////////////////////////////////////////////////////////////////////////////////// #if SHADER_ADDDESC float3 CPU_TranslatedWorldMinAABB; uint MacroGroupId; float3 CPU_TranslatedWorldMaxAABB; uint CPU_PageIndexOffset; int3 CPU_PageIndexResolution; float CPU_VoxelWorldSize; uint bUseCPUVoxelWorldSize; // When adaptive voxel size is disabled, we use CPU voxel size value Buffer GPU_VoxelWorldSize; Buffer MacroGroupVoxelAlignedAABBBuffer; Buffer MacroGroupVoxelSizeBuffer; Buffer PageIndexResolutionAndOffsetBuffer; RWStructuredBuffer OutNodeDescBuffer; [numthreads(1, 1, 1)] void AddDescCS(uint GroupIndex : SV_GroupIndex, uint3 DispatchThreadId : SV_DispatchThreadID) { FVirtualVoxelNodeDesc Node; #if PERMUTATION_GPU_DRIVEN == 1 const uint4 PageIndexResolutionAndOffset = PageIndexResolutionAndOffsetBuffer.Load(MacroGroupId); const FHairAABB TranslatedWorldBound = ReadHairAABB(MacroGroupId, MacroGroupVoxelAlignedAABBBuffer); const float VoxelWorldSize = MacroGroupVoxelSizeBuffer[MacroGroupId]; Node.TranslatedWorldMinAABB = TranslatedWorldBound.Min; Node.TranslatedWorldMaxAABB = TranslatedWorldBound.Max; Node.PageIndexResolution = PageIndexResolutionAndOffset.xyz; Node.PageIndexOffset = PageIndexResolutionAndOffset.w; Node.VoxelWorldSize = bUseCPUVoxelWorldSize ? CPU_VoxelWorldSize : max(GPU_VoxelWorldSize[0], VoxelWorldSize); #else Node.TranslatedWorldMinAABB = CPU_TranslatedWorldMinAABB; Node.TranslatedWorldMaxAABB = CPU_TranslatedWorldMaxAABB; Node.PageIndexResolution = CPU_PageIndexResolution; Node.PageIndexOffset = CPU_PageIndexOffset; Node.VoxelWorldSize = CPU_VoxelWorldSize; #endif FPackedVirtualVoxelNodeDesc PackedNode = PackVoxelNode(Node); OutNodeDescBuffer[MacroGroupId] = PackedNode; } #endif /////////////////////////////////////////////////////////////////////////////////////////////////// // Preapare indirect buffer #if SHADER_ADDINDIRECTBUFFER uint PageResolution; uint MacroGroupCount; int3 IndirectGroupSize; Buffer PageIndexGlobalCounter; RWBuffer OutIndirectArgsBuffer; void WriteArgs(uint WriteIndex, uint AllocatedPageCount) { const uint VoxelCountPerPage = PageResolution * PageResolution * PageResolution; const uint DispatchCountX = DivideAndRoundUp(VoxelCountPerPage, IndirectGroupSize.x); const uint DispatchCountZ = DivideAndRoundUp(AllocatedPageCount, IndirectGroupSize.z); WriteDispatchIndirectArgs(OutIndirectArgsBuffer, WriteIndex, DispatchCountX, 1, DispatchCountZ); } [numthreads(GROUP_SIZE, 1, 1)] void AddIndirectBufferCS(uint GroupIndex : SV_GroupIndex, uint3 DispatchThreadId : SV_DispatchThreadID) { // Total pages allocated across *all* macro groups if (DispatchThreadId.x == 0) { WriteArgs(GLOBAL_PAGE_COUNTER_INDEX, PageIndexGlobalCounter[GLOBAL_PAGE_COUNTER_INDEX]); } // Pages allocated for a particular macro group const uint MacroGroupId = DispatchThreadId.x; if (MacroGroupId < MacroGroupCount) { WriteArgs(GROUPS_PAGE_COUNTER_INDEX + MacroGroupId, PageIndexGlobalCounter[GROUPS_PAGE_COUNTER_INDEX + MacroGroupId]); } } #endif /////////////////////////////////////////////////////////////////////////////////////////////////// // Indirect clear #if SHADER_INDPAGECLEAR Buffer PageIndexGlobalCounter; uint VirtualVoxelParams_PageResolution; int3 VirtualVoxelParams_PageCountResolution; Buffer VirtualVoxelParams_PageIndexCoordBuffer; RWTexture3D OutPageTexture; [numthreads(GROUP_SIZE_X, 1, GROUP_SIZE_Z)] void VoxelIndPageClearCS(uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupId : SV_GroupID, uint3 GroupThreadId : SV_GroupThreadID) { const uint TotalAllocatedPageCount = PageIndexGlobalCounter[GLOBAL_PAGE_COUNTER_INDEX]; const uint VoxelCountPerPage = VirtualVoxelParams_PageResolution * VirtualVoxelParams_PageResolution * VirtualVoxelParams_PageResolution; const uint LinearVoxelCoord = DispatchThreadId.x; const uint AllocatedPageIndex = DispatchThreadId.z; if (AllocatedPageIndex < TotalAllocatedPageCount && LinearVoxelCoord < VoxelCountPerPage) { const uint3 VoxelCoordOffset = IndexToCoord(LinearVoxelCoord, VirtualVoxelParams_PageResolution.xxx); const uint PageIndex = AllocatedPageIndex; // PageIndexBuffer is not needed, we already know those tiles are allocated linearly in 3D within OutPageTexture. const uint3 PageCoord = IndexToCoord(PageIndex, VirtualVoxelParams_PageCountResolution); const int3 VoxelPageBase = PageCoord * VirtualVoxelParams_PageResolution; const int3 VoxelCoord = VoxelPageBase + VoxelCoordOffset; OutPageTexture[VoxelCoord] = 0; } } #endif /////////////////////////////////////////////////////////////////////////////////////////////////// #if SHADER_ADAPTIVE_FEEDBACK #define DEBUG_ENABLE 0 #if DEBUG_ENABLE #include "../ShaderPrint.ush" #endif uint CPUAllocatedPageCount; float CPUMinVoxelWorldSize; float AdaptiveCorrectionThreshold; float AdaptiveCorrectionSpeed; Buffer PageIndexGlobalCounter; Buffer CurrGPUMinVoxelWorldSize; RWBuffer NextGPUMinVoxelWorldSize; float RoundHairVoxelSize(float In) { // Round voxel size to 0.01f to avoid oscillation issue return floor(In * 1000.f + 0.5f) * 0.001f; } [numthreads(1, 1, 1)] void FeedbackCS(uint3 DispatchThreadId : SV_DispatchThreadID) { const float CurrVoxelWorldSize = RoundHairVoxelSize(CurrGPUMinVoxelWorldSize[0]); // Voxel pages are represent a volume. To derive a better estimate of the ratio by which voxel size needs to be scale, // compute the cubic root of this ratio. // // AllocatedPage AllocatedRes^3 // ------------- = -------------- = VolumeRatio = LinearRatio^3 // MaxPage MaxRes^3 // Ratio used for predicting voxel size increase const uint GPUAllocatedPageCount = PageIndexGlobalCounter[GLOBAL_PAGE_COUNTER_INDEX]; const float VolumeRatio = float(GPUAllocatedPageCount) / float(CPUAllocatedPageCount); const float LinearRatio = pow(VolumeRatio, 1.f / 3.f); // Ratio used for predicting voxel size decrease (i.e. when requested allocation fit, // but the voxel size does not match the (more precise) target). // In this case, we add a threshold/margin to to the target, so that there is no oscillation. const float VolumeRatio_Thres = float(GPUAllocatedPageCount) / float(CPUAllocatedPageCount * AdaptiveCorrectionThreshold); const float LinearRatio_Thres = pow(max(VolumeRatio_Thres, 0.f), 1.f / 3.f); // If the page pool is not large enough increase voxel size float NextVoxelWorldSize = CPUMinVoxelWorldSize; if (GPUAllocatedPageCount > CPUAllocatedPageCount) { //NextVoxelWorldSize = CurrVoxelWorldSize * LinearRatio; NextVoxelWorldSize = CurrVoxelWorldSize * LinearRatio_Thres; } // If the page pool is large enough but the voxel are larger than the requested size decrease voxel size else if (GPUAllocatedPageCount < CPUAllocatedPageCount && CurrVoxelWorldSize > CPUMinVoxelWorldSize) { const float TargetVoxelWorldSize = CurrVoxelWorldSize * LinearRatio_Thres; NextVoxelWorldSize = max(CPUMinVoxelWorldSize, lerp(CurrVoxelWorldSize, TargetVoxelWorldSize, AdaptiveCorrectionSpeed)); } //else if (GPUAllocatedPageCount > CPUAllocatedPageCount * AdaptiveCorrectionThreshold) //{ // const float TargetVoxelWorldSize = CurrVoxelWorldSize * LinearRatio_Thres; // NextVoxelWorldSize = max(CPUMinVoxelWorldSize, lerp(CurrVoxelWorldSize, TargetVoxelWorldSize, AdaptiveCorrectionSpeed)); //} else { NextVoxelWorldSize = CPUMinVoxelWorldSize; } // Clamp voxel size into a reasonable amount (e.g. 0.1mm - 100mm) const float ClampMinVoxelWorldSize = 0.01f; const float ClampMaxVoxelWorldSize = 10.0f; NextVoxelWorldSize = clamp(RoundHairVoxelSize(NextVoxelWorldSize), ClampMinVoxelWorldSize, ClampMaxVoxelWorldSize); // Debug #if DEBUG_ENABLE FFontColor CPUColor = FontEmerald; FFontColor GPUColor = FontOrange; FFontColor CstColor = FontSilver; FShaderPrintContext Context = InitShaderPrintContext(true, uint2(700, 50)); Print(Context, TEXT(" ------------------------------- "), FontSilver); Newline(Context); Print(Context, TEXT("| Allocations |"), FontSilver); Newline(Context); Print(Context, TEXT(" ------------------------------- "), FontSilver); Newline(Context); Print(Context, TEXT("GPU Allocated "), GPUColor); Print(Context, GPUAllocatedPageCount, GPUColor); Newline(Context); Print(Context, TEXT("CPU Allocated "), CPUColor); Print(Context, CPUAllocatedPageCount, CPUColor); Newline(Context); Print(Context, TEXT("GPU Curr Min. Size "), GPUColor); Print(Context, CurrVoxelWorldSize, GPUColor); Newline(Context); Print(Context, TEXT("GPU Next Min. Size "), GPUColor); Print(Context, NextVoxelWorldSize, GPUColor); Newline(Context); Print(Context, TEXT("CPU Min. Size "), CPUColor); Print(Context, CPUMinVoxelWorldSize, CPUColor); Newline(Context); Print(Context, TEXT("Correction Thres. "), CstColor); Print(Context, AdaptiveCorrectionThreshold, CstColor); Newline(Context); Print(Context, TEXT("Correction Speed "), CstColor); Print(Context, AdaptiveCorrectionSpeed, CstColor); Newline(Context); #endif // Update state data NextGPUMinVoxelWorldSize[0] = RoundHairVoxelSize(NextVoxelWorldSize); } #endif // SHADER_ADAPTIVE_FEEDBACK /////////////////////////////////////////////////////////////////////////// // Voxel Raster Compute #if SHADER_RASTERCOMPUTE uint MaxRasterCount; uint FrameIdMod8; uint MacroGroupId; uint VertexCount; uint VirtualVoxelParams_PageIndexCount; uint VirtualVoxelParams_PageResolution; uint3 VirtualVoxelParams_PageCountResolution; uint3 VirtualVoxelParams_PageTextureResolution; Buffer VirtualVoxelParams_PageIndexBuffer; StructuredBuffer VirtualVoxelParams_NodeDescBuffer; RWTexture3D OutPageTexture; float CoverageScale; #define JITTER_ENABLE 0 float3 GetHairVoxelJitter(uint2 PixelCoord, uint Seed) { return float3( InterleavedGradientNoise(PixelCoord.xy, Seed), InterleavedGradientNoise(PixelCoord.xy, Seed * 117), InterleavedGradientNoise(PixelCoord.xy, Seed * 7901)); } [numthreads(GROUP_SIZE, 1, 1)] void MainCS(uint2 DispatchThreadID : SV_DispatchThreadID) { uint VertexIndex0 = DispatchThreadID.x; uint VertexIndex1 = VertexIndex0 + 1; bool bIsValid = VertexIndex0 < VertexCount && VertexIndex1 < VertexCount; if (!bIsValid) return; #if PERMUTATION_CULLING == 1 if (HairStrandsVF_bCullingEnable) { const uint VertexCountAfterCulling = HairStrandsVF_CullingIndirectBuffer[3]; uint FetchIndex0 = VertexIndex0; uint FetchIndex1 = VertexIndex1; bIsValid = FetchIndex0 < VertexCountAfterCulling&& FetchIndex1 < VertexCountAfterCulling; if (!bIsValid) { return; } FetchIndex1 = min(FetchIndex0 + 1, VertexCountAfterCulling - 1); VertexIndex0 = HairStrandsVF_CullingIndexBuffer[FetchIndex0]; VertexIndex1 = HairStrandsVF_CullingIndexBuffer[FetchIndex1]; } #endif const float3 PositionOffset = HairStrandsVF_GetHairInstancePositionOffset(); const FHairControlPoint CP0 = ReadHairControlPoint( HairStrandsVF_PositionBuffer, VertexIndex0, PositionOffset, HairStrandsVF_Radius, HairStrandsVF_RootScale, HairStrandsVF_TipScale); if (CP0.Type == HAIR_CONTROLPOINT_END) return; const FHairControlPoint CP1 = ReadHairControlPoint( HairStrandsVF_PositionBuffer, VertexIndex1, PositionOffset, HairStrandsVF_Radius, HairStrandsVF_RootScale, HairStrandsVF_TipScale); const FVirtualVoxelNodeDesc NodeDesc = UnpackVoxelNode(VirtualVoxelParams_NodeDescBuffer[MacroGroupId], VirtualVoxelParams_PageResolution); const float DiameterToRadius = 0.5f; const float3 TranslatedWP0 = mul(float4(CP0.Position, 1), HairStrandsVF_LocalToTranslatedWorldPrimitiveTransform).xyz; const float HairCoverage0 = CP0.WorldRadius / max(CP0.WorldRadius, DiameterToRadius * NodeDesc.VoxelWorldSize); const float3 TranslatedWP1 = mul(float4(CP1.Position, 1), HairStrandsVF_LocalToTranslatedWorldPrimitiveTransform).xyz; const float HairCoverage1 = CP1.WorldRadius / max(CP1.WorldRadius, DiameterToRadius * NodeDesc.VoxelWorldSize); // In order to reduce aliasing, we increase the number of steps. This makes the result more comparable to the raster pass. const float LineStepMultiplier = 1.5f; const float ScaledVoxelWorldSize = NodeDesc.VoxelWorldSize / LineStepMultiplier; FVirtualVoxelCommonDesc CommonDesc; CommonDesc.PageCountResolution = VirtualVoxelParams_PageCountResolution; CommonDesc.PageTextureResolution = VirtualVoxelParams_PageTextureResolution; CommonDesc.PageResolution = VirtualVoxelParams_PageResolution; CommonDesc.PageResolutionLog2 = VirtualVoxel.PageResolutionLog2; // Count the number of fibers which are within a cylinder defined by the voxel size, // and the distance between the origin and the extent of the volume // This assumes that the voxel volume is cubic (i.e. equal dimensions on all sides) const float3 LineSegment = TranslatedWP1 - TranslatedWP0; const float LineLength = length(LineSegment); const float3 StepD = normalize(LineSegment) * ScaledVoxelWorldSize; // Step according to voxel size int3 CurrentPageIndexCoord = -1; bool bIsPageValid = false; uint3 PageCoord = 0; // If we have long segment we could break them into batch (e.g. 8 voxels long), queue them, and indirect dispatch them. // This would make the workload more uniform/coherent. Currently, breaking into smaller batch, does not seems to save a // lot of cost #if JITTER_ENABLE const float3 Jitter = GetHairVoxelJitter(frac(CP0.Position.xy), FrameIdMod8, VirtualVoxel.JitterMode) * 2 - 1; #else const float3 Jitter = 0; #endif const float fMaxStep = LineLength / ScaledVoxelWorldSize; const float MaxStep = float(min(ceil(fMaxStep), MaxRasterCount)); int3 PreviousCoord = -1; for (float StepIt = 0.0f; StepIt < MaxStep; ++StepIt) { const float U = (StepIt + 0.5f) / float(MaxStep); const float Radius = lerp(CP0.WorldRadius, CP1.WorldRadius, U); const float3 HitP = TranslatedWP0 + StepIt * StepD + Jitter * Radius; const int3 VolumeCoord = clamp((HitP - NodeDesc.TranslatedWorldMinAABB) / NodeDesc.VoxelWorldSize, 0, NodeDesc.VirtualResolution-1); const int3 PageIndexCoord = VolumeCoord / CommonDesc.PageResolution; // Update page index only when needed const bool bHasPageIndexChanged = any(PageIndexCoord != CurrentPageIndexCoord); if (bHasPageIndexChanged) { CurrentPageIndexCoord = PageIndexCoord; const uint LinearPageIndexCoord = CoordToIndex(PageIndexCoord, NodeDesc.PageIndexResolution, NodeDesc.PageIndexOffset); const uint PageIndex = VirtualVoxelParams_PageIndexBuffer.Load(LinearPageIndexCoord); bIsPageValid = PageIndex != INVALID_VOXEL_PAGE_INDEX; if (bIsPageValid) { PageCoord = IndexToCoord(PageIndex, CommonDesc.PageCountResolution); } } if (bIsPageValid) { const int3 VoxelPageBase = PageCoord * CommonDesc.PageResolution; const int3 VoxelPageOffset = VolumeCoord - PageIndexCoord * CommonDesc.PageResolution; const int3 VoxelPageCoord = VoxelPageBase + VoxelPageOffset; // Insure we don't write multiple time within the same voxel. // This can happen for small hair segment, where both start & end points could be writtent into the same voxel. const bool bRasterize = any(VoxelPageCoord != PreviousCoord); if (bRasterize) { const float VoxelFixPointScale = GetVoxelDensityFixPointScale(); const float HairCoverage = lerp(HairCoverage0, HairCoverage1, U); uint RawData = HairCoverage * VoxelFixPointScale * HairStrandsVF_Density * CoverageScale; InterlockedAdd(OutPageTexture[VoxelPageCoord], RawData); // Groom having raytraced geometry will cast-shadow on opaque geomtry with their RT-geometry, not with their // voxelization. To avoid for doubling/incorrect shadowing, we mark voxel with no-shadow casting flag. // This adds a significant cost when used. if (HasHairFlags(HairStrandsVF_Flags, HAIR_FLAGS_RAYTRACING_GEOMETRY)) { InterlockedOr(OutPageTexture[VoxelPageCoord], VOXEL_CAST_NO_SHADOW_MASK); } } PreviousCoord = VoxelPageCoord; } } } #endif // SHADER_RASTERCOMPUTE /////////////////////////////////////////////////////////////////////////// // Inject opaque surface into voxels #if SHADER_INJECTOPAQUE_VIRTUALVOXEL uint MacroGroupId; float2 SceneDepthResolution; uint VoxelBiasCount; uint VoxelMarkCount; RWTexture3D OutPageTexture; uint VirtualVoxelParams_PageCount; uint VirtualVoxelParams_PageIndexCount; uint VirtualVoxelParams_PageResolution; uint3 VirtualVoxelParams_PageCountResolution; Buffer VirtualVoxelParams_PageIndexBuffer; Buffer VirtualVoxelParams_AllocatedPageCountBuffer; BufferVirtualVoxelParams_PageIndexCoordBuffer; StructuredBuffer VirtualVoxelParams_NodeDescBuffer; [numthreads(GROUP_SIZE_X, 1, GROUP_SIZE_Z)] void MainCS(uint3 DispatchThreadId : SV_DispatchThreadID) { const uint LinearVoxelCoord = DispatchThreadId.x; const uint AllocatedPageIndex = DispatchThreadId.z; const uint VoxelCountPerPage = VirtualVoxelParams_PageResolution.x * VirtualVoxelParams_PageResolution.x * VirtualVoxelParams_PageResolution.x; const bool bValidVoxel = AllocatedPageIndex < VirtualVoxelParams_AllocatedPageCountBuffer[0] && LinearVoxelCoord < VirtualVoxelParams_PageCount * VoxelCountPerPage; if (!bValidVoxel) { return; } uint3 VoxelCoordOffset = IndexToCoord(LinearVoxelCoord, VirtualVoxelParams_PageResolution.xxx); VoxelCoordOffset.y = VoxelCoordOffset.y % VirtualVoxelParams_PageResolution.x; VoxelCoordOffset.z = VoxelCoordOffset.z % VirtualVoxelParams_PageResolution.x; const FVirtualVoxelNodeDesc NodeDesc = UnpackVoxelNode(VirtualVoxelParams_NodeDescBuffer[MacroGroupId], VirtualVoxelParams_PageResolution); const uint4 PageIndexCoord = VirtualVoxelParams_PageIndexCoordBuffer[NodeDesc.PageIndexOffset + AllocatedPageIndex]; const uint LinearPageIndexCoord = CoordToIndex(PageIndexCoord.xyz, NodeDesc.PageIndexResolution, NodeDesc.PageIndexOffset); // PageIndexCoord have invalid .w component if we run out of available page during the allocation const bool bIsValid = LinearPageIndexCoord < VirtualVoxelParams_PageIndexCount&& PageIndexCoord.w != INVALID_MACRO_GROUP_ID; if (bIsValid) { const uint PageIndex = VirtualVoxelParams_PageIndexBuffer.Load(LinearPageIndexCoord); if (PageIndex != INVALID_VOXEL_PAGE_INDEX) { const uint3 VoxelCoordBase = PageIndexCoord.xyz * VirtualVoxelParams_PageResolution; const uint3 VoxelCoord = VoxelCoordBase + VoxelCoordOffset; const float3 TranslatedWorldPosition = VoxelCoord * NodeDesc.VoxelWorldSize + NodeDesc.TranslatedWorldMinAABB; float4 ClipPos = mul(float4(TranslatedWorldPosition, 1), PrimaryView.TranslatedWorldToClip); ClipPos /= ClipPos.w; const float DepthBias = VoxelBiasCount * NodeDesc.VoxelWorldSize; const float VoxelDepth = ConvertFromDeviceZ(ClipPos.z) - DepthBias; float2 SceneUV = float2(0.5f * (ClipPos.x + 1), 1 - 0.5f * (ClipPos.y + 1)); SceneUV = ViewportUVToBufferUV(SceneUV); const float2 ScenePixelCoord = SceneUV * SceneDepthResolution; const bool bIsOnScreen = SceneUV.x >= 0 && SceneUV.x < 1 && SceneUV.y >= 0 && SceneUV.y < 1; if (!bIsOnScreen) return; const float ClosestDepth = ConvertFromDeviceZ(SceneDepthTexture.Load(uint3(ScenePixelCoord, 0)).x); const float3 SceneTranslatedWorldPos = ReconstructTranslatedWorldPositionFromDepth(SceneUV, ClosestDepth); const bool bIsInVolume = SceneTranslatedWorldPos.x >= NodeDesc.TranslatedWorldMinAABB.x && SceneTranslatedWorldPos.x < NodeDesc.TranslatedWorldMaxAABB.x&& SceneTranslatedWorldPos.y >= NodeDesc.TranslatedWorldMinAABB.y && SceneTranslatedWorldPos.y < NodeDesc.TranslatedWorldMaxAABB.y&& SceneTranslatedWorldPos.z >= NodeDesc.TranslatedWorldMinAABB.z && SceneTranslatedWorldPos.z < NodeDesc.TranslatedWorldMaxAABB.z; if (!bIsInVolume) return; // Inject opaque depth on a thin layer (Dist < DistThreshold) for avoiding weird projection if (ClosestDepth < VoxelDepth && abs(ClosestDepth - VoxelDepth) < VoxelMarkCount * NodeDesc.VoxelWorldSize) { const uint3 VoxelPageIndexCoord = VoxelCoord / VirtualVoxelParams_PageResolution; const uint3 VoxelIndexCoordBase = VoxelPageIndexCoord * VirtualVoxelParams_PageResolution; const uint3 VoxelPageOffset = VoxelCoord - VoxelIndexCoordBase; const uint3 PageCoord = IndexToCoord(PageIndex, VirtualVoxelParams_PageCountResolution); const int3 VoxelPageBase = PageCoord * VirtualVoxelParams_PageResolution; const int3 VoxelPageCoord = VoxelPageOffset + VoxelPageBase; InterlockedOr(OutPageTexture[VoxelPageCoord], VOXEL_OPAQUE_ADD); } } } } #endif // SHADER_INJECTOPAQUE_VIRTUALVOXEL /////////////////////////////////////////////////////////////////////////// #if SHADER_DEPTH_INJECTION float2 OutputResolution; uint MacroGroupId; uint AtlasSlotIndex; float3 LightDirection; uint bIsDirectional; float3 TranslatedLightPosition; StructuredBuffer DeepShadowViewInfoBuffer; void MainVS( uint VertexId : SV_VertexID, out float4 OutPosition : SV_POSITION, out float3 OutTranslatedWorldPosition : WORLD_POSITION) { const FPackedVirtualVoxelNodeDesc PackedNode = VirtualVoxel.NodeDescBuffer[MacroGroupId]; const FVirtualVoxelNodeDesc NodeDesc = UnpackVoxelNode(PackedNode, VirtualVoxel.PageResolution); // Move this to an actual vertex/index buffer const float3 Min = NodeDesc.TranslatedWorldMinAABB; const float3 Max = NodeDesc.TranslatedWorldMaxAABB; const float3 Center = (Min + Max) * 0.5f; const float3 Extent = (Max - Min) * 0.5f; const float3 Position0 = Center + float3(-Extent.x, -Extent.y, -Extent.z); const float3 Position1 = Center + float3(+Extent.x, -Extent.y, -Extent.z); const float3 Position2 = Center + float3(+Extent.x, +Extent.y, -Extent.z); const float3 Position3 = Center + float3(-Extent.x, +Extent.y, -Extent.z); const float3 Position4 = Center + float3(-Extent.x, -Extent.y, +Extent.z); const float3 Position5 = Center + float3(+Extent.x, -Extent.y, +Extent.z); const float3 Position6 = Center + float3(+Extent.x, +Extent.y, +Extent.z); const float3 Position7 = Center + float3(-Extent.x, +Extent.y, +Extent.z); float3 TranslatedWorldPosition = 0; switch (VertexId) { case 0: TranslatedWorldPosition = Position0; break; case 1: TranslatedWorldPosition = Position1; break; case 2: TranslatedWorldPosition = Position2; break; case 3: TranslatedWorldPosition = Position0; break; case 4: TranslatedWorldPosition = Position2; break; case 5: TranslatedWorldPosition = Position3; break; case 6: TranslatedWorldPosition = Position4; break; case 7: TranslatedWorldPosition = Position5; break; case 8: TranslatedWorldPosition = Position6; break; case 9: TranslatedWorldPosition = Position4; break; case 10: TranslatedWorldPosition = Position6; break; case 11: TranslatedWorldPosition = Position7; break; case 12: TranslatedWorldPosition = Position0; break; case 13: TranslatedWorldPosition = Position1; break; case 14: TranslatedWorldPosition = Position5; break; case 15: TranslatedWorldPosition = Position0; break; case 16: TranslatedWorldPosition = Position5; break; case 17: TranslatedWorldPosition = Position4; break; case 18: TranslatedWorldPosition = Position2; break; case 19: TranslatedWorldPosition = Position3; break; case 20: TranslatedWorldPosition = Position7; break; case 21: TranslatedWorldPosition = Position2; break; case 22: TranslatedWorldPosition = Position7; break; case 23: TranslatedWorldPosition = Position6; break; case 24: TranslatedWorldPosition = Position1; break; case 25: TranslatedWorldPosition = Position2; break; case 26: TranslatedWorldPosition = Position6; break; case 27: TranslatedWorldPosition = Position1; break; case 28: TranslatedWorldPosition = Position6; break; case 29: TranslatedWorldPosition = Position5; break; case 30: TranslatedWorldPosition = Position3; break; case 31: TranslatedWorldPosition = Position0; break; case 32: TranslatedWorldPosition = Position4; break; case 33: TranslatedWorldPosition = Position3; break; case 34: TranslatedWorldPosition = Position4; break; case 35: TranslatedWorldPosition = Position7; break; } const FDeepShadowViewInfo DeepShadowViewInfo = DeepShadowViewInfoBuffer[AtlasSlotIndex]; const float4x4 TranslatedWorldToClipMatrix = DeepShadowViewInfo.TranslatedWorldToClipScaledBiased; OutTranslatedWorldPosition = TranslatedWorldPosition; OutPosition = mul(float4(TranslatedWorldPosition, 1), TranslatedWorldToClipMatrix); } //#define VOXEL_TRAVERSAL_TYPE VOXEL_TRAVERSAL_LINEAR_MIPMAP #define VOXEL_TRAVERSAL_TYPE VOXEL_TRAVERSAL_LINEAR #include "HairStrandsVoxelPageTraversal.ush" void MainPS( in float4 InPosition : SV_POSITION, in float3 InTranslatedWorldPosition : WORLD_POSITION, out float OutDepth : SV_DEPTH) { OutDepth = 0; const float2 PixelCoord = InPosition.xy; const float2 UV = PixelCoord / float2(OutputResolution); // todo view rect offset const float DistanceThreshold = 1000; const bool bDebugEnabled = false; const float3 SampleRandom = GetHairVoxelJitter(PixelCoord, View.StateFrameIndexMod8, VirtualVoxel.JitterMode); const float3 TracingDirection = bIsDirectional ? LightDirection : normalize(InTranslatedWorldPosition - TranslatedLightPosition); const float3 TranslatedWP0 = InTranslatedWorldPosition; const float3 TranslatedWP1 = InTranslatedWorldPosition + TracingDirection * DistanceThreshold; FVirtualVoxelCommonDesc CommonDesc; CommonDesc.PageCountResolution = VirtualVoxel.PageCountResolution; CommonDesc.PageTextureResolution = VirtualVoxel.PageTextureResolution; CommonDesc.PageResolution = VirtualVoxel.PageResolution; CommonDesc.PageResolutionLog2 = VirtualVoxel.PageResolutionLog2; const FPackedVirtualVoxelNodeDesc PackedNode = VirtualVoxel.NodeDescBuffer[MacroGroupId]; const FVirtualVoxelNodeDesc NodeDesc = UnpackVoxelNode(PackedNode, VirtualVoxel.PageResolution); FHairTraversalSettings TraversalSettings = InitHairTraversalSettings(); TraversalSettings.DensityScale = VirtualVoxel.DensityScale; TraversalSettings.CountThreshold = 0.9f; // GetOpaqueVoxelValue(); TraversalSettings.DistanceThreshold = DistanceThreshold; TraversalSettings.bDebugEnabled = bDebugEnabled; TraversalSettings.SteppingScale = VirtualVoxel.SteppingScale_Shadow; TraversalSettings.Random = SampleRandom; TraversalSettings.TanConeAngle = 0; TraversalSettings.bIsPrimaryRay = true; TraversalSettings.bUseOpaqueVisibility = true; TraversalSettings.PixelRadius = -1; TraversalSettings.ForcedMip = -1; const FHairTraversalResult TraversalResult = ComputeHairCountVirtualVoxel( TranslatedWP0, TranslatedWP1, CommonDesc, NodeDesc, VirtualVoxel.PageIndexBuffer, VirtualVoxel.PageTexture, TraversalSettings); bool bIsValid = TraversalResult.HairCount > 0; if (bIsValid) { const FDeepShadowViewInfo DeepShadowViewInfo = DeepShadowViewInfoBuffer[AtlasSlotIndex]; const float4x4 TranslatedWorldToClipMatrix = DeepShadowViewInfo.TranslatedWorldToClipScaledBiased; const float3 HitP = TranslatedWP0 + normalize(TranslatedWP1 - TranslatedWP0) * TraversalResult.HitT; float4 ClipP = mul(float4(HitP, 1), TranslatedWorldToClipMatrix); OutDepth = ClipP.z /= ClipP.w; } else { discard; } } #endif //SHADER_DEPTH_INJECTION /////////////////////////////////////////////////////////////////////////// // Common function for mipmapping voxels #if SHADER_MIP_VIRTUALVOXEL || SHADER_MIP_INDIRECTARGS uint ComputeMipDensity( const uint RawDensity0, const uint RawDensity1, const uint RawDensity2, const uint RawDensity3, const uint RawDensity4, const uint RawDensity5, const uint RawDensity6, const uint RawDensity7) { const float TotalOpaque = ((RawDensity0 & VOXEL_OPAQUE_MASK) >> VOXEL_OPAQUE_SHIFT) + ((RawDensity1 & VOXEL_OPAQUE_MASK) >> VOXEL_OPAQUE_SHIFT) + ((RawDensity2 & VOXEL_OPAQUE_MASK) >> VOXEL_OPAQUE_SHIFT) + ((RawDensity3 & VOXEL_OPAQUE_MASK) >> VOXEL_OPAQUE_SHIFT) + ((RawDensity4 & VOXEL_OPAQUE_MASK) >> VOXEL_OPAQUE_SHIFT) + ((RawDensity5 & VOXEL_OPAQUE_MASK) >> VOXEL_OPAQUE_SHIFT) + ((RawDensity6 & VOXEL_OPAQUE_MASK) >> VOXEL_OPAQUE_SHIFT) + ((RawDensity7 & VOXEL_OPAQUE_MASK) >> VOXEL_OPAQUE_SHIFT); const uint OutTotalOpaque = uint(clamp(TotalOpaque / 8.f, TotalOpaque > 0 ? 1 : 0, 0x7F)) << VOXEL_OPAQUE_SHIFT; // Propagate no shadow casting flag, only if other valid/non-empty voxels are also no-shadow caster const bool bHasShadowCaster = ((RawDensity0 & VOXEL_HAIR_MASK) > 0 && (RawDensity0 & VOXEL_CAST_NO_SHADOW_MASK) == 0) || ((RawDensity1 & VOXEL_HAIR_MASK) > 0 && (RawDensity1 & VOXEL_CAST_NO_SHADOW_MASK) == 0) || ((RawDensity2 & VOXEL_HAIR_MASK) > 0 && (RawDensity2 & VOXEL_CAST_NO_SHADOW_MASK) == 0) || ((RawDensity3 & VOXEL_HAIR_MASK) > 0 && (RawDensity3 & VOXEL_CAST_NO_SHADOW_MASK) == 0) || ((RawDensity4 & VOXEL_HAIR_MASK) > 0 && (RawDensity4 & VOXEL_CAST_NO_SHADOW_MASK) == 0) || ((RawDensity5 & VOXEL_HAIR_MASK) > 0 && (RawDensity5 & VOXEL_CAST_NO_SHADOW_MASK) == 0) || ((RawDensity6 & VOXEL_HAIR_MASK) > 0 && (RawDensity6 & VOXEL_CAST_NO_SHADOW_MASK) == 0) || ((RawDensity7 & VOXEL_HAIR_MASK) > 0 && (RawDensity7 & VOXEL_CAST_NO_SHADOW_MASK) == 0); uint TotalHair = (RawDensity0 & VOXEL_HAIR_MASK) + (RawDensity1 & VOXEL_HAIR_MASK) + (RawDensity2 & VOXEL_HAIR_MASK) + (RawDensity3 & VOXEL_HAIR_MASK) + (RawDensity4 & VOXEL_HAIR_MASK) + (RawDensity5 & VOXEL_HAIR_MASK) + (RawDensity6 & VOXEL_HAIR_MASK) + (RawDensity7 & VOXEL_HAIR_MASK); const bool bHasData = TotalHair > 0; TotalHair /= 8; // Insure that if a voxel contains some hair data, its total hair remains > 0 after averaging. // This is important for pruning invalid/empty page later on, to not remove non-empty page (which // could arise due to numerical precision). return min(uint(VOXEL_HAIR_MASK), bHasData ? max(TotalHair, 1u) : 0u) | OutTotalOpaque | (bHasShadowCaster ? 0u : VOXEL_CAST_NO_SHADOW_MASK); } #endif // SHADER_MIP_VIRTUALVOXEL || SHADER_MIP_INDIRECTARGS /////////////////////////////////////////////////////////////////////////// #if SHADER_MIP_VIRTUALVOXEL #include "../MortonCode.ush" uint bPatchEmptyPage; int3 PageCountResolution; uint PageResolution; uint SourceMip; uint TargetMip; Buffer AllocatedPageCountBuffer; Texture3D InDensityTexture; RWTexture3D OutDensityTexture; uint MortonEncode3(uint3 Voxel) { return MortonCode3(Voxel.x) | MortonCode3(Voxel.y) << 1 | MortonCode3(Voxel.z) << 2; } uint3 MortonDecode3(uint Morton) { uint3 Voxel = uint3(ReverseMortonCode3(Morton), ReverseMortonCode3(Morton >> 1), ReverseMortonCode3(Morton >> 2)); return Voxel; } #if PERMUTATION_MIP_AGGREGATE Buffer PageToPageIndexBuffer; RWBuffer OutPageIndexBuffer; RWTexture3D OutDensityTexture2; RWTexture3D OutDensityTexture1; groupshared uint g_Density4[64][GROUP_SIZE_Z]; groupshared uint g_Density2[8][GROUP_SIZE_Z]; #endif // GroupSize is [64u,1u,16u] [numthreads(GROUP_SIZE_X, 1, GROUP_SIZE_Z)] void MainCS(uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { const uint PageIndex = DispatchThreadId.z; const uint3 PageCoord = IndexToCoord(PageIndex, PageCountResolution); const bool bValid = PageIndex < AllocatedPageCountBuffer[0]; // Dummy initialization for pleasing compiler #if PERMUTATION_MIP_AGGREGATE && COMPILER_FXC if (GroupThreadId.x < 8 && GroupThreadId.z < GROUP_SIZE_Z) { g_Density2[GroupThreadId.x][GroupThreadId.z] = 0; } #endif uint Total = 0; if (bValid) { const uint InPageResolution = PageResolution >> SourceMip; const uint OutPageResolution = PageResolution >> TargetMip; const uint TotalVoxelPerOutPageResolution = OutPageResolution * OutPageResolution * OutPageResolution; if (DispatchThreadId.x < TotalVoxelPerOutPageResolution) { const uint VoxelIndex = DispatchThreadId.x; const uint3 OutVoxelCoordOffset = MortonDecode3(VoxelIndex); const uint3 InVoxelCoordOffset = OutVoxelCoordOffset << 1; const uint3 OutVoxelCoord = PageCoord * OutPageResolution + OutVoxelCoordOffset; const uint3 InVoxelCoord = PageCoord * InPageResolution + InVoxelCoordOffset; const uint3 InVoxelCoord0 = InVoxelCoord; const uint3 InVoxelCoord1 = InVoxelCoord0 + uint3(1, 0, 0); const uint3 InVoxelCoord2 = InVoxelCoord0 + uint3(0, 1, 0); const uint3 InVoxelCoord3 = InVoxelCoord0 + uint3(1, 1, 0); const uint3 InVoxelCoord4 = InVoxelCoord0 + uint3(0, 0, 1); const uint3 InVoxelCoord5 = InVoxelCoord0 + uint3(1, 0, 1); const uint3 InVoxelCoord6 = InVoxelCoord0 + uint3(0, 1, 1); const uint3 InVoxelCoord7 = InVoxelCoord0 + uint3(1, 1, 1); const uint RawDensity0 = InDensityTexture[InVoxelCoord0]; const uint RawDensity1 = InDensityTexture[InVoxelCoord1]; const uint RawDensity2 = InDensityTexture[InVoxelCoord2]; const uint RawDensity3 = InDensityTexture[InVoxelCoord3]; const uint RawDensity4 = InDensityTexture[InVoxelCoord4]; const uint RawDensity5 = InDensityTexture[InVoxelCoord5]; const uint RawDensity6 = InDensityTexture[InVoxelCoord6]; const uint RawDensity7 = InDensityTexture[InVoxelCoord7]; Total = ComputeMipDensity( RawDensity0, RawDensity1, RawDensity2, RawDensity3, RawDensity4, RawDensity5, RawDensity6, RawDensity7); OutDensityTexture[OutVoxelCoord] = Total; #if PERMUTATION_MIP_AGGREGATE // Store 4x4x4 values if (GroupThreadId.x < 64) { uint StoreIndex = MortonEncode3(OutVoxelCoordOffset); g_Density4[StoreIndex][GroupThreadId.z] = Total; } #endif } } #if PERMUTATION_MIP_AGGREGATE GroupMemoryBarrierWithGroupSync(); // Target Page Res: 2x2x2 if (bValid && GroupThreadId.x < 8) { const uint Total4 = ComputeMipDensity( g_Density4[GroupThreadId.x*8+0][GroupThreadId.z], g_Density4[GroupThreadId.x*8+1][GroupThreadId.z], g_Density4[GroupThreadId.x*8+2][GroupThreadId.z], g_Density4[GroupThreadId.x*8+3][GroupThreadId.z], g_Density4[GroupThreadId.x*8+4][GroupThreadId.z], g_Density4[GroupThreadId.x*8+5][GroupThreadId.z], g_Density4[GroupThreadId.x*8+6][GroupThreadId.z], g_Density4[GroupThreadId.x*8+7][GroupThreadId.z]); const uint3 StoreVoxel = MortonDecode3(GroupThreadId.x*8)>>1; const uint StoreIndex = MortonEncode3(StoreVoxel); // GroupThreadId.x >> 3; g_Density2[StoreIndex][GroupThreadId.z] = Total4; const uint OutPageResolution = 2; const uint3 OutVoxelCoord = PageCoord * OutPageResolution + StoreVoxel; OutDensityTexture2[OutVoxelCoord] = Total4; } GroupMemoryBarrierWithGroupSync(); // Target Page Res: 1x1x1 if (bValid && GroupThreadId.x < 1) { const uint Total2 = ComputeMipDensity( g_Density2[0][GroupThreadId.z], g_Density2[1][GroupThreadId.z], g_Density2[2][GroupThreadId.z], g_Density2[3][GroupThreadId.z], g_Density2[4][GroupThreadId.z], g_Density2[5][GroupThreadId.z], g_Density2[6][GroupThreadId.z], g_Density2[7][GroupThreadId.z]); const uint OutPageResolution = 1; const uint3 OutVoxelCoord = PageCoord * OutPageResolution + 0; OutDensityTexture1[OutVoxelCoord] = Total2; // Update the page index with invalid page index if the voxel does not contain any data. // This allow to save tracing cost when evaluating the transmittance. const bool bIsEmpty =(Total2 & VOXEL_HAIR_MASK) == 0; if (bPatchEmptyPage > 0 && bIsEmpty) { const uint PageIndexOffset = PageToPageIndexBuffer[PageIndex]; OutPageIndexBuffer[PageIndexOffset] = INVALID_VOXEL_PAGE_INDEX; } } #endif } #endif /////////////////////////////////////////////////////////////////////////// #if SHADER_MIP_INDIRECTARGS #include "HairStrandsVoxelPageCommon.ush" uint PageResolution; uint TargetMipIndex; int3 DispatchGroupSize; Buffer InIndirectArgs; RWBuffer OutIndirectArgs; [numthreads(1, 1, 1)] void MainCS(uint3 DispatchThreadId : SV_DispatchThreadID) { const uint MacroGroupId = DispatchThreadId.x; const uint TargetPageResolution = PageResolution >> TargetMipIndex; const uint TotalVoxelCount = TargetPageResolution * TargetPageResolution * TargetPageResolution; const uint DispatchX = DivideAndRoundUp(TotalVoxelCount, DispatchGroupSize.x); WriteDispatchIndirectArgs(OutIndirectArgs, 0, DispatchX, InIndirectArgs[1], InIndirectArgs[2]); } #endif ///////////////////////////////////////////////////////////////////////////