// Copyright Epic Games, Inc. All Rights Reserved. #define HAIR_STRANDS_PARAMETERS 1 #include "../Common.ush" #include "../WaveOpUtil.ush" #include "HairStrandsClusterCommon.ush" #include "HairStrandsVertexFactoryCommon.ush" #include "HairStrandsVisibilityCommon.ush" /////////////////////////////////////////////////////////////////////////// // Common parameters uint TileSizeAsShift; uint TileSize; float RcpTileSize; uint SqrTileSize; uint HalfTileSize; float RcpHalfTileSize; uint SqrHalfTileSize; int2 TileRes; uint NumBinners; float RcpNumBinners; uint NumRasterizers; float RcpNumRasterizers; uint MaxRasterCount; uint FrameIdMod8; uint ResolutionMultiplier; int2 OutputResolution; float2 OutputResolutionf; /////////////////////////////////////////////////////////////////////////// #if SHADER_RASTERCOMPUTE_BINNING || SHADER_RASTERCOMPUTE_COMPACTION || SHADER_RASTERCOMPUTE_RASTER || SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE || SHADER_RASTERCOMPUTE_DEPTH_GRID /////////////////////////////////////////////////////////////////////////// /* // use untyped buffer for segment tiles to reduce VGPR usage - 16 bytes struct FVisTile { uint PrimOffset; uint PrimCount; uint TileCoord; uint MinDepth; }; */ #define VT_SIZE 4 #define VT_PrimOffset 0 #define VT_PrimCount 1 #define VT_Coord 2 #define VT_MinWriteIndex 3 uint PackVisTileCoord(uint2 Coord) { return uint(((Coord.x & 0xff) << 0) | (((Coord.y) & 0xff) << 8)); } uint2 UnpackVisTileCoord(uint Packed) { return uint2(((Packed >> 0) & 0xff), ((Packed >> 8) & 0xff)); } /////////////////////////////////////////////////////////////////////////// uint MacroGroupId; uint HairMaterialId; Texture2D SceneDepthTexture; uint VertexCount; float CoverageScale; float3 NDCToPixelCoord(float4 InDC) { const float3 NDC = InDC.xyz / InDC.w; float2 UV = NDC.xy * ResolvedView.ScreenPositionScaleBias.xy + ResolvedView.ScreenPositionScaleBias.wz; return float3(UV * OutputResolution, NDC.z); } void CalcHomogenousPos(in uint InPointIndex, in float3 PBO, out float4 HP, out uint Type) { const FHairControlPoint CP = ReadHairControlPoint( HairStrandsVF_PositionBuffer, InPointIndex, PBO, HairStrandsVF_Radius, HairStrandsVF_RootScale, HairStrandsVF_TipScale); const float3 WP = mul(float4(CP.Position, 1.0f), HairStrandsVF_LocalToWorldPrimitiveTransform).xyz; HP = mul(float4(WP, 1.0f), DFHackToFloat(PrimaryView.WorldToClip)); Type = CP.Type; } void CalcHomogenousPosAndRad(in uint InPointIndex, in float3 PBO, out float4 HP, out float Rad, out uint Type) { const FHairControlPoint CP = ReadHairControlPoint( HairStrandsVF_PositionBuffer, InPointIndex, PBO, HairStrandsVF_Radius, HairStrandsVF_RootScale, HairStrandsVF_TipScale); const float3 WP = mul(float4(CP.Position, 1.0f), HairStrandsVF_LocalToWorldPrimitiveTransform).xyz; HP = mul(float4(WP, 1.0f), DFHackToFloat(PrimaryView.WorldToClip)); Rad = CP.WorldRadius * 2000.0; // OutputResolutionf.x; //TODO: figure this out correctly? Type = CP.Type; } float ComputeLerpAlpha(int2 Coord, float2 P0, float2 P1, float SegmentLenSqRcp) { // Project P onto line segment and compute the lerp alpha between P0 and P1 // Simplification of: // A = P - P0 // B = P1 - P0 // Alpha = dot(A, B) / dot(B, B) const float2 P = Coord + 0.5f; const float Alpha = saturate(dot(P - P0, P1 - P0) * SegmentLenSqRcp); return Alpha; } float ComputePerspectiveCorrectRadius(float Rad0, float Rad1, float Alpha, float RcpW0, float RcpW1) { // Alpha value for perspective correct interpolation. We store the reciprocal of w in the w component of P0 and P1, // so this is a simplification of: // (Alpha / w1) / ((1 - Alpha) / w0 + Alpha / w1) const float LerpedRcpW = lerp(RcpW0, RcpW1, Alpha); const float PerspectiveAlpha = (Alpha * RcpW1) / LerpedRcpW; // Divide by W to make thickness dependent on screen space depth? This division was kept from the previous line rasterization algorithm. const float Rad = lerp(Rad0, Rad1, PerspectiveAlpha) * LerpedRcpW; return Rad; } // Line clipping based on "CLIPPING USING HOMOGENEOUS COORDINATES" by Blinn et al. bool BlinnLineClipping(inout float4 P0, inout float4 P1) { float2 T = float2(0.0f, 1.0f); bool bIsRemoved = P0.w < 0.0f && P1.w < 0.0f; // Both points behind near plane bool bSign = false; UNROLL for (uint PlaneIdx = 0; PlaneIdx < 6; ++PlaneIdx) { // Compute boundary coordinates of both points (w+x, w-x, w+y, w-y, z, w-z) bSign = !bSign; const uint CompIdx = PlaneIdx / 2; const float Sign = bSign ? 1.0f : -1.0f; const float WFactor = PlaneIdx != 4 ? 1.0f : 0.0f; const float2 BC = WFactor * float2(P0.w, P1.w) + Sign * float2(P0[CompIdx], P1[CompIdx]); float Num = BC.x; float Denom = BC.x - BC.y; bIsRemoved = bIsRemoved || (BC.x < 0.0f && BC.y < 0.0f); // Both points outside the plane float Alpha = Num / Denom; // If the denominator is negative, P0 has a smaller boundary coordinate than P1, so we can assume // that P1 is inside the plane (or bIsRemoved is true), so we need to update the alpha for P0. // The reverse is true if the denominator is positive. if (Denom < 0.0f) { T.x = max(T.x, Alpha); } else { T.y = min(T.y, Alpha); } } if (!bIsRemoved) { const float4 P0Clipped = lerp(P0, P1, T.x); const float4 P1Clipped = lerp(P0, P1, T.y); P0 = P0Clipped; P1 = P1Clipped; } return !bIsRemoved; } bool ClipRaySegment(float2 AABBMin, float2 AABBMax, float4 P0, float4 P1, out float2 T, out bool2 bClipped) { bClipped = false; T = float2(0.0f, 1.0f); const bool bP0Outside = any(P0.xy < AABBMin) || any(P0.xy > AABBMax); const bool bP1Outside = any(P1.xy < AABBMin) || any(P1.xy > AABBMax); if (!bP0Outside && !bP1Outside) { return true; } const float2 Origin = P0.xy; const float2 Dir = P1.xy - P0.xy; const float2 RcpDir = 1.0f / Dir; const float2 T0 = (AABBMin - Origin) * RcpDir; const float2 T1 = (AABBMax - Origin) * RcpDir; T.x = max(min(T0.x, T1.x), min(T0.y, T1.y)); T.y = min(max(T0.x, T1.x), max(T0.y, T1.y)); // Ray intersects the AABB but the segment is completely outside or no intersection at all. if (T.y < 0.0f || T.x > T.y) { bClipped = true; return false; } if (bP0Outside && T.x > 0.0f && T.x < 1.0f) { bClipped.x = true; } if (bP1Outside && T.y > 0.0f && T.y < 1.0f) { bClipped.y = true; } return true; } bool ClipRaySegment(float2 AABBMin, float2 AABBMax, inout float4 P0, inout float4 P1, inout float Rad0, inout float Rad1, out bool2 bClipped) { float2 T; bool bIsValid = ClipRaySegment(AABBMin, AABBMax, P0, P1, T, bClipped); if (bIsValid) { const bool bP0Outside = any(P0.xy < AABBMin) || any(P0.xy > AABBMax); const bool bP1Outside = any(P1.xy < AABBMin) || any(P1.xy > AABBMax); float4 P0New = P0; float4 P1New = P1; float Rad0New = Rad0; float Rad1New = Rad1; if (bP0Outside && T.x > 0.0f && T.x < 1.0f) { P0New = lerp(P0, P1, T.x); Rad0New = lerp(Rad0, Rad1, T.x); bClipped.x = true; } if (bP1Outside && T.y > 0.0f && T.y < 1.0f) { P1New = lerp(P0, P1, T.y); Rad1New = lerp(Rad0, Rad1, T.y); bClipped.y = true; } P0 = P0New; P1 = P1New; Rad0 = Rad0New; Rad1 = Rad1New; } return bIsValid; } #endif // Common rasetrizer helper function & parameters /////////////////////////////////////////////////////////////////////////// #if SHADER_RASTERCOMPUTE_DEPTH_GRID RWTexture2D OutVisTileDepthGrid; RWTexture2DArray OutDepthCovTexture; uint NumSamples; groupshared uint group_FurthestDepth; // (4 bytes) [numthreads(1024, 1, 1)] void PrepareDepthGridCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID) { if (GroupThreadID == 0) { group_FurthestDepth = 0xFFFFFFFF; } GroupMemoryBarrierWithGroupSync(); // Assumes a workgroup size of 1024 threads to cover a maximum tile size of 32x32. if (GroupThreadID < SqrTileSize) { uint2 Coord; Coord.y = (GroupThreadID + 0.5f) * RcpTileSize; Coord.x = GroupThreadID - (Coord.y * TileSize); Coord += GroupID * TileSize; if (all(Coord < (uint2)OutputResolution)) { const float Depth = SceneDepthTexture.Load(uint3(Coord, 0)); const uint PackedDepth = PackHairVisDepthCoverage(Depth, 1.0); // Compute furthest depth inside this tile WaveInterlockedMin(group_FurthestDepth, PackedDepth); // Copy scene depth to (multisampled) hair depth output texture for (uint SampleIdx = 0; SampleIdx < NumSamples; ++SampleIdx) { InterlockedMax(OutDepthCovTexture[uint3(Coord, SampleIdx)], PackedDepth); } } } GroupMemoryBarrierWithGroupSync(); if (GroupThreadID == 0) { OutVisTileDepthGrid[GroupID] = group_FurthestDepth; } } #endif //SHADER_RASTERCOMPUTE_DEPTH_GRID /////////////////////////////////////////////////////////////////////////// #if SHADER_RASTERCOMPUTE_BINNING #ifndef PERMUTATION_INDIRECT_PRIM_IDS #define PERMUTATION_INDIRECT_PRIM_IDS 0 #endif RWTexture2DArray OutVisTileBinningGrid; RWBuffer OutVisTilePrims; RWBuffer OutVisTileArgs; RWByteAddressBuffer OutVisTileData; Texture2D VisTileDepthGrid; ByteAddressBuffer IndirectPrimIDCount; Buffer IndirectPrimIDs; // TODO: Without setting a limit, we are sometimes caught in an infinite loop even though this should not happen. #define DDA_MAX_ITERATIONS 256 struct FDDAContext { float2 Coord; float2 DeltaDist; float2 Step; float2 SideDist; }; FDDAContext DDACreateContext(float2 RayStart, float2 RayDir) { const float2 RayDirRcp = 1.0f / RayDir; FDDAContext Context; Context.Coord = floor(RayStart); Context.DeltaDist = abs(RayDirRcp); Context.Step = sign(RayDir); Context.SideDist = (Context.Coord - RayStart + max(Context.Step, 0.0f)) * RayDirRcp; return Context; } void DDAAdvance(inout FDDAContext Context) { if (Context.SideDist.x < Context.SideDist.y) { Context.SideDist.x += Context.DeltaDist.x; Context.Coord.x += Context.Step.x; } else { Context.SideDist.y += Context.DeltaDist.y; Context.Coord.y += Context.Step.y; } } uint LoadOutVisTileData(uint index, uint offset) { return OutVisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4)); } void StoreOutVisTileData(uint index, uint offset, uint value) { OutVisTileData.Store((((index)) * VT_SIZE * 4) + ((offset) * 4), (value)); } groupshared uint group_LoopNum; groupshared uint group_VerticesNum; groupshared uint group_BatchNum; #define TILES_TO_ALLOCATE_MAX 1024 groupshared uint group_TilesToAllocate[TILES_TO_ALLOCATE_MAX]; groupshared uint group_TilesToAllocateCount; // The total number of line segments (VertexCount) is divided up equally between N binners - each binner = a workgroup which loops through the designated set segments in batches of 1024 // NB there is still potential to use LDS to prevent/reduce contention when adding segments to the binning grid and this may improve perf [numthreads(1024, 1, 1)] void BinningCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID) { ResolvedView = ResolveView(); if (GroupThreadID == 0) { #if PERMUTATION_INDIRECT_PRIM_IDS group_VerticesNum = IndirectPrimIDCount.Load(0); #else // PERMUTATION_INDIRECT_PRIM_IDS #if PERMUTATION_CULLING group_VerticesNum = HairStrandsVF_bCullingEnable ? HairStrandsVF_CullingIndirectBuffer[3] : VertexCount; #else // PERMUTATION_CULLING group_VerticesNum = VertexCount; #endif //PERMUTATION_CULLING #endif // PERMUTATION_INDIRECT_PRIM_IDS group_BatchNum = (group_VerticesNum + 1023) / 1024; group_LoopNum = (group_BatchNum + (NumBinners - 1)) * RcpNumBinners; } GroupMemoryBarrierWithGroupSync(); LOOP for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++) { const uint BatchIndex = LoopIndex + (GroupID * group_LoopNum); bool bSegValid = (BatchIndex < group_BatchNum); #if PERMUTATION_INDIRECT_PRIM_IDS uint PrimID = 0; const uint PrimIDIndex = BatchIndex * 1024 + GroupThreadID; bSegValid = bSegValid && (PrimIDIndex < group_VerticesNum); if (bSegValid) { PrimID = IndirectPrimIDs[PrimIDIndex]; } #else // PERMUTATION_INDIRECT_PRIM_IDS #if PERMUTATION_CULLING uint PrimID = BatchIndex * 1024 + GroupThreadID; bSegValid = bSegValid && (PrimID < group_VerticesNum); if (bSegValid && HairStrandsVF_bCullingEnable) { const uint FetchIndex0 = PrimID; const uint FetchIndex1 = min(FetchIndex0 + 1, group_VerticesNum - 1); const uint VertexIndex0 = HairStrandsVF_CullingIndexBuffer[FetchIndex0]; const uint VertexIndex1 = HairStrandsVF_CullingIndexBuffer[FetchIndex1]; if (VertexIndex1 != VertexIndex0 + 1) { bSegValid = false; } else { PrimID = VertexIndex0; } } #else // PERMUTATION_CULLING const uint PrimID = BatchIndex * 1024 + GroupThreadID; bSegValid = bSegValid && (PrimID < VertexCount); #endif // PERMUTATION_CULLING #endif // PERMUTATION_INDIRECT_PRIM_IDS const uint SegmentCountLayerIdx = GroupID; // Stores number of segments per tile per workgroup. const uint TmpSegmentCountLayerIdx = SegmentCountLayerIdx + NumBinners; // Also stores number of segments per tile per workgroup. Used as second counter for this two pass algorithm. const uint TileAllocInfoLayerIdx = SegmentCountLayerIdx + NumBinners * 2; // Stores per tile per workgroup allocation info. uint NearestDepth = 0; float2 TileCoord0F = 0.0f; float2 TileCoord1F = 0.0f; // Project segment end points and clip them to the screen if (bSegValid) { const float3 InstancePositionOffset = HairStrandsVF_GetHairInstancePositionOffset(); float4 H0 = 0.0f; float4 H1 = 0.0f; uint Type = -1; CalcHomogenousPos(PrimID, InstancePositionOffset, H0, Type); bool bIsEndCV = (Type == HAIR_CONTROLPOINT_END); bSegValid = !bIsEndCV; if (bSegValid) { CalcHomogenousPos(PrimID + 1, InstancePositionOffset, H1, Type); // Do clipping in homogenous coordinates bSegValid = BlinnLineClipping(H0, H1); if (bSegValid) { float3 SP0 = NDCToPixelCoord(H0); float3 SP1 = NDCToPixelCoord(H1); SP0.xy *= RcpTileSize; SP1.xy *= RcpTileSize; // For peace of mind, make sure these are actually clamped to a valid range. SP0 = clamp(SP0, 0.0f, float3(TileRes, 1.0f)); SP1 = clamp(SP1, 0.0f, float3(TileRes, 1.0f)); NearestDepth = PackHairVisDepthCoverage(max(SP0.z, SP1.z), 1.0f); TileCoord0F = SP0.xy; TileCoord1F = SP1.xy; } } } if (GroupThreadID == 0) { group_TilesToAllocateCount = 0; } GroupMemoryBarrierWithGroupSync(); // Increment per workgroup per tile counters and add tiles to be allocated if (bSegValid) { FDDAContext DDAContext = DDACreateContext(TileCoord0F, normalize(TileCoord1F - TileCoord0F)); const int2 EndCoord = (int2)floor(TileCoord1F); for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt) { const int2 TileCoord = (int2)floor(DDAContext.Coord); BRANCH if (NearestDepth > VisTileDepthGrid[TileCoord]) { uint OldTileSegmentCount; InterlockedAdd(OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)], 1, OldTileSegmentCount); BRANCH if ((OldTileSegmentCount % 1024) == 0) { uint WritePos; InterlockedAdd(group_TilesToAllocateCount, 1, WritePos); if (WritePos < TILES_TO_ALLOCATE_MAX) { group_TilesToAllocate[WritePos] = PackVisTileCoord(TileCoord); } } } if (all(TileCoord == EndCoord)) { break; } DDAAdvance(DDAContext); } } GroupMemoryBarrierWithGroupSync(); // Allocate tiles const uint TilesToAllocateCount = min(TILES_TO_ALLOCATE_MAX, group_TilesToAllocateCount); for (uint TileIdx = GroupThreadID; TileIdx < TilesToAllocateCount; TileIdx += 1024) { const uint PackedTileCoord = group_TilesToAllocate[TileIdx]; const uint2 TileCoord = UnpackVisTileCoord(PackedTileCoord); const uint TotalNewWriteCount = OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)]; const uint TotalOldWriteCount = OutVisTileBinningGrid[uint3(TileCoord, TmpSegmentCountLayerIdx)]; uint NewTile; WaveInterlockedAddScalar_(OutVisTileArgs[0], 1, NewTile); StoreOutVisTileData(NewTile, VT_Coord, PackedTileCoord); // Round down the count to the start of the tile and later compare against this to decide which tile to write to. StoreOutVisTileData(NewTile, VT_MinWriteIndex, TotalNewWriteCount & ~1023u); const uint PrevTile = (OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)] & 0xffff); if (TotalOldWriteCount > 0) { StoreOutVisTileData(PrevTile, VT_PrimCount, 1024); } OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)] = (PrevTile << 16) | (NewTile & 0xffff); } GroupMemoryBarrierWithGroupSync(); // Write PrimID to tiles if (bSegValid) { FDDAContext DDAContext = DDACreateContext(TileCoord0F, normalize(TileCoord1F - TileCoord0F)); const int2 EndCoord = (int2)floor(TileCoord1F); for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt) { const int2 TileCoord = (int2)floor(DDAContext.Coord); BRANCH if (NearestDepth > VisTileDepthGrid[TileCoord]) { const uint PackedTiles = OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)]; const uint CurTile = (PackedTiles & 0xffff); const uint PrevTile = ((PackedTiles >> 16) & 0xffff); // Currently we need this to get our write position, but maybe there is a cheaper way to keep track of that? uint OldTileSegmentCount; InterlockedAdd(OutVisTileBinningGrid[uint3(TileCoord, TmpSegmentCountLayerIdx)], 1, OldTileSegmentCount); const bool bWriteToCurTile = OldTileSegmentCount >= LoadOutVisTileData(CurTile, VT_MinWriteIndex); const uint LocalWritePos = OldTileSegmentCount % 1024; const uint WritePos = (bWriteToCurTile ? CurTile : PrevTile) * 1024 + LocalWritePos; OutVisTilePrims[WritePos] = PrimID; BRANCH if (bWriteToCurTile) { if ((OldTileSegmentCount + 1) == OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)]) { StoreOutVisTileData(CurTile, VT_PrimCount, (OldTileSegmentCount == 1023) ? 1024 : ((OldTileSegmentCount + 1) % 1024)); } } } if (all(TileCoord == EndCoord)) { break; } DDAAdvance(DDAContext); } } } } #endif //SHADER_RASTERCOMPUTE_BINNING /////////////////////////////////////////////////////////////////////////// #if SHADER_RASTERCOMPUTE_COMPACTION ByteAddressBuffer VisTileData; Buffer VisTilePrims; Buffer VisTileArgs; RWByteAddressBuffer OutCompactedVisTileData; RWBuffer OutCompactedVisTilePrims; RWBuffer OutCompactedVisTileArgs; uint LoadVisTileData(uint index, uint offset) { return VisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4)); } void StoreCompactedVisTileData(uint index, uint offset, uint value) { OutCompactedVisTileData.Store((((index)) * VT_SIZE * 4) + ((offset) * 4), (value)); } groupshared uint group_TotalPrimCount; groupshared uint group_PrimWriteOffset; groupshared uint group_NumTiles; groupshared uint group_TilesToCompact[1024]; groupshared uint group_MaxLDSTileIdx; [numthreads(1024, 1, 1)] void CompactionCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID) { if (GroupThreadID == 0) { group_TotalPrimCount = 0; group_NumTiles = 0; group_MaxLDSTileIdx = 0; } GroupMemoryBarrierWithGroupSync(); const uint NumTiles = VisTileArgs[0]; const uint PackedCoord = PackVisTileCoord(GroupID); // Compute total number of primitives at this tile coordinate uint LocalPrimCount = 0; { for (uint TileIdx = GroupThreadID; TileIdx < NumTiles; TileIdx += 1024) { const uint TilePackedCoord = LoadVisTileData(TileIdx, VT_Coord); if (PackedCoord == TilePackedCoord) { LocalPrimCount += LoadVisTileData(TileIdx, VT_PrimCount); uint WritePos; WaveInterlockedAddScalar_(group_NumTiles, 1, WritePos); if (WritePos < 1024) { group_TilesToCompact[WritePos] = TileIdx; WaveInterlockedMax(group_MaxLDSTileIdx, TileIdx); } } } } GroupMemoryBarrierWithGroupSync(); if (LocalPrimCount > 0) { WaveInterlockedAdd(group_TotalPrimCount, LocalPrimCount); } GroupMemoryBarrierWithGroupSync(); const uint TotalPrimCount = group_TotalPrimCount; if (TotalPrimCount == 0) { return; } // Allocate space if (GroupThreadID == 0) { const uint NumTilesToAllocate = (TotalPrimCount + 1023) / 1024; uint FirstCompactedTile; InterlockedAdd(OutCompactedVisTileArgs[0], NumTilesToAllocate, FirstCompactedTile); group_PrimWriteOffset = FirstCompactedTile * 1024; // Initialize new tiles for (uint TileIdx = 0; TileIdx < NumTilesToAllocate; ++TileIdx) { const uint CompactedTile = FirstCompactedTile + TileIdx; const uint PrimCount = min(TotalPrimCount - TileIdx * 1024, 1024); StoreCompactedVisTileData(CompactedTile, VT_PrimCount, PrimCount); StoreCompactedVisTileData(CompactedTile, VT_Coord, PackedCoord); } } GroupMemoryBarrierWithGroupSync(); // Copy PrimIDs to compacted memory { uint CurrentWriteOffset = group_PrimWriteOffset; // First process the LDS list of tiles const uint NumInputTiles = min(group_NumTiles, 1024); for (uint LDSIdx = 0; LDSIdx < NumInputTiles; ++LDSIdx) { const uint TileIdx = group_TilesToCompact[LDSIdx]; const uint TilePrimOffset = TileIdx * 1024; const uint TilePrimCount = LoadVisTileData(TileIdx, VT_PrimCount); if (GroupThreadID < TilePrimCount) { OutCompactedVisTilePrims[CurrentWriteOffset + GroupThreadID] = VisTilePrims[TilePrimOffset + GroupThreadID]; } CurrentWriteOffset += TilePrimCount; } // Check any remaning tiles if (group_NumTiles > 1024) { for (uint TileIdx = group_MaxLDSTileIdx + 1; TileIdx < NumTiles; ++TileIdx) { const uint TilePackedCoord = LoadVisTileData(TileIdx, VT_Coord); if (PackedCoord == TilePackedCoord) { const uint TilePrimOffset = TileIdx * 1024; const uint TilePrimCount = LoadVisTileData(TileIdx, VT_PrimCount); if (GroupThreadID < TilePrimCount) { OutCompactedVisTilePrims[CurrentWriteOffset + GroupThreadID] = VisTilePrims[TilePrimOffset + GroupThreadID]; } CurrentWriteOffset += TilePrimCount; } } } } } #endif // SHADER_RASTERCOMPUTE_COMPACTION /////////////////////////////////////////////////////////////////////////// #if SHADER_RASTERCOMPUTE_RASTER // Wave size #if PERMUTATION_GROUP_SIZE == 64 #define WAVE_SIZE 32 #elif PERMUTATION_GROUP_SIZE == 32 #define WAVE_SIZE 32 #else #error Unknown group size #endif // Simple rasterization algorithm that lerps between line endpoints. Is currently more robust than the Wu algorithm // and optionally supports anti-aliasing similar to the Wu algorithm. #define RASTER_LINEAR 0 // Implementation of Wu's line rasterization algorithm. Currently this implementation has tile shaped artifacts when the line segment is // clipped against the tile which is why we use the simple linear algorithm at the moment. #define RASTER_WU 1 // Set to 1 to enable writing to two pixels straddling the line segment when using the linear rasterization algorithm. #define ENABLE_RASTER_LINEAR_AA 0 #define RASTER_ALGO RASTER_LINEAR Buffer VisTilePrims; Buffer VisTileArgs; ByteAddressBuffer VisTileData; RWTexture2DArray OutHairCountTexture; RWTexture2DArray OutDepthCovTexture; RWTexture2DArray OutPrimMatTexture; uint LoadVisTileData(uint index, uint offset) { return VisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4)); } groupshared uint4 group_SubTile[1024]; //(32 x 32 x 4 x 4 bytes = 16k bytes) groupshared float3 group_PositionOffset; groupshared float group_ooTileLODScale; groupshared uint group_LoopNum; groupshared uint group_TileNum; groupshared uint group_ThreadsPerSeg; #define GS_SEGS 320 //this number is limited by group shared memory groupshared float4 group_SP0[GS_SEGS]; groupshared float4 group_SP1[GS_SEGS]; groupshared float group_Rad0[GS_SEGS]; groupshared float group_Rad1[GS_SEGS]; groupshared uint group_PrimMatID[GS_SEGS]; groupshared uint group_TileIndex; void PlotInternal(int2 Coords, float AntiAliasingFactor, float4 P0, float4 P1, float Rad0, float Rad1, float SegmentLenSqRcp, uint PackedTileMin, uint PrimMatID) { const int2 IntraTileCoord = Coords - int2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff)); if (all(IntraTileCoord >= 0) && all(IntraTileCoord < int2(TileSize,TileSize))) { const float Alpha = ComputeLerpAlpha(Coords, P0.xy, P1.xy, SegmentLenSqRcp); const float Depth = lerp(P0.z, P1.z, Alpha); const uint PackedDepthCov = PackHairVisDepthCoverage(Depth, 1.0f); const uint LinearIndex = IntraTileCoord.x + IntraTileCoord.y * TileSize; // Write Depth + PrimMatID if depth test against hair depths is passed uint OldValue; InterlockedMax(group_SubTile[LinearIndex].x, PackedDepthCov, OldValue); if (PackedDepthCov > OldValue) { group_SubTile[LinearIndex].y = PrimMatID; } // Add hair count if depth test against scene depth is passed if (PackedDepthCov > group_SubTile[LinearIndex].w) { const float Rad = ComputePerspectiveCorrectRadius(Rad0, Rad1, Alpha, P0.w, P1.w); InterlockedAdd(group_SubTile[LinearIndex].z, min(Rad, 0.5f) * 2.0f * 1000.0f * CoverageScale * AntiAliasingFactor); } } } void Plot(int2 Coord, float FracY, float AntiAliasingFactor, bool bIsSteep, float4 P0, float4 P1, float Rad0, float Rad1, float SegmentLenSqRcp, uint PackedTileMin, uint PrimMatID) { // First pixel { float AAFactor = AntiAliasingFactor * (1.0f - FracY); PlotInternal(bIsSteep ? Coord.yx : Coord.xy, AAFactor, P0, P1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID); } // Second pixel { float AAFactor = AntiAliasingFactor * FracY; Coord.y += 1; PlotInternal(bIsSteep ? Coord.yx : Coord.xy, AAFactor, P0, P1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID); } } [numthreads(1024, 1, 1)] void RasterCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID) { ResolvedView = ResolveView(); if (GroupThreadID == 0) { group_TileNum = VisTileArgs[0]; group_LoopNum = (float(group_TileNum) + float(NumRasterizers - 1)) * RcpNumRasterizers; group_PositionOffset = HairStrandsVF_GetHairInstancePositionOffset(); /* no longer in use - keep for ref? Moving these values to group shared memory did seem to reduce VGPRs - more experimentation needed group_RadScale = (((HairStrandsVF_TipScale - HairStrandsVF_RootScale) * HairStrandsVF_Radius * OutputResolutionf.x) / 63.0) / 255.0; group_RadOffset = (HairStrandsVF_RootScale * HairStrandsVF_Radius * OutputResolutionf.x)/63.0; */ } GroupMemoryBarrierWithGroupSync(); LOOP for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++) { if (GroupThreadID == 0) { group_TileIndex = LoopIndex + (GroupID * group_LoopNum); } GroupMemoryBarrierWithGroupSync(); bool bTileValid = (group_TileIndex < group_TileNum); uint PrimOffset = group_TileIndex * 1024; uint PrimCount = LoadVisTileData(group_TileIndex, VT_PrimCount); uint PackedCoord = LoadVisTileData(group_TileIndex, VT_Coord); uint2 SubTileMin = UnpackVisTileCoord(PackedCoord) * TileSize; uint PackedTileMin = ((SubTileMin.x & 0xffff) << 0) | ((SubTileMin.y & 0xffff) << 16); if (GroupThreadID == 0) { group_ThreadsPerSeg = 1; if (PrimCount <= 512) group_ThreadsPerSeg = 2; if (PrimCount <= 341) group_ThreadsPerSeg = 3; if (PrimCount <= 256) group_ThreadsPerSeg = 4; if (PrimCount <= 204) group_ThreadsPerSeg = 5; if (PrimCount <= 170) group_ThreadsPerSeg = 6; if (PrimCount <= 146) group_ThreadsPerSeg = 7; if (PrimCount <= 128) group_ThreadsPerSeg = 8; if (PrimCount <= 64) group_ThreadsPerSeg = 16; if (PrimCount <= 32) group_ThreadsPerSeg = 32; } GroupMemoryBarrierWithGroupSync(); bool bThreadValid = (bTileValid && (GroupThreadID < (PrimCount * group_ThreadsPerSeg))); uint WaveCount = ((PrimCount * group_ThreadsPerSeg) + (WAVE_SIZE - 1) ) / WAVE_SIZE; uint WaveThreadCount = WaveCount * WAVE_SIZE; bool bWaveThreadValid = (bTileValid && (GroupThreadID < WaveThreadCount)); bool bUseGroupSPs = (bThreadValid && (GroupThreadID < (min(PrimCount, GS_SEGS) * group_ThreadsPerSeg))); bool bGenGroupSPs = (bThreadValid && (GroupThreadID < (min(PrimCount, GS_SEGS)))); if (bGenGroupSPs) { uint Prim = GroupThreadID; uint PrimID = VisTilePrims[PrimOffset + Prim]; group_PrimMatID[Prim] = PackHairVisControlPointMaterialId(PrimID, HairMaterialId); uint TypeDummy; CalcHomogenousPosAndRad(PrimID, group_PositionOffset, group_SP0[Prim], group_Rad0[Prim], TypeDummy); CalcHomogenousPosAndRad(PrimID+1, group_PositionOffset, group_SP1[Prim], group_Rad1[Prim], TypeDummy); } if (bWaveThreadValid) { for (uint LinearIndex = GroupThreadID; LinearIndex < SqrTileSize; LinearIndex += WaveThreadCount) { uint2 Coord; Coord.y = (float(LinearIndex) + 0.5f) * RcpTileSize; Coord.x = LinearIndex - (Coord.y * TileSize); Coord += uint2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff)); group_SubTile[LinearIndex].x = OutDepthCovTexture[uint3(Coord, 0)]; group_SubTile[LinearIndex].y = GetInvalidHairControlPointId(); group_SubTile[LinearIndex].z = 0; group_SubTile[LinearIndex].w = PackHairVisDepthCoverage(SceneDepthTexture.Load(uint3(Coord, 0)), 1.0f); } } GroupMemoryBarrierWithGroupSync(); if (bThreadValid) { uint Prim = uint((float(GroupThreadID) + 0.5f) / float(group_ThreadsPerSeg)); uint PModTPS = GroupThreadID - (Prim * group_ThreadsPerSeg); uint PrimMatID; float4 SP0; float4 SP1; float Rad0; float Rad1; if (bUseGroupSPs) { PrimMatID = group_PrimMatID[Prim]; SP0 = group_SP0[Prim]; SP1 = group_SP1[Prim]; Rad0 = group_Rad0[Prim]; Rad1 = group_Rad1[Prim]; } else { uint PrimID = VisTilePrims[PrimOffset + Prim]; PrimMatID = PackHairVisControlPointMaterialId(PrimID, HairMaterialId); uint TypeDummy; CalcHomogenousPosAndRad(PrimID, group_PositionOffset, SP0, Rad0, TypeDummy); CalcHomogenousPosAndRad(PrimID + 1, group_PositionOffset, SP1, Rad1, TypeDummy); } // Clipping { SP0 = float4(NDCToPixelCoord(SP0), 1.0f / SP0.w); SP1 = float4(NDCToPixelCoord(SP1), 1.0f / SP1.w); // Clip against tile const float2 TileMin = float2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff)); const float2 TileMax = TileMin + TileSize; bool2 bClipped = false; ClipRaySegment(TileMin - 0.5f, TileMax + 0.5f, SP0, SP1, Rad0, Rad1, bClipped); } const float SegmentLenSqRcp = 1.0f / dot(SP1.xy - SP0.xy, SP1.xy - SP0.xy); #if RASTER_ALGO == RASTER_LINEAR const bool bIsSteep = abs(SP1.x - SP0.x) < abs(SP1.y - SP0.y); const float X0 = bIsSteep ? min(SP0.y, SP1.y) : min(SP0.x, SP1.x); const float X1 = bIsSteep ? max(SP0.y, SP1.y) : max(SP0.x, SP1.x); const int NumSteps = (int)(ceil(X1) - floor(X0)); const float RcpNumSteps = 1.0f / (X1 - X0); LOOP for (int J = PModTPS; J < NumSteps; J += group_ThreadsPerSeg) { const float Alpha = saturate(J * RcpNumSteps); const float4 SP = lerp(SP0, SP1, Alpha); const float AntiAliasingFactor = 1.0f; #if !ENABLE_RASTER_LINEAR_AA PlotInternal(SP.xy, AntiAliasingFactor, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID); #else const float2 Coord = (bIsSteep ? SP.yx : SP.xy) - 0.5f; const float FracY = frac(Coord.y); Plot(Coord, FracY, AntiAliasingFactor, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID); #endif // !ENABLE_RASTER_LINEAR_AA } #elif RASTER_ALGO == RASTER_WU // Wu's line algorithm. Currently this has some weird artifacts when clipping to tiles. // TODO: Remove this entirely or fix the artifacts. { const bool bIsSteep = abs(SP1.y - SP0.y) > abs(SP1.x - SP0.x); if (bIsSteep) { SP0.xy = SP0.yx; SP1.xy = SP1.yx; } if (SP0.x > SP1.x) { float4 Tmp = SP0; SP0 = SP1; SP1 = Tmp; } const float2 D = SP1.xy - SP0.xy; const float Gradient = abs(D.x) < 1e-5f ? 1.0f : D.y / D.x; float DeltaY = 0.0f; // First endpoint int2 Px0; { const float2 SP0Int = SP0.xy - 0.5f; // transform to integer grid with pixel centers at 0.0 instead of 0.5. float2 End; End.x = floor(SP0Int.x); End.y = SP0Int.y + Gradient * (End.x - SP0Int.x); const float GapX = 1.0f;// 1.0f - frac(SP0Int.x + 0.5f); Px0 = int2(End.x, floor(End.y)); if (PModTPS == 0) { const float FracY = frac(End.y); Plot(Px0, FracY, GapX, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID); } DeltaY = End.y + Gradient; // First y-intersection for the main loop } // Second endpoint int2 Px1; { const float2 SP1Int = SP1.xy - 0.5f; // transform to integer grid with pixel centers at 0.0 instead of 0.5. float2 End; End.x = floor(SP1Int.x); End.y = SP1Int.y + Gradient * (End.x - SP1Int.x); const float GapX = 1.0f;// frac(SP1Int.x + 0.5f); Px1 = float2(End.x, floor(End.y)); if (PModTPS == 0) { const float FracY = frac(End.y); Plot(Px1, FracY, GapX, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID); } } // Main loop const int XBegin = Px0.x + 1 + PModTPS; const int XEnd = Px1.x; DeltaY += PModTPS * Gradient; for (int X = XBegin; X < XEnd; X += group_ThreadsPerSeg) { const int2 Coord = int2(X, floor(DeltaY)); const float FracY = frac(DeltaY); Plot(Coord, FracY, 1.0f, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID); DeltaY += group_ThreadsPerSeg * Gradient; } } #endif // RASTER_ALGO == RASTER_LINEAR } GroupMemoryBarrierWithGroupSync(); if (bWaveThreadValid) { for (uint LinearIndex = GroupThreadID; LinearIndex < SqrTileSize; LinearIndex += WaveThreadCount) { uint2 Coord; Coord.y = (float(LinearIndex) + 0.5f) * RcpTileSize; Coord.x = LinearIndex - (Coord.y * TileSize); Coord += uint2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff)); if (group_SubTile[LinearIndex].y != GetInvalidHairControlPointId()) { uint oldValue; InterlockedMax(OutDepthCovTexture[uint3(Coord, 0)], group_SubTile[LinearIndex].x, oldValue); if (group_SubTile[LinearIndex].x > oldValue) { OutPrimMatTexture[uint3(Coord, 0)] = group_SubTile[LinearIndex].y; } } InterlockedAdd(OutHairCountTexture[uint3(Coord, 0)], group_SubTile[LinearIndex].z); } } GroupMemoryBarrierWithGroupSync(); } } #endif //SHADER_RASTERCOMPUTE_RASTER /////////////////////////////////////////////////////////////////////////// #if SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE #ifndef PERMUTATION_MULTI_SAMPLE_COUNT #define PERMUTATION_MULTI_SAMPLE_COUNT 1 #endif // Wave size #if PERMUTATION_GROUP_SIZE == 64 #define WAVE_SIZE 32 #elif PERMUTATION_GROUP_SIZE == 32 #define WAVE_SIZE 32 #else #error Unknown group size #endif Buffer VisTilePrims; Buffer VisTileArgs; ByteAddressBuffer VisTileData; RWTexture2D OutHairCountTexture; RWTexture2DArray OutDepthCovTexture; RWTexture2DArray OutPrimMatTexture; uint LoadVisTileData(uint index, uint offset) { return VisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4)); } groupshared uint group_SubTileSceneDepth[256]; // (16 x 16 x 4 bytes = 1k bytes) groupshared uint group_SubTileHairCount[256]; // (16 x 16 x 4 bytes = 1k bytes) groupshared uint group_SubTileHairDepth[PERMUTATION_MULTI_SAMPLE_COUNT][256]; // (16 x 16 x 4 bytes = 1k bytes) per sample groupshared uint group_SubTilePrimMatID[PERMUTATION_MULTI_SAMPLE_COUNT][256]; // (16 x 16 x 4 bytes = 1k bytes) per sample groupshared float3 group_PositionOffset; groupshared uint group_LoopNum; groupshared uint group_TileNum; float GetDistanceToLine(float2 P1, float2 P2, float2 P3, float RcpLineSegLength) { // We can compute the distance of P1 to the line defined by P2 and P3 as the height of the triangle spanned by these points. // Area of triangle: A = 0.5 * h * b where h is the triangle height and b is the length of the base side. // Solving for h gives: h = (2 * A) / b // We can compute A using the determinant: A = 0.5 * abs(det(P1, P2, P3)) // After some simplification, this results in the following: float A = abs(P1.x * (P2.y - P3.y) + P2.x * (P3.y - P1.y) + P3.x * (P1.y - P2.y)); return A * RcpLineSegLength; } uint GetCoverageMask(int2 PixelCoord, float2 P0, float2 P1) { const float LineThickness = 1.0f / PERMUTATION_MULTI_SAMPLE_COUNT; // In pixel units uint Mask = 0; // Set origin to PixelCoord P0 -= PixelCoord; P1 -= PixelCoord; const float RcpLineSegLength = 1.0f / distance(P0, P1); #if PERMUTATION_MULTI_SAMPLE_COUNT == 1 Mask |= (GetDistanceToLine(float2(0.5f, 0.5f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0; #elif PERMUTATION_MULTI_SAMPLE_COUNT == 2 Mask |= (GetDistanceToLine(float2(0.75f, 0.75f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0; Mask |= (GetDistanceToLine(float2(0.25f, 0.25f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 1u) : 0; #elif PERMUTATION_MULTI_SAMPLE_COUNT == 4 Mask |= (GetDistanceToLine(float2(0.375f, 0.125f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0; Mask |= (GetDistanceToLine(float2(0.875f, 0.375f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 1u) : 0; Mask |= (GetDistanceToLine(float2(0.125f, 0.625f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 2u) : 0; Mask |= (GetDistanceToLine(float2(0.625f, 0.875f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 3u) : 0; #elif PERMUTATION_MULTI_SAMPLE_COUNT == 8 Mask |= (GetDistanceToLine(float2(0.5625f, 0.3125f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0; Mask |= (GetDistanceToLine(float2(0.4375f, 0.6875f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 1u) : 0; Mask |= (GetDistanceToLine(float2(0.8125f, 0.5625f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 2u) : 0; Mask |= (GetDistanceToLine(float2(0.3125f, 0.1875f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 3u) : 0; Mask |= (GetDistanceToLine(float2(0.1875f, 0.8125f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 4u) : 0; Mask |= (GetDistanceToLine(float2(0.0625f, 0.4375f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 5u) : 0; Mask |= (GetDistanceToLine(float2(0.6875f, 0.9375f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 6u) : 0; Mask |= (GetDistanceToLine(float2(0.9375f, 0.0625f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 7u) : 0; #else #error Unsupported PERMUTATION_MULTI_SAMPLE_COUNT! Must be 1, 2, 4 or 8! #endif return Mask; } void Plot(int2 Coord, float4 P0, float4 P1, float Rad0, float Rad1, float SegmentLenSqRcp, uint PackedHalfTileMin, uint PrimMatID) { const int2 IntraTileCoord = Coord - int2(((PackedHalfTileMin >> 0) & 0xffff), ((PackedHalfTileMin >> 16) & 0xffff)); if (all(IntraTileCoord >= 0) && all(IntraTileCoord < int2(HalfTileSize,HalfTileSize))) { const float Alpha = ComputeLerpAlpha(Coord, P0.xy, P1.xy, SegmentLenSqRcp); const float Depth = lerp(P0.z, P1.z, Alpha); const uint PackedDepthCov = PackHairVisDepthCoverage(Depth, 1.0f); const uint LinearIndex = IntraTileCoord.x + IntraTileCoord.y * HalfTileSize; // Test against scene depth if (PackedDepthCov > group_SubTileSceneDepth[LinearIndex]) { const float Rad = ComputePerspectiveCorrectRadius(Rad0, Rad1, Alpha, P0.w, P1.w); const uint HairCount = min(Rad, 0.5f) * 2.0f * 1000.0f * CoverageScale; const uint CoverageMask = GetCoverageMask(Coord, P0.xy, P1.xy); // Accumulate hair count if (CoverageMask) { InterlockedAdd(group_SubTileHairCount[LinearIndex], HairCount); } UNROLL for (uint SampleIdx = 0; SampleIdx < PERMUTATION_MULTI_SAMPLE_COUNT; ++SampleIdx) { if (CoverageMask & (1u << SampleIdx)) { // Write Depth + PrimMatID if depth test against hair depths is passed uint OldValue; InterlockedMax(group_SubTileHairDepth[SampleIdx][LinearIndex], PackedDepthCov, OldValue); if (PackedDepthCov > OldValue) { group_SubTilePrimMatID[SampleIdx][LinearIndex] = PrimMatID; } } } } } } [numthreads(1024, 1, 1)] void RasterMultiSampleCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID) { ResolvedView = ResolveView(); if (GroupThreadID == 0) { group_TileNum = VisTileArgs[0]; group_LoopNum = (float(group_TileNum) + float(NumRasterizers - 1)) * RcpNumRasterizers; group_PositionOffset = HairStrandsVF_GetHairInstancePositionOffset(); } GroupMemoryBarrierWithGroupSync(); LOOP for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++) { const uint TileIndex = LoopIndex + (GroupID * group_LoopNum); if (TileIndex >= group_TileNum) { return; } const uint PrimOffset = TileIndex * 1024; const uint PrimCount = LoadVisTileData(TileIndex, VT_PrimCount); const uint PackedCoord = LoadVisTileData(TileIndex, VT_Coord); const uint2 TileMin = UnpackVisTileCoord(PackedCoord) * TileSize; const uint PackedTileMin = ((TileMin.x & 0xffff) << 0) | ((TileMin.y & 0xffff) << 16); uint ThreadsPerSeg = 1; if (PrimCount <= 512) ThreadsPerSeg = 2; if (PrimCount <= 341) ThreadsPerSeg = 3; if (PrimCount <= 256) ThreadsPerSeg = 4; if (PrimCount <= 204) ThreadsPerSeg = 5; if (PrimCount <= 170) ThreadsPerSeg = 6; if (PrimCount <= 146) ThreadsPerSeg = 7; if (PrimCount <= 128) ThreadsPerSeg = 8; if (PrimCount <= 64) ThreadsPerSeg = 16; if (PrimCount <= 32) ThreadsPerSeg = 32; const bool bThreadValid = (GroupThreadID < (PrimCount * ThreadsPerSeg)); const uint Prim = uint((float(GroupThreadID) + 0.5f) / float(ThreadsPerSeg)); const uint PModTPS = GroupThreadID - (Prim * ThreadsPerSeg); float4 SP0 = 0; float4 SP1 = 0; float Rad0 = 0; float Rad1 = 0; bool bIsEndPoint = false; uint PrimMatID = ~0; if (bThreadValid) { const uint PrimID = VisTilePrims[PrimOffset + Prim]; PrimMatID = PackHairVisControlPointMaterialId(PrimID, HairMaterialId); uint Type; CalcHomogenousPosAndRad(PrimID, group_PositionOffset, SP0, Rad0, Type); CalcHomogenousPosAndRad(PrimID + 1, group_PositionOffset, SP1, Rad1, Type); bIsEndPoint = (Type == HAIR_CONTROLPOINT_END); SP0 = float4(NDCToPixelCoord(SP0), 1.0f / SP0.w); SP1 = float4(NDCToPixelCoord(SP1), 1.0f / SP1.w); } // Split 32x32 tile into 4 16x16 tiles that are processed one after another. // This is to reduce LDS memory pressure. UNROLL for (uint SubTileIdx = 0; SubTileIdx < 4; ++SubTileIdx) { const uint2 SubTileMin = TileMin + uint2((SubTileIdx == 0 || SubTileIdx == 2) ? 0 : HalfTileSize, SubTileIdx < 2 ? 0 : HalfTileSize); const uint2 SubTileMax = SubTileMin + HalfTileSize; const uint PackedSubTileMin = ((SubTileMin.x & 0xFFFF) << 0u) | ((SubTileMin.y & 0xFFFF) << 16u); // Initialize LDS if (GroupThreadID < SqrHalfTileSize) { uint2 Coord; Coord.y = (float(GroupThreadID) + 0.5f) * RcpHalfTileSize; Coord.x = GroupThreadID - (Coord.y * HalfTileSize); Coord += uint2(((PackedSubTileMin >> 0u) & 0xFFFF), ((PackedSubTileMin >> 16u) & 0xFFFF)); group_SubTileSceneDepth[GroupThreadID] = PackHairVisDepthCoverage(SceneDepthTexture.Load(uint3(Coord, 0)), 1.0f); group_SubTileHairCount[GroupThreadID] = 0; UNROLL for (uint SampleIdx = 0; SampleIdx < PERMUTATION_MULTI_SAMPLE_COUNT; ++SampleIdx) { const uint HairDepth = OutDepthCovTexture[uint3(Coord, SampleIdx)]; group_SubTileHairDepth[SampleIdx][GroupThreadID] = HairDepth; group_SubTilePrimMatID[SampleIdx][GroupThreadID] = GetInvalidHairControlPointId(); } } GroupMemoryBarrierWithGroupSync(); // Rasterize to LDS if (bThreadValid) { const uint2 SubTileMin = uint2(((PackedSubTileMin >> 0u) & 0xFFFF), ((PackedSubTileMin >> 16u) & 0xFFFF)); const uint2 SubTileMax = SubTileMin + HalfTileSize; bool2 bClipped; float2 T; const bool bVisible = ClipRaySegment(SubTileMin - 0.5f, SubTileMax + 0.5f, SP0, SP1, T, bClipped); T = saturate(T); if (bVisible) { const float2 SP0Clipped = lerp(SP0, SP1, T.x).xy; const float2 SP1Clipped = lerp(SP0, SP1, T.y).xy; const bool bIsSteep = abs(SP1.x - SP0.x) < abs(SP1.y - SP0.y); const float X0 = bIsSteep ? min(SP0Clipped.y, SP1Clipped.y) : min(SP0Clipped.x, SP1Clipped.x); const float X1 = bIsSteep ? max(SP0Clipped.y, SP1Clipped.y) : max(SP0Clipped.x, SP1Clipped.x); const int NumSteps = (int)(ceil(X1) - floor(X0)); const float RcpNumSteps = 1.0f / (X1 - X0); const float SegmentLenSqRcp = 1.0f / dot(SP1.xy - SP0.xy, SP1.xy - SP0.xy); const int End = !bClipped.y && !bIsEndPoint ? (NumSteps - 1) : NumSteps; LOOP for (int J = PModTPS; J < End; J += ThreadsPerSeg) { const float Alpha = lerp(T.x, T.y, saturate(J * RcpNumSteps)); const float2 SP = lerp(SP0.xy, SP1.xy, Alpha); Plot(SP, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedSubTileMin, PrimMatID); } } } GroupMemoryBarrierWithGroupSync(); // Write out to global memory if (GroupThreadID < SqrHalfTileSize) { uint2 Coord; Coord.y = (float(GroupThreadID) + 0.5f) * RcpHalfTileSize; Coord.x = GroupThreadID - (Coord.y * HalfTileSize); Coord += uint2(((PackedSubTileMin >> 0u) & 0xFFFF), ((PackedSubTileMin >> 16u) & 0xFFFF)); const uint HairCount = group_SubTileHairCount[GroupThreadID]; if (HairCount != 0) { InterlockedAdd(OutHairCountTexture[Coord], HairCount); } UNROLL for (uint SampleIdx = 0; SampleIdx < PERMUTATION_MULTI_SAMPLE_COUNT; ++SampleIdx) { const uint3 SampleCoord = uint3(Coord, SampleIdx); const uint PrimMatID = group_SubTilePrimMatID[SampleIdx][GroupThreadID]; if (PrimMatID != GetInvalidHairControlPointId()) { const uint HairDepth = group_SubTileHairDepth[SampleIdx][GroupThreadID]; uint OldValue; InterlockedMax(OutDepthCovTexture[SampleCoord], HairDepth, OldValue); if (HairDepth > OldValue) { OutPrimMatTexture[SampleCoord] = PrimMatID; } } } } GroupMemoryBarrierWithGroupSync(); } } } #endif //SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE /////////////////////////////////////////////////////////////////////////// #if SHADER_RASTERCOMPUTE_DEBUG #include "../ShaderPrint.ush" Texture2D VisTileDepthGrid; Texture2DArray VisTileBinningGrid; Buffer VisTileArgs; uint MacroGroupId; uint PrimitiveInfoIndex; uint TotalPrimitiveInfoCount; #define TilePrintOffset (TileSize >> 1) float4 Transparent(float4 Color) { return float4(Color.xyz, 0.5f); } uint GetTileTotalSegment(uint2 TileCoord, bool bPrintDetails) { const float TileDisplayScale = 1.5f; const uint DisplayTileSize = TileSize * TileDisplayScale; uint2 InlinedTileCoord = uint2(0, 0); uint TotalSegments = 0; const uint BinCount = NumBinners;// * 2; // Each binner fill in 2 bins, see binning algo. for (uint BinIt = 0; BinIt < BinCount; ++BinIt) { const uint CurrTileSegments = VisTileBinningGrid.Load(uint4(TileCoord, BinIt, 0)); TotalSegments += CurrTileSegments; if (bPrintDetails) { AddFilledQuadSS(InlinedTileCoord * DisplayTileSize, (InlinedTileCoord + 1) * DisplayTileSize, CurrTileSegments > 0 ? Transparent(ColorLightGreen) : Transparent(ColorLightRed)); AddQuadSS(InlinedTileCoord * DisplayTileSize, (InlinedTileCoord + 1) * DisplayTileSize, ColorYellow); FShaderPrintContext Context = InitShaderPrintContext(true, InlinedTileCoord * DisplayTileSize + TilePrintOffset); Print(Context, CurrTileSegments, FontWhite); ++InlinedTileCoord.x; // Span details onto 2 lines if (BinIt == NumBinners-1) { InlinedTileCoord.x = 0; ++InlinedTileCoord.y; } } } return TotalSegments; } void PrintTile(uint2 TileCoord, uint TotalSegments, bool bPrintText) { AddFilledQuadSS(TileCoord * TileSize, (TileCoord + 1) * TileSize, TotalSegments > 0 ? Transparent(ColorLightGreen) : Transparent(ColorLightRed)); if (bPrintText) { FShaderPrintContext Context = InitShaderPrintContext(true, TileCoord * TileSize + uint2(0, TileSize * 1.5f)); Print(Context, TotalSegments, FontWhite); AddQuadSS(TileCoord * TileSize, (TileCoord + 1) * TileSize, ColorYellow); } } [numthreads(8, 8, 1)] void MainCS(uint3 ThreadId : SV_DispatchThreadID) { // Info/Stats if (all(ThreadId == 0)) { FShaderPrintContext Context = InitShaderPrintContext(true, uint2(50, 110)); Print(Context, TEXT("Raster compute "), FontYellow); Newline(Context); Print(Context, TEXT("Macro Group Id : "), FontSilver); Print(Context, MacroGroupId, FontWhite); Newline(Context); Print(Context, TEXT("Primitive Info : "), FontSilver); Print(Context, PrimitiveInfoIndex, FontWhite, 2, 0); Print(Context, TEXT("/"), FontSilver); Print(Context, TotalPrimitiveInfoCount, FontWhite, 2, 0); Newline(Context); Newline(Context); Print(Context, TEXT("Configuration "), FontYellow); Newline(Context); Print(Context, TEXT("Output Resolution : "), FontSilver); Print(Context, OutputResolution, FontWhite); Newline(Context); Print(Context, TEXT("Resolution Multiplier: "), FontSilver); Print(Context, ResolutionMultiplier, FontWhite); Newline(Context); Newline(Context); Print(Context, TEXT("Tile Size : "), FontSilver); Print(Context, TileSize, FontWhite); Newline(Context); Print(Context, TEXT("Tile Res : "), FontSilver); Print(Context, TileRes.x, FontWhite, 2, 0); Print(Context, TEXT("x"), FontSilver); Print(Context, TileRes.y, FontWhite, 2, 0); Newline(Context); Newline(Context); Print(Context, TEXT("Num Binners : "), FontSilver); Print(Context, NumBinners, FontWhite); Newline(Context); Print(Context, TEXT("Num Rasterizers : "), FontSilver); Print(Context, NumRasterizers, FontWhite); Newline(Context); Print(Context, TEXT("Max Raster Count : "), FontSilver); Print(Context, MaxRasterCount, FontWhite); Newline(Context); Newline(Context); Print(Context, TEXT("Allocated Tile Count : "), FontSilver); Print(Context, VisTileArgs[0], FontWhite); Newline(Context); } // Cursor info if (all(ThreadId == 0) && all(ShaderPrintData.CursorCoord >= 0)) { const uint2 PixelCoord = ShaderPrintData.CursorCoord; const uint2 TileCoord = PixelCoord >> TileSizeAsShift; const uint TotalSegments = GetTileTotalSegment(TileCoord, true); PrintTile(TileCoord, TotalSegments, true); } // All tile { const uint2 TileCoord = ThreadId.xy; const uint TotalSegments = GetTileTotalSegment(TileCoord, false); if (TotalSegments) { PrintTile(TileCoord, TotalSegments, false); } } } #endif //SHADER_RASTERCOMPUTE_DEBUG