UnrealEngine/Engine/Shaders/Private/HairStrands/HairStrandsVisibilityRasterCompute.usf

// Copyright Epic Games, Inc. All Rights Reserved.

#define HAIR_STRANDS_PARAMETERS 1

#include "../Common.ush"
#include "../WaveOpUtil.ush"
#include "HairStrandsClusterCommon.ush"
#include "HairStrandsVertexFactoryCommon.ush"
#include "HairStrandsVisibilityCommon.ush"

///////////////////////////////////////////////////////////////////////////
// Common parameters

uint	TileSizeAsShift;
uint	TileSize;
float	RcpTileSize;
uint	SqrTileSize;
uint	HalfTileSize;
float	RcpHalfTileSize;
uint	SqrHalfTileSize;
int2	TileRes;

uint	NumBinners;
float	RcpNumBinners;
uint	NumRasterizers;
float	RcpNumRasterizers;

uint	MaxRasterCount;
uint	FrameIdMod8;
uint	ResolutionMultiplier;
int2	OutputResolution;
float2	OutputResolutionf;

///////////////////////////////////////////////////////////////////////////

#if SHADER_RASTERCOMPUTE_BINNING || SHADER_RASTERCOMPUTE_COMPACTION || SHADER_RASTERCOMPUTE_RASTER || SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE || SHADER_RASTERCOMPUTE_DEPTH_GRID

///////////////////////////////////////////////////////////////////////////

/*
// use untyped buffer for segment tiles to reduce VGPR usage - 16 bytes
struct FVisTile
{
	uint PrimOffset;
	uint PrimCount;
	uint TileCoord;
	uint MinDepth;
};
*/
#define VT_SIZE 4

#define VT_PrimOffset 0
#define VT_PrimCount 1
#define VT_Coord 2
#define VT_MinWriteIndex 3

uint PackVisTileCoord(uint2 Coord)
{
	return uint(((Coord.x & 0xff) << 0) | (((Coord.y) & 0xff) << 8));
}

uint2 UnpackVisTileCoord(uint Packed)
{
	return uint2(((Packed >> 0) & 0xff), ((Packed >> 8) & 0xff));
}

///////////////////////////////////////////////////////////////////////////

uint			MacroGroupId;
uint			HairMaterialId;


Texture2D<float> SceneDepthTexture;

uint 			VertexCount;
float			CoverageScale;

float3 NDCToPixelCoord(float4 InDC)
{
	const float3 NDC = InDC.xyz / InDC.w;
	float2 UV = NDC.xy * ResolvedView.ScreenPositionScaleBias.xy + ResolvedView.ScreenPositionScaleBias.wz;
	return float3(UV * OutputResolution, NDC.z);
}

void CalcHomogenousPos(in uint InPointIndex, in float3 PBO, out float4 HP, out uint Type)
{
	const FHairControlPoint CP = ReadHairControlPoint(
		HairStrandsVF_PositionBuffer,
		InPointIndex,
		PBO,
		HairStrandsVF_Radius,
		HairStrandsVF_RootScale,
		HairStrandsVF_TipScale);

	const float3 WP = mul(float4(CP.Position, 1.0f), HairStrandsVF_LocalToWorldPrimitiveTransform).xyz;
	HP = mul(float4(WP, 1.0f), DFHackToFloat(PrimaryView.WorldToClip));
	Type = CP.Type;
}

void CalcHomogenousPosAndRad(in uint InPointIndex, in float3 PBO, out float4 HP, out float Rad, out uint Type)
{
	const FHairControlPoint CP = ReadHairControlPoint(
		HairStrandsVF_PositionBuffer,
		InPointIndex,
		PBO,
		HairStrandsVF_Radius,
		HairStrandsVF_RootScale,
		HairStrandsVF_TipScale);

	const float3 WP = mul(float4(CP.Position, 1.0f), HairStrandsVF_LocalToWorldPrimitiveTransform).xyz;
	HP = mul(float4(WP, 1.0f), DFHackToFloat(PrimaryView.WorldToClip));
	Rad = CP.WorldRadius * 2000.0; // OutputResolutionf.x; //TODO: figure this out correctly?
	Type = CP.Type;
}

float ComputeLerpAlpha(int2 Coord, float2 P0, float2 P1, float SegmentLenSqRcp)
{
	// Project P onto line segment and compute the lerp alpha between P0 and P1
	// Simplification of:
	// A = P - P0
	// B = P1 - P0
	// Alpha = dot(A, B) / dot(B, B)
	const float2 P = Coord + 0.5f;
	const float Alpha = saturate(dot(P - P0, P1 - P0) * SegmentLenSqRcp);
	return Alpha;
}

float ComputePerspectiveCorrectRadius(float Rad0, float Rad1, float Alpha, float RcpW0, float RcpW1)
{
	// Alpha value for perspective correct interpolation. We store the reciprocal of w in the w component of P0 and P1,
	// so this is a simplification of:
	// (Alpha / w1) / ((1 - Alpha) / w0 + Alpha / w1)
	const float LerpedRcpW = lerp(RcpW0, RcpW1, Alpha);
	const float PerspectiveAlpha = (Alpha * RcpW1) / LerpedRcpW;
	// Divide by W to make thickness dependent on screen space depth? This division was kept from the previous line rasterization algorithm.
	const float Rad = lerp(Rad0, Rad1, PerspectiveAlpha) * LerpedRcpW;
	return Rad;
}

// Line clipping based on "CLIPPING USING HOMOGENEOUS COORDINATES" by Blinn et al.
bool BlinnLineClipping(inout float4 P0, inout float4 P1)
{
	float2 T = float2(0.0f, 1.0f);
	bool bIsRemoved = P0.w < 0.0f && P1.w < 0.0f; // Both points behind near plane

	bool bSign = false;

	UNROLL
	for (uint PlaneIdx = 0; PlaneIdx < 6; ++PlaneIdx)
	{
		// Compute boundary coordinates of both points (w+x, w-x, w+y, w-y, z, w-z)
		bSign = !bSign;
		const uint CompIdx = PlaneIdx / 2;
		const float Sign = bSign ? 1.0f : -1.0f;
		const float WFactor = PlaneIdx != 4 ? 1.0f : 0.0f;
		const float2 BC = WFactor * float2(P0.w, P1.w) + Sign * float2(P0[CompIdx], P1[CompIdx]);

		float Num = BC.x;
		float Denom = BC.x - BC.y;
		bIsRemoved = bIsRemoved || (BC.x < 0.0f && BC.y < 0.0f); // Both points outside the plane
		float Alpha = Num / Denom;

		// If the denominator is negative, P0 has a smaller boundary coordinate than P1, so we can assume
		// that P1 is inside the plane (or bIsRemoved is true), so we need to update the alpha for P0.
		// The reverse is true if the denominator is positive.
		if (Denom < 0.0f)
		{
			T.x = max(T.x, Alpha);
		}
		else
		{
			T.y = min(T.y, Alpha);
		}
	}

	if (!bIsRemoved)
	{
		const float4 P0Clipped = lerp(P0, P1, T.x);
		const float4 P1Clipped = lerp(P0, P1, T.y);
		P0 = P0Clipped;
		P1 = P1Clipped;
	}

	return !bIsRemoved;
}

bool ClipRaySegment(float2 AABBMin, float2 AABBMax, float4 P0, float4 P1, out float2 T, out bool2 bClipped)
{
	bClipped = false;
	T = float2(0.0f, 1.0f);
	const bool bP0Outside = any(P0.xy < AABBMin) || any(P0.xy > AABBMax);
	const bool bP1Outside = any(P1.xy < AABBMin) || any(P1.xy > AABBMax);
	if (!bP0Outside && !bP1Outside)
	{
		return true;
	}

	const float2 Origin = P0.xy;
	const float2 Dir = P1.xy - P0.xy;
	const float2 RcpDir = 1.0f / Dir;

	const float2 T0 = (AABBMin - Origin) * RcpDir;
	const float2 T1 = (AABBMax - Origin) * RcpDir;

	T.x = max(min(T0.x, T1.x), min(T0.y, T1.y));
	T.y = min(max(T0.x, T1.x), max(T0.y, T1.y));

	// Ray intersects the AABB but the segment is completely outside or no intersection at all.
	if (T.y < 0.0f || T.x > T.y)
	{
		bClipped = true;
		return false;
	}

	if (bP0Outside && T.x > 0.0f && T.x < 1.0f)
	{
		bClipped.x = true;
	}
	if (bP1Outside && T.y > 0.0f && T.y < 1.0f)
	{
		bClipped.y = true;
	}

	return true;
}

bool ClipRaySegment(float2 AABBMin, float2 AABBMax, inout float4 P0, inout float4 P1, inout float Rad0, inout float Rad1, out bool2 bClipped)
{
	float2 T;
	bool bIsValid = ClipRaySegment(AABBMin, AABBMax, P0, P1, T, bClipped);

	if (bIsValid)
	{
		const bool bP0Outside = any(P0.xy < AABBMin) || any(P0.xy > AABBMax);
		const bool bP1Outside = any(P1.xy < AABBMin) || any(P1.xy > AABBMax);

		float4 P0New = P0;
		float4 P1New = P1;
		float Rad0New = Rad0;
		float Rad1New = Rad1;
		if (bP0Outside && T.x > 0.0f && T.x < 1.0f)
		{
			P0New = lerp(P0, P1, T.x);
			Rad0New = lerp(Rad0, Rad1, T.x);
			bClipped.x = true;
		}
		if (bP1Outside && T.y > 0.0f && T.y < 1.0f)
		{
			P1New = lerp(P0, P1, T.y);
			Rad1New = lerp(Rad0, Rad1, T.y);
			bClipped.y = true;
		}
		P0 = P0New;
		P1 = P1New;
		Rad0 = Rad0New;
		Rad1 = Rad1New;
	}

	return bIsValid;
}

#endif // Common rasetrizer helper function & parameters

///////////////////////////////////////////////////////////////////////////

#if SHADER_RASTERCOMPUTE_DEPTH_GRID

RWTexture2D<uint>			OutVisTileDepthGrid;
RWTexture2DArray<uint>		OutDepthCovTexture;
uint						NumSamples;

groupshared uint group_FurthestDepth; // (4 bytes)

[numthreads(1024, 1, 1)]
void PrepareDepthGridCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID)
{
	if (GroupThreadID == 0)
	{
		group_FurthestDepth = 0xFFFFFFFF;
	}

	GroupMemoryBarrierWithGroupSync();

	// Assumes a workgroup size of 1024 threads to cover a maximum tile size of 32x32.
	if (GroupThreadID < SqrTileSize)
	{
		uint2 Coord;

		Coord.y = (GroupThreadID + 0.5f) * RcpTileSize;
		Coord.x = GroupThreadID - (Coord.y * TileSize);

		Coord += GroupID * TileSize;

		if (all(Coord < (uint2)OutputResolution))
		{
			const float Depth = SceneDepthTexture.Load(uint3(Coord, 0));
			const uint PackedDepth = PackHairVisDepthCoverage(Depth, 1.0);

			// Compute furthest depth inside this tile
			WaveInterlockedMin(group_FurthestDepth, PackedDepth);

			// Copy scene depth to (multisampled) hair depth output texture
			for (uint SampleIdx = 0; SampleIdx < NumSamples; ++SampleIdx)
			{
				InterlockedMax(OutDepthCovTexture[uint3(Coord, SampleIdx)], PackedDepth);
			}
		}
	}

	GroupMemoryBarrierWithGroupSync();

	if (GroupThreadID == 0)
	{
		OutVisTileDepthGrid[GroupID] = group_FurthestDepth;
	}
}

#endif //SHADER_RASTERCOMPUTE_DEPTH_GRID

///////////////////////////////////////////////////////////////////////////

#if SHADER_RASTERCOMPUTE_BINNING

#ifndef PERMUTATION_INDIRECT_PRIM_IDS
#define PERMUTATION_INDIRECT_PRIM_IDS 0
#endif

RWTexture2DArray<uint> 				OutVisTileBinningGrid;
RWBuffer<uint>						OutVisTilePrims;
RWBuffer<uint>						OutVisTileArgs;
RWByteAddressBuffer					OutVisTileData;
Texture2D<uint>						VisTileDepthGrid;
ByteAddressBuffer					IndirectPrimIDCount;
Buffer<uint>						IndirectPrimIDs;

// TODO: Without setting a limit, we are sometimes caught in an infinite loop even though this should not happen.
#define DDA_MAX_ITERATIONS 256

struct FDDAContext
{
	float2 Coord;
	float2 DeltaDist;
	float2 Step;
	float2 SideDist;
};

FDDAContext DDACreateContext(float2 RayStart, float2 RayDir)
{
	const float2 RayDirRcp = 1.0f / RayDir;

	FDDAContext Context;
	Context.Coord = floor(RayStart);
	Context.DeltaDist = abs(RayDirRcp);
	Context.Step = sign(RayDir);
	Context.SideDist = (Context.Coord - RayStart + max(Context.Step, 0.0f)) * RayDirRcp;

	return Context;
}

void DDAAdvance(inout FDDAContext Context)
{
	if (Context.SideDist.x < Context.SideDist.y)
	{
		Context.SideDist.x += Context.DeltaDist.x;
		Context.Coord.x += Context.Step.x;
	}
	else
	{
		Context.SideDist.y += Context.DeltaDist.y;
		Context.Coord.y += Context.Step.y;
	}
}

uint LoadOutVisTileData(uint index, uint offset)
{
	return OutVisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4));
}

void StoreOutVisTileData(uint index, uint offset, uint value)
{
	OutVisTileData.Store((((index)) * VT_SIZE * 4) + ((offset) * 4), (value));
}

groupshared uint group_LoopNum;
groupshared uint group_VerticesNum;
groupshared uint group_BatchNum;

#define TILES_TO_ALLOCATE_MAX 1024
groupshared uint group_TilesToAllocate[TILES_TO_ALLOCATE_MAX];
groupshared uint group_TilesToAllocateCount;

// The total number of line segments (VertexCount) is divided up equally between N binners - each binner = a workgroup which loops through the designated set segments in batches of 1024
// NB there is still potential to use LDS to prevent/reduce contention when adding segments to the binning grid and this may improve perf

[numthreads(1024, 1, 1)]
void BinningCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID)
{
	ResolvedView = ResolveView();
	if (GroupThreadID == 0)
	{
#if PERMUTATION_INDIRECT_PRIM_IDS
		group_VerticesNum = IndirectPrimIDCount.Load(0);
#else // PERMUTATION_INDIRECT_PRIM_IDS
	#if PERMUTATION_CULLING
		group_VerticesNum = HairStrandsVF_bCullingEnable ? HairStrandsVF_CullingIndirectBuffer[3] : VertexCount;
	#else // PERMUTATION_CULLING
		group_VerticesNum = VertexCount;
	#endif //PERMUTATION_CULLING
#endif // PERMUTATION_INDIRECT_PRIM_IDS

		group_BatchNum = (group_VerticesNum + 1023) / 1024;
		group_LoopNum = (group_BatchNum + (NumBinners - 1)) * RcpNumBinners;
	}

	GroupMemoryBarrierWithGroupSync();

	LOOP for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++)
	{
		const uint BatchIndex = LoopIndex + (GroupID * group_LoopNum);
		bool bSegValid = (BatchIndex < group_BatchNum);

#if PERMUTATION_INDIRECT_PRIM_IDS
		uint PrimID = 0;
		const uint PrimIDIndex = BatchIndex * 1024 + GroupThreadID;
		bSegValid = bSegValid && (PrimIDIndex < group_VerticesNum);
		if (bSegValid)
		{
			PrimID = IndirectPrimIDs[PrimIDIndex];
		}
#else // PERMUTATION_INDIRECT_PRIM_IDS
	#if PERMUTATION_CULLING
		uint PrimID = BatchIndex * 1024 + GroupThreadID;
		bSegValid = bSegValid && (PrimID < group_VerticesNum);

		if (bSegValid && HairStrandsVF_bCullingEnable)
		{
			const uint FetchIndex0 = PrimID;
			const uint FetchIndex1 = min(FetchIndex0 + 1, group_VerticesNum - 1);

			const uint VertexIndex0 = HairStrandsVF_CullingIndexBuffer[FetchIndex0];
			const uint VertexIndex1 = HairStrandsVF_CullingIndexBuffer[FetchIndex1];

			if (VertexIndex1 != VertexIndex0 + 1)
			{
				bSegValid = false;
			}
			else
			{
				PrimID = VertexIndex0;
			}
		}
	#else // PERMUTATION_CULLING
		const uint PrimID = BatchIndex * 1024 + GroupThreadID;
		bSegValid = bSegValid && (PrimID < VertexCount);
	#endif // PERMUTATION_CULLING
#endif // PERMUTATION_INDIRECT_PRIM_IDS

		const uint SegmentCountLayerIdx = GroupID; // Stores number of segments per tile per workgroup.
		const uint TmpSegmentCountLayerIdx = SegmentCountLayerIdx + NumBinners; // Also stores number of segments per tile per workgroup. Used as second counter for this two pass algorithm.
		const uint TileAllocInfoLayerIdx = SegmentCountLayerIdx + NumBinners * 2; // Stores per tile per workgroup allocation info.


		uint NearestDepth = 0;
		float2 TileCoord0F = 0.0f;
		float2 TileCoord1F = 0.0f;

		// Project segment end points and clip them to the screen
		if (bSegValid)
		{
			const float3 InstancePositionOffset = HairStrandsVF_GetHairInstancePositionOffset();
			float4 H0 = 0.0f;
			float4 H1 = 0.0f;
			uint Type = -1;
			CalcHomogenousPos(PrimID, InstancePositionOffset, H0, Type);

			bool bIsEndCV = (Type == HAIR_CONTROLPOINT_END);
			bSegValid = !bIsEndCV;

			if (bSegValid)
			{
				CalcHomogenousPos(PrimID + 1, InstancePositionOffset, H1, Type);

				// Do clipping in homogenous coordinates
				bSegValid = BlinnLineClipping(H0, H1);

				if (bSegValid)
				{
					float3 SP0 = NDCToPixelCoord(H0);
					float3 SP1 = NDCToPixelCoord(H1);
					SP0.xy *= RcpTileSize;
					SP1.xy *= RcpTileSize;

					// For peace of mind, make sure these are actually clamped to a valid range.
					SP0 = clamp(SP0, 0.0f, float3(TileRes, 1.0f));
					SP1 = clamp(SP1, 0.0f, float3(TileRes, 1.0f));

					NearestDepth = PackHairVisDepthCoverage(max(SP0.z, SP1.z), 1.0f);
					TileCoord0F = SP0.xy;
					TileCoord1F = SP1.xy;
				}
			}
		}

		if (GroupThreadID == 0)
		{
			group_TilesToAllocateCount = 0;
		}

		GroupMemoryBarrierWithGroupSync();

		// Increment per workgroup per tile counters and add tiles to be allocated
		if (bSegValid)
		{
			FDDAContext DDAContext = DDACreateContext(TileCoord0F, normalize(TileCoord1F - TileCoord0F));
			const int2 EndCoord = (int2)floor(TileCoord1F);

			for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt)
			{
				const int2 TileCoord = (int2)floor(DDAContext.Coord);

				BRANCH
				if (NearestDepth > VisTileDepthGrid[TileCoord])
				{
					uint OldTileSegmentCount;
					InterlockedAdd(OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)], 1, OldTileSegmentCount);

					BRANCH
					if ((OldTileSegmentCount % 1024) == 0)
					{
						uint WritePos;
						InterlockedAdd(group_TilesToAllocateCount, 1, WritePos);
						if (WritePos < TILES_TO_ALLOCATE_MAX)
						{
							group_TilesToAllocate[WritePos] = PackVisTileCoord(TileCoord);
						}
					}
				}

				if (all(TileCoord == EndCoord))
				{
					break;
				}

				DDAAdvance(DDAContext);
			}
		}

		GroupMemoryBarrierWithGroupSync();

		// Allocate tiles
		const uint TilesToAllocateCount = min(TILES_TO_ALLOCATE_MAX, group_TilesToAllocateCount);
		for (uint TileIdx = GroupThreadID; TileIdx < TilesToAllocateCount; TileIdx += 1024)
		{
			const uint PackedTileCoord = group_TilesToAllocate[TileIdx];
			const uint2 TileCoord = UnpackVisTileCoord(PackedTileCoord);

			const uint TotalNewWriteCount = OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)];
			const uint TotalOldWriteCount = OutVisTileBinningGrid[uint3(TileCoord, TmpSegmentCountLayerIdx)];

			uint NewTile;
			WaveInterlockedAddScalar_(OutVisTileArgs[0], 1, NewTile);

			StoreOutVisTileData(NewTile, VT_Coord, PackedTileCoord);
			// Round down the count to the start of the tile and later compare against this to decide which tile to write to.
			StoreOutVisTileData(NewTile, VT_MinWriteIndex, TotalNewWriteCount & ~1023u);

			const uint PrevTile = (OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)] & 0xffff);

			if (TotalOldWriteCount > 0)
			{
				StoreOutVisTileData(PrevTile, VT_PrimCount, 1024);
			}

			OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)] = (PrevTile << 16) | (NewTile & 0xffff);
		}

		GroupMemoryBarrierWithGroupSync();

		// Write PrimID to tiles
		if (bSegValid)
		{
			FDDAContext DDAContext = DDACreateContext(TileCoord0F, normalize(TileCoord1F - TileCoord0F));
			const int2 EndCoord = (int2)floor(TileCoord1F);

			for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt)
			{
				const int2 TileCoord = (int2)floor(DDAContext.Coord);

				BRANCH
				if (NearestDepth > VisTileDepthGrid[TileCoord])
				{
					const uint PackedTiles = OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)];
					const uint CurTile = (PackedTiles & 0xffff);
					const uint PrevTile = ((PackedTiles >> 16) & 0xffff);

					// Currently we need this to get our write position, but maybe there is a cheaper way to keep track of that?
					uint OldTileSegmentCount;
					InterlockedAdd(OutVisTileBinningGrid[uint3(TileCoord, TmpSegmentCountLayerIdx)], 1, OldTileSegmentCount);

					const bool bWriteToCurTile = OldTileSegmentCount >= LoadOutVisTileData(CurTile, VT_MinWriteIndex);
					const uint LocalWritePos = OldTileSegmentCount % 1024;
					const uint WritePos = (bWriteToCurTile ? CurTile : PrevTile) * 1024 + LocalWritePos;

					OutVisTilePrims[WritePos] = PrimID;

					BRANCH
					if (bWriteToCurTile)
					{
						if ((OldTileSegmentCount + 1) == OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)])
						{
							StoreOutVisTileData(CurTile, VT_PrimCount, (OldTileSegmentCount == 1023) ? 1024 : ((OldTileSegmentCount + 1) % 1024));
						}
					}
				}

				if (all(TileCoord == EndCoord))
				{
					break;
				}

				DDAAdvance(DDAContext);
			}
		}
	}
}
#endif //SHADER_RASTERCOMPUTE_BINNING

///////////////////////////////////////////////////////////////////////////

#if SHADER_RASTERCOMPUTE_COMPACTION

ByteAddressBuffer					VisTileData;
Buffer<uint> 						VisTilePrims;
Buffer<uint> 						VisTileArgs;
RWByteAddressBuffer					OutCompactedVisTileData;
RWBuffer<uint> 						OutCompactedVisTilePrims;
RWBuffer<uint> 						OutCompactedVisTileArgs;

uint LoadVisTileData(uint index, uint offset)
{
	return VisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4));
}

void StoreCompactedVisTileData(uint index, uint offset, uint value)
{
	OutCompactedVisTileData.Store((((index)) * VT_SIZE * 4) + ((offset) * 4), (value));
}

groupshared uint group_TotalPrimCount;
groupshared uint group_PrimWriteOffset;
groupshared uint group_NumTiles;
groupshared uint group_TilesToCompact[1024];
groupshared uint group_MaxLDSTileIdx;

[numthreads(1024, 1, 1)]
void CompactionCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID)
{
	if (GroupThreadID == 0)
	{
		group_TotalPrimCount = 0;
		group_NumTiles = 0;
		group_MaxLDSTileIdx = 0;
	}

	GroupMemoryBarrierWithGroupSync();

	const uint NumTiles = VisTileArgs[0];
	const uint PackedCoord = PackVisTileCoord(GroupID);

	// Compute total number of primitives at this tile coordinate
	uint LocalPrimCount = 0;
	{
		for (uint TileIdx = GroupThreadID; TileIdx < NumTiles; TileIdx += 1024)
		{
			const uint TilePackedCoord = LoadVisTileData(TileIdx, VT_Coord);
			if (PackedCoord == TilePackedCoord)
			{
				LocalPrimCount += LoadVisTileData(TileIdx, VT_PrimCount);

				uint WritePos;
				WaveInterlockedAddScalar_(group_NumTiles, 1, WritePos);
				if (WritePos < 1024)
				{
					group_TilesToCompact[WritePos] = TileIdx;
					WaveInterlockedMax(group_MaxLDSTileIdx, TileIdx);
				}
			}
		}
	}

	GroupMemoryBarrierWithGroupSync();

	if (LocalPrimCount > 0)
	{
		WaveInterlockedAdd(group_TotalPrimCount, LocalPrimCount);
	}

	GroupMemoryBarrierWithGroupSync();

	const uint TotalPrimCount = group_TotalPrimCount;

	if (TotalPrimCount == 0)
	{
		return;
	}

	// Allocate space
	if (GroupThreadID == 0)
	{
		const uint NumTilesToAllocate = (TotalPrimCount + 1023) / 1024;

		uint FirstCompactedTile;
		InterlockedAdd(OutCompactedVisTileArgs[0], NumTilesToAllocate, FirstCompactedTile);

		group_PrimWriteOffset = FirstCompactedTile * 1024;

		// Initialize new tiles
		for (uint TileIdx = 0; TileIdx < NumTilesToAllocate; ++TileIdx)
		{
			const uint CompactedTile = FirstCompactedTile + TileIdx;

			const uint PrimCount = min(TotalPrimCount - TileIdx * 1024, 1024);
			StoreCompactedVisTileData(CompactedTile, VT_PrimCount, PrimCount);
			StoreCompactedVisTileData(CompactedTile, VT_Coord, PackedCoord);
		}
	}

	GroupMemoryBarrierWithGroupSync();

	// Copy PrimIDs to compacted memory
	{
		uint CurrentWriteOffset = group_PrimWriteOffset;

		// First process the LDS list of tiles
		const uint NumInputTiles = min(group_NumTiles, 1024);
		for (uint LDSIdx = 0; LDSIdx < NumInputTiles; ++LDSIdx)
		{
			const uint TileIdx = group_TilesToCompact[LDSIdx];

			const uint TilePrimOffset = TileIdx * 1024;
			const uint TilePrimCount = LoadVisTileData(TileIdx, VT_PrimCount);

			if (GroupThreadID < TilePrimCount)
			{
				OutCompactedVisTilePrims[CurrentWriteOffset + GroupThreadID] = VisTilePrims[TilePrimOffset + GroupThreadID];
			}

			CurrentWriteOffset += TilePrimCount;
		}

		// Check any remaning tiles
		if (group_NumTiles > 1024)
		{
			for (uint TileIdx = group_MaxLDSTileIdx + 1; TileIdx < NumTiles; ++TileIdx)
			{
				const uint TilePackedCoord = LoadVisTileData(TileIdx, VT_Coord);
				if (PackedCoord == TilePackedCoord)
				{
					const uint TilePrimOffset = TileIdx * 1024;
					const uint TilePrimCount = LoadVisTileData(TileIdx, VT_PrimCount);

					if (GroupThreadID < TilePrimCount)
					{
						OutCompactedVisTilePrims[CurrentWriteOffset + GroupThreadID] = VisTilePrims[TilePrimOffset + GroupThreadID];
					}

					CurrentWriteOffset += TilePrimCount;
				}
			}
		}
	}
}

#endif // SHADER_RASTERCOMPUTE_COMPACTION

///////////////////////////////////////////////////////////////////////////

#if SHADER_RASTERCOMPUTE_RASTER

// Wave size
#if PERMUTATION_GROUP_SIZE == 64
#define WAVE_SIZE 32
#elif PERMUTATION_GROUP_SIZE == 32
#define WAVE_SIZE 32
#else
#error Unknown group size
#endif

// Simple rasterization algorithm that lerps between line endpoints. Is currently more robust than the Wu algorithm
// and optionally supports anti-aliasing similar to the Wu algorithm.
#define RASTER_LINEAR 0
// Implementation of Wu's line rasterization algorithm. Currently this implementation has tile shaped artifacts when the line segment is
// clipped against the tile which is why we use the simple linear algorithm at the moment.
#define RASTER_WU 1
 // Set to 1 to enable writing to two pixels straddling the line segment when using the linear rasterization algorithm.
#define ENABLE_RASTER_LINEAR_AA 0

#define RASTER_ALGO RASTER_LINEAR

Buffer<uint> 				VisTilePrims;
Buffer<uint> 				VisTileArgs;
ByteAddressBuffer			VisTileData;
RWTexture2DArray<uint>		OutHairCountTexture;
RWTexture2DArray<uint>		OutDepthCovTexture;
RWTexture2DArray<uint>		OutPrimMatTexture;

uint LoadVisTileData(uint index, uint offset)
{
	return VisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4));
}

groupshared uint4 group_SubTile[1024]; //(32 x 32 x 4 x 4 bytes = 16k bytes)

groupshared float3 group_PositionOffset;
groupshared float group_ooTileLODScale;

groupshared uint group_LoopNum;
groupshared uint group_TileNum;

groupshared uint group_ThreadsPerSeg;

#define GS_SEGS 320 //this number is limited by group shared memory

groupshared float4 group_SP0[GS_SEGS];
groupshared float4 group_SP1[GS_SEGS];
groupshared float group_Rad0[GS_SEGS];
groupshared float group_Rad1[GS_SEGS];
groupshared uint group_PrimMatID[GS_SEGS];

groupshared uint group_TileIndex;

void PlotInternal(int2 Coords, float AntiAliasingFactor, float4 P0, float4 P1, float Rad0, float Rad1, float SegmentLenSqRcp, uint PackedTileMin, uint PrimMatID)
{
	const int2 IntraTileCoord = Coords - int2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff));

	if (all(IntraTileCoord >= 0) && all(IntraTileCoord < int2(TileSize,TileSize)))
	{
		const float Alpha = ComputeLerpAlpha(Coords, P0.xy, P1.xy, SegmentLenSqRcp);
		const float Depth = lerp(P0.z, P1.z, Alpha);
		const uint PackedDepthCov = PackHairVisDepthCoverage(Depth, 1.0f);
		const uint LinearIndex = IntraTileCoord.x + IntraTileCoord.y * TileSize;

		// Write Depth + PrimMatID if depth test against hair depths is passed
		uint OldValue;
		InterlockedMax(group_SubTile[LinearIndex].x, PackedDepthCov, OldValue);
		if (PackedDepthCov > OldValue)
		{
			group_SubTile[LinearIndex].y = PrimMatID;
		}

		// Add hair count if depth test against scene depth is passed
		if (PackedDepthCov > group_SubTile[LinearIndex].w)
		{
			const float Rad = ComputePerspectiveCorrectRadius(Rad0, Rad1, Alpha, P0.w, P1.w);
			InterlockedAdd(group_SubTile[LinearIndex].z, min(Rad, 0.5f) * 2.0f * 1000.0f * CoverageScale * AntiAliasingFactor);
		}
	}
}

void Plot(int2 Coord, float FracY, float AntiAliasingFactor, bool bIsSteep, float4 P0, float4 P1, float Rad0, float Rad1, float SegmentLenSqRcp, uint PackedTileMin, uint PrimMatID)
{
	// First pixel
	{
		float AAFactor = AntiAliasingFactor * (1.0f - FracY);

		PlotInternal(bIsSteep ? Coord.yx : Coord.xy, AAFactor, P0, P1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
	}

	// Second pixel
	{
		float AAFactor = AntiAliasingFactor * FracY;
		Coord.y += 1;

		PlotInternal(bIsSteep ? Coord.yx : Coord.xy, AAFactor, P0, P1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
	}
}

[numthreads(1024, 1, 1)]
void RasterCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID)
{
	ResolvedView = ResolveView();

	if (GroupThreadID == 0)
	{
		group_TileNum = VisTileArgs[0];
		group_LoopNum = (float(group_TileNum) + float(NumRasterizers - 1)) * RcpNumRasterizers;

		group_PositionOffset = HairStrandsVF_GetHairInstancePositionOffset();

		/* no longer in use - keep for ref? Moving these values to group shared memory did seem to reduce VGPRs - more experimentation needed
		group_RadScale = (((HairStrandsVF_TipScale - HairStrandsVF_RootScale) * HairStrandsVF_Radius * OutputResolutionf.x) / 63.0) / 255.0;
		group_RadOffset = (HairStrandsVF_RootScale * HairStrandsVF_Radius * OutputResolutionf.x)/63.0;
		*/
	}

	GroupMemoryBarrierWithGroupSync();

	LOOP
	for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++)
	{
		if (GroupThreadID == 0)
		{
			group_TileIndex = LoopIndex + (GroupID * group_LoopNum);
		}

		GroupMemoryBarrierWithGroupSync();

		bool bTileValid = (group_TileIndex < group_TileNum);

		uint PrimOffset = group_TileIndex * 1024;
		uint PrimCount = LoadVisTileData(group_TileIndex, VT_PrimCount);

		uint PackedCoord = LoadVisTileData(group_TileIndex, VT_Coord);
		uint2 SubTileMin = UnpackVisTileCoord(PackedCoord) * TileSize;

		uint PackedTileMin = ((SubTileMin.x & 0xffff) << 0) | ((SubTileMin.y & 0xffff) << 16);

		if (GroupThreadID == 0)
		{
			group_ThreadsPerSeg = 1;

			if (PrimCount <= 512)
				group_ThreadsPerSeg = 2;
			if (PrimCount <= 341)
				group_ThreadsPerSeg = 3;
			if (PrimCount <= 256)
				group_ThreadsPerSeg = 4;
			if (PrimCount <= 204)
				group_ThreadsPerSeg = 5;
			if (PrimCount <= 170)
				group_ThreadsPerSeg = 6;
			if (PrimCount <= 146)
				group_ThreadsPerSeg = 7;
			if (PrimCount <= 128)
				group_ThreadsPerSeg = 8;
			if (PrimCount <= 64)
				group_ThreadsPerSeg = 16;
			if (PrimCount <= 32)
				group_ThreadsPerSeg = 32;
		}

		GroupMemoryBarrierWithGroupSync();

		bool bThreadValid = (bTileValid && (GroupThreadID < (PrimCount * group_ThreadsPerSeg)));

		uint WaveCount = ((PrimCount * group_ThreadsPerSeg) + (WAVE_SIZE - 1) ) / WAVE_SIZE;
		uint WaveThreadCount = WaveCount * WAVE_SIZE;

		bool bWaveThreadValid = (bTileValid && (GroupThreadID < WaveThreadCount));

		bool bUseGroupSPs = (bThreadValid && (GroupThreadID < (min(PrimCount, GS_SEGS) * group_ThreadsPerSeg)));

		bool bGenGroupSPs = (bThreadValid && (GroupThreadID < (min(PrimCount, GS_SEGS))));

		if (bGenGroupSPs)
		{
			uint Prim = GroupThreadID;
			uint PrimID = VisTilePrims[PrimOffset + Prim];

			group_PrimMatID[Prim] = PackHairVisControlPointMaterialId(PrimID, HairMaterialId);

			uint TypeDummy;
			CalcHomogenousPosAndRad(PrimID, group_PositionOffset, group_SP0[Prim], group_Rad0[Prim], TypeDummy);
			CalcHomogenousPosAndRad(PrimID+1, group_PositionOffset, group_SP1[Prim], group_Rad1[Prim], TypeDummy);
		}

		if (bWaveThreadValid)
		{
			for (uint LinearIndex = GroupThreadID; LinearIndex < SqrTileSize; LinearIndex += WaveThreadCount)
			{
				uint2 Coord;

				Coord.y = (float(LinearIndex) + 0.5f) * RcpTileSize;
				Coord.x = LinearIndex - (Coord.y * TileSize);

				Coord += uint2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff));

				group_SubTile[LinearIndex].x = OutDepthCovTexture[uint3(Coord, 0)];
				group_SubTile[LinearIndex].y = GetInvalidHairControlPointId();
				group_SubTile[LinearIndex].z = 0;
				group_SubTile[LinearIndex].w = PackHairVisDepthCoverage(SceneDepthTexture.Load(uint3(Coord, 0)), 1.0f);
			}
		}

		GroupMemoryBarrierWithGroupSync();

		if (bThreadValid)
		{
			uint Prim = uint((float(GroupThreadID) + 0.5f) / float(group_ThreadsPerSeg));
			uint PModTPS = GroupThreadID - (Prim * group_ThreadsPerSeg);

			uint PrimMatID;
			float4 SP0;
			float4 SP1;
			float Rad0;
			float Rad1;

			if (bUseGroupSPs)
			{
				PrimMatID = group_PrimMatID[Prim];
				SP0 = group_SP0[Prim];
				SP1 = group_SP1[Prim];
				Rad0 = group_Rad0[Prim];
				Rad1 = group_Rad1[Prim];
			}
			else
			{
				uint PrimID = VisTilePrims[PrimOffset + Prim];
				PrimMatID = PackHairVisControlPointMaterialId(PrimID, HairMaterialId);

				uint TypeDummy;
				CalcHomogenousPosAndRad(PrimID, group_PositionOffset, SP0, Rad0, TypeDummy);
				CalcHomogenousPosAndRad(PrimID + 1, group_PositionOffset, SP1, Rad1, TypeDummy);
			}

			// Clipping
			{
				SP0 = float4(NDCToPixelCoord(SP0), 1.0f / SP0.w);
				SP1 = float4(NDCToPixelCoord(SP1), 1.0f / SP1.w);

				// Clip against tile
				const float2 TileMin = float2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff));
				const float2 TileMax = TileMin + TileSize;
				bool2 bClipped = false;
				ClipRaySegment(TileMin - 0.5f, TileMax + 0.5f, SP0, SP1, Rad0, Rad1, bClipped);
			}

			const float SegmentLenSqRcp = 1.0f / dot(SP1.xy - SP0.xy, SP1.xy - SP0.xy);

#if RASTER_ALGO == RASTER_LINEAR

			const bool bIsSteep = abs(SP1.x - SP0.x) < abs(SP1.y - SP0.y);
			const float X0 = bIsSteep ? min(SP0.y, SP1.y) : min(SP0.x, SP1.x);
			const float X1 = bIsSteep ? max(SP0.y, SP1.y) : max(SP0.x, SP1.x);
			const int NumSteps = (int)(ceil(X1) - floor(X0));
			const float RcpNumSteps = 1.0f / (X1 - X0);

			LOOP
			for (int J = PModTPS; J < NumSteps; J += group_ThreadsPerSeg)
			{
				const float Alpha = saturate(J * RcpNumSteps);
				const float4 SP = lerp(SP0, SP1, Alpha);

				const float AntiAliasingFactor = 1.0f;
#if !ENABLE_RASTER_LINEAR_AA
				PlotInternal(SP.xy, AntiAliasingFactor, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
#else
				const float2 Coord = (bIsSteep ? SP.yx : SP.xy) - 0.5f;
				const float FracY = frac(Coord.y);
				Plot(Coord, FracY, AntiAliasingFactor, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
#endif // !ENABLE_RASTER_LINEAR_AA
			}
#elif RASTER_ALGO == RASTER_WU
			// Wu's line algorithm. Currently this has some weird artifacts when clipping to tiles.
			// TODO: Remove this entirely or fix the artifacts.
			{
				const bool bIsSteep = abs(SP1.y - SP0.y) > abs(SP1.x - SP0.x);

				if (bIsSteep)
				{
					SP0.xy = SP0.yx;
					SP1.xy = SP1.yx;
				}
				if (SP0.x > SP1.x)
				{
					float4 Tmp = SP0;
					SP0 = SP1;
					SP1 = Tmp;
				}

				const float2 D = SP1.xy - SP0.xy;
				const float Gradient = abs(D.x) < 1e-5f ? 1.0f : D.y / D.x;

				float DeltaY = 0.0f;

				// First endpoint
				int2 Px0;
				{
					const float2 SP0Int = SP0.xy - 0.5f; // transform to integer grid with pixel centers at 0.0 instead of 0.5.
					float2 End;
					End.x = floor(SP0Int.x);
					End.y = SP0Int.y + Gradient * (End.x - SP0Int.x);

					const float GapX = 1.0f;// 1.0f - frac(SP0Int.x + 0.5f);

					Px0 = int2(End.x, floor(End.y));

					if (PModTPS == 0)
					{
						const float FracY = frac(End.y);
						Plot(Px0, FracY, GapX, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
					}

					DeltaY = End.y + Gradient; // First y-intersection for the main loop
				}

				// Second endpoint
				int2 Px1;
				{
					const float2 SP1Int = SP1.xy - 0.5f; // transform to integer grid with pixel centers at 0.0 instead of 0.5.
					float2 End;
					End.x = floor(SP1Int.x);
					End.y = SP1Int.y + Gradient * (End.x - SP1Int.x);
					const float GapX = 1.0f;// frac(SP1Int.x + 0.5f);

					Px1 = float2(End.x, floor(End.y));

					if (PModTPS == 0)
					{
						const float FracY = frac(End.y);
						Plot(Px1, FracY, GapX, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
					}
				}

				// Main loop
				const int XBegin = Px0.x + 1 + PModTPS;
				const int XEnd = Px1.x;
				DeltaY += PModTPS * Gradient;
				for (int X = XBegin; X < XEnd; X += group_ThreadsPerSeg)
				{
					const int2 Coord = int2(X, floor(DeltaY));
					const float FracY = frac(DeltaY);
					Plot(Coord, FracY, 1.0f, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
					DeltaY += group_ThreadsPerSeg * Gradient;
				}
			}
#endif // RASTER_ALGO == RASTER_LINEAR
		}

		GroupMemoryBarrierWithGroupSync();

		if (bWaveThreadValid)
		{
			for (uint LinearIndex = GroupThreadID; LinearIndex < SqrTileSize; LinearIndex += WaveThreadCount)
			{
				uint2 Coord;

				Coord.y = (float(LinearIndex) + 0.5f) * RcpTileSize;
				Coord.x = LinearIndex - (Coord.y * TileSize);

				Coord += uint2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff));

				if (group_SubTile[LinearIndex].y != GetInvalidHairControlPointId())
				{
					uint oldValue;
					InterlockedMax(OutDepthCovTexture[uint3(Coord, 0)], group_SubTile[LinearIndex].x, oldValue);
					if (group_SubTile[LinearIndex].x > oldValue)
					{
						OutPrimMatTexture[uint3(Coord, 0)] = group_SubTile[LinearIndex].y;
					}
				}
				InterlockedAdd(OutHairCountTexture[uint3(Coord, 0)], group_SubTile[LinearIndex].z);
			}
		}

		GroupMemoryBarrierWithGroupSync();
	}
}

#endif //SHADER_RASTERCOMPUTE_RASTER

///////////////////////////////////////////////////////////////////////////

#if SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE

#ifndef PERMUTATION_MULTI_SAMPLE_COUNT
#define PERMUTATION_MULTI_SAMPLE_COUNT 1
#endif

// Wave size
#if PERMUTATION_GROUP_SIZE == 64
#define WAVE_SIZE 32
#elif PERMUTATION_GROUP_SIZE == 32
#define WAVE_SIZE 32
#else
#error Unknown group size
#endif

Buffer<uint> 				VisTilePrims;
Buffer<uint> 				VisTileArgs;
ByteAddressBuffer			VisTileData;
RWTexture2D<uint>			OutHairCountTexture;
RWTexture2DArray<uint>		OutDepthCovTexture;
RWTexture2DArray<uint>		OutPrimMatTexture;

uint LoadVisTileData(uint index, uint offset)
{
	return VisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4));
}

groupshared uint group_SubTileSceneDepth[256]; // (16 x 16 x 4 bytes = 1k bytes)
groupshared uint group_SubTileHairCount[256]; // (16 x 16 x 4 bytes = 1k bytes)
groupshared uint group_SubTileHairDepth[PERMUTATION_MULTI_SAMPLE_COUNT][256]; // (16 x 16 x 4 bytes = 1k bytes) per sample
groupshared uint group_SubTilePrimMatID[PERMUTATION_MULTI_SAMPLE_COUNT][256]; // (16 x 16 x 4 bytes = 1k bytes) per sample

groupshared float3 group_PositionOffset;

groupshared uint group_LoopNum;
groupshared uint group_TileNum;

float GetDistanceToLine(float2 P1, float2 P2, float2 P3, float RcpLineSegLength)
{
	// We can compute the distance of P1 to the line defined by P2 and P3 as the height of the triangle spanned by these points.
	// Area of triangle: A = 0.5 * h * b where h is the triangle height and b is the length of the base side.
	// Solving for h gives: h = (2 * A) / b
	// We can compute A using the determinant: A = 0.5 * abs(det(P1, P2, P3))
	// After some simplification, this results in the following:
	float A = abs(P1.x * (P2.y - P3.y) + P2.x * (P3.y - P1.y) + P3.x * (P1.y - P2.y));
	return A * RcpLineSegLength;
}

uint GetCoverageMask(int2 PixelCoord, float2 P0, float2 P1)
{
	const float LineThickness = 1.0f / PERMUTATION_MULTI_SAMPLE_COUNT; // In pixel units
	uint Mask = 0;

	// Set origin to PixelCoord
	P0 -= PixelCoord;
	P1 -= PixelCoord;

	const float RcpLineSegLength = 1.0f / distance(P0, P1);

#if PERMUTATION_MULTI_SAMPLE_COUNT == 1
	Mask |= (GetDistanceToLine(float2(0.5f, 0.5f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0;
#elif PERMUTATION_MULTI_SAMPLE_COUNT == 2
	Mask |= (GetDistanceToLine(float2(0.75f, 0.75f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0;
	Mask |= (GetDistanceToLine(float2(0.25f, 0.25f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 1u) : 0;
#elif PERMUTATION_MULTI_SAMPLE_COUNT == 4
	Mask |= (GetDistanceToLine(float2(0.375f, 0.125f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0;
	Mask |= (GetDistanceToLine(float2(0.875f, 0.375f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 1u) : 0;
	Mask |= (GetDistanceToLine(float2(0.125f, 0.625f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 2u) : 0;
	Mask |= (GetDistanceToLine(float2(0.625f, 0.875f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 3u) : 0;
#elif PERMUTATION_MULTI_SAMPLE_COUNT == 8
	Mask |= (GetDistanceToLine(float2(0.5625f, 0.3125f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0;
	Mask |= (GetDistanceToLine(float2(0.4375f, 0.6875f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 1u) : 0;
	Mask |= (GetDistanceToLine(float2(0.8125f, 0.5625f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 2u) : 0;
	Mask |= (GetDistanceToLine(float2(0.3125f, 0.1875f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 3u) : 0;
	Mask |= (GetDistanceToLine(float2(0.1875f, 0.8125f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 4u) : 0;
	Mask |= (GetDistanceToLine(float2(0.0625f, 0.4375f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 5u) : 0;
	Mask |= (GetDistanceToLine(float2(0.6875f, 0.9375f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 6u) : 0;
	Mask |= (GetDistanceToLine(float2(0.9375f, 0.0625f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 7u) : 0;
#else
#error Unsupported PERMUTATION_MULTI_SAMPLE_COUNT! Must be 1, 2, 4 or 8!
#endif

	return Mask;
}

void Plot(int2 Coord, float4 P0, float4 P1, float Rad0, float Rad1, float SegmentLenSqRcp, uint PackedHalfTileMin, uint PrimMatID)
{
	const int2 IntraTileCoord = Coord - int2(((PackedHalfTileMin >> 0) & 0xffff), ((PackedHalfTileMin >> 16) & 0xffff));

	if (all(IntraTileCoord >= 0) && all(IntraTileCoord < int2(HalfTileSize,HalfTileSize)))
	{
		const float Alpha = ComputeLerpAlpha(Coord, P0.xy, P1.xy, SegmentLenSqRcp);
		const float Depth = lerp(P0.z, P1.z, Alpha);
		const uint PackedDepthCov = PackHairVisDepthCoverage(Depth, 1.0f);
		const uint LinearIndex = IntraTileCoord.x + IntraTileCoord.y * HalfTileSize;

		// Test against scene depth
		if (PackedDepthCov > group_SubTileSceneDepth[LinearIndex])
		{
			const float Rad = ComputePerspectiveCorrectRadius(Rad0, Rad1, Alpha, P0.w, P1.w);
			const uint HairCount = min(Rad, 0.5f) * 2.0f * 1000.0f * CoverageScale;

			const uint CoverageMask = GetCoverageMask(Coord, P0.xy, P1.xy);

			// Accumulate hair count
			if (CoverageMask)
			{
				InterlockedAdd(group_SubTileHairCount[LinearIndex], HairCount);
			}

			UNROLL for (uint SampleIdx = 0; SampleIdx < PERMUTATION_MULTI_SAMPLE_COUNT; ++SampleIdx)
			{
				if (CoverageMask & (1u << SampleIdx))
				{
					// Write Depth + PrimMatID if depth test against hair depths is passed
					uint OldValue;
					InterlockedMax(group_SubTileHairDepth[SampleIdx][LinearIndex], PackedDepthCov, OldValue);
					if (PackedDepthCov > OldValue)
					{
						group_SubTilePrimMatID[SampleIdx][LinearIndex] = PrimMatID;
					}
				}
			}
		}
	}
}

[numthreads(1024, 1, 1)]
void RasterMultiSampleCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID)
{
	ResolvedView = ResolveView();

	if (GroupThreadID == 0)
	{
		group_TileNum = VisTileArgs[0];
		group_LoopNum = (float(group_TileNum) + float(NumRasterizers - 1)) * RcpNumRasterizers;

		group_PositionOffset = HairStrandsVF_GetHairInstancePositionOffset();
	}

	GroupMemoryBarrierWithGroupSync();

	LOOP for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++)
	{
		const uint TileIndex = LoopIndex + (GroupID * group_LoopNum);

		if (TileIndex >= group_TileNum)
		{
			return;
		}

		const uint PrimOffset = TileIndex * 1024;
		const uint PrimCount = LoadVisTileData(TileIndex, VT_PrimCount);

		const uint PackedCoord = LoadVisTileData(TileIndex, VT_Coord);
		const uint2 TileMin = UnpackVisTileCoord(PackedCoord) * TileSize;

		const uint PackedTileMin = ((TileMin.x & 0xffff) << 0) | ((TileMin.y & 0xffff) << 16);

		uint ThreadsPerSeg = 1;

		if (PrimCount <= 512)
			ThreadsPerSeg = 2;
		if (PrimCount <= 341)
			ThreadsPerSeg = 3;
		if (PrimCount <= 256)
			ThreadsPerSeg = 4;
		if (PrimCount <= 204)
			ThreadsPerSeg = 5;
		if (PrimCount <= 170)
			ThreadsPerSeg = 6;
		if (PrimCount <= 146)
			ThreadsPerSeg = 7;
		if (PrimCount <= 128)
			ThreadsPerSeg = 8;
		if (PrimCount <= 64)
			ThreadsPerSeg = 16;
		if (PrimCount <= 32)
			ThreadsPerSeg = 32;

		const bool bThreadValid = (GroupThreadID < (PrimCount * ThreadsPerSeg));
		const uint Prim = uint((float(GroupThreadID) + 0.5f) / float(ThreadsPerSeg));
		const uint PModTPS = GroupThreadID - (Prim * ThreadsPerSeg);

		float4 SP0 = 0;
		float4 SP1 = 0;
		float Rad0 = 0;
		float Rad1 = 0;
		bool bIsEndPoint = false;
		uint PrimMatID = ~0;

		if (bThreadValid)
		{
			const uint PrimID = VisTilePrims[PrimOffset + Prim];
			PrimMatID = PackHairVisControlPointMaterialId(PrimID, HairMaterialId);

			uint Type;
			CalcHomogenousPosAndRad(PrimID, group_PositionOffset, SP0, Rad0, Type);
			CalcHomogenousPosAndRad(PrimID + 1, group_PositionOffset, SP1, Rad1, Type);
			bIsEndPoint = (Type == HAIR_CONTROLPOINT_END);

			SP0 = float4(NDCToPixelCoord(SP0), 1.0f / SP0.w);
			SP1 = float4(NDCToPixelCoord(SP1), 1.0f / SP1.w);
		}

		// Split 32x32 tile into 4 16x16 tiles that are processed one after another.
		// This is to reduce LDS memory pressure.
		UNROLL for (uint SubTileIdx = 0; SubTileIdx < 4; ++SubTileIdx)
		{
			const uint2 SubTileMin = TileMin + uint2((SubTileIdx == 0 || SubTileIdx == 2) ? 0 : HalfTileSize, SubTileIdx < 2 ? 0 : HalfTileSize);
			const uint2 SubTileMax = SubTileMin + HalfTileSize;
			const uint PackedSubTileMin = ((SubTileMin.x & 0xFFFF) << 0u) | ((SubTileMin.y & 0xFFFF) << 16u);

			// Initialize LDS
			if (GroupThreadID < SqrHalfTileSize)
			{
				uint2 Coord;
				Coord.y = (float(GroupThreadID) + 0.5f) * RcpHalfTileSize;
				Coord.x = GroupThreadID - (Coord.y * HalfTileSize);
				Coord += uint2(((PackedSubTileMin >> 0u) & 0xFFFF), ((PackedSubTileMin >> 16u) & 0xFFFF));

				group_SubTileSceneDepth[GroupThreadID] = PackHairVisDepthCoverage(SceneDepthTexture.Load(uint3(Coord, 0)), 1.0f);
				group_SubTileHairCount[GroupThreadID] = 0;

				UNROLL for (uint SampleIdx = 0; SampleIdx < PERMUTATION_MULTI_SAMPLE_COUNT; ++SampleIdx)
				{
					const uint HairDepth = OutDepthCovTexture[uint3(Coord, SampleIdx)];
					group_SubTileHairDepth[SampleIdx][GroupThreadID] = HairDepth;
					group_SubTilePrimMatID[SampleIdx][GroupThreadID] = GetInvalidHairControlPointId();
				}
			}

			GroupMemoryBarrierWithGroupSync();

			// Rasterize to LDS
			if (bThreadValid)
			{
				const uint2 SubTileMin = uint2(((PackedSubTileMin >> 0u) & 0xFFFF), ((PackedSubTileMin >> 16u) & 0xFFFF));
				const uint2 SubTileMax = SubTileMin + HalfTileSize;
				bool2 bClipped;
				float2 T;
				const bool bVisible = ClipRaySegment(SubTileMin - 0.5f, SubTileMax + 0.5f, SP0, SP1, T, bClipped);
				T = saturate(T);

				if (bVisible)
				{
					const float2 SP0Clipped = lerp(SP0, SP1, T.x).xy;
					const float2 SP1Clipped = lerp(SP0, SP1, T.y).xy;
					const bool bIsSteep = abs(SP1.x - SP0.x) < abs(SP1.y - SP0.y);
					const float X0 = bIsSteep ? min(SP0Clipped.y, SP1Clipped.y) : min(SP0Clipped.x, SP1Clipped.x);
					const float X1 = bIsSteep ? max(SP0Clipped.y, SP1Clipped.y) : max(SP0Clipped.x, SP1Clipped.x);
					const int NumSteps = (int)(ceil(X1) - floor(X0));
					const float RcpNumSteps = 1.0f / (X1 - X0);

					const float SegmentLenSqRcp = 1.0f / dot(SP1.xy - SP0.xy, SP1.xy - SP0.xy);

					const int End = !bClipped.y && !bIsEndPoint ? (NumSteps - 1) : NumSteps;
					LOOP for (int J = PModTPS; J < End; J += ThreadsPerSeg)
					{
						const float Alpha = lerp(T.x, T.y, saturate(J * RcpNumSteps));
						const float2 SP = lerp(SP0.xy, SP1.xy, Alpha);
						Plot(SP, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedSubTileMin, PrimMatID);
					}
				}
			}

			GroupMemoryBarrierWithGroupSync();

			// Write out to global memory
			if (GroupThreadID < SqrHalfTileSize)
			{
				uint2 Coord;
				Coord.y = (float(GroupThreadID) + 0.5f) * RcpHalfTileSize;
				Coord.x = GroupThreadID - (Coord.y * HalfTileSize);
				Coord += uint2(((PackedSubTileMin >> 0u) & 0xFFFF), ((PackedSubTileMin >> 16u) & 0xFFFF));

				const uint HairCount = group_SubTileHairCount[GroupThreadID];
				if (HairCount != 0)
				{
					InterlockedAdd(OutHairCountTexture[Coord], HairCount);
				}

				UNROLL for (uint SampleIdx = 0; SampleIdx < PERMUTATION_MULTI_SAMPLE_COUNT; ++SampleIdx)
				{
					const uint3 SampleCoord = uint3(Coord, SampleIdx);

					const uint PrimMatID = group_SubTilePrimMatID[SampleIdx][GroupThreadID];
					if (PrimMatID != GetInvalidHairControlPointId())
					{
						const uint HairDepth = group_SubTileHairDepth[SampleIdx][GroupThreadID];

						uint OldValue;
						InterlockedMax(OutDepthCovTexture[SampleCoord], HairDepth, OldValue);
						if (HairDepth > OldValue)
						{
							OutPrimMatTexture[SampleCoord] = PrimMatID;
						}
					}
				}
			}

			GroupMemoryBarrierWithGroupSync();
		}
	}
}

#endif //SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE

///////////////////////////////////////////////////////////////////////////

#if SHADER_RASTERCOMPUTE_DEBUG

#include "../ShaderPrint.ush"

Texture2D<uint>						VisTileDepthGrid;
Texture2DArray<uint> 				VisTileBinningGrid;
Buffer<uint>						VisTileArgs;
uint								MacroGroupId;
uint								PrimitiveInfoIndex;
uint								TotalPrimitiveInfoCount;

#define TilePrintOffset (TileSize >> 1)

float4 Transparent(float4 Color) { return float4(Color.xyz, 0.5f); }

uint GetTileTotalSegment(uint2 TileCoord, bool bPrintDetails)
{
	const float TileDisplayScale = 1.5f;
	const uint DisplayTileSize = TileSize * TileDisplayScale;
	uint2 InlinedTileCoord = uint2(0, 0);

	uint TotalSegments = 0;
	const uint BinCount = NumBinners;// * 2; // Each binner fill in 2 bins, see binning algo.
	for (uint BinIt = 0; BinIt < BinCount; ++BinIt)
	{
		const uint CurrTileSegments = VisTileBinningGrid.Load(uint4(TileCoord, BinIt, 0));
		TotalSegments += CurrTileSegments;

		if (bPrintDetails)
		{
			AddFilledQuadSS(InlinedTileCoord * DisplayTileSize, (InlinedTileCoord + 1) * DisplayTileSize, CurrTileSegments > 0 ? Transparent(ColorLightGreen) : Transparent(ColorLightRed));
			AddQuadSS(InlinedTileCoord * DisplayTileSize, (InlinedTileCoord + 1) * DisplayTileSize, ColorYellow);

			FShaderPrintContext Context = InitShaderPrintContext(true, InlinedTileCoord * DisplayTileSize + TilePrintOffset);
			Print(Context, CurrTileSegments, FontWhite);
			++InlinedTileCoord.x;

			// Span details onto 2 lines
			if (BinIt == NumBinners-1)
			{
				InlinedTileCoord.x = 0;
				++InlinedTileCoord.y;
			}
		}
	}
	return TotalSegments;
}

void PrintTile(uint2 TileCoord, uint TotalSegments, bool bPrintText)
{
	AddFilledQuadSS(TileCoord * TileSize, (TileCoord + 1) * TileSize, TotalSegments > 0 ? Transparent(ColorLightGreen) : Transparent(ColorLightRed));
	if (bPrintText)
	{
		FShaderPrintContext Context = InitShaderPrintContext(true, TileCoord * TileSize + uint2(0, TileSize * 1.5f));
		Print(Context, TotalSegments, FontWhite);

		AddQuadSS(TileCoord * TileSize, (TileCoord + 1) * TileSize, ColorYellow);
	}
}

[numthreads(8, 8, 1)]
void MainCS(uint3 ThreadId : SV_DispatchThreadID)
{
	// Info/Stats
	if (all(ThreadId == 0))
	{
		FShaderPrintContext Context = InitShaderPrintContext(true, uint2(50, 110));
		Print(Context, TEXT("Raster compute         "), FontYellow); Newline(Context);
		Print(Context, TEXT("Macro Group Id       : "), FontSilver); Print(Context, MacroGroupId, FontWhite); Newline(Context);
		Print(Context, TEXT("Primitive Info       : "), FontSilver); Print(Context, PrimitiveInfoIndex, FontWhite, 2, 0); Print(Context, TEXT("/"), FontSilver); Print(Context, TotalPrimitiveInfoCount, FontWhite, 2, 0); Newline(Context);
		Newline(Context);

		Print(Context, TEXT("Configuration          "), FontYellow); Newline(Context);
		Print(Context, TEXT("Output Resolution    : "), FontSilver); Print(Context, OutputResolution, FontWhite); Newline(Context);
		Print(Context, TEXT("Resolution Multiplier: "), FontSilver); Print(Context, ResolutionMultiplier, FontWhite); Newline(Context);
		Newline(Context);

		Print(Context, TEXT("Tile Size            : "), FontSilver); Print(Context, TileSize, FontWhite); Newline(Context);
		Print(Context, TEXT("Tile Res             : "), FontSilver); Print(Context, TileRes.x, FontWhite, 2, 0); Print(Context, TEXT("x"), FontSilver); Print(Context, TileRes.y, FontWhite, 2, 0); Newline(Context);
		Newline(Context);

		Print(Context, TEXT("Num Binners          : "), FontSilver); Print(Context, NumBinners, FontWhite); Newline(Context);
		Print(Context, TEXT("Num Rasterizers      : "), FontSilver); Print(Context, NumRasterizers, FontWhite); Newline(Context);
		Print(Context, TEXT("Max Raster Count     : "), FontSilver); Print(Context, MaxRasterCount, FontWhite); Newline(Context);
		Newline(Context);

		Print(Context, TEXT("Allocated Tile Count : "), FontSilver); Print(Context, VisTileArgs[0], FontWhite); Newline(Context);


	}

	// Cursor info
	if (all(ThreadId == 0) && all(ShaderPrintData.CursorCoord >= 0))
	{
		const uint2 PixelCoord = ShaderPrintData.CursorCoord;
		const uint2 TileCoord  = PixelCoord >> TileSizeAsShift;

		const uint TotalSegments = GetTileTotalSegment(TileCoord, true);
		PrintTile(TileCoord, TotalSegments, true);
	}

	// All tile
	{
		const uint2 TileCoord = ThreadId.xy;
		const uint TotalSegments = GetTileTotalSegment(TileCoord, false);
		if (TotalSegments)
		{
			PrintTile(TileCoord, TotalSegments, false);
		}
	}
}
#endif //SHADER_RASTERCOMPUTE_DEBUG