// Copyright Epic Games, Inc. All Rights Reserved.

#define VF_SUPPORTS_PRIMITIVE_SCENE_DATA 1

#include "/Engine/Public/Platform.ush"
#include "/Engine/Private/Common.ush"
#include "/Engine/Private/SceneData.ush"
#include "../Nanite/NaniteHZBCull.ush"
#include "../ColorMap.ush"

#if PERMUTATION_DEBUG
#include "../ShaderPrint.ush"
#endif

// Bin tile
#define BIN_TILE_SIZE 32
#define BIN_TILE_INV_SIZE (1.f / float(BIN_TILE_SIZE))
#define BIN_TILE_SIZE_DIV_AS_SHIFT 5

// Raster tile
#define RASTER_TILE_SIZE 8
#define BIN_RASTER_INV_SIZE (1.f / float(RASTER_TILE_SIZE))
#define RASTER_TILE_SIZE_DIV_AS_SHIFT 3

// 
#define NUM_CURVE_PER_CLUSTER 64
#define RENDER_CURVE_PRIMITIVE_DATA_STRIDE_IN_BYTES 16

#define FPackedSegmentType uint4

#ifndef THREADGROUP_SIZE
#error THREADGROUP_SIZE needs to be defined
#endif

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Debug
#if PERMUTATION_DEBUG

FShaderPrintContext InitShaderPrintContextUnique(bool bActive, uint2 InBaseCoord, uint2 InOffset)
{
	FShaderPrintContext TmpCtx = InitShaderPrintContext(bActive, uint2(0, 0));
	uint UniqueOffset;
	if (bActive)
	{
		SHADER_PRINT_INTERLOCKEDADD(SHADER_PRINT_RWENTRYBUFFER(TmpCtx, 3) /* Free counter */, 1, UniqueOffset);
	}
	return InitShaderPrintContext(bActive, InBaseCoord + InOffset * UniqueOffset);
}


FShaderPrintContext InitShaderPrintContextAtCursorUnique(uint2 ActiveCoord, uint2 InBaseCoord, uint2 InOffset)
{
	FShaderPrintContext TmpCtx = InitShaderPrintContextAtCursor(ActiveCoord, uint2(0, 0));
	uint UniqueOffset;
	if (TmpCtx.bIsActive)
	{
		SHADER_PRINT_INTERLOCKEDADD(SHADER_PRINT_RWENTRYBUFFER(TmpCtx, 3) /* Free counter */, 1, UniqueOffset);
	}
	return InitShaderPrintContextAtCursor(ActiveCoord, InBaseCoord + InOffset * UniqueOffset);
}

void PlotCondition(inout FShaderPrintContext Ctx, bool bCondition)
{
	if (bCondition)
	{
		Print(Ctx, TEXT("x "), FontGreen);
	}
	else
	{
		Print(Ctx, TEXT(". "), FontRed);
	}
}

#endif // PERMUTATION_DEBUG

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Pack/unpack

uint  PackR7(float In)   { return uint(saturate(In) * 127.f) & 0x7F; }
float UnpackR7(uint In)  { return (In&0x7F) / 127.f; }
uint  PackR18(float In)  { return uint(saturate(In) * 262143.f) & 0x3FFFF; }
float UnpackR18(uint In) { return (In&0x3FFFF) / 262143.f; }

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Instance
struct FRenderCurveInstanceData
{
	bool bIsValid;
	uint PersistentPrimitiveIndex;
	uint InstanceSceneDataOffset;
	uint ClusterOffset;
	uint ClusterCount;

	float4 TranslatedWorldBoundCenterAndRadius;
	float3 LocalBoundsCenter;
	float3 LocalBoundsExtent;
	float4x4 LocalToTranslatedWorld;
};

FRenderCurveInstanceData GetRenderCurveInstanceData(uint InPrimitiveIndex)
{
	FRenderCurveInstanceData Out = (FRenderCurveInstanceData)0;
	if (InPrimitiveIndex < Scene.RenderCurve.InstanceCount)
	{
		const uint4 Packed = Scene.RenderCurve.RenderCurveInstanceData.Load4(InPrimitiveIndex * RENDER_CURVE_PRIMITIVE_DATA_STRIDE_IN_BYTES);
		Out.bIsValid = true;
		Out.PersistentPrimitiveIndex = Packed.x;
		Out.InstanceSceneDataOffset  = Packed.y;
		Out.ClusterOffset = Packed.z;
		Out.ClusterCount  = Packed.w;

		const FPrimitiveSceneData PrimitiveData = GetPrimitiveData(Out.PersistentPrimitiveIndex);
		const FInstanceSceneData InstanceData = GetInstanceSceneDataUnchecked(Out.InstanceSceneDataOffset);

		Out.LocalToTranslatedWorld = DFFastToTranslatedWorld(InstanceData.LocalToWorld, ResolvedView.PreViewTranslation);
		Out.TranslatedWorldBoundCenterAndRadius = float4(DFFastToTranslatedWorld(PrimitiveData.ObjectWorldPosition, ResolvedView.PreViewTranslation), PrimitiveData.ObjectRadius);
		Out.LocalBoundsCenter = InstanceData.LocalBoundsCenter;
		Out.LocalBoundsExtent = InstanceData.LocalBoundsExtent;
	}
	return Out;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Cluster Header
struct FClusterHeader
{
	float3 Center;
	uint CurveCount;
	uint PointPerCurve;
	float MaxLength;
	float MaxRadius;
	float3 LocalBoundCenter;
	float3 LocalBoundExtent;
};

FClusterHeader GetClusterHeader(uint InClusterIndex)
{
	FClusterHeader Out = (FClusterHeader)0;
	if (InClusterIndex < Scene.RenderCurve.ClusterCount)
	{
		const uint4 Packed0 = Scene.RenderCurve.ClusterData.Load4(InClusterIndex * Scene.RenderCurve.MaxClusterStrideInBytes);
		const uint4 Packed1 = Scene.RenderCurve.ClusterData.Load4(InClusterIndex * Scene.RenderCurve.MaxClusterStrideInBytes + 16u);

		Out.Center           = asfloat(Packed0.xyz);
		Out.CurveCount       = BitFieldExtractU32(Packed0.w, 8, 0);
		Out.PointPerCurve    = BitFieldExtractU32(Packed0.w, 8, 8);

		Out.MaxLength        = f16tof32(BitFieldExtractU32(Packed1.x, 16,  0));
		Out.MaxRadius        = f16tof32(BitFieldExtractU32(Packed1.x, 16, 16));

		Out.LocalBoundCenter.x = f16tof32(BitFieldExtractU32(Packed1.y, 16,  0));
		Out.LocalBoundCenter.y = f16tof32(BitFieldExtractU32(Packed1.y, 16, 16));
		Out.LocalBoundCenter.z = f16tof32(BitFieldExtractU32(Packed1.z, 16,  0));
		Out.LocalBoundExtent.x = f16tof32(BitFieldExtractU32(Packed1.z, 16, 16));
		Out.LocalBoundExtent.y = f16tof32(BitFieldExtractU32(Packed1.w, 16,  0));
		Out.LocalBoundExtent.z = f16tof32(BitFieldExtractU32(Packed1.w, 16, 16));
		
		Out.LocalBoundCenter += Out.Center;
	}
	return Out;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Min/Max-Z

struct FMinMaxZ
{
	float MinZ;
	float MaxZ;
	float Range;
	float InvRange;
	float Offset;
	float Scale;
};

FMinMaxZ UnpackMinMaxZ(uint In0, uint In1, float SceneDepthMinZ=0.f)
{
	const float2 In = float2(asfloat(In0), asfloat(In1));

	FMinMaxZ Out;
	Out.MinZ = max(In.x, SceneDepthMinZ);
	Out.MaxZ = In.y;
	Out.Range = Out.MaxZ - Out.MinZ;
	Out.InvRange = 1.f / max(Out.Range, 1e-5f);
	Out.Offset = -Out.MinZ * Out.InvRange;
	Out.Scale = Out.InvRange;
	return Out;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Cluster Data - Point

struct FCurvePoint
{
	float3 Position;
	float  Radius;
	float  UCoord;
	bool   bValid;
};

FCurvePoint UnpackCurvePoint(uint2 In, float3 InPositionOffset, float InMaxRadius)
{
	FCurvePoint Out = (FCurvePoint)0;
	Out.Position = float3(UnpackFloat2FromUInt(In.x), f16tof32(BitFieldExtractU32(In.y, 16, 0))) + InPositionOffset;
	Out.UCoord	 = UnpackR8(BitFieldExtractU32(In.y, 8, 16));
	Out.Radius	 = UnpackR6(BitFieldExtractU32(In.y, 6, 24)) * InMaxRadius;
	Out.bValid	 = BitFieldExtractU32(In.y, 1, 30);

	return Out;
}

FCurvePoint GetClusterPoint(FClusterHeader Header, uint ClusterIndex, uint CurveIndex, uint PointIndex)
{
	const uint HeaderStride = 32u;
	const uint PointStride  = 8u;
	const uint PointOffsetInBytes = (Header.CurveCount * PointIndex + CurveIndex) * PointStride;
	const uint2 Packed = Scene.RenderCurve.ClusterData.Load2(ClusterIndex * Scene.RenderCurve.MaxClusterStrideInBytes + HeaderStride + PointOffsetInBytes);
	return UnpackCurvePoint(Packed, Header.Center, Header.MaxRadius);
}

///////////////////////////////////////////////////////////////////////////
// Segment

// 64bit segment encoding
// Anchor point top-left tile corner 
// * P0.xy = 7/7bit pos + 18bit depth = 32bit
// * P1.xy = 7/7bit pos + 18bit depth = 32bit
struct FSegment
{	
	float3 P0;
	float3 P1;
	float3 Color;
};

bool ClipSegment(float2 AABBMin, float2 AABBMax, inout float3 P0, inout float3 P1);

FPackedSegmentType PackSegment(uint2 TileCoord, FMinMaxZ MinMaxZ, FSegment In)
{
	const float2 TileP0 = TileCoord     * BIN_TILE_SIZE;
	const float2 TileP1 = (TileCoord+1) * BIN_TILE_SIZE;

	// Clip segment to tile
	ClipSegment(TileP0, TileP1, In.P0, In.P1);

	// Relative to tile corner
	In.P0.xy = (In.P0.xy - TileP0);
	In.P1.xy = (In.P1.xy - TileP0);

	// Normalize in tile space
	const float2 nP0 = In.P0.xy / BIN_TILE_SIZE;
	const float2 nP1 = In.P1.xy / BIN_TILE_SIZE;
	const float nP0z  = (In.P0.z - MinMaxZ.MinZ) * MinMaxZ.InvRange;
	const float nP1z  = (In.P1.z - MinMaxZ.MinZ) * MinMaxZ.InvRange;

	// Quantize
	const uint3 QP0 = uint3(PackR7(nP0.x), PackR7(nP0.y), PackR18(nP0z));
	const uint3 QP1 = uint3(PackR7(nP1.x), PackR7(nP1.y), PackR18(nP1z));

	return FPackedSegmentType(
		QP0.x | (QP0.y<<7) | (QP0.z << 14), 
		QP1.x | (QP1.y<<7) | (QP1.z << 14),
		PackR10G10B10F(In.Color),
		0);
}

FSegment UnpackSegment(uint2 TileCoord, FMinMaxZ MinMaxZ, FPackedSegmentType In)
{
	const float2 TileBase = TileCoord * BIN_TILE_SIZE;

	FSegment Out;
	Out.P0 = float3(
		UnpackR7 (BitFieldExtractU32(In.x,  7,  0)) * BIN_TILE_SIZE + TileBase.x,
		UnpackR7 (BitFieldExtractU32(In.x,  7,  7)) * BIN_TILE_SIZE + TileBase.y,
		UnpackR18(BitFieldExtractU32(In.x, 18, 14)) * MinMaxZ.Range + MinMaxZ.MinZ);
	Out.P1 = float3(
		UnpackR7 (BitFieldExtractU32(In.y,  7,  0)) * BIN_TILE_SIZE + TileBase.x,
		UnpackR7 (BitFieldExtractU32(In.y,  7,  7)) * BIN_TILE_SIZE + TileBase.y,
		UnpackR18(BitFieldExtractU32(In.y, 18, 14)) * MinMaxZ.Range + MinMaxZ.MinZ);
	Out.Color = UnpackR10G10B10F(In.z);
	return Out;
}

///////////////////////////////////////////////////////////////////////////
// DDA helper
 
// TODO: Without setting a limit, we are sometimes caught in an infinite loop even though this should not happen.
#define DDA_MAX_ITERATIONS 256

struct FDDAContext
{
	float2 Coord;
	float2 DeltaDist;
	float2 Step;
	float2 SideDist;
};

FDDAContext DDACreateContext(float2 RayStart, float2 RayDir)
{
	const float2 RayDirRcp = 1.0f / RayDir;

	FDDAContext Context;
	Context.Coord = floor(RayStart);
	Context.DeltaDist = abs(RayDirRcp);
	Context.Step = sign(RayDir);
	Context.SideDist = (Context.Coord - RayStart + max(Context.Step, 0.0f)) * RayDirRcp;

	return Context;
}

void DDAAdvance(inout FDDAContext Context)
{
	if (Context.SideDist.x < Context.SideDist.y)
	{
		Context.SideDist.x += Context.DeltaDist.x;
		Context.Coord.x += Context.Step.x;
	}
	else
	{
		Context.SideDist.y += Context.DeltaDist.y;
		Context.Coord.y += Context.Step.y;
	}
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#define USE_SEGMENT_LUT 0
Texture2D<uint2> SegmentLUT;

// Output a 8x8 bitmask of the rastized segment
// * Pos0/Pos1 are in [0..1]
// * Pos0/Pos1 are clipped to border
uint2 GetSegmentBits(Texture2D<float> InSceneDepthTexture, uint2 InCoordOffset, float2 Pos0, float2 Pos1, float PosZ0, float PosZ1, bool bDepthTestEnable)
{
#if USE_SEGMENT_LUT
	Pos0 *= BIN_RASTER_INV_SIZE;
	Pos1 *= BIN_RASTER_INV_SIZE;

	// Sample a 256x256 LUT 4D texture order as follow:
	//    <-----------16------------>
	//     16                       A
	//    [  ] | [  ] | [  ] | [  ] |
	// 16 [  ] | [  ] | [  ] | [  ] |
	//    ---- |----- |----- |----- | 16
	//    [  ] | [  ] | [  ] | [  ] |
	//    [  ] | [  ] | [  ] | [  ] |
	//                              v
	const uint2 iPos0 = min(uint2(Pos0 * 16u) * 16u, 0xFF);
	const uint2 iPos1 = min(uint2(Pos1 * 16u), 0xF);
	const uint2 Coord = iPos0 + iPos1;
	return SegmentLUT[Coord];
#else
	uint2 Out = 0;
	FDDAContext DDAContext = DDACreateContext(Pos0.xy, normalize(Pos1.xy - Pos0.xy));
	const int2 StartCoord = (int2)floor(Pos0.xy);
	const int2 EndCoord = (int2)floor(Pos1.xy);
	for (int DDAIt = 0; DDAIt < 16u; ++DDAIt)
	{
		int2 TileCoord = (int2)floor(DDAContext.Coord);

		TileCoord = clamp(TileCoord, 0, 7);

		// TODO make this more optimal
		// On a simple example this cost 0.7ms
		#if 1
		const float s = clamp(length(TileCoord - StartCoord) / length(EndCoord - StartCoord), 0, 1);
		const float SceneDepth = InSceneDepthTexture.Load(uint3(InCoordOffset + TileCoord, 0));
		const float SegmentDepth = lerp(PosZ0, PosZ1, s);
		const bool bVisible = bDepthTestEnable ? SegmentDepth > SceneDepth : true;
		#else
		const bool bVisible = true;
		#endif
		if (bVisible)
		{
			const uint l = TileCoord.x + TileCoord.y * 8u;
			if (l < 32u)
			{
				Out.x |= 1u << l;
			}
			else
			{
				Out.y |= 1u << (l - 32);
			}
		}

		if (all(TileCoord == EndCoord))
		{
			break;
		}
		DDAAdvance(DDAContext);
	}
	return Out;
#endif
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// HZB
bool HZB(FFrustumCullData FrustumCull)
{
	int4 HZBRect = ResolvedView.ViewRectMinAndSize;  //int4(ResolvedView.ViewRectMinAndSize.xy, ResolvedView.ViewRectMinAndSize.xy + ResolvedView.ViewRectMinAndSize.xy);
	FScreenRect Rect = GetScreenRect( HZBRect, FrustumCull, 4 );
	bool bVisible = true;
	BRANCH
	if(!FrustumCull.bCrossesNearPlane)
	{
		bVisible = IsVisibleHZB(Rect, true);
	}
	return bVisible;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

uint2 PackDepth(float2 In)
{
	return asuint(In);
}

float2 UnpackDepth(uint2 In)
{
	return asfloat(In);
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Projection

float3 NDCToPixelCoord(float4 InDC, uint2 InResolution)
{
	const float3 NDC = InDC.xyz / InDC.w;
	float2 UV = NDC.xy * ResolvedView.ScreenPositionScaleBias.xy + ResolvedView.ScreenPositionScaleBias.wz;
	return float3(UV * InResolution, NDC.z);
}

// Line clipping based on "CLIPPING USING HOMOGENEOUS COORDINATES" by Blinn et al.
bool BlinnLineClipping(inout float4 P0, inout float4 P1)
{
	float2 T = float2(0.0f, 1.0f);
	bool bIsRemoved = P0.w < 0.0f && P1.w < 0.0f; // Both points behind near plane

	bool bSign = false;

	UNROLL
	for (uint PlaneIdx = 0; PlaneIdx < 6; ++PlaneIdx)
	{
		// Compute boundary coordinates of both points (w+x, w-x, w+y, w-y, z, w-z)
		bSign = !bSign;
		const uint CompIdx = PlaneIdx / 2;
		const float Sign = bSign ? 1.0f : -1.0f;
		const float WFactor = PlaneIdx != 4 ? 1.0f : 0.0f;
		const float2 BC = WFactor * float2(P0.w, P1.w) + Sign * float2(P0[CompIdx], P1[CompIdx]);

		float Num = BC.x;
		float Denom = BC.x - BC.y;
		bIsRemoved = bIsRemoved || (BC.x < 0.0f && BC.y < 0.0f); // Both points outside the plane
		float Alpha = Num / Denom;
		
		// If the denominator is negative, P0 has a smaller boundary coordinate than P1, so we can assume
		// that P1 is inside the plane (or bIsRemoved is true), so we need to update the alpha for P0.
		// The reverse is true if the denominator is positive.
		if (Denom < 0.0f)
		{
			T.x = max(T.x, Alpha);
		}
		else
		{
			T.y = min(T.y, Alpha);
		}
	}

	if (!bIsRemoved)
	{
		const float4 P0Clipped = lerp(P0, P1, T.x);
		const float4 P1Clipped = lerp(P0, P1, T.y);
		P0 = P0Clipped;
		P1 = P1Clipped;
	}

	return !bIsRemoved;
}

bool InternalClipSegment(float2 AABBMin, float2 AABBMax, float2 P0, float2 P1, out float2 T, out bool2 bClipped)
{
	bClipped = false;
	T = float2(0.0f, 1.0f);
	const bool bP0Outside = any(P0 < AABBMin) || any(P0 > AABBMax);
	const bool bP1Outside = any(P1 < AABBMin) || any(P1 > AABBMax);
	if (!bP0Outside && !bP1Outside)
	{
		return true;
	}

	const float2 Origin = P0;
	const float2 Dir = P1 - P0;
	const float2 RcpDir = 1.0f / Dir;

	const float2 T0 = (AABBMin - Origin) * RcpDir;
	const float2 T1 = (AABBMax - Origin) * RcpDir;

	T.x = max(min(T0.x, T1.x), min(T0.y, T1.y));
	T.y = min(max(T0.x, T1.x), max(T0.y, T1.y));

	// Ray intersects the AABB but the segment is completely outside or no intersection at all.
	if (T.y < 0.0f || T.x > T.y || T.x > 1.f)
	{
		bClipped = true;
		return false;
	}

	if (bP0Outside && T.x > 0.0f && T.x < 1.0f)
	{
		bClipped.x = true;
	}
	if (bP1Outside && T.y > 0.0f && T.y < 1.0f)
	{
		bClipped.y = true;
	}

	return true;
}

bool ClipSegment(float2 AABBMin, float2 AABBMax, inout float3 P0, inout float3 P1)
{
	float2 T = 0;
	bool2 bClipped = false;
	bool bIsValid = InternalClipSegment(AABBMin, AABBMax, P0.xy, P1.xy, T, bClipped);
	
	if (bIsValid)
	{
		const bool bP0Outside = any(P0.xy < AABBMin) || any(P0.xy > AABBMax);
		const bool bP1Outside = any(P1.xy < AABBMin) || any(P1.xy > AABBMax);

		float3 P0New = P0;
		float3 P1New = P1;
		if (bP0Outside && T.x > 0.0f && T.x < 1.0f)
		{
			P0New = lerp(P0, P1, T.x);
			bClipped.x = true;
		}
		if (bP1Outside && T.y > 0.0f && T.y < 1.0f)
		{
			P1New = lerp(P0, P1, T.y);
			bClipped.y = true;
		}
		P0 = P0New;
		P1 = P1New;
	}

	return bIsValid;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
float3 GetCurveColor(uint ClusterIndex, uint CurveIndex)
{
	return ColorMapViridis(float(ClusterIndex * 64 + CurveIndex) / 2048);
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Common parameters

int2 Resolution;

uint BinTileSize;
int2 BinTileRes;
uint NumBinners;

uint RasterTileSize;
int2 RasterTileRes;
uint NumRasterizers;

uint MaxTileDataCount;
uint MaxSegmentDataCount;
uint MaxZBinDataCount;
uint MaxRasterWorkCount;
uint MaxZBinSegmentDataCount;

float MinCoverageThreshold;

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#ifdef InstanceCullingCS

RWBuffer<uint> RWVisibleInstanceArgs;
RWStructuredBuffer<uint> RWVisibleInstances;
RWStructuredBuffer<uint> RWMinMaxZ;

[numthreads(64, 1, 1)]
void InstanceCullingCS(uint2 DispatchThreadId : SV_DispatchThreadID)
{
	ResolvedView = ResolveView(); 

	if (all(DispatchThreadId == 0))
	{
		RWVisibleInstanceArgs[1] = 1;
		RWVisibleInstanceArgs[2] = 1;

		RWMinMaxZ[0] = ~0u;
		RWMinMaxZ[1] =  0u;
	}

	const uint PrimitiveIndex = DispatchThreadId.x;
	const FRenderCurveInstanceData RenderCurveInstanceData = GetRenderCurveInstanceData(PrimitiveIndex);

	if (RenderCurveInstanceData.bIsValid)
	{
		// 1. Distance culling
		// TODO?

		// 2. Frustum culling
		const FFrustumCullData FrustumCullData = BoxCullFrustum( 
			RenderCurveInstanceData.LocalBoundsCenter, 
			RenderCurveInstanceData.LocalBoundsExtent, 
			RenderCurveInstanceData.LocalToTranslatedWorld, 
			ResolvedView.TranslatedWorldToClip, 
			ResolvedView.ViewToClip, 
			false /*bIsOrtho*/, 
			true  /*bNearClip*/, 
			false /*bSkipCullFrustum*/);
		bool bIsVisible = FrustumCullData.bIsVisible;

		// 3. HZB culling
		if (bIsHZBValid && bIsVisible)
		{
			bIsVisible = bIsVisible && HZB(FrustumCullData);
		}

		if (bIsVisible)
		{
			uint WriteOffset = 0;
			//WaveInterlockedAddScalarInGroups(RWVisibleInstanceArgs[3], RWVisibleInstanceArgs[0], 64, 1, WriteOffset);
			WaveInterlockedAddScalar_(RWVisibleInstanceArgs[0], 1, WriteOffset);
			RWVisibleInstances[WriteOffset] = PrimitiveIndex;

			#if PERMUTATION_DEBUG
			{
				FShaderPrintContext Ctx = InitShaderPrintContext(true, uint2(50, 80 + WriteOffset * 15));
				AddAABBTWS(Ctx, RenderCurveInstanceData.TranslatedWorldBoundCenterAndRadius.xyz - RenderCurveInstanceData.TranslatedWorldBoundCenterAndRadius.www, RenderCurveInstanceData.TranslatedWorldBoundCenterAndRadius.xyz + RenderCurveInstanceData.TranslatedWorldBoundCenterAndRadius.www, ColorYellow);
				AddOBBTWS(Ctx, RenderCurveInstanceData.LocalBoundsCenter - RenderCurveInstanceData.LocalBoundsExtent, RenderCurveInstanceData.LocalBoundsCenter + RenderCurveInstanceData.LocalBoundsExtent, ColorGreen, RenderCurveInstanceData.LocalToTranslatedWorld);
			}
			#endif
		}
	}
}

#endif // InstanceCullingCS

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#ifdef ClusterCullingCS

Buffer<uint> VisibleInstanceArgs;
StructuredBuffer<uint> VisibleInstances;

RWBuffer<uint> RWVisibleClusterArgs;
RWStructuredBuffer<uint2> RWVisibleClusters;
RWStructuredBuffer<uint> RWMinMaxZ;

[numthreads(THREADGROUP_SIZE, 1, 1)]
void ClusterCullingCS(uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint LinearThreadIndex : SV_GroupIndex)
{
	ResolvedView = ResolveView(); 

	if (all(DispatchThreadId == 0))
	{
		RWVisibleClusterArgs[1] = 1;
		RWVisibleClusterArgs[2] = 1;
	}

	const uint VisibleInstanceCount = VisibleInstanceArgs[0];
	const uint VisibleInstanceIndex = GroupId.x;
	if (VisibleInstanceIndex >= VisibleInstanceCount)
	{
		return;
	}

	const uint PrimitiveIndex = VisibleInstances[VisibleInstanceIndex];
	const FRenderCurveInstanceData RenderCurveInstanceData = GetRenderCurveInstanceData(PrimitiveIndex);

	
	#if PERMUTATION_DEBUG
	if (0)
	{
		//FShaderPrintContext Ctx = InitShaderPrintContext(true, uint2(500,50));
		//Print(Ctx, TEXT("CLUSTER CULLING"), FontRed); Newline(Ctx);
		//const float4 ClusterColor = float4(ColorMapMagma(float(ClusterIt) / RenderCurveInstanceData.ClusterCount), 1);
		//AddOBBTWS(Ctx, LocalBoundsCenter - LocalBoundsExtent, LocalBoundsCenter + LocalBoundsExtent, ClusterColor, RenderCurveInstanceData.LocalToTranslatedWorld);
	}
	#endif

	// TODO: change the traversal to be hierarchical and maybe using persistent thread
	for (uint ClusterIt = LinearThreadIndex; ClusterIt < RenderCurveInstanceData.ClusterCount; ClusterIt += THREADGROUP_SIZE)
	{
		const uint ClusterIndex = RenderCurveInstanceData.ClusterOffset + ClusterIt;
		const FClusterHeader ClusterHeader = GetClusterHeader(ClusterIndex);

		if (RenderCurveInstanceData.bIsValid)
		{
			// 1. Distance culling
			// TODO?

			const float3 LocalBoundsCenter = ClusterHeader.LocalBoundCenter;
			const float3 LocalBoundsExtent = ClusterHeader.LocalBoundExtent;

			// 2. Frustum culling
			const FFrustumCullData FrustumCullData = BoxCullFrustum( 
				LocalBoundsCenter, 
				LocalBoundsExtent, 
				RenderCurveInstanceData.LocalToTranslatedWorld, 
				ResolvedView.TranslatedWorldToClip, 
				ResolvedView.ViewToClip, 
				false /*bIsOrtho*/, 
				true  /*bNearClip*/, 
				false /*bSkipCullFrustum*/);
			bool bIsVisible = FrustumCullData.bIsVisible;

			// 3. HZB culling
			if (bIsHZBValid && bIsVisible)
			{
				bIsVisible = bIsVisible && HZB(FrustumCullData);
			}

			if (bIsVisible)
			{
				uint WriteOffset = 0;
				WaveInterlockedAddScalarInGroups(RWVisibleClusterArgs[3], RWVisibleClusterArgs[0], THREADGROUP_SIZE, 1, WriteOffset);
				RWVisibleClusters[WriteOffset] = uint2(PrimitiveIndex, ClusterIndex);
			}

			#if PERMUTATION_DEBUG
			if (0)
			{
				FShaderPrintContext Ctx = InitShaderPrintContext(true, uint2(500,500));
				const float4 ClusterColor = float4(ColorMapMagma(float(ClusterIt) / RenderCurveInstanceData.ClusterCount), 1);
				AddOBBTWS(Ctx, LocalBoundsCenter - LocalBoundsExtent, LocalBoundsCenter + LocalBoundsExtent, ClusterColor, RenderCurveInstanceData.LocalToTranslatedWorld);
			}
			#endif

			WaveInterlockedMin(RWMinMaxZ[0], asuint(max(FrustumCullData.RectMin.z, 0.0f)));
			WaveInterlockedMax(RWMinMaxZ[1], asuint(min(FrustumCullData.RectMax.z, 1.0f)));
		}
	}
}

#endif // ClusterCullingCS

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#ifdef SceneTileDepthCS

Texture2D<float> SceneDepthTexture;
RWTexture2D<uint2> OutSceneTileDepthTexture;
groupshared uint group_MinDepth; // (4 bytes)
groupshared uint group_MaxDepth; // (4 bytes)

// 32x32 tile
#define BIN_THREAD_COUNT THREADGROUP_SIZE

#if THREADGROUP_SIZE != (BIN_TILE_SIZE * BIN_TILE_SIZE)
#error Assumes a workgroup size of 1024 threads to cover a maximum tile size of 32x32.
#endif

uint2 LinearTo2D_Bin(uint In)
{
	uint2 Out;
	Out.y = In >> BIN_TILE_SIZE_DIV_AS_SHIFT;
	Out.x = In - Out.y * BIN_TILE_SIZE;
	return Out;
}

[numthreads(THREADGROUP_SIZE, 1, 1)]
void SceneTileDepthCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID) 
{
	if (GroupThreadID == 0)
	{
		group_MinDepth = 0xFFFFFFFF; // Inverse-Z
		group_MaxDepth = 0;
	}

	GroupMemoryBarrierWithGroupSync();

	if (GroupThreadID < THREADGROUP_SIZE)
	{
		const uint2 PixelCoord = LinearTo2D_Bin(GroupThreadID) + GroupID * BIN_TILE_SIZE;

		if (all(PixelCoord < (uint2)Resolution))
		{
			const float Depth = SceneDepthTexture.Load(uint3(PixelCoord, 0));

			// Compute furthest depth inside this tile
			WaveInterlockedMin(group_MinDepth, asuint(Depth)); // Inverse-Z
			WaveInterlockedMax(group_MaxDepth, asuint(Depth)); // Inverse-Z
		}
	}

	GroupMemoryBarrierWithGroupSync();

	if (GroupThreadID == 0)
	{
		OutSceneTileDepthTexture[GroupID] = uint2(group_MinDepth, group_MaxDepth);
	}
}

#endif // SceneTileDepthCS

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Tile data

// Visibility tile data are stored as:
//   ______________________________________________________________________________________________________________________________________________________________________
// ||                      Tile 0                          ||                      Tile 1                          ||                      Tile 2                          ||
// ||______________________________________________________||______________________________________________________||______________________________________________________||
// ||            |           |             |               ||            |           |             |               ||            |           |             |               ||
// || PrimOffset | PrimCount | (Tile)Coord | MinWriteIndex || PrimOffset | PrimCount | (Tile)Coord | MinWriteIndex || PrimOffset | PrimCount | (Tile)Coord | MinWriteIndex ||
#define VT_PrimOffset 0
#define VT_PrimCount 1
#define VT_Coord 2
#define VT_MinWriteIndex 3
#define VT_SIZE 4

void StoreTileData(RWStructuredBuffer<uint> OutBuffer, uint Index, uint VTEntry, uint Value)
{
	const uint WriteIndex = Index * VT_SIZE + VTEntry;
	OutBuffer[WriteIndex] = Value;
}
uint LoadTileData(RWStructuredBuffer<uint> OutBuffer, uint Index, uint VTEntry)
{
	const uint ReadIndex = Index * VT_SIZE + VTEntry;
	return OutBuffer[ReadIndex];
}
uint LoadTileData(StructuredBuffer<uint> OutBuffer, uint Index, uint VTEntry)
{
	const uint ReadIndex = Index * VT_SIZE + VTEntry;
	return OutBuffer[ReadIndex];
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#define SEGMENT_COUNT_PER_ALLOC 1024
#if SEGMENT_COUNT_PER_ALLOC != 1024
#error Update binning and compaction code
#endif

#ifdef BinningCS

#define MAX_TILES_TO_ALLOCATE 1024
#define MAX_THREAD_ITERATION_COUNT 4096

StructuredBuffer<uint>    ViewMinMaxZ;
Texture2D<uint2>          SceneTileDepthTexture;
StructuredBuffer<uint2>   VisibleClusters;
StructuredBuffer<uint>    VisibleClustersCount;
RWStructuredBuffer<uint>  VisibleClustersQueue;

RWTexture2DArray<uint>    RWTileSegmentCount;
RWStructuredBuffer<uint>  RWTileData;
RWStructuredBuffer<FPackedSegmentType> RWSegmentData;
RWStructuredBuffer<uint>  RWTileDataAllocatedCount;

groupshared uint group_TilesToAllocate[MAX_TILES_TO_ALLOCATE];
groupshared uint group_TilesToAllocateCount;
groupshared uint group_ClusterIndex;
groupshared uint group_ClusterFetchIndex;
groupshared float4x4 group_LocalToClip;
groupshared FClusterHeader group_ClusterHeader;

// TODO most add a permutation for this
#define PERMUTATION_NUM_POINT_PER_CURVE 16

struct FDebug
{
	#if PERMUTATION_DEBUG
	FShaderPrintContext Ctx;
	#endif
	uint GroupID;
	uint Dummy;
};

// * Each binners fetches work from the visible cluster queue. 
// * Each binner (= a workgroup) loops through all segments of a cluster
// NB there is still potential to use LDS to prevent/reduce contention when adding segments to the binning grid and this may improve perf

void BinCluster(FClusterHeader ClusterHeader, uint BinnerIndex, uint ClusterIndex, uint CurveIt, uint PointIt0, uint PointIt1, float4x4 LocalToClip, uint GroupThreadID, inout FDebug Debug)
{
	// 1. Project segment and clip to screen
	// Each thread of the group is processing a segment of the cluster
	// * GroupThread.x : Curve index
	// * GroupThread.y : Point index
	//
	//     C0 C1 C2 C3 ... C63
	// P0  x  x  x  x      x 
	//     |  |  |  |      |
	// P1  x  x  x  x      x 
	//     |  |  |  |      |
	// P2  x  x  x  x      x 
	//     |  |  |  |      |
	// P3  x  x  x  x      x 
	// ... 	
	#if PERMUTATION_DEBUG
	FShaderPrintContext CtxU = InitShaderPrintContext(true, uint2(500 + CurveIt * 10, 200 + PointIt0 * 10));
	#endif

	bool bValid = false;
	float3 SP0 = 0;
	float3 SP1 = 0;
	if (PointIt1 < ClusterHeader.PointPerCurve)
	{
		const FCurvePoint Point0 = GetClusterPoint(ClusterHeader, ClusterIndex, CurveIt, PointIt0);
		const FCurvePoint Point1 = GetClusterPoint(ClusterHeader, ClusterIndex, CurveIt, PointIt1);
		if (Point0.bValid && Point1.bValid)
		{
			float4 ClipPosition0 = mul(float4(Point0.Position, 1), LocalToClip);
			float4 ClipPosition1 = mul(float4(Point1.Position, 1), LocalToClip);

			// Do clipping in homogenous coordinates
			bValid = true;
			#if 1
			bValid = BlinnLineClipping(ClipPosition0, ClipPosition1); // TODO Is this expensive? Could it be made faster?
			#endif
			SP0 = NDCToPixelCoord(ClipPosition0, Resolution);
			SP1 = NDCToPixelCoord(ClipPosition1, Resolution);

			#if PERMUTATION_DEBUG
			if (0)
			{
				FShaderPrintContext Ctx = InitShaderPrintContext(Debug.GroupID == 0, uint2(0, 0));
				AddLineSS(Ctx, SP0.xy, SP1.xy, ColorGreen, ColorBlue);
			}
			#endif
		}
	}
	#if PERMUTATION_DEBUG
	//PlotCondition(CtxU, bValid);
	#endif
	
	// 2. Reset allocation counter
	if (GroupThreadID == 0)
	{
		group_TilesToAllocateCount = 0;
	}
	GroupMemoryBarrierWithGroupSync();

	// 3. Increment per workgroup per tile counters and add tiles to be allocated
	const float MinZ = min(SP0.z, SP1.z);
	const float2 TileCoord0 = SP0.xy / BIN_TILE_SIZE;
	const float2 TileCoord1 = SP1.xy / BIN_TILE_SIZE;
	if (bValid)
	{
		FDDAContext DDAContext = DDACreateContext(TileCoord0.xy, normalize(TileCoord1.xy - TileCoord0.xy));
		const int2 EndCoord = (int2)floor(TileCoord1.xy);

		for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt)
		{
			uint DebugInsertMode = 0;
			const int2 TileCoord = (int2)floor(DDAContext.Coord);
			BRANCH
			if (MinZ >= UnpackDepth(SceneTileDepthTexture[TileCoord]).x)  // Inverse-Z
			{
				// Add segment to global counter
				uint OldTileSegmentCount;
				InterlockedAdd(RWTileSegmentCount[uint3(TileCoord, BinnerIndex)], 1, OldTileSegmentCount);
				DebugInsertMode = 1;

				// If global counter reach current span limit (1k segment), queue a span allocation
				BRANCH
				if ((OldTileSegmentCount % 1024) == 0)
				{
					uint WritePos;
					InterlockedAdd(group_TilesToAllocateCount, 1, WritePos);
					if (WritePos < MAX_TILES_TO_ALLOCATE)
					{
						group_TilesToAllocate[WritePos] = PackTileCoord8bits(TileCoord);
						DebugInsertMode = 2;
					}
				}
			}

			#if PERMUTATION_DEBUG
			if (0)
			{
				float4 DebugColor;
				FShaderPrintContext Ctx = InitShaderPrintContext(Debug.GroupID == 0/* && GroupThreadID == 0*/, uint2(0, 0));
				if (DebugInsertMode == 0) DebugColor = ColorRed;
				if (DebugInsertMode == 1) DebugColor = ColorGreen;
				if (DebugInsertMode == 2) DebugColor = ColorYellow;
				AddFilledQuadSS(Ctx, TileCoord * BIN_TILE_SIZE, (TileCoord + 1) * BIN_TILE_SIZE, float4(DebugColor.xyz, 0.01f));
				AddQuadSS(Ctx, TileCoord * BIN_TILE_SIZE, (TileCoord + 1) * BIN_TILE_SIZE, DebugColor);
			}
			#endif

			if (all(TileCoord == EndCoord))
			{
				break;
			}

			DDAAdvance(DDAContext);
		}
	}

	GroupMemoryBarrierWithGroupSync();

	// 4. Allocate new span tiles
	// Segment count has 3 layers:
	// * Tile segment count
	// * Temp segment count
	// * Tile info
	const uint SegmentCountLayerIdx    = BinnerIndex;
	const uint TmpSegmentCountLayerIdx = BinnerIndex + NumBinners * 1;
	const uint TileAllocInfoLayerIdx   = BinnerIndex + NumBinners * 2;

	const uint TilesToAllocateCount = min(MAX_TILES_TO_ALLOCATE, group_TilesToAllocateCount);
	//#if PERMUTATION_DEBUG
	//PrintLineN(Debug.Ctx, TilesToAllocateCount);
	//#endif

	// DEBUG
	for (uint TileIdx = GroupThreadID; TileIdx < TilesToAllocateCount; TileIdx += THREADGROUP_SIZE)
	//for (uint TileIdx = 0; TileIdx < TilesToAllocateCount; TileIdx++)
	{
		//if (GroupThreadID < 1)
		{
			const uint PackedTileCoord = group_TilesToAllocate[TileIdx];
			const uint2 TileCoord = UnpackTileCoord8bits(PackedTileCoord);

			const uint TotalNewWriteCount = RWTileSegmentCount[uint3(TileCoord,    SegmentCountLayerIdx)];
			const uint TotalOldWriteCount = RWTileSegmentCount[uint3(TileCoord, TmpSegmentCountLayerIdx)];

			uint NewTileIndex;
			WaveInterlockedAddScalar_(RWTileDataAllocatedCount[0], 1, NewTileIndex);
			if (NewTileIndex < MaxTileDataCount)
			{
				StoreTileData(RWTileData, NewTileIndex, VT_Coord, PackedTileCoord);
				// Round down the count to the start of the tile and later compare against this to decide which tile to write to.
				StoreTileData(RWTileData, NewTileIndex, VT_MinWriteIndex, TotalNewWriteCount & ~1023u);

				const uint PrevTileIndex = (RWTileSegmentCount[uint3(TileCoord, TileAllocInfoLayerIdx)] & 0xffff);
				if (TotalOldWriteCount > 0)
				{
					StoreTileData(RWTileData, PrevTileIndex, VT_PrimCount, 1024);
				}

				RWTileSegmentCount[uint3(TileCoord, TileAllocInfoLayerIdx)] = (PrevTileIndex << 16) | (NewTileIndex & 0xffff);
			}
		}
	}

	// Visualize allocated tile
	#if PERMUTATION_DEBUG
	if (0)
	//if (Debug.Ctx.bIsActive)
	{
		PrintLineN(Debug.Ctx, Debug.GroupID);
		PrintLineN(Debug.Ctx, TilesToAllocateCount);
		for (uint TileIdx = 0; TileIdx < TilesToAllocateCount; TileIdx++)
		{
			const uint PackedTileCoord = group_TilesToAllocate[TileIdx];
			const uint2 TileCoord = UnpackTileCoord8bits(PackedTileCoord);
			PrintLineN(Debug.Ctx, TileCoord);
			AddQuadSS(Debug.Ctx, TileCoord * BIN_TILE_SIZE, (TileCoord + 1) * BIN_TILE_SIZE, ColorGreen);
		}
	}
	#endif

	GroupMemoryBarrierWithGroupSync();

	// 5. Write segment to tiles
	const FMinMaxZ MinMaxZ = UnpackMinMaxZ(ViewMinMaxZ[0], ViewMinMaxZ[1]);
	if (bValid)
	{
		FDDAContext DDAContext = DDACreateContext(TileCoord0, normalize(TileCoord1 - TileCoord0));
		const int2 EndCoord = (int2)floor(TileCoord1);

		for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt)
		{
			const int2 TileCoord = (int2)floor(DDAContext.Coord);

			BRANCH
			if (MinZ >= UnpackDepth(SceneTileDepthTexture[TileCoord]).x)  // Inverse-Z
			{
				const uint PackedTiles = RWTileSegmentCount[uint3(TileCoord, TileAllocInfoLayerIdx)];
				const uint CurTile     = BitFieldExtractU32(PackedTiles, 16, 0);
				const uint PrevTile    = BitFieldExtractU32(PackedTiles, 16, 16);

				// Currently we need this to get our write position, but maybe there is a cheaper way to keep track of that?
				uint OldTileSegmentCount;
				InterlockedAdd(RWTileSegmentCount[uint3(TileCoord, TmpSegmentCountLayerIdx)], 1, OldTileSegmentCount);

				const bool bWriteToCurTile = OldTileSegmentCount >= LoadTileData(RWTileData, CurTile, VT_MinWriteIndex);
				const uint LocalWritePos = OldTileSegmentCount % 1024;
				const uint WritePos = (bWriteToCurTile ? CurTile : PrevTile) * 1024 + LocalWritePos;

				if (WritePos < MaxSegmentDataCount)
				{
					FSegment Segment;
					Segment.P0 = SP0;
					Segment.P1 = SP1;
					Segment.Color = GetCurveColor(ClusterIndex, CurveIt);
					RWSegmentData[WritePos] = PackSegment(TileCoord, MinMaxZ, Segment);
				}
				BRANCH
				if (bWriteToCurTile)
				{
					if ((OldTileSegmentCount + 1) == RWTileSegmentCount[uint3(TileCoord, SegmentCountLayerIdx)])
					{
						StoreTileData(RWTileData, CurTile, VT_PrimCount, (OldTileSegmentCount == 1023) ? 1024 : ((OldTileSegmentCount + 1) % 1024));
					}
				}
			}

			if (all(TileCoord == EndCoord))
			{
				break;
			}

			DDAAdvance(DDAContext);
		}
	}
}

[numthreads(THREADGROUP_SIZE_X, THREADGROUP_SIZE_Y, 1)]
void BinningCS(uint2 GroupThreadID2D : SV_GroupThreadID, uint GroupThread1D : SV_GroupIndex, uint GroupID : SV_GroupID)
{
	ResolvedView = ResolveView();

	const uint BinnerIndex = GroupID;
	const uint VisibleClusterCount = VisibleClustersCount[0];

	FDebug Debug;
	#if PERMUTATION_DEBUG
	const bool bDebugEnabled = GroupID <= uint(View.GeneralPurposeTweak) && GroupThread1D == 0;
	//Debug.Ctx = InitShaderPrintContext(bDebugEnabled, uint2(50 + GroupID * 250, 250));
	Debug.Ctx = InitShaderPrintContext(bDebugEnabled, uint2(350 + GroupID * 250, 50));
	Debug.GroupID = GroupID;
	#endif

	// Persistent thread loop for binning the clusters queue
	group_ClusterIndex = 0; 
	group_ClusterFetchIndex = 0; 
	uint IterationIt = 0;
	while (IterationIt < MAX_THREAD_ITERATION_COUNT)
	{
		if (GroupThread1D == 0)
		{
			uint ClusterFetchIndex = 0;
			InterlockedAdd(VisibleClustersQueue[0], 1, ClusterFetchIndex);

			const uint2 VisibleData = VisibleClusters[ClusterFetchIndex];
			const uint  PrimitiveIndex = VisibleData.x;
			const uint  ClusterIndex   = VisibleData.y;
			const FRenderCurveInstanceData RenderCurveInstanceData = GetRenderCurveInstanceData(PrimitiveIndex);
			const FClusterHeader ClusterHeader = GetClusterHeader(ClusterIndex);

			group_ClusterFetchIndex = ClusterFetchIndex;
			group_ClusterIndex      = ClusterIndex;
			group_ClusterHeader     = ClusterHeader;
			group_LocalToClip = mul(RenderCurveInstanceData.LocalToTranslatedWorld, ResolvedView.TranslatedWorldToClip);
		}
		GroupMemoryBarrierWithGroupSync();

		if (group_ClusterFetchIndex < VisibleClusterCount)
		{
			const uint CurveIt = GroupThreadID2D.x;
			const uint PointIt0 = GroupThreadID2D.y;
			const uint PointIt1 = PointIt0+1;
			BinCluster(group_ClusterHeader, BinnerIndex, group_ClusterIndex, CurveIt, PointIt0, PointIt1, group_LocalToClip, GroupThread1D, Debug);
		}
		else
		{
			break;
		}

		++IterationIt;
	}
}
#endif // BinningCS

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

struct FRasterWork
{
	uint2 TileCoord;
	uint ZBinOffset;
	uint ZBinCount;
};

uint2 PackRasterWork(FRasterWork In)
{
	return uint2(In.ZBinOffset, (PackTileCoord8bits(In.TileCoord)<<16u) | (In.ZBinCount & 0xFFFF));
}

FRasterWork UnpackRasterWork(uint2 In)
{
	FRasterWork Out;
	Out.ZBinOffset = In.x;
	Out.ZBinCount = In.y & 0xFFFF;
	Out.TileCoord = UnpackTileCoord8bits(In.y >> 16u);
	return Out;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

struct FZBin
{
	uint BinZIndex;
	uint PrimOffset;
	uint PrimCount;
};

uint2 PackZBin(FZBin In)
{
	return uint2(In.PrimOffset, (In.BinZIndex & 0x3FF) | (In.PrimCount<<10));
}

FZBin UnpackZBin(uint2 In)
{
	FZBin Out;
	Out.PrimOffset = In.x; 
	Out.PrimCount = In.y >> 10;
	Out.BinZIndex = In.y & 0x3FF;
	return Out;
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// DEBUG
// Filled ZBin count
// Filled Tile data
// Primitive count in tile
// Occupancy of primitive within tile 8x8/16x16/32x32

#define MAX_SEGMENT_COUNT_PER_ZBIN 1024
#define MAX_ALLOCATED_ZBIN_COUNT 16
#define MAX_TILE_TO_COMPACT 1024

#ifdef CompactionCS

// TODO reduce this?
#define COMPACTION_DEPTH_BUCKET 256

#if COMPACTION_DEPTH_BUCKET > THREADGROUP_SIZE
#error THREADGROUP_SIZE needs to be larger or equal to COMPACTION_DEPTH_BUCKET in order to reset correctly depth bucket values
#endif

Texture2D<uint2> 			SceneTileDepthTexture;
StructuredBuffer<uint> 		ViewMinMaxZ;
Texture2DArray<uint>		TileSegmentCount;
StructuredBuffer<uint>		TileData;
StructuredBuffer<FPackedSegmentType> SegmentData;
StructuredBuffer<uint> 		TileDataAllocatedCount;
							
RWStructuredBuffer<uint> 	RWZBinDataAllocatedCount;
RWStructuredBuffer<uint2>	RWZBinData;
RWStructuredBuffer<uint> 	RWZBinSegmentAllocatedCount;
RWStructuredBuffer<FPackedSegmentType> RWZBinSegmentData;
							
RWStructuredBuffer<uint> 	RWRasterWorkAllocatedCount;
RWStructuredBuffer<uint2> 	RWRasterWork; // Offset & Count + tile coord

groupshared uint group_TilePrimCount;
groupshared uint group_TilePrimOffset;
groupshared uint group_TileToCompactCount;
groupshared uint group_TileToCompact[MAX_TILE_TO_COMPACT];

groupshared uint group_MaxZBinIndex;
groupshared uint group_ZBinOffset[COMPACTION_DEPTH_BUCKET];
groupshared uint group_ZBinCount[COMPACTION_DEPTH_BUCKET];

groupshared uint group_ZBinAllocatedOffset[MAX_ALLOCATED_ZBIN_COUNT];
groupshared uint group_ZBinAllocatedCount[MAX_ALLOCATED_ZBIN_COUNT];

uint GetZBinIndex(float InDepth, FMinMaxZ InMinMaxZ)
{
	// Inverse-Z
	const uint DepthIt = clamp(saturate(InDepth * InMinMaxZ.Scale + InMinMaxZ.Offset) * COMPACTION_DEPTH_BUCKET, 0, COMPACTION_DEPTH_BUCKET - 1);
	return (COMPACTION_DEPTH_BUCKET - 1) - DepthIt;
}

// Launch based on CPU BinTileResX x BinTileResY
// 1 group per screen-tile, 1 threads per bin-tile matching the screen-tile coord
// There can be/are several bins for the same screen area
[numthreads(THREADGROUP_SIZE, 1, 1)]
void CompactionCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID)
{
	if (GroupThreadID == 0)
	{
		group_TilePrimCount = 0;
		group_TilePrimOffset = 0;
		group_TileToCompactCount = 0;
		//group_ZBinToRefine = 0;
	}

	if (GroupThreadID < COMPACTION_DEPTH_BUCKET)
	{
		group_ZBinOffset[GroupThreadID] = 0;
		group_ZBinCount[GroupThreadID] = 0;
	}

	if (GroupThreadID < MAX_ALLOCATED_ZBIN_COUNT)
	{
		group_ZBinAllocatedOffset[GroupThreadID] = 0;
		group_ZBinAllocatedCount[GroupThreadID] = 0;
	}

	GroupMemoryBarrierWithGroupSync();

	const uint  TileCount = TileDataAllocatedCount[0];
	const uint2 TileCoord = GroupID;
	const uint  TilePackedCoord = PackTileCoord8bits(GroupID); // All thread will process the same tile
	const float SceneMinZ = UnpackDepth(SceneTileDepthTexture.Load(uint3(TileCoord, 0))).x;
	const FMinMaxZ MinMaxZ = UnpackMinMaxZ(ViewMinMaxZ[0], ViewMinMaxZ[1], SceneMinZ);

	// 1. Compute total number of primitives at this tile coordinate
	uint LocalPrimCount = 0;
	for (uint TileIdx = GroupThreadID; TileIdx < TileCount; TileIdx += THREADGROUP_SIZE)
	{
		const uint CurrentTilePackedCoord = LoadTileData(TileData, TileIdx, VT_Coord);

		if (TilePackedCoord == CurrentTilePackedCoord)
		{
			LocalPrimCount += LoadTileData(TileData, TileIdx, VT_PrimCount);
				
			uint WritePos;
			WaveInterlockedAddScalar_(group_TileToCompactCount, 1, WritePos);
			if (WritePos < MAX_TILE_TO_COMPACT)
			{
				group_TileToCompact[WritePos] = TileIdx;
			}
		}
	}

	if (LocalPrimCount > 0)
	{
		WaveInterlockedAdd(group_TilePrimCount, LocalPrimCount);
	}

	GroupMemoryBarrierWithGroupSync();

	const uint TotalPrimCount = group_TilePrimCount;
	if (TotalPrimCount == 0)
	{
		return;
	}

	// 2. Allocate space
	if (GroupThreadID == 0)
	{
		InterlockedAdd(RWZBinSegmentAllocatedCount[0], group_TilePrimCount, group_TilePrimOffset);
	}
	GroupMemoryBarrierWithGroupSync();

	#if PERMUTATION_DEBUG
	FShaderPrintContext Ctx = InitShaderPrintContext(all(GetCursorPos()/BIN_TILE_SIZE == TileCoord) && GroupThreadID == 0, uint2(1500, 200));
	Print(Ctx, TEXT("Compaction"), FontRed); Newline(Ctx);
	PrintLineN(Ctx, TotalPrimCount);
	PrintLineN(Ctx, SceneMinZ);
	PrintLineN(Ctx, MinMaxZ.MinZ);
	PrintLineN(Ctx, MinMaxZ.MaxZ);
	Newline(Ctx);
	#endif

	// 3. Copy PrimIDs to compacted memory
	{
		const uint NumInputTiles = min(group_TileToCompactCount, MAX_TILE_TO_COMPACT);

		// 3.1 First process the LDS list of tiles
		for (uint LDSIdx = 0; LDSIdx < NumInputTiles; ++LDSIdx)
		{
			const uint TileIdx = group_TileToCompact[LDSIdx];
			const uint TilePrimOffset = TileIdx * SEGMENT_COUNT_PER_ALLOC;
			const uint TilePrimCount = LoadTileData(TileData, TileIdx, VT_PrimCount);

			if (GroupThreadID < TilePrimCount)
			{
				const FSegment Segment = UnpackSegment(TileCoord, MinMaxZ, SegmentData[TilePrimOffset + GroupThreadID]);
				const float SegmentMaxZ = max(Segment.P0.z, Segment.P1.z);
				const float SegmentMinZ = min(Segment.P0.z, Segment.P1.z);
				const uint ZBinIndex = GetZBinIndex(SegmentMaxZ, MinMaxZ);
				InterlockedAdd(group_ZBinCount[ZBinIndex], 1);
			}
		}

		GroupMemoryBarrierWithGroupSync();

		// 3.2 Prefix sum of bin count
		// TODO Change to waveops prefixsum
		if (GroupThreadID == 0)
		{
			// 3.2.1 Compute ZBin offset and count
			uint ZBinAllocatedCount = 0;
			{
				uint ZBinAllocatedIndex = 0;
				uint AccSegmentCount = 0;
				uint GlobalOffset = 0;

				group_ZBinAllocatedCount[0]  = 0;
				group_ZBinAllocatedOffset[0] = 0;
				group_MaxZBinIndex = COMPACTION_DEPTH_BUCKET-1;
				for (uint It=0; It < COMPACTION_DEPTH_BUCKET;++It)
				{
					group_ZBinOffset[It] = GlobalOffset;

					const uint CurrentSegmentCount = group_ZBinCount[It];

					if ((AccSegmentCount + CurrentSegmentCount) < MAX_SEGMENT_COUNT_PER_ZBIN)
					{
						// Accumulate segment count
						group_ZBinAllocatedCount[ZBinAllocatedIndex] += CurrentSegmentCount;
					}
					else
					{
						// If we have reach the limit of ZBin we can allocate per tile, mark the max ZBinIndex
						if (ZBinAllocatedIndex+1 >= MAX_ALLOCATED_ZBIN_COUNT)
						{
							group_MaxZBinIndex = It-1u;
							break;
						}

						// New ZBin
						ZBinAllocatedIndex++;

						// Initialize segement offset/count
						group_ZBinAllocatedOffset[ZBinAllocatedIndex] = GlobalOffset;
						group_ZBinAllocatedCount[ZBinAllocatedIndex]  = CurrentSegmentCount;
						AccSegmentCount = 0;

					}
					AccSegmentCount += CurrentSegmentCount;
					GlobalOffset += CurrentSegmentCount;
				}

				ZBinAllocatedCount = ZBinAllocatedIndex + 1;
			}
	
			{
				// 3.2.2 Allocate ZBins
				uint ZBinOffset_Global = 0;
				InterlockedAdd(RWZBinDataAllocatedCount[0], ZBinAllocatedCount, ZBinOffset_Global);

				// 3.2.3 Write ZBins
				if (ZBinOffset_Global+ZBinAllocatedCount < MaxZBinDataCount)
				{
					for (uint It=0; It < ZBinAllocatedCount;++It)
					{
						const uint SegmentOffset = group_TilePrimOffset + group_ZBinAllocatedOffset[It];
						const uint SegmentCount  = group_ZBinAllocatedCount[It];

						#if PERMUTATION_DEBUG
						PrintLineN(Ctx, It);
						PrintLineN(Ctx, SegmentOffset);
						PrintLineN(Ctx, SegmentCount);
						Newline(Ctx);
						#endif

						RWZBinData[ZBinOffset_Global + It] = uint2(SegmentOffset, SegmentCount);
					}
				}

				// 3.2.4 Write raster work
				if (GroupThreadID == 0)
				{
					FRasterWork RasterWork;
					RasterWork.TileCoord = TileCoord;
					RasterWork.ZBinOffset= ZBinOffset_Global;
					RasterWork.ZBinCount = ZBinAllocatedCount;
					
					uint WriteOffset = 0;
					InterlockedAdd(RWRasterWorkAllocatedCount[0], 1, WriteOffset);
					if (WriteOffset < MaxRasterWorkCount)
					{
						RWRasterWork[WriteOffset] = PackRasterWork(RasterWork);
					}
				}
			}
		}

		GroupMemoryBarrierWithGroupSync();

		// 3.3 Clear insertion counter
		if (GroupThreadID < COMPACTION_DEPTH_BUCKET)
		{
			group_ZBinCount[GroupThreadID] = 0;
		}

		GroupMemoryBarrierWithGroupSync();

		// 3.4 Insert primitive into bins
		for (uint LDSIdx = 0; LDSIdx < NumInputTiles; ++LDSIdx)
		{
			const uint TileIdx = group_TileToCompact[LDSIdx];
			const uint TilePrimOffset = TileIdx * 1024;
			const uint TilePrimCount = LoadTileData(TileData, TileIdx, VT_PrimCount);

			if (GroupThreadID < TilePrimCount)
			{
				const FPackedSegmentType PackedSegment = SegmentData[TilePrimOffset + GroupThreadID];
				const FSegment Segment = UnpackSegment(TileCoord, MinMaxZ, PackedSegment);
				const float SegmentNearZ = max(Segment.P0.z, Segment.P1.z); // TODO: always order segment P0 to have nearest Z to avoid loading both points?
				const uint ZBinIndex = GetZBinIndex(SegmentNearZ, MinMaxZ);
				// TODO remapp so that we get ZBin filled up to max

				if (ZBinIndex <= group_MaxZBinIndex)
				{
					uint LocalOffset = 0;
					InterlockedAdd(group_ZBinCount[ZBinIndex], 1, LocalOffset);
					const uint WriteIndex = group_TilePrimOffset + group_ZBinOffset[ZBinIndex] + LocalOffset;
					RWZBinSegmentData[WriteIndex] = PackedSegment;
				}
			}
		}

		// 3.5 Check any remaning tiles (Unlikely?)
		//if (group_TileToCompactCount > 1024)
		//{
		//	for (uint TileIdx = group_MaxLDSTileIdx + 1; TileIdx < TileCount; ++TileIdx)
		//	{
		//		const uint TilePackedCoord = LoadVisTileData(TileData, TileIdx, VT_Coord);
		//		if (PackedCoord == TilePackedCoord)
		//		{
		//			const uint TilePrimOffset = TileIdx * 1024;
		//			const uint TilePrimCount = LoadVisTileData(TileData, TileIdx, VT_PrimCount);
		//
		//			if (GroupThreadID < TilePrimCount)
		//			{
		//				RWZBinSegmentData[CurrentWriteOffset + GroupThreadID] = SegmentData[TilePrimOffset + GroupThreadID];
		//			}
		//
		//			CurrentWriteOffset += TilePrimCount;
		//		}
		//	}
		//}
	}
}

#endif // CompactionCS

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#ifdef RasterizerCS

#define MAX_THREAD_ITERATION_COUNT 4096
#define RASTER_TILE_COUNT_2D (BIN_TILE_SIZE / RASTER_TILE_SIZE)           // = 4x4
#define RASTER_TILE_COUNT_1D (RASTER_TILE_COUNT_2D*RASTER_TILE_COUNT_2D)  // = 16

#define MAX_SEGMENT_PER_RASTER_STEP 32

#define WAVE_RASTER 1

// Sanity check
#if RASTER_TILE_COUNT_1D != 16
	#error Update code
#endif
#if RASTER_TILE_COUNT_2D != 4
	#error Update code
#endif
#if MAX_SEGMENT_COUNT_PER_ZBIN > THREADGROUP_SIZE
	#error MAX_SEGMENT_COUNT_PER_ZBIN needs to be smaller than THREADGROUP_SIZE to ensure all segment of a given ZBin could be loaded in one iteration
#endif
#if (MAX_SEGMENT_PER_RASTER_STEP * RASTER_TILE_COUNT_1D) > THREADGROUP_SIZE
	#error MAX_SEGMENT_PER_RASTER_STEP is too large, clearing won't be done in a single step
#endif
#if MAX_SEGMENT_PER_RASTER_STEP != 32
	#error  Update code, as waveops 32 are used for segment rasterization
#endif

StructuredBuffer<uint> 	ViewMinMaxZ;
							
StructuredBuffer<uint> 	ZBinDataAllocatedCount;
StructuredBuffer<uint2>	ZBinData;
StructuredBuffer<uint> 	ZBinSegmentAllocatedCount;
StructuredBuffer<FPackedSegmentType> ZBinSegmentData;
							
StructuredBuffer<uint> 	RasterWorkAllocatedCount;
StructuredBuffer<uint2> RasterWork; // Offset & Count + tile coord
RWStructuredBuffer<uint>RasterWorkQueue;

Texture2D<uint2> 		SceneTileDepthTexture;
Texture2D<float> 		SceneDepthTexture;
RWTexture2D<float4> 	OutputTexture;

groupshared uint group_WorkFetchIndex;
groupshared uint group_Valid;
groupshared FRasterWork group_Work;

groupshared FPackedSegmentType group_PackedSegments[MAX_SEGMENT_COUNT_PER_ZBIN];
#if WAVE_RASTER
#define THREADGROUP_WAVE_COUNT (THREADGROUP_SIZE / 32)
#if THREADGROUP_WAVE_COUNT * MAX_SEGMENT_PER_RASTER_STEP > THREADGROUP_SIZE
#error Update code as we expect a certain number of wave size to rasterize the segment
#endif
groupshared uint2 group_SegmentsBits[THREADGROUP_SIZE];
groupshared float group_SegmentsColor[THREADGROUP_SIZE];
groupshared uint group_CompletedWaves[RASTER_TILE_COUNT_1D * 2]; // Wave
#else
groupshared uint2 group_SegmentsBits[RASTER_TILE_COUNT_1D][MAX_SEGMENT_PER_RASTER_STEP]; 	// 8x8 bit mask per segments. 32 segments
#endif
//groupshared float group_Coverage[RASTER_TILE_COUNT_1D][RASTER_TILE_SIZE][RASTER_TILE_SIZE]; // 16 tiles of 8x8 - Needs to be reduce -> 8bit compaction for coverage?
//groupshared float3 group_Color[RASTER_TILE_COUNT_1D][RASTER_TILE_SIZE][RASTER_TILE_SIZE];

groupshared uint group_SceneMaxZ[RASTER_TILE_COUNT_1D];

#define COVERAGE_CULLING 1

// 8x256 = 2048
// 8x32  = 256  -> x16  =4096
// 8x8   = 64   -> x16  =1024
// ------------
// LDS 2.5k per group
// x16 = 32k 
struct FTileThreadCoord
{
	uint2 Tile;
	uint2 Thread;

	uint Tile1d;
	uint Thread1d;
};

struct FOutputCoord
{
	uint2 PixelCoord;
};

struct FCoord
{
	FTileThreadCoord Bin;
	FTileThreadCoord Raster;
	FOutputCoord Out;
};

void DrawBitLine(RWTexture2D<float4> Out, uint2 OutResolution, uint2 OutBaseCoord, uint2 In)
{	
	for (uint y=0;y<8; ++y)
	for (uint x=0;x<8; ++x)
	{
		const uint l = x + y * 8;
		const uint b = l<32u ? ((In.x>>l)&0x1) : ((In.y>>(l-32u))&0x1);
		if (b > 0)
		{
			const uint2 OutCoord = OutBaseCoord + uint2(x, y);
			if (all(OutCoord < OutResolution))
			Out[OutCoord] = float4(0,1,0,1);
		}
	}
}
#if PERMUTATION_DEBUG
void PlotRasterTileCoverage(inout FShaderPrintContext Ctx, uint RasterTile1d)
{
	//const float RasterTileMinCoverage = asfloat(group_RasterTileMinCoverage[RasterTile1d]);
	//PrintLineN(Ctx, RasterTileMinCoverage);
	//Newline(Ctx);
	//for (uint y = 0; y < 8; ++y)
	//{
	//	for (uint x = 0; x < 8; ++x)
	//	{
	//		const float Cov = group_Coverage[RasterTile1d][x][y];
	//		if (Cov > 0)
	//		Print(Ctx, TEXT("x "), FontGreen);
	//		else
	//		Print(Ctx, TEXT(". "), FontWhite);
	//	}
	//	Newline(Ctx);
	//}
}

void PlotWorkInfo(inout FShaderPrintContext Ctx, FCoord InCoord)
{
	PrintLineN(Ctx, group_Work.TileCoord);
	PrintLineN(Ctx, group_Work.ZBinOffset);
	//PrintLineN(Ctx, group_Work.ZBinCount);
	Print(Ctx, TEXT("ZBinCount :"), FontRed); Print(Ctx, group_Work.ZBinCount, FontRed); Newline(Ctx);
	Newline(Ctx);

	PrintLineN(Ctx, InCoord.Bin.Tile);
	PrintLineN(Ctx, InCoord.Bin.Thread);
	PrintLineN(Ctx, InCoord.Raster.Tile);
	PrintLineN(Ctx, InCoord.Raster.Thread);
	PrintLineN(Ctx, InCoord.Out.PixelCoord);
	Newline(Ctx);
}

void PlotRasterTileAABB(inout FShaderPrintContext Ctx, FCoord InCoord)
{
	const float2 AABBMin = InCoord.Bin.Tile * BIN_TILE_SIZE +  InCoord.Raster.Tile    * RASTER_TILE_SIZE;
	const float2 AABBMax = InCoord.Bin.Tile * BIN_TILE_SIZE + (InCoord.Raster.Tile+1) * RASTER_TILE_SIZE;
	AddQuadSS(Ctx, AABBMin, AABBMax, ColorRed);
}

void PlotUnclippedSegment(FCoord InCoord, uint SegIt, FMinMaxZ MinMaxZ)
{
	const FSegment Segment = UnpackSegment(InCoord.Bin.Tile, MinMaxZ, group_PackedSegments[SegIt]);
	FShaderPrintContext CtxD = InitShaderPrintContext(true, 0);
	AddLineSS(CtxD, Segment.P0.xy, Segment.P1.xy, ColorPurple);
}

FShaderPrintContext GetShaderPrintContextPerRasterThread(FCoord Coord)
{
	const float2 AABBMin = Coord.Bin.Tile * BIN_TILE_SIZE +  Coord.Raster.Tile    * RASTER_TILE_SIZE;
	const float2 AABBMax = Coord.Bin.Tile * BIN_TILE_SIZE + (Coord.Raster.Tile+1) * RASTER_TILE_SIZE;
	const uint2 CursorCoord = GetCursorPos();
	return InitShaderPrintContext(all(CursorCoord >= AABBMin) && all(CursorCoord < AABBMax), uint2(450, 450) + Coord.Raster.Thread * 20);
}
#endif

[numthreads(THREADGROUP_SIZE, 1, 1)]
void RasterizerCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThread1D : SV_GroupIndex, uint GroupID : SV_GroupID)
{
	ResolvedView = ResolveView();

	// Compute alll the coordinate (Bin/Raster/Output - Tile/Thread/Thread1d)
	FCoord Coord;
	{
		// Use harcoded value for clarity. Ensure the code is coherent
		#if RASTER_TILE_COUNT_2D != 4u
		#error Update tile code
		#endif

		const uint2 ThreadBlock8x256       = uint2(GroupThread1D % RASTER_TILE_SIZE, GroupThread1D / RASTER_TILE_SIZE);
		const uint2 ThreadBlock32x32       = uint2(GroupThread1D % BIN_TILE_SIZE, GroupThread1D / BIN_TILE_SIZE);
		const uint  Block8x8               = GroupThread1D / (RASTER_TILE_SIZE*RASTER_TILE_SIZE);

		const uint2 LocalThreadCoord_Bin   = ThreadBlock32x32;
		const uint2 GlobalTileCoord_Bin = uint2(GroupID % 16, GroupID / 16); // For debug
		
		const uint2 LocalTileCoord_Raster   = uint2(Block8x8 % 4u, Block8x8 / 4u); // Each bin tile is divided in to 16 (=4x4) raster tiles
		const uint2 LocalThreadCoord_Raster = uint2(ThreadBlock8x256.x, ThreadBlock8x256.y % RASTER_TILE_SIZE);

		// Bin coord
		Coord.Bin.Tile     = 0;
		Coord.Bin.Thread   = LocalThreadCoord_Bin;
		Coord.Bin.Tile1d   = 0;
		Coord.Bin.Thread1d = Coord.Bin.Thread.x + Coord.Bin.Thread.y * BIN_TILE_SIZE;

		// Raster coord
		Coord.Raster.Tile     = LocalTileCoord_Raster; // Local 4x4 tile coord
		Coord.Raster.Thread   = LocalThreadCoord_Raster;
		Coord.Raster.Tile1d   = Coord.Raster.Tile.x + Coord.Raster.Tile.y * RASTER_TILE_COUNT_2D;
		Coord.Raster.Thread1d = Coord.Raster.Thread.x + Coord.Raster.Thread.y * RASTER_TILE_SIZE;

		// Output coord Setup later for each work item
		Coord.Out.PixelCoord = 0;
	}

	const uint RasterizerIndex = GroupID;
	const uint WorkCount = min(RasterWorkAllocatedCount[0], MaxRasterWorkCount);
	const FMinMaxZ MinMaxZ = UnpackMinMaxZ(ViewMinMaxZ[0], ViewMinMaxZ[1]);

	group_Valid = true;
	GroupMemoryBarrierWithGroupSync();

	// Persistent thread loop for binning the clusters queue
	#if 0
	uint IterationIt = 0;
	while (IterationIt++ < MAX_THREAD_ITERATION_COUNT)
	#else
	for (uint IterationIt=0; IterationIt<MAX_THREAD_ITERATION_COUNT; ++IterationIt)
	#endif
	{
		// 1. Fetch work
		if (GroupThread1D == 0)
		{
			uint WorkFetchIndex = 0;
			InterlockedAdd(RasterWorkQueue[0], 1, WorkFetchIndex);
			group_WorkFetchIndex = WorkFetchIndex;
			if (WorkFetchIndex < WorkCount)
			{
				group_Work = UnpackRasterWork(RasterWork[WorkFetchIndex]);
			}
		}
		GroupMemoryBarrierWithGroupSync();

		const bool bValidWork = group_WorkFetchIndex < WorkCount;
		if (!bValidWork)
		{
			return;
		}

		// 2. Clear shared data
		{
			// Segment's bitmasks
			if (GroupThread1D < RASTER_TILE_COUNT_1D * MAX_SEGMENT_PER_RASTER_STEP)
			{
				const uint RasterTile1d = GroupThread1D / MAX_SEGMENT_PER_RASTER_STEP;
				const uint SliceIndex = GroupThread1D % MAX_SEGMENT_PER_RASTER_STEP;
				#if WAVE_RASTER == 0
				group_SegmentsBits[RasterTile1d][SliceIndex] = 0;
				#endif

				group_SceneMaxZ[GroupThread1D] = 0;
			}

			#if WAVE_RASTER == 1
			if (GroupThread1D < THREADGROUP_SIZE)
			{
				group_SegmentsBits[GroupThread1D] = 0;
				group_SegmentsColor[GroupThread1D] = 0;
			}
			#endif

			if (GroupThread1D < RASTER_TILE_COUNT_1D * 2)
			{
				group_CompletedWaves[GroupThread1D] = 0;
			}
		}
		float3 OutColor = 0;
		float OutTransmittance = 1.f;

		// 2.1 Update coord with the current work item data
		Coord.Bin.Tile       = group_Work.TileCoord;
		Coord.Bin.Tile1d     = 0; // Not used
		Coord.Out.PixelCoord = group_Work.TileCoord * BIN_TILE_SIZE + Coord.Raster.Tile * RASTER_TILE_SIZE + Coord.Raster.Thread;

		// 2.2 Compute scene max Z for each raster tile
		{
			const float SceneDepth = SceneDepthTexture.Load(uint3(Coord.Out.PixelCoord, 0));
			WaveInterlockedMax(group_SceneMaxZ[Coord.Raster.Tile1d], asuint(SceneDepth));
		}

		GroupMemoryBarrierWithGroupSync();

		// 3. Process work or exit if done
		if (bValidWork)
		{
			#if PERMUTATION_DEBUG
			FShaderPrintContext Ctx = InitShaderPrintContextAtCursorUnique(Coord.Out.PixelCoord, uint2(650, 200), uint2(350, 0));
			PlotWorkInfo(Ctx, Coord);
			PrintLineN(Ctx, group_WorkFetchIndex);
			PrintLineN(Ctx, IterationIt);

			Newline(Ctx);
			PrintLineN(Ctx, DispatchThreadID);
			PrintLineN(Ctx, GroupThread1D);
			PrintLineN(Ctx, GroupID);
			Newline(Ctx);
			#endif

			// 2.2 Load & raster segments in front-to-back order
			for (uint ZBinIt = 0; ZBinIt < group_Work.ZBinCount; ZBinIt++)
			{
				// All rasterizers within the tile are done
				#if COVERAGE_CULLING
				{
					const uint LaneIndex = WaveGetLaneIndex();
					const uint WavePerRasterTile = WaveGetLaneCount() == 32u ? 2u : 1u;
					const uint RasterCoverageIndex = LaneIndex % (RASTER_TILE_COUNT_1D * WavePerRasterTile);
					const bool bTileDone = WaveActiveAllTrue(group_CompletedWaves[RasterCoverageIndex] == 1);
					if (bTileDone)
					{
						break;
					}
				}
				#endif

				#if PERMUTATION_DEBUG
				Print(Ctx, TEXT("ZBinIt : "), FontOrange); Print(Ctx, ZBinIt, FontOrange); Newline(Ctx);
				PrintLineN(Ctx, group_WorkFetchIndex);
				PrintLineN(Ctx, IterationIt);
				Newline(Ctx);
				#endif

				const uint2 Data = ZBinData[group_Work.ZBinOffset + ZBinIt];
				const uint SegmentOffset = Data.x;
				const uint SegmentCount  = min(Data.y, MAX_SEGMENT_COUNT_PER_ZBIN);

				if (GroupThread1D < SegmentCount)
				{
					group_PackedSegments[GroupThread1D] = ZBinSegmentData[SegmentOffset + GroupThread1D];
				}

				// Sort segments
				{
					// Count segment per bin
					// Compute bin offset/size
					// InsertSegment
				}
				GroupMemoryBarrierWithGroupSync();

				#if PERMUTATION_DEBUG
				PrintLineN(Ctx, SegmentOffset);
				PrintLineN(Ctx, SegmentCount);
				const uint ActualNumSegmentToLoad = Data.y;
				PrintLineN(Ctx, ActualNumSegmentToLoad);
				#endif

				// Raster 8x8
				const bool bRasterize = true;
				if (bRasterize)
				{
					const float2 RasterTileAABBMin =  Coord.Raster.Tile    * RASTER_TILE_SIZE;
					const float2 RasterTileAABBMax = (Coord.Raster.Tile+1) * RASTER_TILE_SIZE;

					const uint WaveIndex = GroupThread1D / WaveGetLaneCount();
					const uint WaveOffset = WaveIndex * WaveGetLaneCount();
					const uint LaneIndex = WaveGetLaneIndex();
					const uint LaneCount = WaveGetLaneCount();

					#if PERMUTATION_DEBUG
					//Newline(Ctx);
					//PrintLineN(Ctx, WaveIndex);
					//PrintLineN(Ctx, WaveOffset);
					//PrintLineN(Ctx, LaneIndex);
					//PrintLineN(Ctx, LaneCount);
					#endif

					// ... 
					#if PERMUTATION_DEBUG
					const uint2 CursorPos = GetCursorPos();
					const float2 GlobalRasterTileAABBMin = Coord.Bin.Tile * BIN_TILE_SIZE +  Coord.Raster.Tile    * RASTER_TILE_SIZE;
					const float2 GlobalRasterTileAABBMax = Coord.Bin.Tile * BIN_TILE_SIZE + (Coord.Raster.Tile+1) * RASTER_TILE_SIZE;
					//PrintLineN(Ctx, GlobalRasterTileAABBMin);
					//PrintLineN(Ctx, GlobalRasterTileAABBMax);
					AddQuadSS(Ctx, GlobalRasterTileAABBMin, GlobalRasterTileAABBMax, ColorYellow);
					const bool bDebugRasterTile = all(CursorPos >= GlobalRasterTileAABBMin) && all(CursorPos < GlobalRasterTileAABBMax) && ZBinIt == 0 && IterationIt == 0;
					const uint2 LanePacked = (WaveIndex & 1) == 0 ? uint2(0,0) : uint2(0, 4);
					//FShaderPrintContext CtxU = InitShaderPrintContext(bDebugRasterTile, uint2(500 + LaneIndex * 20, 200 + Yoff * 10));
					const uint2 LaneXY = uint2(LaneIndex % 8, LaneIndex / 8);
					//FShaderPrintContext CtxU = InitShaderPrintContext(bDebugRasterTile, uint2(500, 200) + (LaneXY + LanePacked) * 20);
					//FShaderPrintContext CtxU = InitShaderPrintContext(bDebugRasterTile, uint2(500, 200) + (LaneXY + LanePacked) * uint2(120, 40));
					#endif

					const float Cov = 0.2f;
					for (uint SegOffset=0; SegOffset<SegmentCount; SegOffset+=MAX_SEGMENT_PER_RASTER_STEP) // sweep all segments, 32 at a time
					{
						// Compute segment's bitmask - in batch of MAX_SEGMENT_PER_RASTER_STEP(32)
						// Each wave compute the bitmask for 32 segments
						if (LaneIndex < MAX_SEGMENT_PER_RASTER_STEP)
						{
							uint2 SegmentBitmask = 0;
							float3 SegmentColor = 0;
							if (SegOffset + LaneIndex < SegmentCount)
							{
								// Segment is loaded relative to the current bin tile. We don't need the absolute coord.
								const FSegment Segment = UnpackSegment(0/*Coord.Bin.Tile*/, MinMaxZ, group_PackedSegments[SegOffset + LaneIndex]);
								SegmentColor = Segment.Color;

								float3 ClipP0 = Segment.P0;
								float3 ClipP1 = Segment.P1;
								if (ClipSegment(RasterTileAABBMin, RasterTileAABBMax, ClipP0, ClipP1))
								{
									const float2 nP0 = clamp(ClipP0.xy - RasterTileAABBMin, 0, RASTER_TILE_SIZE* 0.999f);
									const float2 nP1 = clamp(ClipP1.xy - RasterTileAABBMin, 0, RASTER_TILE_SIZE* 0.999f);

									const float PMinZ = min(ClipP0.z, ClipP1.z);
									const float PMaxZ = min(ClipP0.z, ClipP1.z);
									const bool bNeedFineDepthTest = PMinZ < asfloat(group_SceneMaxZ[Coord.Raster.Tile1d]);
									#if PERMUTATION_DEBUG
									if (0)//(bNeedFineDepthTest)
									{
										FShaderPrintContext CtxD = InitShaderPrintContext(true, 0);
										AddQuadSS(CtxD, GlobalRasterTileAABBMin, GlobalRasterTileAABBMax, ColorOrange);
									}
									#endif
									SegmentBitmask = GetSegmentBits(SceneDepthTexture, Coord.Bin.Tile * BIN_TILE_SIZE + RasterTileAABBMin, nP0, nP1, ClipP0.z, ClipP1.z, bNeedFineDepthTest);
								}
							}
							group_SegmentsBits[WaveOffset + LaneIndex] = SegmentBitmask;
							group_SegmentsColor[WaveOffset + LaneIndex] = SegmentColor.x;
						}

						// Compute raster tile bitmask (8x8)
						if (WaveGetLaneCount() == 32)
						{
							const uint ThreadMask = 1u << LaneIndex;

							// 0..31
							// x x x x x x x x
							// x x x x x x x x
							// x x x x x x x x
							// x x x x x x x x
							// . . . . . . . .
							// . . . . . . . .
							// . . . . . . . .
							// . . . . . . . .
							if ((WaveIndex&1) == 0)
							{
								for (uint SliceIt = 0; SliceIt < 32u; ++SliceIt)
								{
									const bool bVisible = (group_SegmentsBits[WaveOffset + SliceIt].x & ThreadMask) != 0;
									if (bVisible)
									{
										const float CurrentTransmittance = OutTransmittance;
										OutTransmittance *= (1-Cov);

										const float3 SegmentColor = group_SegmentsColor[WaveOffset + SliceIt];
										OutColor += OutTransmittance * Cov * SegmentColor.xyz;
									}
								}
							}
							// 32..63
							// . . . . . . . .
							// . . . . . . . .
							// . . . . . . . .
							// . . . . . . . .
							// x x x x x x x x
							// x x x x x x x x
							// x x x x x x x x
							// x x x x x x x x
							else
							{
								for (uint SliceIt = 0; SliceIt < 32u; ++SliceIt)
								{
									const bool bVisible = (group_SegmentsBits[WaveOffset + SliceIt].y & ThreadMask) != 0;
									if (bVisible)
									{
										const float CurrentTransmittance = OutTransmittance;
										OutTransmittance *= (1-Cov);

										const float3 SegmentColor = group_SegmentsColor[WaveOffset + SliceIt];
										OutColor += OutTransmittance * Cov * SegmentColor.xyz;
									}
								}
							}
						}
						else if (WaveGetLaneCount() == 64)
						{
							// TODO
						}

						#if COVERAGE_CULLING
						// Current rasterizer is done
						if (WaveActiveAllTrue(OutTransmittance < MinCoverageThreshold))
						{
							if (WaveIsFirstLane())
							{
								#if PERMUTATION_DEBUG
								FShaderPrintContext CtxD = InitShaderPrintContext(true, 0);
								const float3 TileColor = ColorMapTurbo(0.5f + (SegOffset / float(SegmentCount)) * 0.5f);
								//AddQuadSS(CtxD, GlobalRasterTileAABBMin, GlobalRasterTileAABBMax, float4(TileColor,1));
								PrintLineN(Ctx, OutTransmittance);
								PrintLine(Ctx, SegOffset); PrintLineN(Ctx, SegmentCount);
								#endif

								group_CompletedWaves[WaveIndex] = 1;
							}
							break;
						}
						#endif
					} // for (SegOffset)
					GroupMemoryBarrierWithGroupSync();
				} // if (bRasterize)
			} // for(ZBin)

			GroupMemoryBarrierWithGroupSync();

			#if PERMUTATION_DEBUG
			Newline(Ctx);
			PrintLineN(Ctx, OutColor);
			PrintLineN(Ctx, OutTransmittance);
			#endif

			// Write out
			OutputTexture[Coord.Out.PixelCoord] = float4(OutColor, 1.f);

		} // if (bValidWork)

	} // while()
}
#endif // RasterizerCS

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#ifdef DebugDrawingCS

#ifndef PERMUTATION_DEBUG
#error PERMUTATION_DEBUG needs to be defined 
#endif

uint TotalBufferMemoryInMBytes;
uint TotalTextureMemoryInMBytes;

Buffer<uint> VisibleInstanceArgs;
StructuredBuffer<uint2> VisibleInstances;

Buffer<uint> VisibleClusterArgs;
StructuredBuffer<uint2> VisibleClusters;

Texture2D<uint2> SceneTileDepthTexture;

Texture2DArray<uint> TileSegmentCount;
StructuredBuffer<uint> TileDataAllocatedCount;
StructuredBuffer<uint> ViewMinMaxZ;

StructuredBuffer<uint2> ZBinData;
StructuredBuffer<uint2> RasterWork;
StructuredBuffer<uint> RasterWorkAllocatedCount;
StructuredBuffer<FPackedSegmentType> ZBinSegmentData;
StructuredBuffer<uint> ZBinSegmentAllocatedCount;
StructuredBuffer<uint> ZBinDataAllocatedCount;

void PrintRatio(inout FShaderPrintContext Ctx, uint In, uint InMax, uint InDigit)
{
	Print(Ctx, In, Select(In <= InMax, FontYellow, FontRed), InDigit,0); Print(Ctx, TEXT("/"), FontWhite); Print(Ctx, InMax, FontYellow, InDigit,0); 
}

[numthreads(THREADGROUP_SIZE, 1, 1)]
void DebugDrawingCS(uint2 DispatchThreadId : SV_DispatchThreadID)
{
	ResolvedView = ResolveView(); 

	const uint VisibleInstanceCount = VisibleInstanceArgs[0];
	const uint VisibleClusterCount = VisibleClusterArgs[3];
	const uint TileDataAllocCount = TileDataAllocatedCount[0];
	const uint RasterWorkAllocCount = RasterWorkAllocatedCount[0];
	const uint ZBinSegAllocatedCount = ZBinSegmentAllocatedCount[0];
	const uint ZBinDatAllocatedCount = ZBinDataAllocatedCount[0];

	const FMinMaxZ MinMaxZ = UnpackMinMaxZ(ViewMinMaxZ[0], ViewMinMaxZ[1]);

	// Draw main stats
	if (all(DispatchThreadId == 0))
	{
		// Draw main stats
		FShaderPrintContext Ctx = InitShaderPrintContext(all(DispatchThreadId == 0), uint2(50, 50));
		Print(Ctx, TEXT("Render Curve Raster Pipeline"), FontRed); Newline(Ctx);
		Newline(Ctx);

		Print(Ctx, TEXT("Instance/Cluster"), FontOrange); Newline(Ctx);
		Print(Ctx, TEXT("Visible Instance    : "), FontWhite); PrintRatio(Ctx, VisibleInstanceCount, Scene.RenderCurve.InstanceCount, 3); Newline(Ctx);
		Print(Ctx, TEXT("Visible Cluster     : "), FontWhite); PrintRatio(Ctx, VisibleClusterCount, Scene.RenderCurve.ClusterCount, 6); Newline(Ctx);
		Print(Ctx, TEXT("Max ClusterStride   : "), FontWhite); Print(Ctx, Scene.RenderCurve.MaxClusterStrideInBytes, FontYellow);  Newline(Ctx);
		Newline(Ctx);

		Print(Ctx, TEXT("Min/Max Z"), FontOrange); Newline(Ctx);
		Print(Ctx, TEXT("MinZ                : "), FontWhite); Print(Ctx, MinMaxZ.MinZ, FontYellow); Newline(Ctx);
		Print(Ctx, TEXT("MaxZ                : "), FontWhite); Print(Ctx, MinMaxZ.MaxZ, FontYellow); Newline(Ctx);
		Newline(Ctx);

		Print(Ctx, TEXT("Segment & ZBin"), FontOrange); Newline(Ctx);
		Print(Ctx, TEXT("MaxSegmentDataCount : "), FontWhite); Print(Ctx, MaxSegmentDataCount, FontYellow); Newline(Ctx);
		Print(Ctx, TEXT("ZBin Data alloc.    : "), FontWhite); PrintRatio(Ctx, ZBinDatAllocatedCount, MaxZBinDataCount, 9); Newline(Ctx);
		Print(Ctx, TEXT("ZBin Segment alloc. : "), FontWhite); PrintRatio(Ctx, ZBinSegAllocatedCount, MaxZBinSegmentDataCount, 9); Newline(Ctx);
		Newline(Ctx);

		Print(Ctx, TEXT("Binners"), FontOrange); Newline(Ctx);
		Print(Ctx, TEXT("Num Binners         : "), FontWhite); Print(Ctx, NumBinners, FontYellow); Newline(Ctx);
		Print(Ctx, TEXT("Bin Tile Size       : "), FontWhite); Print(Ctx, uint(BIN_TILE_SIZE), FontYellow); Newline(Ctx);
		Print(Ctx, TEXT("Bin Res             : "), FontWhite); Print(Ctx, BinTileRes.x, FontYellow, 3, 0);  Print(Ctx, TEXT("x"), FontWhite); Print(Ctx, BinTileRes.y, FontYellow, 3, 0);Newline(Ctx);
		Print(Ctx, TEXT("Tile data allocated : "), FontWhite); PrintRatio(Ctx, TileDataAllocCount, MaxTileDataCount, 8); Newline(Ctx);
		Newline(Ctx);

		Print(Ctx, TEXT("Rasterizers"), FontOrange); Newline(Ctx);
		Print(Ctx, TEXT("Num Rasterizers     : "), FontWhite); Print(Ctx, NumRasterizers, FontYellow); Newline(Ctx);
		Print(Ctx, TEXT("Raster Tile Size    : "), FontWhite); Print(Ctx, uint(RASTER_TILE_SIZE), FontYellow); Newline(Ctx);
		Print(Ctx, TEXT("Raster Res          : "), FontWhite); Print(Ctx, RasterTileRes.x, FontYellow, 3, 0);  Print(Ctx, TEXT("x"), FontWhite); Print(Ctx, RasterTileRes.y, FontYellow, 3, 0);Newline(Ctx);
		Print(Ctx, TEXT("Raster Work         : "), FontWhite); PrintRatio(Ctx, RasterWorkAllocCount, MaxRasterWorkCount, 6); Newline(Ctx);
		Print(Ctx, TEXT("Raster Load         : "), FontWhite); Print(Ctx, RasterWorkAllocCount / float(NumRasterizers), FontYellow); Newline(Ctx);
		Newline(Ctx);

		Print(Ctx, TEXT("Memory"), FontOrange); Newline(Ctx);
		Print(Ctx, TEXT("Buffer Memory(MB)   : "), FontWhite); Print(Ctx, TotalBufferMemoryInMBytes, FontYellow); Newline(Ctx);
		Print(Ctx, TEXT("Texture Memory(MB)  : "), FontWhite); Print(Ctx, TotalTextureMemoryInMBytes, FontYellow); Newline(Ctx);
		Newline(Ctx);

		// Cursor bin
		if (0)
		{
			const uint2 BinCoord = uint2(GetCursorPos()) >> BIN_TILE_SIZE_DIV_AS_SHIFT;
			uint SegmentCount = 0; 
			for (uint BinnerIt = 0; BinnerIt < NumBinners; ++BinnerIt)
			{
				SegmentCount += TileSegmentCount.Load(uint4(BinCoord, BinnerIt, 0));
			}
			AddQuadSS(Ctx, BinCoord * BIN_TILE_SIZE, (BinCoord + 1) * BIN_TILE_SIZE, ColorRed);
			Print(Ctx, TEXT("SegmentCount        : "), FontWhite); Print(Ctx, SegmentCount, FontYellow); Newline(Ctx);
		}
	}

	// Draw bining tiles
	#if 0
	if (all(DispatchThreadId.xy < BinTileRes))
	{
		const uint2 BinCoord = DispatchThreadId.xy;
		// All bin
		if (0)
		{
			uint SegmentCount = 0; 
			for (uint BinnerIt = 0; BinnerIt < NumBinners; ++BinnerIt)
			{
				SegmentCount += TileSegmentCount.Load(uint4(BinCoord, BinnerIt, 0));
			}

			FShaderPrintContext Ctx = InitShaderPrintContext(true, uint2(50, 350));
			AddFilledQuadSS(Ctx, float2(BinCoord) * BinTileSize, float2(BinCoord + 1) * BinTileSize, float4(ColorMapMagma(SegmentCount/2048.f), 0.5));
			//AddQuadSS(Ctx, BinCoord * BIN_TILE_SIZE, (BinCoord + 1) * BIN_TILE_SIZE, ColorRed);
			//PrintLineN(Ctx, SegmentCount);
			
			//const uint2 BinCoord = uint2(BinX, BinY);
			//const float Depth = UnpackDepth(SceneTileDepthTexture.Load(uint3(BinCoord, 0))).x;
			//AddFilledQuadSS(Ctx, float2(BinCoord) * BinTileSize, float2(BinCoord + 1) * BinTileSize, float4(ColorMapMagma(Depth), 0.5));

			//const uint SegmentCount = TileSegmentCount.Load(uint4(BinCoord, ResolvedView.GeneralPurposeTweak, 0));
		}
	}
	#endif

	// Draw ZBin tiles
	#if 1
	if (0)
	//if (all(DispatchThreadId == 0))
	{
		FShaderPrintContext Ctx = InitShaderPrintContext(all(DispatchThreadId == 0), uint2(50, 350));

		const uint RasterWorkCount = RasterWorkAllocatedCount[0];
		const FMinMaxZ MinMaxZ = UnpackMinMaxZ(ViewMinMaxZ[0], ViewMinMaxZ[1]);
		for (uint WorkIt = 0; WorkIt < RasterWorkCount; ++WorkIt)
		{
			const FRasterWork Work = UnpackRasterWork(RasterWork[WorkIt]);
		
			#if PERMUTATION_DEBUG
			//PrintLine(Ctx, Work.TileCoord);
			//PrintLine(Ctx, Work.ZBinOffset);
			//PrintLine(Ctx, Work.ZBinCount);
			for (uint ZBinIt = 0; ZBinIt < Work.ZBinCount; ZBinIt++)
			{
				const uint2 Data = ZBinData[Work.ZBinOffset + ZBinIt];
				const uint SegmentOffset = Data.x;
				const uint SegmentCount  = Data.y;
				//PrintLine(Ctx, SegmentOffset);
				//PrintLine(Ctx, SegmentCount);

				for (uint SegIt = 0; SegIt < SegmentCount; SegIt++)
				//uint SegIt = 0;
				//if (SegmentCount>0)
				{
					const FPackedSegmentType PackedSegment = ZBinSegmentData[SegmentOffset + SegIt];
					const FSegment Segment = UnpackSegment(Work.TileCoord, MinMaxZ, PackedSegment);
					AddLineSS(Ctx, Segment.P0.xy, Segment.P1.xy, ColorPurple);

					//PrintLine(Ctx, Segment.P0.xy);
					//PrintLine(Ctx, Segment.P1.xy);
				}
			}
			Newline(Ctx);
			#endif
		}
	}
	#endif

	// Draw clusters
	#if 1
	const uint VisibleClusterFetchIndex = DispatchThreadId.x + DispatchThreadId.y * Resolution.x;
	if (VisibleClusterFetchIndex < VisibleClusterCount)
	{
		const uint2 VisibleData = VisibleClusters[VisibleClusterFetchIndex];
		const uint  PrimitiveIndex = VisibleData.x;
		const uint  ClusterIndex = VisibleData.y;

		const FRenderCurveInstanceData RenderCurveInstanceData = GetRenderCurveInstanceData(PrimitiveIndex);
		const FClusterHeader ClusterHeader = GetClusterHeader(ClusterIndex);

		const float3 LocalBoundsCenter = ClusterHeader.LocalBoundCenter;
		const float3 LocalBoundsExtent = ClusterHeader.LocalBoundExtent;

		// Cluster bounds
		#if 0
		{
			const float3 ClusterColor = ColorMapMagma(VisibleClusterFetchIndex / float(VisibleClusterCount));
			FShaderPrintContext Ctx = InitShaderPrintContext(true, uint2(0,0));
			AddOBBTWS(Ctx, LocalBoundsCenter - LocalBoundsExtent, LocalBoundsCenter + LocalBoundsExtent, float4(ClusterColor,1), RenderCurveInstanceData.LocalToTranslatedWorld);
		}
		#endif

		// Cluster curves
		#if 0
		for (uint CurveIt = 0; CurveIt < ClusterHeader.CurveCount; ++CurveIt)
		{
			float3 PrevPoint = 0;
			for (uint PointIt = 0; PointIt < ClusterHeader.PointPerCurve; ++PointIt)
			{
				const FCurvePoint Point = GetClusterPoint(ClusterHeader, ClusterIndex, CurveIt, PointIt);
				const float3 TranslatedWorldPosition = mul(float4(Point.Position, 1), RenderCurveInstanceData.LocalToTranslatedWorld).xyz;
				if (PointIt > 0 && Point.bValid)
				{
					AddLineTWS(PrevPoint, TranslatedWorldPosition, lerp(ColorOrange, ColorBlue, float(PointIt) / ClusterHeader.PointPerCurve));
				}
				PrevPoint = TranslatedWorldPosition;
			}
		}
		#endif
	}
	#endif

	#if 0
	const uint2 VisibleData = VisibleClusters[GroupId];
	const uint  PrimitiveIndex = VisibleData.x;
	const uint  ClusterIndex = VisibleData.y;
	const uint  CurveIndex = LinearThreadIndex; // expect curve count == THREADGROUP_SIZE. Add validation code for this

	const FRenderCurveInstanceData RenderCurveInstanceData = GetRenderCurveInstanceData(PrimitiveIndex);
	const FClusterHeader ClusterHeader = GetClusterHeader(ClusterIndex);

	//float3 PrevPoint = 0;
	//for (uint32 PointIt = 0; PointIt < ClusterHeader.PointPerCurve; ++PointIt)
	//{
	//	const float3 Position = GetClusterPoint(ClusterHeader, CurveIndex, PointIt);
	//	const float3 TranslatedWorldPosition = mul(Position, RenderCurveInstanceData.LocalToTranslatedWorld);
	//	if (PointIt > 0)
	//	{
	//		AddLineTWS(PrevPoint, TranslatedWorldPosition, lerp(ColorOrange, ColorBlue, float(PointIt)/ClusterHeader.PointPerCurve));
	//	}
	//}
	#endif
}

#endif // DebugDrawingCS

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#ifdef SegmentLUTCS

#if PERMUTATION_DEBUG
void DrawBitLine(RWTexture2D<float4> Out, uint2 OutResolution, uint2 OutBaseCoord, uint2 In)
{	
	for (uint y=0;y<8; ++y)
	for (uint x=0;x<8; ++x)
	{
		const uint l = x + y * 8;
		const uint b = l<32u ? ((In.x>>l)&0x1) : ((In.y>>(l-32u))&0x1);
		if (b > 0)
		{
			const uint2 OutCoord = OutBaseCoord + uint2(x, y);
			if (all(OutCoord < OutResolution))
			Out[OutCoord] = float4(0,1,0,1);
		}
	}
}
#endif

uint2 DebugOutputResolution;
RWTexture2D<float4> RWDebugOutput;
RWTexture2D<uint2>  RWSegmentLUT;

#define LUT_RESOLUTION THREADGROUP_SIZE_X
#if THREADGROUP_SIZE_X != THREADGROUP_SIZE_Y
#define THREADGROUP_SIZE_X and THREADGROUP_SIZE_Y needs to have the same size
#endif

[numthreads(LUT_RESOLUTION, LUT_RESOLUTION, 1)]
void SegmentLUTCS(uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupThreadID2D : SV_GroupThreadID, uint2 GroupID : SV_GroupID)
{
	const float2 P0 = GroupID.xy + 0.5f;
	const float2 P1 = GroupThreadID2D.xy + 0.5f;

	// 1. Init 16x16 output
	uint Out[LUT_RESOLUTION][LUT_RESOLUTION];
	for (uint y=0;y<LUT_RESOLUTION; ++y)
	for (uint x=0;x<LUT_RESOLUTION; ++x)
	{
		Out[x][y] = 0;
	}

	// 2. Rasterize line (16x16)
	FDDAContext DDAContext = DDACreateContext(P0, normalize(P1 - P0));
	const int2 EndCoord = (int2)floor(P1.xy);
	for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt)
	{
		const int2 TileCoord = (int2)floor(DDAContext.Coord);

		if (all(TileCoord >= 0) && all(TileCoord < LUT_RESOLUTION))
		{
			Out[TileCoord.x][TileCoord.y] = 1;
		}

		if (all(TileCoord == EndCoord))
		{
			break;
		}

		DDAAdvance(DDAContext);
	}

	// 3. Downsample (16x16) -> (8x8)
	uint2 BitOutput = 0;
	for (uint y=0;y<LUT_RESOLUTION; y+=2)
	for (uint x=0;x<LUT_RESOLUTION; x+=2)
	{
		uint Avg = 
		Out[x  ][y  ] +
		Out[x+1][y  ] +
		Out[x+1][y+1] +
		Out[x  ][y+1] ;

		if (Avg >= 2)
		{
			const uint hx = x >> 1u;
			const uint hy = y >> 1u;
			const uint l = hx + hy * 8u;
			if (l < 32)
			{
				BitOutput.x |= 1u << l;
			}
			else
			{
				BitOutput.y |= 1u << (l-32u);
			}
		}
	}

	// 4. Write outptu
	RWSegmentLUT[DispatchThreadId.xy] = BitOutput;

	// 5. Plot output in 2D
	#if PERMUTATION_DEBUG
	if (0)
	{
		const uint2 TileSize = 16u;
		const uint2 BaseCoord = DispatchThreadId.xy * 16u;

		float4 DebugColor = float4(1, 0, 0, 1);
		const uint2 CursorCoord = uint2(ShaderPrintData.CursorCoord);
		FShaderPrintContext Ctx;
		if (all(BaseCoord <= CursorCoord) && all(CursorCoord <= BaseCoord + TileSize))
		{
			Ctx = InitShaderPrintContext(true, uint2(500, 50));
			Print(Ctx, TEXT("SegmentLUT"), FontRed); Newline(Ctx);
			PrintLineN(Ctx, P0);
			PrintLineN(Ctx, P1);

			AddQuadSS(Ctx, BaseCoord, BaseCoord + 16u, ColorYellow);
			DebugColor = float4(0, 1, 0, 1);
		}
		{

			DrawBitLine(RWDebugOutput, DebugOutputResolution, BaseCoord, BitOutput);
		}
	}
	#endif // PERMUTATION_DEBUG
}
#endif // SegmentLUTCS