UnrealEngine/Engine/Shaders/Private/DiaphragmDOF/DOFRecombine.usf

// Copyright Epic Games, Inc. All Rights Reserved.

/*=============================================================================
	DiaphragmDOF/DOFRecombine.usf: Recombine lower res convolution with full
	res scene color.
=============================================================================*/


#include "DOFCommon.ush"
#include "../SceneTexturesCommon.ush"
#include "../Random.ush"
#include "../MonteCarlo.ush"
#include "../SobolRandom.ush"


//------------------------------------------------------- DEBUG COMPILE TIME CONFIG

// When on, color the output pixels according to how expensive they were.
#define DEBUG_FAST_PATHS 0


//------------------------------------------------------- ENUM VALUES

/** Slight out of focus gathering method. */
	// No slight out of focus.
	#define SLIGHT_FOCUS_METHOD_DISABLED 0

	// Accumulate foreground and background slight out of focus in unique convolution.
	#define SLIGHT_FOCUS_METHOD_UNIQUE_CONVOLUTIONS 1

	// Accumulate foreground and background slight out of focus separatly
	#define SLIGHT_FOCUS_METHOD_SEPARATE_CONVOLUTIONS 2


/** Method used to analysis the full resolution neighborhood. */
	// Uses integer atomic
	#define NEIGHBORHOOD_ANALISIS_ATOMIC 0

	// Uses wave instruction.
	#define NEIGHBORHOOD_ANALISIS_WAVE 1

	// Uses LDS reduction.
	#define NEIGHBORHOOD_ANALISIS_LDS_REDUCE 2


// Compositing method for the background.
// TODO: shader permutation to scale down.
#define COMPOSITING_METHOD_NONE 0
#define COMPOSITING_METHOD_BILINEAR_BKG 1


//------------------------------------------------------- COMPILE TIME CONFIG

// Configures across layer processing permutations.
#if DIM_LAYER_PROCESSING == LAYER_PROCESSING_FOREGROUND_ONLY
	#define CONFIG_COMPOSITING_METHOD (COMPOSITING_METHOD_NONE)

#elif DIM_LAYER_PROCESSING == LAYER_PROCESSING_BACKGROUND_ONLY
	#define CONFIG_COMPOSITING_METHOD (COMPOSITING_METHOD_BILINEAR_BKG)

#elif DIM_LAYER_PROCESSING == LAYER_PROCESSING_COMBINED
	#define CONFIG_COMPOSITING_METHOD (COMPOSITING_METHOD_BILINEAR_BKG)

#else
	#error Unknown layer processing.
#endif


// Configures across quality permutations.
#if DIM_QUALITY == 0
	#define CONFIG_GATHER_PAIR_COUNT 0
	#define CONFIG_SLIGHT_FOCUS_METHOD (SLIGHT_FOCUS_METHOD_DISABLED)
	#define CONFIG_HOLE_FILLING_METHOD (HOLE_FILLING_METHOD_OPACITY_AMEND)

#elif DIM_QUALITY == 1
	#define CONFIG_GATHER_PAIR_COUNT 12
	#define CONFIG_SLIGHT_FOCUS_METHOD (SLIGHT_FOCUS_METHOD_UNIQUE_CONVOLUTIONS)
	#define CONFIG_HOLE_FILLING_METHOD (HOLE_FILLING_METHOD_SEPARATE_GATHER)
	#define CONFIG_FETCH_FULLRES_COC_FROM_ALPHA (!CONFIG_DOF_ALPHA)

#elif DIM_QUALITY == 2
	#define CONFIG_GATHER_PAIR_COUNT 16
	#define CONFIG_SLIGHT_FOCUS_METHOD (SLIGHT_FOCUS_METHOD_UNIQUE_CONVOLUTIONS)
	#define CONFIG_HOLE_FILLING_METHOD (HOLE_FILLING_METHOD_SEPARATE_GATHER)
	#define CONFIG_FETCH_FULLRES_COC_FROM_ALPHA (!CONFIG_DOF_ALPHA)

#else
	#error Unknown quality.
#endif


// Configures the neighborhood analysis method to use for slight out of focus early out.
#if COMPILER_SUPPORTS_WAVE_MINMAX && (PS4_PROFILE || XBOXONE_PROFILE)
	// GCN only optimisation
	#define NEIGHBORHOOD_ANALISIS_METHOD (NEIGHBORHOOD_ANALISIS_WAVE)

#elif COMPILER_HLSLCC
	// Compiler does not like NEIGHBORHOOD_ANALISIS_ATOMIC at all.
	#define NEIGHBORHOOD_ANALISIS_METHOD (NEIGHBORHOOD_ANALISIS_LDS_REDUCE)

#else
	#define NEIGHBORHOOD_ANALISIS_METHOD (NEIGHBORHOOD_ANALISIS_ATOMIC)

#endif

// Clamp full res gathering to have spec hits still temporally stable.
#define CONFIG_CLAMP_FULLRES_GATHER 1

// Clamp buffer UVs
#define CONFIG_CLAMP_SCENE_BUFFER_UV 1

// Clamp buffer UVs
#define CONFIG_CLAMP_DOF_BUFFER_UV 1

// Fetch CoC radius from full res scene color's alpha channel.
#ifndef CONFIG_FETCH_FULLRES_COC_FROM_ALPHA
	#define CONFIG_FETCH_FULLRES_COC_FROM_ALPHA 0
#endif


//------------------------------------------------------- COMPILE TIME CONSTS

// Epsilon used to compare opacity values.
#define OPACITY_EPSILON 0.01


#define GROUP_BORDER_SIZE     (DEFAULT_GROUP_BORDER_SIZE)
#define THREADGROUP_TOTALSIZE (GROUP_BORDER_SIZE * GROUP_BORDER_SIZE)


//------------------------------------------------------- PARAMETERS

float4 ViewportSize;
uint4 ViewportRect;

FScreenTransform DispatchThreadIdToDOFBufferUV;
float2 DOFBufferUVMax;

float4 SeparateTranslucencyBilinearUVMinMax;
uint SeparateTranslucencyUpscaling;
float EncodedCocRadiusToRecombineCocRadius;
float MaxRecombineAbsCocRadius;

Texture2D BokehLUT;

Texture2D SceneColorInput;
Texture2D SceneDepthTexture;
Texture2D SceneSeparateCoc;

Texture2D<float> LowResDepthTexture;
Texture2D<float> FullResDepthTexture;

Texture2D SceneSeparateTranslucency;
Texture2D SceneSeparateTranslucencyModulateColor;

float4 ConvolutionInputSize;

Texture2D ForegroundConvolution_SceneColor;
Texture2D ForegroundConvolution_SeparateAlpha;

Texture2D ForegroundHoleFillingConvolution_SceneColor;
Texture2D ForegroundHoleFillingConvolution_SeparateAlpha;

Texture2D SlightOutOfFocusConvolution_SceneColor;
Texture2D SlightOutOfFocusConvolution_SeparateAlpha;

Texture2D BackgroundConvolution_SceneColor;
Texture2D BackgroundConvolution_SeparateAlpha;


RWTexture2D<float4>	SceneColorOutput;

// Utilities to upsample tranmslucency to full resolution
#include "../SeparateTranslucency.ush"
float2 SeparateTranslucencyTextureLowResExtentInverse;

//------------------------------------------------------- INTERMEDIARY STRUCTURES

// Intermediary results of the recombine.
struct FRecombineInputParameters
{
	// The viewport UV of the output pixel.
	float2 ViewportUV;

	// Buffer UV to sample scene texture buffers.
	float2 SceneBufferUV;

	// Buffer UV to sample DOF buffers.
	float2 DOFBufferUV;

	// Buffer size and inv size for the DOF inputs.
	float4 DOFBufferSize;

	// Random seeds
	uint2 Seed0;
};


//------------------------------------------------------- FUNCTIONS

float SampleWorldDepth(float2 BufferUV)
{
	return ConvertFromDeviceZ(SceneDepthTexture.SampleLevel(GlobalPointClampedSampler, BufferUV, 0).x);
}

// {0 to 1} output.
float NoizNorm(float2 N, float X)
{
	N+=X;
	return frac(sin(dot(N.xy,float2(12.9898, 78.233)))*43758.5453);
}

float2 RotVec(float Radius, float Radians)
{
	return Radius * float2(cos(Radians), sin(Radians));
}

void AmendAdditiveColorWithMaxTranslucency(inout float4 Color, inout float Translucency, float MaxTranslucency)
{
	if (Translucency < 1)
	{
		float NewTranslucency = min(Translucency, MaxTranslucency);
		Color *= (1 - NewTranslucency) / (1 - Translucency);
		Translucency = NewTranslucency;
	}
}

// TODO: most of the math below uses CocRadius as expressed in physical pixels, rather than scaling-independent 'encoded' units.
// It's not clear if the math is always accounting for this, as many of these expressions contain magic numbers.


// Compute sample weight according's to its Coc radius.
float ComputeSampleWeight(float CocRadius)
{
	#if 0
		return 1;

	#else
		const float PixelRadius = FullResPixelDistanceToCocDistance(0.5);

		const float MaximumWeight = rcp((4 * PI) * PixelRadius * PixelRadius);

		float Weight = min(rcp((4 * PI) * CocRadius * CocRadius), MaximumWeight);

		//Weight /= max(1, CocRadius * 2);


		return Weight;
	#endif
}

float ComputeSampleIntersection(float SampleCocRadius, float SampleDistance)
{
	#if 0 // DEBUG
		return SampleDistance < SampleCocRadius ? 1 : 0;
	#endif

	// Mulitplier is set to 1 / pixel radius = 0.5, and also need a * 2.0 because recombine is done at full resolution.
	const float Multiplier = 4.0;

	// Offset is set to 0.5 / 2 so that when a sample Coc cover half of the pixel (abs(CocRadiusA) - SampleDistance) == 0),
	// we get a 50% overlap.
	const float LinearOffset = 0.5;

	// Minimal Abs coc radius to be considered to avoid SampleCocRadius=0 & SampleDistance=0 returning < 1.0
	const float MinimalAbsCocRadius = 0.25;

	float AbsCocRadius = max(abs(SampleCocRadius), MinimalAbsCocRadius);

	// Compute linear overlap.
	float LinearIntersection = saturate((AbsCocRadius - SampleDistance) * Multiplier + LinearOffset);

	// Pixels are aproximated as disk. So to make the intersection of two disk look better,
	// do a smoothstep.
	return smoothstep(0, 1, LinearIntersection);
}

// Returns the opacity to use to transition to background.
float ComputeBackgroundSampleOpacity(float CocRadius)
{
	//return CocRadius < MAX_RECOMBINE_ABS_COC_RADIUS ? 1 : 0;
	return saturate(MaxRecombineAbsCocRadius - CocRadius);
}

// Returns the opacity to use to transition to background.
float IsConsideredBackgroundSample(float CocRadius)
{
	return ComputeBackgroundSampleOpacity(CocRadius) * saturate((CocRadius - 0.125) * 8);
}

// Compute translucency of the in focus sample.
float ComputeInFocusOpacity(float CocRadius)
{
	// TODO: should be 4*
	return saturate(2 - 4 * abs(CocRadius));
}

// Returns the opacity to use to transition foreground slight out of focus over in focus.
float ComputeForegroundSampleOpacity(float CocRadius)
{
	return saturate(-1 - 8 * CocRadius);
}

// Returns the opacity to use to transition to background.
float IsConsideredForegroundSample(float CocRadius)
{
	return ComputeForegroundSampleOpacity(CocRadius) * saturate(MaxRecombineAbsCocRadius + CocRadius);
}


//------------------------------------------------------- ACCUMULATOR

/** Structs that holds data about a sample for gathering. */
struct FGatherSample
{
	// Sample's scene color (and optionally alpha channel).
	float4 Color;

	// Sample's radius of the Coc
	float CocRadius;

	// Sample's intersection.
	float Intersection;
};

/** Gathering parameters in recombine pass */
struct FFullResGatherParameters
{
	// Radius size in FULL res pixels.
	float KernelPixelRadius;

	// Number of pair of gathering samples.
	uint SamplePairCount;

};

/** Gathering accumulator for recombine pass */
struct FFullResGatherAccumulator
{
	// Parameters of the full res gather.
	FFullResGatherParameters Parameters;

	float4 Color;
	float ColorWeight;

	float Opacity;
	float OpacityWeight;

	uint LayerProcessing;
};

FFullResGatherAccumulator CreateFullResGatherAccumulator(in FFullResGatherParameters GatherParameters)
{
	FFullResGatherAccumulator Accumulator;
	Accumulator.Parameters = GatherParameters;

	Accumulator.Color = 0.0;
	Accumulator.ColorWeight = 0.0;
	Accumulator.Opacity = 0.0;
	Accumulator.OpacityWeight = 0.0;
	Accumulator.LayerProcessing = LAYER_PROCESSING_BACKGROUND_ONLY;

	return Accumulator;
}

struct FGatherSampleDerivedParameters
{
	float Weight;
	float IsConsidered;
	float Opacity;
};

FGatherSampleDerivedParameters ComputeSampleDerivates(in FFullResGatherAccumulator Accumulator, in FGatherSample A)
{
	FGatherSampleDerivedParameters DerivedA;
	DerivedA.Weight = ComputeSampleWeight(A.CocRadius);

	if (Accumulator.LayerProcessing == LAYER_PROCESSING_FOREGROUND_ONLY)
	{
		DerivedA.Opacity = ComputeForegroundSampleOpacity(A.CocRadius);
		DerivedA.IsConsidered = IsConsideredForegroundSample(A.CocRadius);
	}
	else if (Accumulator.LayerProcessing == LAYER_PROCESSING_FOREGROUND_HOLE_FILLING)
	{
		// Don't care about weight for hole filling
		DerivedA.Weight = 1;

		DerivedA.Opacity = ComputeForegroundSampleOpacity(A.CocRadius);
		DerivedA.IsConsidered = 1 - IsConsideredForegroundSample(A.CocRadius);
	}
	else if (Accumulator.LayerProcessing == LAYER_PROCESSING_SLIGHT_OUT_OF_FOCUS)
	{
		DerivedA.Opacity = ComputeBackgroundSampleOpacity(A.CocRadius);
		DerivedA.IsConsidered = saturate(MaxRecombineAbsCocRadius - abs(A.CocRadius));
	}
	else if (Accumulator.LayerProcessing == LAYER_PROCESSING_BACKGROUND_ONLY)
	{
		DerivedA.Opacity = ComputeBackgroundSampleOpacity(A.CocRadius);
		DerivedA.IsConsidered = IsConsideredBackgroundSample(A.CocRadius);
	}

	return DerivedA;
}

void HoleFillCloserSample(
	in FFullResGatherAccumulator Accumulator,
	inout FGatherSample A, inout FGatherSampleDerivedParameters DerivedA,
	in FGatherSample Closer, in FGatherSampleDerivedParameters DerivedCloser)
{
	if (Accumulator.LayerProcessing == LAYER_PROCESSING_FOREGROUND_ONLY)
	{
		A.Intersection = Closer.Intersection;
		DerivedA.Weight = DerivedCloser.Weight;

		#if 1 // Used with LAYER_PROCESSING_FOREGROUND_HOLE_FILLING

		#elif 0 // looks nice over slight out of focus, but looks bad over large background out of focus.
			Opacity[1] = Opacity[0] * ComputeBackgroundSampleOpacity(S[1].CocRadius);
			IsConsidered[1] = Opacity[1] * IsConsidered[0];

		#else
			DerivedA.IsConsidered = DerivedCloser.IsConsidered;
			DerivedA.Opacity = DerivedCloser.Opacity;
		#endif
	}
	else if (Accumulator.LayerProcessing == LAYER_PROCESSING_FOREGROUND_HOLE_FILLING)
	{
		A.Intersection = Closer.Intersection;
		DerivedA.Weight = DerivedCloser.Weight;

		DerivedA.Opacity = DerivedCloser.Opacity;
	}
	else if (Accumulator.LayerProcessing == LAYER_PROCESSING_SLIGHT_OUT_OF_FOCUS)
	{
		A.Intersection = Closer.Intersection;
		DerivedA.Weight = DerivedCloser.Weight;
	}
}

void AccumulateSample(
	inout FFullResGatherAccumulator Accumulator,
	in FGatherSample A,
	in FGatherSampleDerivedParameters DerivedA)
{
	float ColorWeight = A.Intersection * DerivedA.Weight * DerivedA.IsConsidered;
	float OpacityWeight = A.Intersection;

	if (Accumulator.LayerProcessing == LAYER_PROCESSING_BACKGROUND_ONLY)
	{
		// This works really well to have smaller out ofcus than the gathering kernel.
		DerivedA.Opacity *= DerivedA.Weight * rcp(ComputeSampleWeight(Accumulator.Parameters.KernelPixelRadius * 0.5));
	}
	else if (Accumulator.LayerProcessing == LAYER_PROCESSING_SLIGHT_OUT_OF_FOCUS)
	{
		//DerivedA.Opacity *= DerivedA.Weight * rcp(ComputeSampleWeight(Accumulator.Parameters.KernelPixelRadius * 0.5));
	}
	else if (1)
	{
		DerivedA.Opacity *= DerivedA.Weight * rcp(ComputeSampleWeight(Accumulator.Parameters.KernelPixelRadius * 0.5));
	}

	Accumulator.Color += ColorWeight * A.Color;
	Accumulator.ColorWeight += ColorWeight;

	Accumulator.Opacity += OpacityWeight * DerivedA.Opacity;
	Accumulator.OpacityWeight += OpacityWeight;
}

/** Accumulates mirror samples. */
void AccumulateMirrorSamples(inout FFullResGatherAccumulator Accumulator, in FGatherSample S[2])
{
	FGatherSampleDerivedParameters DerivedS[2];

	UNROLL
	for (uint i = 0; i < 2; i++)
	{
		DerivedS[i] = ComputeSampleDerivates(Accumulator, S[i]);
	}

	// Mirror hole filling.
	#if 1
		if (S[1].CocRadius > S[0].CocRadius)
		{
			HoleFillCloserSample(Accumulator, S[1], DerivedS[1], S[0], DerivedS[0]);
		}
		else if (S[0].CocRadius > S[1].CocRadius)
		{
			HoleFillCloserSample(Accumulator, S[0], DerivedS[0], S[1], DerivedS[1]);
		}
	#else
		if (IsForeground(S[0].CocRadius) && S[1].CocRadius > S[0].CocRadius)
		{
			HoleFillCloserSample(Accumulator, S[1], DerivedS[1], S[0], DerivedS[0]);
		}
		else if (IsForeground(S[1].CocRadius) && S[0].CocRadius > S[1].CocRadius)
		{
			HoleFillCloserSample(Accumulator, S[0], DerivedS[0], S[1], DerivedS[1]);
		}
	#endif

	UNROLL
	for (uint j = 0; j < 2; j++)
	{
		AccumulateSample(Accumulator, S[j], DerivedS[j]);
	}
}

/** Accumulates center sample. */
void AccumulateCenterSample(inout FFullResGatherAccumulator Accumulator, in FGatherSample A)
{
	FGatherSampleDerivedParameters DerivedA = ComputeSampleDerivates(Accumulator, A);

	// Force this sample to be considered to guareentee their is a resolved color if in focus or background.
	DerivedA.IsConsidered = 1;

	AccumulateSample(Accumulator, A, DerivedA);
}

/** Resolves the slightly out of focus. */
void ResolveAccumulator(
	in FFullResGatherAccumulator Accumulator,
	out float4 OutGatherBackgroundUnpremultipliedColor,
	out float OutGatherBackgroundOpacity)
{
	const float SampleCount = 1.0 + 2.0 * Accumulator.Parameters.SamplePairCount;

	float Opacity;
	if (Accumulator.LayerProcessing == LAYER_PROCESSING_SLIGHT_OUT_OF_FOCUS)
	{
		//Opacity = saturate(Accumulator.Opacity * SafeRcp(SampleCount));
		Opacity = saturate(Accumulator.Opacity * SafeRcp(Accumulator.OpacityWeight));
	}
	else if (Accumulator.LayerProcessing == LAYER_PROCESSING_BACKGROUND_ONLY || 1)
	{
		Opacity = saturate(Accumulator.Opacity * SafeRcp(Accumulator.OpacityWeight));
	}
	else
	{
		Opacity = saturate(Accumulator.Opacity * SafeRcp(SampleCount));
	}

	OutGatherBackgroundOpacity = Accumulator.ColorWeight > 0 ? Opacity : 0;
	OutGatherBackgroundUnpremultipliedColor = Accumulator.Color * (SafeRcp(Accumulator.ColorWeight));
}


//------------------------------------------------------- KERNEL

#if CONFIG_SLIGHT_FOCUS_METHOD != SLIGHT_FOCUS_METHOD_DISABLED

void FetchAndAccumulateSamplePair(
	in const FRecombineInputParameters InputParameters,
	in float2 PixelOffset,
	inout FFullResGatherAccumulator Accumulator)
{
	// Accuratly quantize sample offset so the intersection get evaluated at the center,
	// unless using a look up table to know SampleDistance.
	#if DIM_BOKEH_SIMULATION == BOKEH_SIMULATION_DISABLED
		PixelOffset = sign(PixelOffset) * floor(abs(PixelOffset) + 0.5);
	#endif

	// Distance of the sample from output pixels in half res pixel unit.
	float SampleDistance = FullResPixelDistanceToCocDistance(length(PixelOffset));

	// Scene buffer offset.
	float2 SceneBufferUVOffset = PixelOffset * View.BufferSizeAndInvSize.zw * float2(CocInvSqueeze, 1.0);

	// Two sample to gather at same time.
	FGatherSample Sample[2];

	UNROLL
	for (uint k = 0; k < 2; k++)
	{
		const float SampleSign = (k == 0) ? 1.0 : -1.0;

		// Fetch SampleDistance from lookup table.
		#if DIM_BOKEH_SIMULATION != BOKEH_SIMULATION_DISABLED
		if (k == 0 || DIM_BOKEH_SIMULATION == BOKEH_SIMULATION_GENERAL)
		{
			const float InvLutSize = rcp(float(BOKEH_LUT_SIZE));

			float2 LookupUV = (0.5 + 0.5 * InvLutSize) + PixelOffset * (SampleSign * InvLutSize) * float2(CocInvSqueeze, 1.0);
			float4 LookupSample = BokehLUT.SampleLevel(GlobalPointClampedSampler, LookupUV, 0);
			SampleDistance = LookupSample.x;
		}
		#endif

		float2 BufferUV = InputParameters.SceneBufferUV + SampleSign * SceneBufferUVOffset;

		if (true) // TODO.
		{
			BufferUV = clamp(BufferUV, View.BufferBilinearUVMinMax.xy, View.BufferBilinearUVMinMax.zw);
		}

		// Fetch full res color and CocRadius.
		#if CONFIG_FETCH_FULLRES_COC_FROM_ALPHA
			Sample[k].Color = SceneColorInput.SampleLevel(GlobalPointClampedSampler, BufferUV, 0);
			Sample[k].CocRadius = Sample[k].Color.a * EncodedCocRadiusToRecombineCocRadius;
		#else
			Sample[k].Color = SceneColorInput.SampleLevel(GlobalPointClampedSampler, BufferUV, 0);
			Sample[k].CocRadius = SceneDepthToCocRadius(SampleWorldDepth(BufferUV));
		#endif

		// Convert scene color alpha from translucency to opacity.
		Sample[k].Color.a = 1 - Sample[k].Color.a;

		#if DIM_LAYER_PROCESSING == LAYER_PROCESSING_BACKGROUND_ONLY
			Sample[k].CocRadius = max(Sample[k].CocRadius, 0);
		#endif

		Sample[k].Intersection = ComputeSampleIntersection(Sample[k].CocRadius, SampleDistance);
	}

	AccumulateMirrorSamples(Accumulator, Sample);
}

void GatherToAccumulator(
	in const FRecombineInputParameters InputParameters,
	in const FFullResGatherParameters GatherParameters,
	inout FFullResGatherAccumulator Accumulator)
#if 0 // brute force the gathering kernel.
{
	int QuadSize = 0 * 2 * MAX_RECOMBINE_ABS_COC_RADIUS;

	UNROLL
	for (int x = -QuadSize; x <= QuadSize; x++)
	UNROLL
	for (int y = 0; y <= QuadSize; y++)
	{
		if (y == 0 && x <= 0)
		{
			continue;
		}

		const float2 PixelOffset = float2(x, y);
		const float PixelDistance = length(PixelOffset);

		if (PixelDistance > QuadSize)
		{
			continue;
		}

		FetchAndAccumulateSamplePair(InputParameters, PixelOffset, Accumulator);
	}
}
#else
{
	// Samples at full resolution.
	LOOP
	for(uint SamplePairId = 0; SamplePairId < GatherParameters.SamplePairCount; SamplePairId++)
	{
		float2 E = Hammersley16(SamplePairId, CONFIG_GATHER_PAIR_COUNT, InputParameters.Seed0);
		float2 DiskRandom = UniformSampleDiskConcentricApprox(E);

		float2 PixelOffset = GatherParameters.KernelPixelRadius * DiskRandom;

		// We already sampled the center pixels, and there is no point sampling it again with very small Coc.
		// Therefore clipped the offset so that it does not sample the center again.
		//FLATTEN
		if (any(abs(PixelOffset) <= 0.5) && 0)
		{
			PixelOffset = clamp(PixelOffset * SafeRcp(max(abs(PixelOffset.x), abs(PixelOffset.y))), -1, 1);
		}

		FetchAndAccumulateSamplePair(InputParameters, PixelOffset, Accumulator);
	}
}
#endif

#endif // CONFIG_SLIGHT_FOCUS_METHOD != SLIGHT_FOCUS_METHOD_DISABLED


//------------------------------------------------------- ENTRY POINT

#if NEIGHBORHOOD_ANALISIS_METHOD == NEIGHBORHOOD_ANALISIS_ATOMIC

groupshared uint SharedMaxConsideredAbsCocRadius;


#elif NEIGHBORHOOD_ANALISIS_METHOD == NEIGHBORHOOD_ANALISIS_LDS_REDUCE

groupshared float SharedMaxConsideredAbsCocRadius[GROUP_BORDER_SIZE * GROUP_BORDER_SIZE];


#endif

[numthreads(GROUP_BORDER_SIZE, GROUP_BORDER_SIZE, 1)]
void RecombineMainCS(
	uint2 DispatchThreadId : SV_DispatchThreadID,
	uint2 GroupThreadId : SV_GroupThreadID,
	uint GroupThreadIndex : SV_GroupIndex)
{
	float4 Debug = 0;

	// Setup input parameters.
	FRecombineInputParameters InputParameters;
	{
		InputParameters.DOFBufferSize = ConvolutionInputSize;

		InputParameters.ViewportUV = (DispatchThreadId + 0.5) * ViewportSize.zw;
		InputParameters.SceneBufferUV = ViewportUVToBufferUV(InputParameters.ViewportUV);

		if (CONFIG_CLAMP_SCENE_BUFFER_UV)
		{
			InputParameters.SceneBufferUV = clamp(InputParameters.SceneBufferUV, View.BufferBilinearUVMinMax.xy, View.BufferBilinearUVMinMax.zw);
		}

		// - 0.5 * TemporalJitterPixels because DOF buffer is non temporally jittering, thanks to half res TAA pass.
		InputParameters.DOFBufferUV = ApplyScreenTransform(float2(DispatchThreadId), DispatchThreadIdToDOFBufferUV);

		if (CONFIG_CLAMP_DOF_BUFFER_UV)
		{
			InputParameters.DOFBufferUV = min(InputParameters.DOFBufferUV, DOFBufferUVMax);
		}

		InputParameters.Seed0 = Rand3DPCG16(int3(DispatchThreadId, View.StateFrameIndexMod8)).xy;
	}

	//Fetch foreground layer first to early return if ForegroundTranslucency == 0.0.
	float4 ForegroundColor;
	float ForegroundTranslucency;
	{
		#if DIM_LAYER_PROCESSING == LAYER_PROCESSING_BACKGROUND_ONLY
			ForegroundColor = 0;
			ForegroundTranslucency = 1;

		#elif CONFIG_DOF_ALPHA
			// Sample premultiplied RGBA foreground.
			ForegroundColor = ForegroundConvolution_SceneColor.SampleLevel(
				GlobalBilinearClampedSampler, InputParameters.DOFBufferUV, 0);
			ForegroundTranslucency = 1 - ForegroundConvolution_SeparateAlpha.SampleLevel(
				GlobalBilinearClampedSampler, InputParameters.DOFBufferUV, 0).r;
		#else
			// Sample premultiplied RGBA foreground.
			ForegroundColor = ForegroundConvolution_SceneColor.SampleLevel(
				GlobalBilinearClampedSampler, InputParameters.DOFBufferUV, 0);
			ForegroundTranslucency = 1 - ForegroundColor.a;

		#endif
	}


	// Get full res color and coc radius.
	FGatherSample CenterSample;
	CenterSample.Color = SceneColorInput.SampleLevel(GlobalPointClampedSampler, InputParameters.SceneBufferUV, 0);
	CenterSample.Intersection = 1.0;

	#if CONFIG_FETCH_FULLRES_COC_FROM_ALPHA
		CenterSample.CocRadius = CenterSample.Color.a * EncodedCocRadiusToRecombineCocRadius;
	#else
		CenterSample.CocRadius = SceneDepthToCocRadius(SampleWorldDepth(InputParameters.SceneBufferUV));
	#endif

	// Convert scene color alpha from translucency to opacity.
	CenterSample.Color.a = 1 - CenterSample.Color.a;

	// Whether can display solly foreground.
	bool bCanReturnForegroundOnly = ForegroundTranslucency < OPACITY_EPSILON;

	// Group constant: Whether should do full resolution gathering for slight out of focus.
	bool bGatherFullRes = false;

	#if CONFIG_SLIGHT_FOCUS_METHOD != SLIGHT_FOCUS_METHOD_DISABLED
	// Full resolution gather's parameters.
	FFullResGatherParameters GatherParameters;
	{
		#if NEIGHBORHOOD_ANALISIS_METHOD == NEIGHBORHOOD_ANALISIS_ATOMIC
		{
			SharedMaxConsideredAbsCocRadius = 0;
			GroupMemoryBarrierWithGroupSync();
		}
		#endif


		// Grab the smallest slightly out of focus Coc radius of the tile.
		float TileMaxConsideredAbsCocRadius;
		{
			float MaxConsideredAbsCocRadius = abs(CenterSample.CocRadius) < MaxRecombineAbsCocRadius ? abs(CenterSample.CocRadius) : 0;

			for (uint j = 0; j < 4; j++)
			{
				const float2 SamplePixelOffset = float2(kOffsetsCross3x3[j]) * CocDistanceToFullResPixelDistance(MaxRecombineAbsCocRadius);

				float2 SampleUVOffset = View.BufferSizeAndInvSize.zw * SamplePixelOffset;
				float2 SampleUV = InputParameters.SceneBufferUV + SampleUVOffset;

				if (CONFIG_CLAMP_SCENE_BUFFER_UV)
				{
					SampleUV = clamp(SampleUV, View.BufferBilinearUVMinMax.xy, View.BufferBilinearUVMinMax.zw);
				}

				#if CONFIG_FETCH_FULLRES_COC_FROM_ALPHA
					float SampleCocRadius = SceneColorInput.SampleLevel(GlobalPointClampedSampler, SampleUV, 0).a * EncodedCocRadiusToRecombineCocRadius;
				#else
					float SampleCocRadius = SceneDepthToCocRadius(SampleWorldDepth(SampleUV));
				#endif

				float SampleAbsCocRadius = abs(SampleCocRadius);

				#if 0
				// Compute the minimum CocRadius to overlap with the group's tile, to reduce amount of tiles gathering uselessly.
				// TODO: have not witnessed any performance regression or improvement with this yet.
				{
					// Compute the minimum CocRadius to overlap with the group's tile.
					float2 ThreadDistanceToGroupBorder = lerp(GroupThreadId, (GROUP_BORDER_SIZE - 1) - GroupThreadId, kSquare2x2[j]);
					float2 OutsideGroupPixelOffset = abs(SamplePixelOffset) - ThreadDistanceToGroupBorder;
					float2 OutsideGroupCocOffset = FullResPixelDistanceToCocDistance(OutsideGroupPixelOffset);

					float MinCocRadiusSquare = dot(OutsideGroupCocOffset, OutsideGroupCocOffset);

					// Not interested if the CocRadius is too large, or does not overlap with the group's tile.
					if (SampleAbsCocRadius < MaxRecombineAbsCocRadius &&
						SampleAbsCocRadius * SampleAbsCocRadius > MinCocRadiusSquare)
					{
						MaxConsideredAbsCocRadius = max(MaxConsideredAbsCocRadius, SampleAbsCocRadius);
					}
				}
				#else
				{
					MaxConsideredAbsCocRadius = max(MaxConsideredAbsCocRadius, SampleAbsCocRadius < MaxRecombineAbsCocRadius ? SampleAbsCocRadius : MaxConsideredAbsCocRadius);
				}
				#endif
			}

			#if NEIGHBORHOOD_ANALISIS_METHOD == NEIGHBORHOOD_ANALISIS_ATOMIC
			{
				// Do atomic min and max of the positive or null float MaxConsideredAbsCocRadius
				// as if they were uint.
				uint Unused;
				InterlockedMax(SharedMaxConsideredAbsCocRadius, asuint(MaxConsideredAbsCocRadius), Unused);

				GroupMemoryBarrierWithGroupSync();

				// Read atomic counters.
				TileMaxConsideredAbsCocRadius = asfloat(SharedMaxConsideredAbsCocRadius);
			}
			#elif NEIGHBORHOOD_ANALISIS_METHOD == NEIGHBORHOOD_ANALISIS_LDS_REDUCE
			{
				SharedMaxConsideredAbsCocRadius[GroupThreadIndex] = MaxConsideredAbsCocRadius;
				GroupMemoryBarrierWithGroupSync();

				// Safe for vector sizes 32 or larger, AMD and NV
				// TODO Intel variable size vector
				UNROLL
				for (uint i = 0; i < 5; i++)
				{
					const uint ReduceSize = 32u >> i;
					if (GroupThreadIndex < ReduceSize)
					{
						MaxConsideredAbsCocRadius = max(MaxConsideredAbsCocRadius, SharedMaxConsideredAbsCocRadius[GroupThreadIndex + ReduceSize]);
						SharedMaxConsideredAbsCocRadius[GroupThreadIndex] = MaxConsideredAbsCocRadius;
					}
				}

				TileMaxConsideredAbsCocRadius = SharedMaxConsideredAbsCocRadius[0];
			}
			#elif NEIGHBORHOOD_ANALISIS_METHOD == NEIGHBORHOOD_ANALISIS_WAVE
			{
				TileMaxConsideredAbsCocRadius = WaveActiveMax(MaxConsideredAbsCocRadius);
			}
			#else
				#error Unknown neighborhood analisis method to use.
			#endif
		}

		// Determines what should be done.
		{
			// Gather at full resolution only if we know there is considered neighborhood that a COC radius big enough.
			bGatherFullRes = TileMaxConsideredAbsCocRadius > 0.125;

			// No need to gather at full res for this pixel if totally occluded by foreground.
			bGatherFullRes = bGatherFullRes && !bCanReturnForegroundOnly;
		}

		// Set up gathering parameters.
		{
			float FullResKernelRadius = CocDistanceToFullResPixelDistance(TileMaxConsideredAbsCocRadius);

			// Set the size of the kernel to exactly the max convolution that needs to be done.
			GatherParameters.KernelPixelRadius = ceil(FullResKernelRadius);

			// Increase the size of the kernel radius to avoid the gather Point sampler to create Coc step artifacts.
			GatherParameters.KernelPixelRadius += 0.5;

			float KenelArea = Pow2(FullResKernelRadius) * PI;

			float RecommendedPairCount = KenelArea * 0.5;

			// Number of pair of sample.
			GatherParameters.SamplePairCount = min(uint(CONFIG_GATHER_PAIR_COUNT), uint(RecommendedPairCount));
		}
	}
	#endif // CONFIG_SLIGHT_FOCUS_METHOD != SLIGHT_FOCUS_METHOD_DISABLED

	if (any((ViewportRect.xy + DispatchThreadId) >= ViewportRect.zw))
	{
		return;
	}

	float GatherBackgroundOpacity = ComputeInFocusOpacity(CenterSample.CocRadius);
	float4 GatherBackgroundUnpremultipliedColor = CenterSample.Color;

	#if CONFIG_SLIGHT_FOCUS_METHOD == SLIGHT_FOCUS_METHOD_DISABLED
	{
		GatherBackgroundOpacity = ComputeBackgroundSampleOpacity(CenterSample.CocRadius);
	}
	#elif CONFIG_SLIGHT_FOCUS_METHOD == SLIGHT_FOCUS_METHOD_UNIQUE_CONVOLUTIONS
	BRANCH
	if (bGatherFullRes)
	{
		// Full resolution's opacity with background.
		FFullResGatherAccumulator Accumulator = CreateFullResGatherAccumulator(GatherParameters);
		Accumulator.LayerProcessing = LAYER_PROCESSING_SLIGHT_OUT_OF_FOCUS;

		// Accumulate center sample first to reduce VGPR pressure.
		AccumulateCenterSample(Accumulator, CenterSample);

		// TODO: Adaptive number of sample.
		GatherToAccumulator(
			InputParameters, GatherParameters,
			Accumulator);

		// The full resolution gathering kernel is sampling directly the full res scene color, that is jittering
		// and potentially flickering on spec hits. To avoids issues with TAA's clamping box, we clamp this with the
		// with prefiltering scene color for temporal stability.
		//
		// TODO: should this be done in YCoCg or LCoCg?
		#if CONFIG_CLAMP_FULLRES_GATHER
		{
			float4 Min;
			float4 Max;

			float2 ClampUVBox = InputParameters.DOFBufferSize.zw;

			UNROLL
			for (uint i = 0; i < 4; i++)
			{
				float2 SampleUV = InputParameters.DOFBufferUV + (0.5 * kOffsetsCross3x3[i]) * ClampUVBox;

				if (CONFIG_CLAMP_DOF_BUFFER_UV)
				{
					SampleUV = min(SampleUV, DOFBufferUVMax);
				}

				float4 StableSampleColor = SlightOutOfFocusConvolution_SceneColor.SampleLevel(GlobalPointClampedSampler, SampleUV, 0);

				if (i == 0)
				{
					Min = Max = StableSampleColor;
				}
				else
				{
					Min = min(Min, StableSampleColor);
					Max = max(Max, StableSampleColor);
				}
			}

			// TODO: Increase constrast of limit a little to workaround to strong denoise at near-in-focus (stolen from CircleDOF).
			#if 1
			{
				float4 HD = Min;
				float Small = 0.125 * (1.0 - saturate(CenterSample.CocRadius * CenterSample.CocRadius * rcp(64.0)));
				Max += HD * Small;
				Min -= HD * Small;

				// Ensures the temporally stable opacity remains between 0-1.
				// Uses saturate() instead of min(0 and max() to be optimised as
				// saturate() MAD post modifier on GCN.
				Min.a = saturate(Min.a);
				Max.a = saturate(Max.a);
			}
			#endif

			float ClampWeight = saturate(CenterSample.CocRadius * CenterSample.CocRadius * 4.0);

			// Clamp color
			float3 ClampedColor = clamp(Accumulator.Color.rgb, Min.rgb * Accumulator.ColorWeight, Max.rgb * Accumulator.ColorWeight);

			Accumulator.Color.rgb = lerp(Accumulator.Color.rgb, ClampedColor, ClampWeight);

			// Clamp opacity.
			float ClampedOpacity = clamp(Accumulator.Opacity, Min.a * Accumulator.OpacityWeight, Max.a * Accumulator.OpacityWeight);

			Accumulator.Opacity = lerp(Accumulator.Opacity, ClampedOpacity, ClampWeight);
		}
		#endif

		// Resolve full res gather.
		ResolveAccumulator(Accumulator, GatherBackgroundUnpremultipliedColor, GatherBackgroundOpacity);
	}
	#endif

	// Compose lower res foreground with full res gather foreground.
	float4 GatherForegroundAdditiveColor = ForegroundColor;
	float GatherForegroundTranslucency = ForegroundTranslucency;

	// Sample lower res background, if necessary.
	float4 BackgroundColor = 0.0;
	float BackgroundValidity = 0.0;

	// Separate foregroung hole filling, exposed mainly for debugging purposes.
	float4 HoleFillingAdditiveColor = 0;
	float HoleFillingTranslucency = 1;

	BRANCH
	if ((GatherForegroundTranslucency < OPACITY_EPSILON || bCanReturnForegroundOnly) && 0)
	{
		GatherForegroundAdditiveColor *= SafeRcp(1 - GatherForegroundTranslucency);
		GatherForegroundTranslucency = 0;
	}
	else
	{
		#if CONFIG_COMPOSITING_METHOD == COMPOSITING_METHOD_BILINEAR_BKG
		{
			BackgroundColor = BackgroundConvolution_SceneColor.SampleLevel(GlobalBilinearClampedSampler, InputParameters.DOFBufferUV, 0);

			#if CONFIG_DOF_ALPHA
				BackgroundValidity = BackgroundConvolution_SeparateAlpha.SampleLevel(GlobalBilinearClampedSampler, InputParameters.DOFBufferUV, 0).r;
			#else
				//Background = float4(0, 0, 0, 1);
				BackgroundValidity = BackgroundColor.a;
			#endif

			// Make sure the background color is always normalized, or unrendered.
			BackgroundColor *= SafeRcp(BackgroundValidity);
		}
		#endif

		// Hole fill the background in output final scene color before composing foreground on top.
		#if CONFIG_HOLE_FILLING_METHOD == HOLE_FILLING_METHOD_SEPARATE_GATHER
		{
			HoleFillingAdditiveColor = ForegroundHoleFillingConvolution_SceneColor.SampleLevel(GlobalBilinearClampedSampler, InputParameters.DOFBufferUV, 0);

			#if CONFIG_DOF_ALPHA
				HoleFillingTranslucency = ForegroundHoleFillingConvolution_SeparateAlpha.SampleLevel(GlobalBilinearClampedSampler, InputParameters.DOFBufferUV, 0).r;
			#else
				HoleFillingTranslucency = HoleFillingAdditiveColor.a;
			#endif

			float MaxTranslucency = 1 - IsConsideredForegroundSample(CenterSample.CocRadius);

			// Force the hole filling translucency to 0 if the background is unrendered.
			if (BackgroundValidity <= 0.001)
			{
				if (HoleFillingTranslucency < 1.0)
				{
					float MaxTranslucency = 1 - IsConsideredForegroundSample(CenterSample.CocRadius);

					MaxTranslucency = min(MaxTranslucency, BackgroundValidity);
					AmendAdditiveColorWithMaxTranslucency(HoleFillingAdditiveColor, HoleFillingTranslucency, MaxTranslucency);
				}
				else
				{
					GatherBackgroundOpacity = 1;
				}
			}

			BackgroundColor = BackgroundColor * HoleFillingTranslucency + HoleFillingAdditiveColor;
		}
		#else
		if (BackgroundValidity <= 0.001)
		{
			GatherBackgroundOpacity = 1;
		}
		#endif
	}

	// Compose background loweer res gather and full res gather.
	float4 OutputFinalSceneColor = BackgroundColor * (1 - GatherBackgroundOpacity) + GatherBackgroundOpacity * GatherBackgroundUnpremultipliedColor;

	// Forces foreground translucency to 0 when large out of focus high res foreground, to
	// avoid background leaking.
	#if CONFIG_HOLE_FILLING_METHOD == HOLE_FILLING_METHOD_OPACITY_AMEND
	if (GatherForegroundTranslucency < 1 && BackgroundValidity != 1.0)
	{
		#if CONFIG_GATHER_PAIR_COUNT == 0
			float MaxTranslucency = saturate(MaxRecombineAbsCocRadius + CenterSample.CocRadius);
		#else
			float MaxTranslucency = 1 - ComputeForegroundSampleOpacity(CenterSample.CocRadius);
		#endif

		AmendAdditiveColorWithMaxTranslucency(GatherForegroundAdditiveColor, GatherForegroundTranslucency, MaxTranslucency);
	}
	#endif

	// Compose foreground.
	OutputFinalSceneColor = OutputFinalSceneColor * GatherForegroundTranslucency + GatherForegroundAdditiveColor;

	// Compose separate translucency.
	if (1)
	{
		NearestDepthNeighborUpsamplingResult UpsampleResult;
		if (SeparateTranslucencyUpscaling == 0)
		{
			UpsampleResult.bUsePointSampler = true;
			UpsampleResult.UV = InputParameters.SceneBufferUV;
		}
		else
		{
			float2 PixelPos = (View.ViewRectMin.xy + DispatchThreadId) + 0.5;
			UpsampleResult = NearestDepthNeighborUpsampling(
				LowResDepthTexture,
				FullResDepthTexture,
				PixelPos,
				InputParameters.SceneBufferUV,
				SeparateTranslucencyTextureLowResExtentInverse);
		}

		UpsampleResult.UV = clamp(UpsampleResult.UV, SeparateTranslucencyBilinearUVMinMax.xy, SeparateTranslucencyBilinearUVMinMax.zw);

		float4 SeparateTranslucencyColor = 0;
		float4 SeparateTranslucencyModulateColor = 0;
		if (UpsampleResult.bUsePointSampler)
		{
			SeparateTranslucencyColor = SceneSeparateTranslucency.SampleLevel(GlobalPointClampedSampler, UpsampleResult.UV, 0);
			SeparateTranslucencyModulateColor = SceneSeparateTranslucencyModulateColor.SampleLevel(GlobalPointClampedSampler, UpsampleResult.UV, 0);
		}
		else
		{
			SeparateTranslucencyColor = SceneSeparateTranslucency.SampleLevel(GlobalBilinearClampedSampler, UpsampleResult.UV, 0);
			SeparateTranslucencyModulateColor = SceneSeparateTranslucencyModulateColor.SampleLevel(GlobalBilinearClampedSampler, UpsampleResult.UV, 0);
		}
		float SeparateTranslucencyBackgroundVisibility = SeparateTranslucencyColor.a;
		float GreyScaleModulateColorBackgroundVisibility = dot(SeparateTranslucencyModulateColor.rgb, float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f));

		// This matches what is done in ComposeSeparateTranslucency.usf
		OutputFinalSceneColor.rgb = OutputFinalSceneColor.rgb * SeparateTranslucencyBackgroundVisibility * SeparateTranslucencyModulateColor.rgb + SeparateTranslucencyColor.rgb;
		// Also stores BackgroundVisibility (=transmittance) in alpha
		float FinalSceneVisibility = 1.0 - OutputFinalSceneColor.a;
		OutputFinalSceneColor.a = FinalSceneVisibility * SeparateTranslucencyBackgroundVisibility * GreyScaleModulateColorBackgroundVisibility;

		// Convert from visibility to coverage to comply with the following process
		OutputFinalSceneColor.a = 1.0f - OutputFinalSceneColor.a;
	}

	// Convert alpha channel from opacity back to translucency.
	OutputFinalSceneColor.a = 1 - OutputFinalSceneColor.a;

	// Ensure that alpha values that are expected to be opaque (but are only close to opaque) are forced to be opaque.
	// (0.005 chosen to accomodate handling of 1/255)
	#if CONFIG_DOF_ALPHA
		OutputFinalSceneColor.a = select(OutputFinalSceneColor.a < 0.005, 0.0, OutputFinalSceneColor.a);
		OutputFinalSceneColor.a = select(OutputFinalSceneColor.a > 0.995, 1.0, OutputFinalSceneColor.a);
	#endif

	// Debug optimisation colors.
	#if 0
	{
		float3 DebugColor;

		if (bGatherFullRes)
		{
			// RED: Full res gather.
			Debug = float4(1.0, 0.0, 0.0, 0.0);
		}
		else if (bCanReturnForegroundOnly)
		{
			// GREEN: Foreground is the cheapest.
			Debug = float4(0.0, 1.0, 0.0, 0.0);
		}
		else
		{
			// BLUE: Fetch foreground and background.
			Debug = float4(0.0, 0.0, 1.0, 0.0);
		}
	}
	#elif 0
	{
		if (bGatherFullRes)
		{
			Debug = float4(1.0, 0.0, 0.0, 0.0);
		}
		else
		{
			Debug = float4(0.0, 1.0, 0.0, 0.0);
		}
	}
	#endif

	#if 1 // Lower VGPR footprint.
		uint2 OutputPixelPosition = InputParameters.SceneBufferUV * View.BufferSizeAndInvSize.xy;
	#else
		uint2 OutputPixelPosition = ViewportRect.xy + DispatchThreadId;
	#endif

	#if CONFIG_DOF_ALPHA
		SceneColorOutput[OutputPixelPosition] = OutputFinalSceneColor;
	#else
		SceneColorOutput[OutputPixelPosition] = float4(OutputFinalSceneColor.rgb, 0);
	#endif


	#if DEBUG_OUTPUT
	{
		DebugOutput[OutputPixelPosition] = Debug;
	}
	#endif
}