UnrealEngine/Engine/Shaders/Private/GPUFastFourierTransform.usf

// Copyright Epic Games, Inc. All Rights Reserved.

/*=============================================================================
GPUFastFourierTransform.usf: Core Fast Fourier Transform Code
=============================================================================*/

#pragma once

#include "Common.ush"  // for ReverseBits32()


// Requires the following define:
// SCAN_LINE_LENGTH

// Map get the RADIX and NUMTHREADSX as a function of SCAN_LINE_LENGTH

#if MIXED_RADIX == 1 && SCAN_LINE_LENGTH > 8
	#define RADIX 8
#else

#if SCAN_LINE_LENGTH == 2
	#define RADIX  2
#endif

#if SCAN_LINE_LENGTH == 4
	#define RADIX 4
#endif

#if SCAN_LINE_LENGTH == 8
	#define RADIX 8
#endif

#if SCAN_LINE_LENGTH == 16
	#define RADIX 4
#endif

#if SCAN_LINE_LENGTH == 32
	#define RADIX 2
#endif

#if SCAN_LINE_LENGTH == 64
	#define RADIX 8
#endif

#if SCAN_LINE_LENGTH == 128
	#define RADIX 2
#endif

#if SCAN_LINE_LENGTH == 256
	#define RADIX 4
#endif

#if SCAN_LINE_LENGTH == 512
	#define RADIX 8
#endif

#if SCAN_LINE_LENGTH == 1024
	#define RADIX 4
#endif

#if SCAN_LINE_LENGTH == 2048
	#define RADIX 2
#endif

#if SCAN_LINE_LENGTH == 4096
	#define RADIX 8
#endif

#endif

// NB: This is only needed for the not group shared stuff below.
// Need to make that work and re-factor to fix this.
#ifndef RADIX
	#define RADIX 4
	#define SCAN_LINE_LENGTH  1024
#endif

#define NUMTHREADSX ( SCAN_LINE_LENGTH / RADIX )

#define STRIDE ( SCAN_LINE_LENGTH / RADIX )


// Simple utility function
float ConvertToLuma(in float3 ColorValue)
{
	// Rec 709 function for luma.
	return dot(ColorValue, float3(0.2126, 0.7152, 0.0722));
}


// BrightPixelGain.x = Min Threshold: only pixels brighter than this get boosted
// BrightPixelGain.y = Max Threshold: the maximal value brightness
// BrightPixelGain.z = Multiplier applied to brightness above Min Threshold

float3 BrightPixelGain;

bool FilterPixel(in float3 Filter, inout float4 PixelValue)
{
	bool bIsChanged = false;
	float Luma = ConvertToLuma(PixelValue.xyz);

	if (Luma > Filter.x)
	{
		float TargetLuma = Filter.z * (Luma - Filter.x) + Filter.x;
		TargetLuma  = min(TargetLuma, Filter.y);

		PixelValue.rgb *= (TargetLuma / Luma);
		bIsChanged = true;
	}

	return bIsChanged;
}


// The Actual FFT code written to use local registers.


#include "GPUFastFourierTransform2DCore.ush"


uint2 Coord(in uint2 P, in bool bHorizontal)
{
	return (bHorizontal) ? P : uint2(P.y, P.x);
}

uint BitReverse(in uint Idx, in uint BitRange)
{
	uint ReversedValue = reversebits(Idx);

	// Shift into correct range.
	// 32 = sizeof(uint)
	ReversedValue >>= (32-BitRange);

	return ReversedValue;
}

void ScrubNaNs(inout float4 InOutValue, bool bScrubNaNs)
{
	if (bScrubNaNs)
	{
		InOutValue = -min(-InOutValue, float4(0.f, 0.f, 0.f,0.f));
	}
}

// Used to insure the convolution doesn't produce energy (i.e. normalizing the kernel)
//  NB: This function relies on knowledge of the transform order and data layout to identify the 'dc' term in the fft.
void GetKernelSum(in Texture2DType KernelTexture, in bool bIsHorizontal, uint NumScanlines, inout Complex Integral[2])
{


	if (!bIsHorizontal)  // Since this is the second tranform the first must have been horizontal, assume the data layout from a 2-for-1
	{
		Integral[0] = KernelTexture[uint2(0, 0)].xz;  // RB sums
		Integral[1] = KernelTexture[uint2(NumScanlines-2, 0)].xz;

	}
	else
	{
		Integral[0] = KernelTexture[uint2(0, 0)].xz;
		Integral[1] = KernelTexture[uint2(0, NumScanlines-2)].xz;
	}
}


#ifdef INCLUDE_GROUP_SHARED_COMPLEX_FFT

// ---------------------------------------------------------------------------------------------------------------------------------------
//				FFT Compute Shader for two channels of complex data in an image
//              and inverse.
//              And Helpers
// ---------------------------------------------------------------------------------------------------------------------------------------

uint2 SrcRectMin;
uint2 SrcRectMax;
uint2 DstExtent;
uint4 DstRect;


// CS Entry Point:
// Compute shader that performs (two) complex FFTs of the data in 'SrcTexture'
//
// uint2 TransformDef; determines the type (forward / inverse) and direction of the transform.
//       TransformType & 1 == {1,0} transforms the data in the {Horizontal , Vertical} direction.
//       TransformType & 2 == {1,0} performs a {Forward , Inverse} transform.
//
// The 'r & g' channels are transformed as one complex signal 'r + I * g'
// The 'b & a' channels are transformed as a second complex signal 'b + I * a'
//
// uint2 SrcRectMin,  SrcRectMax;
// define the subsection of 'SrcTexture' to be transformed.
//
//
// The FFT transforms a signal with a power-of-two length N.
// A "Horizontal / Vertical" transform,  will transform horizontal/vertical scanlines independently.
// The scanlines have length N = NUMTHREADSX * RADIX.
//
// The input data is composed of scanlines from a windowed region of SrcTexture (SrcRectMin/Max).
// Since SrcRectMax - SrcRectMin  is generally smaller than N, the scanlines are padded with zero.
//
// NB:   uint2 TransformSize is the target buffer size.
//       It is assumed that TransformSize.x == NUMTHREADSX * RADIX
//                          TransformSize.y == Number of thread groups

uint TransformType;
[numthreads(NUMTHREADSX, 1, 1)]
void GroupSharedComplexFFTCS(uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID )
{
	const bool bIsHorizontal = (TransformType & 0x1);
	const bool bIsForward    = (TransformType & 0x2);

	// Threads are defined in a 1d array.

	const uint ThreadIdx = GroupThreadID.x;

	// The scan line this thread works on

	const uint ScanIdx  = GroupID.z;


	// The length of the signal to be transformed

	const uint SignalLength = SCAN_LINE_LENGTH;

	// The main memory access pattern for this thread.

	uint Head = ThreadIdx;
	const uint Stride = STRIDE;

	// Thread-local memory.  Reserve two arrays since we plit .xy and .zw channels

	Complex LocalBuffer[2][RADIX];


	// Load the local memory from the source texture
	// LocalBuffer[0][] holds .xy,  LocalBuffer[1][] holds.zw


	// Read from an image buffer
	CopyDataSrcWindowToLocal(LocalBuffer, bIsHorizontal, ScanIdx, Head, Stride, SrcRectMin, SrcRectMax);
	//FFTMemoryBarrier();

	// Fourier Transform  the data
	// This uses the group shared memory and has appropriate syncs

	GroupSharedFFT(bIsForward, LocalBuffer, SignalLength, ThreadIdx);

	// Copy data to target buffer


	// Write image space data.
	//FFTMemoryBarrier();
	CopyDataLocalToDstWindow(LocalBuffer, bIsHorizontal, ScanIdx, Head, Stride, DstExtent);
}

#endif  // #ifdef INCLUDE_GROUP_SHARED_COMPLEX_FFT


#ifdef INCLUDE_GROUP_SHARED_TWO_FOR_ONE_FFT

// ---------------------------------------------------------------------------------------------------------------------------------------
//				FFT Compute Shader for four channels of real data in an image, resulting in four 1/2 lenght channels of complex data
//              and inverse.
//              And needed helper functions
// ---------------------------------------------------------------------------------------------------------------------------------------


void ModifyInput(inout Complex LocalBuffer[2][RADIX])
{

	for (uint r = 0; r < RADIX; ++r)
	{
		float4 InputColor = float4(LocalBuffer[0][r].x, LocalBuffer[0][r].y, LocalBuffer[1][r].x, LocalBuffer[1][r].y);

		bool bIsChanged = FilterPixel(BrightPixelGain, InputColor);

		if (bIsChanged)
		{
			LocalBuffer[0][r] = InputColor.xy;
			LocalBuffer[1][r] = InputColor.zw;
		}
	}
}


// CS Entry Point:
// Compute shader that performs (four) real FFTs of the data in 'SrcTexture' using group shared
// memory.
//
// uint2 TransformType; determines the type (forward / inverse) and direction of the transform.
//       TransformType & 1 == {1,0} transforms the data in the {Horizontal , Vertical} direction.
//       TransformType & 2 == {1,0} performs a {Forward , Inverse} transform.
//
// For the Forward Transform:
//		The 'r & g' channels are transformed as one complex signal 'r + I * g'
//		The 'b & a' channels are transformed as a second complex signal 'b + I * a'
//		and then the two-for-one unpacking is applied.   so the output layout will
//      be columns of float4 = (R, B), followed by columns of float4 = (G, A)
//      where R,G,B,A are the transforms of r,g,b,a and are complex (stored as float2).
//
// The Inverse Transform:
//      Consumes two-for-one unpacked data (see above), and inverts the two-for-one transform,
//      to recover the original signal.
//
// uint2 SrcRectMin,  SrcRectMax;
// define the subsection of 'SrcTexture' to be transformed.
//
//
// The FFT transforms a signal with a power-of-two length N.
// A "Horizontal / Vertical" transform,  will transform horizontal/vertical scanlines independently.
// The scanlines have length N = NUMTHREADSX * RADIX.
//
// The input data is composed of scanlines from a windowed region of SrcTexture (SrcRectMin/Max).
// Since SrcRectMax - SrcRectMin  is generally smaller than N, the scanlines are padded with zero.
//
// NB:   uint2 TransformSize is the target buffer size.
//       It is assumed that TransformSize.x == NUMTHREADSX * RADIX
//                          TransformSize.y == Number of thread groups
uint TransformType;
uint2 SrcRectMin;
uint2 SrcRectMax;
uint4 DstRect;

StructuredBuffer<float4> DstPostFilterParameters;

[numthreads(NUMTHREADSX, 1, 1)]
void GroupSharedTwoForOneFFTCS(uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID )
{
	const bool bIsHorizontal = (TransformType & 0x1);
	const bool bIsForward    = (TransformType & 0x2);
	const bool bModifyInput  = (TransformType & 0x4);

	// Threads are defined in a 1d array.

	const uint ThreadIdx = GroupThreadID.x;

	// The (horizontal / vertical) scan line this thread works on

	const uint ScanIdx  = GroupID.z;


	// The length of the signal to be transformed

	const uint SignalLength = SCAN_LINE_LENGTH;

	// The main memory access pattern for this thread.

	uint Head = ThreadIdx;
	const uint Stride = STRIDE;

	// Thread-local memory.  Reserve two arrays since we plit .xy and .zw channels

	Complex LocalBuffer[2][RADIX];


	// Load the local memory from the source texture
	// LocalBuffer[0][] holds .xy,  LocalBuffer[1][] holds.zw

	if (bIsForward)
	{
		uint4 SrcRect;
		SrcRect.xy = SrcRectMin.xy;
		SrcRect.zw = SrcRectMax.xy;
		// Read from an image buffer

		CopyDataSrcWindowToLocal(LocalBuffer, bIsHorizontal, ScanIdx, Head, Stride, SrcRect);

		// NB: need to revisit
		// Note: this only works for colors. Any negatives or Nans will be set to zero

		ScrubNANs(LocalBuffer);

		if (bModifyInput)
		{
			// The input colors may be modified to boost the bright pixels.
			ModifyInput(LocalBuffer);
		}
	}
	else
	{
		// Read a frequency space buffer with two-for-one data layout
		ReadTwoForOneFrequencyData(bIsHorizontal, LocalBuffer, ScanIdx, Head, Stride, SignalLength);
	}

	// Fourier Transform  the data
	// This uses the group shared memory and has appropriate syncs

	GroupSharedFFT(bIsForward, LocalBuffer, SignalLength, ThreadIdx);

	// Copy data to target buffer

	if (bIsForward)
	{
		// Write a frequency space buffer with two-for-one data layout
		// FFTMemoryBarrier();
		WriteTwoForOneFrequencyData(bIsHorizontal, LocalBuffer, ScanIdx, Head, Stride, SignalLength);
	}
	else
	{
		// Write image space data.
		// FFTMemoryBarrier();

		// This is specialized for images, where floating point errors may have
		// resulted in (very very small) negative color values.
		ScrubNANs(LocalBuffer);

		float4 Scale = DstPostFilterParameters[0];

		UNROLL
		for (uint r = 0; r < RADIX; r++)
		{
			LocalBuffer[0][r] *= Scale.xy;
			LocalBuffer[1][r] *= Scale.xw;
		}

		CopyDataLocalToDstWindow(LocalBuffer, bIsHorizontal, ScanIdx, Head, Stride, DstRect);
	}
}

#endif // #ifdef INCLUDE_GROUP_SHARED_TWO_FOR_ONE_FFT


#ifdef INCLUDE_GROUP_SHARED_CONVOLUTION_WITH_TEXTURE

// ---------------------------------------------------------------------------------------------------------------------------------------
//				FFT Compute Shader for convolution of image with a pre-transformed kernel.
//
//              This shader is really the second of three steps to perform a 2D image convolution.
//					1)  Horizontal TwoForOneFFT(RealImage) -> ComplexData0
//					2)  Vertical ConvolutionWithTexture(ComplexData0) -> ComplexData1
//					3)  Horizontal Inverse TwoForOneFFT(ComplexData1)  -> RealImage
//
//				This assumes the incoming buffer has a data layout of four 1/2-lenght channels of complex data.
//              that resulted from 1D transform of 4 real channels (i.e. the image).
//				Also assumes that the pre-transformed kernel was computed in the same way
//              (e.g. Vertical ComplexFFT ( Horizontal TwoForOneFFT(Kernel)))
//
//
//              And needed helper functions
// ---------------------------------------------------------------------------------------------------------------------------------------

// Input SRV:
Texture2DType FilterTexture;


// Loading the Filter texture ( here the pre-transformed convolutin kernel)
void CopyFilterTextureToFilterBuffer(inout Complex Filter[2][RADIX], bool bIsHorizontal, in uint N, in uint Head, in uint Stride, in uint ScanIdx)
{
	if (bIsHorizontal)
	{
		uint2 Pixel = uint2(Head, ScanIdx);
		UNROLL
		for (uint r = 0; r < RADIX; ++r, Pixel.x += Stride)
		{
			float4 TextureValue = FilterTexture[Pixel];
			Filter[0][r] = TextureValue.xy;
			Filter[1][r] = TextureValue.zw;
		}

	}
	else
	{
		uint2 Pixel = uint2(ScanIdx, Head);
		UNROLL
		for (uint r = 0; r < RADIX; ++r, Pixel.y += Stride)
		{
			float4 TextureValue = FilterTexture[Pixel];
			Filter[0][r] = TextureValue.xy;
			Filter[1][r] = TextureValue.zw;
		}
	}

}

void ComplexMultTexture( bool bUseAlpha, bool bIsGAGroup,  in Complex Filter[2][RADIX], inout Complex LocalBuffer[2][RADIX])
{
	UNROLL for (uint r = 0; r < RADIX; ++r)
	{
		LocalBuffer[0][r] = ComplexMult(LocalBuffer[0][r], Filter[0][r]);
	}
	if (bUseAlpha || !bIsGAGroup)
	{
		UNROLL for (uint r = 0; r < RADIX; ++r)
		{
			LocalBuffer[1][r] = ComplexMult(LocalBuffer[1][r], Filter[1][r]);
		}
	}
}
void ComplexMultTexture(bool bIsHorizontal, bool bUseAlpha, bool bIsGAGroup, in uint N, in uint Head, in uint Stride, in uint ScanIdx, inout Complex LocalBuffer[2][RADIX])
{
	Complex Filter[2][RADIX];
	CopyFilterTextureToFilterBuffer(Filter, bIsHorizontal, N, Head, Stride, ScanIdx);

	ComplexMultTexture( bUseAlpha, bIsGAGroup,  Filter, LocalBuffer);
}


//float4 FilterTint;
void ApplyTint(in Complex Tint, inout Complex LocalBuffer[2][RADIX])
{
	{ for (uint r = 0; r < RADIX; ++r) LocalBuffer[0][r] *= Tint.x; }
	{ for (uint r = 0; r < RADIX; ++r) LocalBuffer[1][r] *= Tint.y; }
}


// CS Entry Point:
// Compute shader that does a convolution by applying complex FFTs on the data in 'SrcTexture'
// multiplies a texture (the transform of the physical space kernel), and then inverts the transform.
//
//  Expected usage:
//           input buffer of 4 float channels.
//           1) Horizontal_Forward Two-For-One Transform
//           2) Convolution ( Vertical_Forward complex transform, complex multiply Vertical_Inverse transform)
//           3) Horizontal_Inverse Two-For-One Transform.
//
// uint2 TransformType; determines the type (forward / inverse) and direction of the first transform in the
//       convolution (cf. Vertical_Forward in step 2 above).
//
//       TransformType & 1 == {1,0} transforms the data in the {Horizontal , Vertical} direction.
//       TransformType & 2 == {1,0} performs a {Forward , Inverse} transform.
//
// NB: The kernel texture must have been the result of the same forward steps
// e.g. Horizontal_Forward two-for-one followed by Vertical_Forward complex in the above example.
//
// uint2 SrcRectMin,  SrcRectMax;
// define the subsection of 'SrcTexture' to be transformed.
//
//
// The FFT steps transform a signal with a power-of-two length N.
// A "Horizontal / Vertical" transform,  will transform horizontal/vertical scanlines independently.
// The scanlines have length N = NUMTHREADSX * RADIX.
//
// The input data is composed of scanlines from a windowed region of SrcTexture (SrcRectMin/Max).
// Since SrcRectMax - SrcRectMin  is generally smaller than N, the scanlines are padded with zero.
//
// NB:   uint2 TransformSize is the target buffer size.
//       It is assumed that TransformSize.x == NUMTHREADSX * RADIX
//                          TransformSize.y == Number of thread groups
uint TransformType;
uint2 DstExtent;
uint2 SrcRectMin;
uint2 SrcRectMax;

[numthreads(NUMTHREADSX, 1, 1)]
void GSConvolutionWithTextureCS(uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID )
{
	// These are constant across all thread groups
	const bool bIsHorizontal = (TransformType & 0x1);
	const bool bIsForward    = (TransformType & 0x2);
	const bool bUseAlpha     = (TransformType & 0x8);

	// Threads are defined in a 1d array.

	const uint ThreadIdx = GroupThreadID.x;

	// The scan line this thread works on

	const uint ScanIdx  = GroupID.z;

	const uint NumScanlines = (bIsHorizontal) ? DstExtent.y : DstExtent.x;

	//const uint NumFrequencies = TransformSize.x * TransformSize.y;

	// The two-for-one transform results in a data layout with complex coefficients
	// R G B A (representing the 1-d transform of r g b a)
	// In half of the buffer
	//       R = SrcTexture.xy, B = SrcTexture.zw
	// and in the other half
	//       G = SrcTexture.xy, A = SrcTexture.zw
	// With this layout
	// R/B = columns [0, .., NumScanlines/2 -1]
	// G/A = columns [NumScanlines/2, .., NumScanlines-1]

	// This data is loaded into the LocalBuffer[2][RADIX]
	// as
	// LocalBuffer[0] = {R | G};
	// LocalBuffer[1] = {B | A};


	// The thread groups in this shader act on the columns.


	// This thread group acts on the columns of G/A.
	const bool bIsGAGroup = (2 * ScanIdx > NumScanlines - 2 );

	// Force the alpha 'tint' to do nothing
	float4 FilterTint = float4(1, 1, 1, 1);
	const float2 Tint = (bIsGAGroup) ? float2(FilterTint.y, 1.f) : FilterTint.xz;

	// The length of the signal to be transformed

	const uint SignalLength = SCAN_LINE_LENGTH;

	// The main memory access pattern for this thread.

	uint Head = ThreadIdx;
	const uint Stride = STRIDE;

	// Thread-local memory.  Reserve two arrays since we plit .xy and .zw channels

	Complex LocalBuffer[2][RADIX];

	// Load the filter
	Complex Filter[2][RADIX];
	CopyFilterTextureToFilterBuffer(Filter, bIsHorizontal, SignalLength, Head, Stride, ScanIdx);


	// Load the local memory from the source texture
	// LocalBuffer[0][] holds .xy,  LocalBuffer[1][] holds.zw

	CopyDataSrcWindowToLocal(LocalBuffer, bIsHorizontal, ScanIdx, Head, Stride, SrcRectMin, SrcRectMax);


	// Fourier Transform  the data
	// This uses the group shared memory and has appropriate syncs

	GroupSharedFFT(bIsForward, LocalBuffer, SignalLength, Head);


	// ---- Convolution in frequency space is a multiply.
	// Here we multiply against the transform of a physical space kernel, but special case the
	// thread groups that are working on  Green and Alpha


	ComplexMultTexture( bUseAlpha, bIsGAGroup,  Filter, LocalBuffer);

	// The input kernel might not have been normalized.
	// This applies the correct normalization to local buffer,

	{
		float2 Norm[2];
		GetKernelSum(FilterTexture, bIsHorizontal, NumScanlines, Norm);
		// redSum = Norm[0].x;  greenSum = Norm[0].y; blueSum = Norm[1].x; alphaSum = Norm[1].y

		// Normalize R & G
		{
			//float Normal = NormMax;
			float Normal = (bIsGAGroup) ? Norm[0].y :  Norm[0].x;

			for (uint r = 0; r < RADIX; ++r)
			{
				// This is the R or G channel
				LocalBuffer[0][r] /= Normal;
			}
		}

		// Normalize B & A
		{
			//float Normal = AorBNorm;
			float Normal = (bIsGAGroup) ? Norm[1].y :  Norm[1].x;

			for (uint r = 0; r < RADIX; ++r)
			{
				// This is the B or A channel
				LocalBuffer[1][r] /= Normal;
			}
		}
	}


	// ---- Transform back ---- //

	GroupSharedFFT(!bIsForward, LocalBuffer, SignalLength, Head);

	// Apply additional tinting to the convolution result

	ApplyTint(Tint, LocalBuffer);

	// Copy Data back to main memory (dst)
	//uint2 Extent = SrcRectMax - SrcRectMin;

	CopyDataLocalToDstWindow(LocalBuffer, bIsHorizontal, ScanIdx, Head, Stride, DstExtent);
}

#endif  // #ifdef INCLUDE_GROUP_SHARED_CONVOLUTION_WITH_TEXTURE


// ---------------------------------------------------------------------------------------------------------------------------------------
//	     Follow Compute Shaders are used by the multi-pass variant of the FFT.
//       The multipass is only used when the image scanlines are too large for the group shared memory
//       implementation and will be much slower than the group-shared versions.
//
//       ReorderFFTPassCS() :
//       The multipass works by first performing a "Decimation in Time" that reorders the input data.
//       This reording is equivalent to recursively segratating elements into Odd / Even groups untill the group size
//       fits in the group shared memory..
//
//       GroupSharedSubComplexFFTCS() :
//       Then a group-shared pass oppertates on each subgroup independently.
//
//       ComplexFFTPassCS():
//       Followed by the appropriate number of "Butterfly" passes to join the results.
//
//       PackTwoForOneFFTPassCS():
//       Should the input or result of complex FFT need to be interpreted real data,
//       a pass can either split or merge the data.
// ---------------------------------------------------------------------------------------------------------------------------------------

float4 WindowRead(in uint2 Texel, in uint4 Window, in Texture2DType SrcTexture)
{
	// not in window
	bool bNotIn = !(Texel.x < Window.z) ||
		           (Texel.x < Window.x) ||
				  !(Texel.y < Window.w) ||
				   (Texel.y < Window.y);

    if (bNotIn)
	{
		return float4(0.f, 0.f, 0.f, 0.f);
	}
	else
	{
		return SrcTexture[Texel];
	}
}

#ifdef INCLUDE_REORDER_FFT_PASS

// Input SRV:
Texture2DType   SrcSRV;
RWTexture2DType DstUAV;


#define NUMTHREADS_PER_COL 32

// This pass re-orders the data of lenght 2^LogTwoLength
// into 2^BitCount disjoint regions in preparation for 2^BitCount
// independent FFTs.
// The partition is equivalent to recursively splitting into even and odd
// subregions.
// Bit Count (1) :  even , odd entries
// Bit Count (2) :  even-even, odd -even, odd-odd.
// etc.

// e.g. input f(0), f(1), f(2), f(3), f(4), f(5), f(6), f(7)
// Bit Count 1 ->  f(0), f(2), f(4), f(6) . f(1), f(3), f(5), f(7)
// Bit Count 2 ->  f(0), f(4) . f(2), f(6) . f(1), f(5), f(3), f(7)

uint4 SrcRect;
uint4 DstRect;
uint TransformType;
uint TransformLenght;
uint BitCount; // log(2, Pow2SubLenghtCount);
uint LogTwoLength; // log(2, TransformLength);

// Reverse the last 'BitReverseCount' values and move them to the
// High bits.  Assumes BitReverse count < 32
uint PartialBitReverse(in uint InValue, in uint BitReverseCount, in uint BitRange )
{
	uint Result = 0;
	uint Tmp = InValue;
	for (uint i = 0; i < BitReverseCount; ++i)
	{
		uint LowBit = Tmp & 0x1;
		Tmp = Tmp >> 1;
		Result = Result << 1;
		Result |= LowBit;
	}
	Result = Result << (BitRange - BitReverseCount);
	Result |= Tmp;

	return Result;
}

uint InversePartialBitReverse(in uint InValue, in uint BitReverseCount, in uint BitRange)
{


	uint ResultValue;
	// -- for testing
	if (false && BitReverseCount == 1)
	{
		uint HalfTransformLength = 1u << (BitRange -1);
		if (InValue < HalfTransformLength )
		{
			ResultValue = 2 * InValue;
		}
		else
		{
			ResultValue = 2 * (InValue - HalfTransformLength) + 1;
		}
	}

	// This could be made faster..
	ResultValue = BitReverse(InValue, BitRange);
	ResultValue = PartialBitReverse(ResultValue, BitReverseCount, BitRange);
	ResultValue = BitReverse(ResultValue, BitRange);

	return ResultValue;
}

[numthreads(NUMTHREADS_PER_COL, 1, 1)]
void ReorderFFTPassCS(uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID )
{
	const uint ThreadsPerGroup = NUMTHREADS_PER_COL;
	const bool bIsHorizontal = (TransformType & 0x1);
	const bool bIsForward    = (TransformType & 0x2);
	const bool bScrubNaNs    = (TransformType & 0x4);

	// The number of bit that need to be reversed.
	const uint BitReverseCount = BitCount;

	// The lenght of the transform.
	const uint TransformLenght = 1u << LogTwoLength;
	const uint NumSubLengths = 1u << BitCount;

	// When inverting the transform the correct scale is 1 / TransformLenght
	// But the group shared sub transforms will scale by NumSubLengths / TransformLength.
	// So we need to account for the additional factor here.

	float Scale = (bIsForward) ? 1.f : 1.f / float(NumSubLengths);


	// NB: The names assume we doing a horizontal transform.  In which case thread groups opperate on columns
	// of data.

	// Number of elements in the transverse direction.

	const uint NumRows = (bIsHorizontal) ? SrcRect.w - SrcRect.y : SrcRect.z - SrcRect.x;


	// All the threads in this group operate on this column.

	const uint ColNum  = GroupID.z;

	// This thread is responsible for the element (RowIdx, ColIdx)

	const uint RowIdx = GroupThreadID.x;

	const uint DstColA = 2 * ColNum;
	const uint DstColB = DstColA + 1;

	// Bit reverse to find the Src Columns
	const uint SrcColA =  InversePartialBitReverse(DstColA, BitCount, LogTwoLength);
	const uint SrcColB =  InversePartialBitReverse(DstColB, BitCount, LogTwoLength);


	const uint2 SrcOffset = SrcRect.xy;
	const uint2 DstOffset = DstRect.xy;

	// Loop over the rows that this thread owns.
	//const uint ElementsPerThread = NumRows / ThreadsPerGroup;

	float4 SrcValueA;
	float4 SrcValueB;

	uint4 Window = SrcRect;

	if (bIsHorizontal)
	{
		uint CurRowIdx = RowIdx;
		for (; CurRowIdx < NumRows; CurRowIdx += ThreadsPerGroup)
		{
			uint2 SrcTexelA = SrcOffset + uint2(SrcColA, CurRowIdx);
			uint2 SrcTexelB = SrcOffset + uint2(SrcColB, CurRowIdx);

			uint2 DstTexelA = DstOffset + uint2(DstColA, CurRowIdx);
			uint2 DstTexelB = DstOffset + uint2(DstColB, CurRowIdx);


			SrcValueA = WindowRead(SrcTexelA, Window, SrcSRV);
			ScrubNaNs(SrcValueA, bScrubNaNs);

			DstUAV[DstTexelA] = Scale * SrcValueA;


			SrcValueB = WindowRead(SrcTexelB, Window, SrcSRV);
			ScrubNaNs(SrcValueB, bScrubNaNs);

			DstUAV[DstTexelB] = Scale * SrcValueB;
		}
	}
	else
	{
		uint CurRowIdx = RowIdx;
		for (; CurRowIdx < NumRows; CurRowIdx += ThreadsPerGroup)
		{
			uint2 SrcTexelA = SrcOffset + uint2(CurRowIdx, SrcColA);
			uint2 SrcTexelB = SrcOffset + uint2(CurRowIdx, SrcColB);

			uint2 DstTexelA = DstOffset + uint2(CurRowIdx, DstColA);
			uint2 DstTexelB = DstOffset + uint2(CurRowIdx, DstColB);

			SrcValueB = WindowRead(SrcTexelB, SrcRect, SrcSRV);
			ScrubNaNs(SrcValueB, bScrubNaNs);
			SrcValueA = WindowRead(SrcTexelA, SrcRect, SrcSRV);
			ScrubNaNs(SrcValueA, bScrubNaNs);

			// Write out
			DstUAV[DstTexelA] = Scale * SrcValueA;
			DstUAV[DstTexelB] = Scale * SrcValueB;
		}
	}
}

#endif //INCLUDE_REORDER_FFT_PASS


#ifdef INCLUDE_GROUP_SHARED_SUB_COMPLEX_FFT

// The FFT transforms a signal with a power-of-two length N.
// A "Horizontal / Vertical" transform,  will transform horizontal/vertical scanlines independently.
// The scanlines have length N = NUMTHREADSX * RADIX.
//
// The input data is composed of scanlines from a windowed region of SrcTexture (SrcRectMin/Max).
// Since SrcRectMax - SrcRectMin  is generally smaller than N, the scanlines are padded with zero.
//
// NB:   uint2 TransformSize is the target buffer size.
//       It is assumed that TransformSize.x == NUMTHREADSX * RADIX
//                          TransformSize.y == Number of thread groups
// Input SRV:
//Texture2DType   SrcSRV;
//RWTexture2DType DstUAV;

uint NumSubRegions;
uint TransformLength;
uint4 SrcWindow;
uint TransformType;
[numthreads(NUMTHREADSX, 1, 1)]
void GroupSharedSubComplexFFTCS(uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID )
{
	const bool bIsHorizontal = (TransformType & 0x1);
	const bool bIsForward    = (TransformType & 0x2);

	// Threads are defined in a 1d array.

	const uint ThreadIdx = GroupThreadID.x;

	// The scan line this thread works on

	const uint ScanIdx  = GroupID.z;


	// The length of the signal to be transformed

	const uint SignalLength = SCAN_LINE_LENGTH;

	const uint SubLength = TransformLength / NumSubRegions;

	// The main memory access pattern for this thread.

	uint Head = ThreadIdx;
	const uint Stride = STRIDE;

	// Thread-local memory.  Reserve two arrays since we plit .xy and .zw channels

	Complex LocalBuffer[2][RADIX];


	// Load the local memory from the source texture
	// LocalBuffer[0][] holds .xy,  LocalBuffer[1][] holds.zw

	uint4 Window    =  SrcWindow;
	uint2 WindowOffset = (bIsHorizontal) ? uint2(SubLength, 0) : uint2(0, SubLength);

	// Do 'NumSubRegions' independent (spatially disjoint) transforms that together cover the
	// entire domain.

	for (uint SubRegionId = 0; SubRegionId < NumSubRegions; SubRegionId++)
	{
		// Create the correct src window for this read

		FFTMemoryBarrier();

		// Offset the Read & Write window.

		// Read from an image buffer
		CopyDataSrcWindowToLocal(LocalBuffer, bIsHorizontal, ScanIdx, Head, Stride, Window.xy, Window.zw);
		//FFTMemoryBarrier();

		// Fourier Transform  the data
		// This uses the group shared memory and has appropriate syncs
		// NB: bIsForward == false case applies 1/SignalLenght scaling
		// for the inverse.

		GroupSharedFFT(bIsForward, LocalBuffer, SignalLength, ThreadIdx);

		// Copy data to target buffer

		CopyDataLocalToDstWindow(LocalBuffer, bIsHorizontal, ScanIdx, Head, Stride, Window);

		// Move the window so we can work on the next section.
		Window = Window + uint4(WindowOffset, WindowOffset);
	}
}

#endif  // #ifdef INCLUDE_GROUP_SHARED_SUB_COMPLEX_FFT


#ifdef  INCLUDE_COMPLEX_FFT_PASS
// No group shared mememory FFT, does multiple passes


// Input SRV:
Texture2DType   SrcSRV;
RWTexture2DType DstUAV;


#define NUMTHREADS_PER_COL 32

uint4 SrcRect;
uint4 DstRect;
uint TransformType;
uint BitCount;  // log(2, TransformLength) + 1;
uint PowTwoLength;  // Power of two.  2^p where p is the pass number

StructuredBuffer<float4> DstPostFilterParameters;

[numthreads(NUMTHREADS_PER_COL, 1, 1)]
void ComplexFFTPassCS(uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID )
{
	const uint ThreadsPerGroup = NUMTHREADS_PER_COL;
	const bool bIsHorizontal = (TransformType & 0x1);
	const bool bIsForward    = (TransformType & 0x2);
	const bool bScrubNaNs    = (TransformType & 0x4);

	// We zipping together 2 transforms of length Ns to make a new one
	// of length 2*Ns

	const uint Ns = PowTwoLength;
	const uint TwoNs = 2 * Ns;
	int Stride = Ns;

	//const bool bScrubNaNs = (bIsForward && Ns == 1);

	// NB: The names assume we doing a horizontal transform.  In which case thread groups opperate on columns
	// of data.

	// Number of elements in the transverse direction.

	const uint NumRows = (bIsHorizontal) ? SrcRect.w - SrcRect.y : SrcRect.z - SrcRect.x;


	// All the threads in this group operate on this column.

	const uint ColNum  = GroupID.z;

	// This thread is responsible for the element (RowIdx, ColIdx)

	const uint RowIdx = GroupThreadID.x;

	// The Radix-2 points needed for this pass.
	// Ns = 2^pass.  E.g. zero pass, Ns = 1: pass 1, Ns = 2, pass 2 Ns = 4...

	uint SrcColA = (ColNum / Ns) * TwoNs + ColNum % Ns;
	uint SrcColB = SrcColA + Ns;

	const uint DstColA = SrcColA;
	const uint DstColB = SrcColB;

	float Angle = 6.283185307179586f * (  float(SrcColA % TwoNs)   / float(TwoNs) );
	if (!bIsForward) {
		Angle *= -1.f;
	}

	Complex Twiddle = Complex(1.f, 1.f);
	sincos(Angle, Twiddle.y, Twiddle.x);

	// Bit reverse on the first pass
	if (Ns == 1)
	{
		SrcColA = BitReverse(DstColA, BitCount - 1);
		SrcColB = BitReverse(DstColB, BitCount - 1);
	}


	const uint2 SrcOffset = SrcRect.xy;
	const uint2 DstOffset = DstRect.xy;

	// Loop over the rows that this thread owns.
	//const uint ElementsPerThread = NumRows / ThreadsPerGroup;

	float4 SrcValueA;
	float4 SrcValueB;

	const uint TransformLength = 1u << (BitCount-1);

	float4 Scale = DstPostFilterParameters[0] * ((Ns == 1 && !bIsForward) ? 1.f / float(TransformLength) : 1.f);

	if (bIsHorizontal)
	{
		uint CurRowIdx = RowIdx;
		for (; CurRowIdx < NumRows; CurRowIdx += ThreadsPerGroup)
		{
			uint2 SrcTexelA = SrcOffset + uint2(SrcColA, CurRowIdx);
			uint2 SrcTexelB = SrcOffset + uint2(SrcColB, CurRowIdx);

			uint2 DstTexelA = DstOffset + uint2(DstColA, CurRowIdx);
			uint2 DstTexelB = DstOffset + uint2(DstColB, CurRowIdx);

			SrcValueB = WindowRead(SrcTexelB, SrcRect, SrcSRV);
			ScrubNaNs(SrcValueB, bScrubNaNs);

			{
				float2 RG = SrcValueB.xy;
				float2 BA = SrcValueB.zw;
				RG = ComplexMult(Twiddle, RG);
				BA = ComplexMult(Twiddle, BA);
				SrcValueB = float4(RG.x, RG.y, BA.x, BA.y);
				//SrcValueB = float4(ComplexMult(Twiddle, SrcValueB.xy), ComplexMult(Twiddle, SrcValueB.zw));
			}

			SrcValueA = WindowRead(SrcTexelA, SrcRect, SrcSRV);
			ScrubNaNs(SrcValueA, bScrubNaNs);

			float4 ResultValueA = SrcValueA + SrcValueB;
			float4 ResultValueB = SrcValueA - SrcValueB;

			ResultValueA *= Scale;
			ResultValueB *= Scale;


			// Write out
			DstUAV[DstTexelA] = ResultValueA;
			DstUAV[DstTexelB] = ResultValueB;
		}
	}
	else
	{
		uint CurRowIdx = RowIdx;
		for (; CurRowIdx < NumRows; CurRowIdx += ThreadsPerGroup)
		{
			uint2 SrcTexelA = SrcOffset + uint2(CurRowIdx, SrcColA);
			uint2 SrcTexelB = SrcOffset + uint2(CurRowIdx, SrcColB);

			uint2 DstTexelA = DstOffset + uint2(CurRowIdx, DstColA);
			uint2 DstTexelB = DstOffset + uint2(CurRowIdx, DstColB);

			SrcValueB = WindowRead(SrcTexelB, SrcRect, SrcSRV);
			ScrubNaNs(SrcValueB, bScrubNaNs);
			SrcValueB = float4(ComplexMult(Twiddle, SrcValueB.xy), ComplexMult(Twiddle, SrcValueB.zw));

			SrcValueA = WindowRead(SrcTexelA, SrcRect, SrcSRV);
			ScrubNaNs(SrcValueA, bScrubNaNs);

			float4 ResultValueA = SrcValueA + SrcValueB;
			float4 ResultValueB = SrcValueA - SrcValueB;

			ResultValueA *= Scale;
			ResultValueB *= Scale;

			// Write out
			DstUAV[DstTexelA] = ResultValueA;
			DstUAV[DstTexelB] = ResultValueB;
		}
	}
}
#endif  //ifdef   INCLUDE_COMPLEX_FFT_PASS

#ifdef INCLUDE_PACK_TWOFORONE_FFT_PASS

#define NUMTHREADS_PER_COL 512


// Input SRV:
Texture2DType   SrcSRV;
RWTexture2DType DstUAV;

uint4 DstRect;
uint TransformType;

[numthreads(NUMTHREADS_PER_COL, 1, 1)]
void PackTwoForOneFFTPassCS(uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID )
{
	// currently assume that the src and result have the same offset.
	uint4 SrcRect = DstRect;

	const uint ThreadsPerGroup = NUMTHREADS_PER_COL;
	const bool bIsHorizontal   = (TransformType & 0x1);
	const bool bIsForward      = (TransformType & 0x2);

	// Number of elements in the transverse direction.

	const uint NumRows = (bIsHorizontal) ? DstRect.w - DstRect.y : DstRect.z - DstRect.x;

	// All the threads in this group operate on this "column""
	// for forward: K in [0, Transform Lenght /2 + 1)
	// for inverse: K in [0, Transform Lenght /2 + 1)
	const uint K  = GroupID.z;
	const uint RowIdx = GroupThreadID.x;

	// This thread is responsible for the element (K, RowIdx)


	uint NumDstCol = (bIsHorizontal) ? DstRect.z - DstRect.x : DstRect.w - DstRect.y;
	uint N = (bIsForward) ? NumDstCol - 2 : NumDstCol;
	uint Non2 = N / 2;

	// Forward case: Two Real Signals were transformed as a single complex signal, split them apart.
	if (bIsForward) // dispatch-level
	{
		uint Non2 = N / 2;

		//if (K > Non2) return;

		if (K != 0 && K != Non2)  // group-level
		{
			const uint NmK = N - K;

			uint CurRowIdx = RowIdx;
			for (; CurRowIdx < NumRows; CurRowIdx += ThreadsPerGroup)
			{
				// I need values at K and N - K
				uint2 TexelK   = Coord( uint2(K, CurRowIdx),   bIsHorizontal );
				float4 ZAtK    = SrcSRV[ SrcRect.xy + TexelK ];

				uint2 TexelNmK = Coord( uint2(NmK, CurRowIdx), bIsHorizontal );
				float4 ZAtNmK  = SrcSRV[ SrcRect.xy + TexelNmK ];

				{
					// F_k = (1/2) ( Z_k + Conjugate(Z_{N-k}) )
					float4 FAtK   = ZAtK + float4(ZAtNmK.x, -ZAtNmK.y, ZAtNmK.z, -ZAtNmK.w);
					FAtK *= float4(0.5f, 0.5f, 0.5f, 0.5f);

					DstUAV[DstRect.xy +TexelK] = FAtK;
				}

				{
					// F_{N-k} = -(i/2) (Z_{N-k} - Conjuate(Z_k))
					float4 FAtNmK  = ZAtNmK - float4(ZAtK.x, -ZAtK.y, ZAtK.z, -ZAtK.w);
					FAtNmK *= -float4(0.5f, 0.5f, 0.5f, 0.5f);

					// mult by 'i'
					FAtNmK = float4(-FAtNmK.y, FAtNmK.x, -FAtNmK.w, FAtNmK.z);
					DstUAV[DstRect.xy + TexelNmK] = FAtNmK;
				}
			}
		}
		else // K == 0 || K == N/2
		{
			uint CurRowIdx = RowIdx;
			for (; CurRowIdx < NumRows; CurRowIdx += ThreadsPerGroup)
			{
				uint2 TexelK  = Coord( uint2(K, CurRowIdx), bIsHorizontal );
				float4 ZAtK   = SrcSRV[ SrcRect.xy + TexelK ];
				DstUAV[DstRect.xy + TexelK] = float4(ZAtK.x, 0.f, ZAtK.z, 0.f);

				uint2 Texel2 = ( K == 0 ) ? Coord(uint2(N, CurRowIdx), bIsHorizontal) :  Coord(uint2( N + 1, CurRowIdx), bIsHorizontal);

				DstUAV[DstRect.xy + Texel2]  = float4(ZAtK.y, 0.f, ZAtK.w, 0.f);
			}
		}

	}
	else // Inverse case: Merge the coefficients from two real transforms int a single complex signal.
	{
		const uint Non2 = N / 2;
		// for the inverse K is in [0, N/2 + 1)
		if (K != 0 && K != Non2)
		{
			if (K > Non2) return;

			const uint NmK = N - K ;

			uint CurRowIdx = RowIdx;
			for (; CurRowIdx < NumRows; CurRowIdx += ThreadsPerGroup)
			{
				// I need values at K and N - K

				uint2 TexelK   = Coord(uint2(K,   CurRowIdx), bIsHorizontal);
				uint2 TexelNmK = Coord(uint2(NmK, CurRowIdx), bIsHorizontal);

				float4 ZAtNmK = SrcSRV[ SrcRect.xy + TexelNmK];
				float4 ZAtK   = SrcSRV[ SrcRect.xy + TexelK ];

				{
					// Complex( SharedImag[ NmK ], SharedReal[ NmK ] )
					float4 FAtK   = ZAtK + float4(ZAtNmK.y, ZAtNmK.x, ZAtNmK.w, ZAtNmK.z);
					DstUAV[ DstRect.xy + TexelK ] = FAtK;
				}
				{
					float4 FAtNmK = ZAtNmK - float4(ZAtK.y, ZAtK.x, ZAtK.w, ZAtK.z);
					// Mult by 'i'
					FAtNmK = float4(-FAtNmK.y, FAtNmK.x, -FAtNmK.w, FAtNmK.z);
					DstUAV[ DstRect.xy + TexelNmK ] = FAtNmK;
				}
			}
		}
		else // K == 0 || K == N/2
		{
			uint CurRowIdx = RowIdx;
			for (; CurRowIdx < NumRows; CurRowIdx += ThreadsPerGroup)
			{
				uint2 TexelK   = Coord(uint2(K,   CurRowIdx), bIsHorizontal);
				float4 ZAtK    = SrcSRV[ SrcRect.xy + TexelK ];

				uint2 Texel2 = ( K == 0 ) ?  Coord(uint2(N,   CurRowIdx), bIsHorizontal) : Coord(uint2(N + 1,   CurRowIdx), bIsHorizontal);

				float4 ZAt2 = SrcSRV[ SrcRect.xy + Texel2 ];
				DstUAV[DstRect.xy + TexelK] = float4(ZAtK.x, ZAt2.x, ZAtK.z, ZAt2.z);
			}
		}

	}

}

#endif //INCLUDE_PACK_TWOFORONE_FFT_PASS

#ifdef INCLUDE_COPY_WINDOW


// Input SRV:
Texture2DType   SrcSRV;
RWTexture2DType DstUAV;

uint4 DstRect;
uint4 SrcRect;
#define X_THREAD_COUNT 1
#define Y_THREAD_COUNT 32
[numthreads(X_THREAD_COUNT, Y_THREAD_COUNT, 1)]
void CopyWindowCS(uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID )
{

	const bool bModifyInput  = (BrightPixelGain.y > BrightPixelGain.x);

	const uint XTile  = GroupID.x;
	const uint YTile  = GroupID.y;

	const uint XThreadId = GroupThreadID.x;
	const uint YThreadId = GroupThreadID.y;

	uint2 Pixel = uint2(XTile * X_THREAD_COUNT + XThreadId, YTile * Y_THREAD_COUNT + YThreadId);

	// Need to window the read and write
	float4 SrcValue = WindowRead(SrcRect.xy + Pixel, SrcRect, SrcSRV);

	if (bModifyInput)
	{
		FilterPixel(BrightPixelGain, SrcValue);
	}

	uint2 Texel = DstRect.xy + Pixel;

	// not in window
	bool bNotIn = !(Texel.x < DstRect.z) ||
		           (Texel.x < DstRect.x) ||
				  !(Texel.y < DstRect.w) ||
				   (Texel.y < DstRect.y);
	if (!bNotIn)
	{
		DstUAV[Texel] = SrcValue;
	}

	//DstUAV[DstRect.xy + Pixel] = SrcValue;

}

#endif //INCLUDE_COPY_WINDOW

#ifdef INCLUDE_COMPLEX_MULTIPLY_IMAGES

// Input SRV:
Texture2DType   SrcSRV;
Texture2DType   KnlSRV;
RWTexture2DType DstUAV;

uint4 SrcRect;
uint DataLayout;  // 1 for horizontal

#define NUMTHREADS_PER_COL 32

[numthreads(NUMTHREADS_PER_COL, 1, 1)]
void ComplexMultiplyImagesCS(uint3 GroupID : SV_GroupID, uint3 GroupThreadID : SV_GroupThreadID )
{

	const uint ThreadCount = NUMTHREADS_PER_COL;

	// Are our scan-lines horizontal?
	const bool bIsHorizontal = (DataLayout == 1);


	const uint NumScanlines   = (bIsHorizontal) ? SrcRect.w - SrcRect.y  :  SrcRect.z - SrcRect.x;
	const uint ScanlineLength = (bIsHorizontal) ? SrcRect.z - SrcRect.x  :  SrcRect.w - SrcRect.y;


	float4 InvNorm;
	// Get Values needed for normalization
	{

		const bool bIsGAGroup = ( (2 *  GroupID.z) > (NumScanlines - 2) );
		Complex Norm[2];


		const bool bUseAlpha = true;

		GetKernelSum(KnlSRV, bIsHorizontal, NumScanlines, Norm);

		// redSum = Norm[0].x;  greenSum = Norm[0].y; blueSum = Norm[1].x; alphaSum = Norm[1].y

		// We normalize each RGBA channel independently.
		InvNorm = float4(rcp((bIsGAGroup) ? Norm[0].y : Norm[0].x).xx, rcp((bIsGAGroup) ? Norm[1].y : Norm[1].x).xx);
	}


	// All the threads in this group operate on this column.

	const uint ScanLineIdx  = GroupID.z;
	//const bool bIsGAGroup = ( (2 * ScanLineIdx) > (NumScanlines - 2) );

	// This thread is responsible for the element (ElementIdx, ScanIdx)
	const uint ElementIdx = GroupThreadID.x;

	// Write to the same size/location window.
	const uint4 DstRect = SrcRect;

	if (bIsHorizontal)
	{
		uint Loc = ElementIdx;
		for ( ; Loc < ScanlineLength; Loc += ThreadCount)
		{
			uint2 Pixel = uint2( Loc, ScanLineIdx) + SrcRect.xy;
			float4 SrcValue = SrcSRV[Pixel];
			SrcValue *= InvNorm;
			float4 KnlValue = KnlSRV[Pixel];

			float2 RorG = ComplexMult(SrcValue.xy, KnlValue.xy);
			float2 BorA = ComplexMult(SrcValue.zw, KnlValue.zw);

			DstUAV[Pixel] = float4(RorG, BorA);

		}
	}
	else
	{
		uint Loc = ElementIdx;
		for ( ; Loc < ScanlineLength; Loc += ThreadCount)
		{
			uint2 Pixel = uint2(ScanLineIdx, Loc) + SrcRect.xy;
			float4 SrcValue = SrcSRV[Pixel];
			SrcValue *= InvNorm;
			float4 KnlValue = KnlSRV[Pixel];

			float2 RorG = ComplexMult(SrcValue.xy, KnlValue.xy);
			float2 BorA = ComplexMult(SrcValue.zw, KnlValue.zw);

			DstUAV[Pixel] = float4(RorG, BorA);
		}
	}
}

#endif // INCLUDE_COMPLEX_MULTIPLY_IMAGES