Files
UnrealEngine/Engine/Shaders/Private/GPUFastFourierTransform2DCore.ush
2025-05-18 13:04:45 +08:00

438 lines
12 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
GPUFastFourierTransform2DCore.usf: Core 2D Fast Fourier Transform Code
=============================================================================*/
#pragma once
// This file requires the two values to be defined.
// SCAN_LINE_LENGTH must be a power of RADIX and RADIX in turn must be a power of two
// Need define: RADIX and SCAN_LINE_LENGTH
// The core FFT functionality
#include "GPUFastFourierTransformCore.ush"
// ---------------------------------------------------------------------------------------------------------------------------------------
// For the 2D FFT - Utilities used copying data between main memory and local registers where each thread of the FFT works.
// ---------------------------------------------------------------------------------------------------------------------------------------
// The Src and Dst buffers containt two types of data:
// A real 4 channel color (r,g,b,a) or two complex numbers float4.{xy, zw} = (Complex, Complex)
#define SRCTYPE float4
#define DSTTYPE float4
#define Texture2DType Texture2D<SRCTYPE>
#define RWTexture2DType RWTexture2D<DSTTYPE>
// Input SRV:
Texture2DType SrcTexture;
// Output: Real and Imaginary Parts (UAV)
RWTexture2DType DstTexture;
// Utility to replace any NaNs with zeros.
void ScrubNANs(inout Complex LocalBuffer[2][RADIX])
{
UNROLL
for (uint r = 0; r < RADIX; ++r)
{
LocalBuffer[0][r] = -min(-LocalBuffer[0][r], Complex(0,0) );
LocalBuffer[1][r] = -min(-LocalBuffer[1][r], Complex(0,0) );
}
}
// Copy Data from main memory (src texture) to local
void CopyDataSrcToLocal(inout Complex LocalBuffer[2][RADIX], bool bIsHorizontal, uint ScanIdx, uint Loc, uint Stride )
{
if (bIsHorizontal)
{
uint2 Pixel = uint2(Loc, ScanIdx);
UNROLL
for (uint i = 0; i < RADIX; ++i, Pixel.x += Stride)
{
float4 SrcValue = SrcTexture[Pixel];
LocalBuffer[0][ i ] = SrcValue.xy;
LocalBuffer[1][ i ] = SrcValue.zw;
}
}
else
{
uint2 Pixel = uint2(ScanIdx, Loc);
UNROLL
for (uint i = 0; i < RADIX; ++i, Pixel.y += Stride)
{
float4 SrcValue = SrcTexture[Pixel];
LocalBuffer[0][ i ] = SrcValue.xy;
LocalBuffer[1][ i ] = SrcValue.zw;
}
}
}
// Copy Data back to main memory (dst)
void CopyDataLocalToDst(in Complex LocalBuffer[2][RADIX], bool bIsHorizontal, uint ScanIdx, uint Loc, uint Stride)
{
if(bIsHorizontal)
{
uint2 Pixel = uint2(Loc, ScanIdx);
UNROLL
for (uint r = 0; r < RADIX; ++r, Pixel.x += Stride)
{
float4 DstValue;
DstValue.xy = LocalBuffer[0][r];
DstValue.zw = LocalBuffer[1][r];
DstTexture[Pixel] = DstValue;
}
}
else
{
uint2 Pixel = uint2(ScanIdx, Loc);
UNROLL
for (uint r = 0; r < RADIX; ++r, Pixel.y += Stride)
{
float4 DstValue;
DstValue.xy = LocalBuffer[0][r];
DstValue.zw = LocalBuffer[1][r];
DstTexture[Pixel] = DstValue;
}
}
}
// Copy Data from main memory (src texture) to local buffer
// Loads zero values for areas outside the window.
void CopyDataSrcWindowToLocal(inout Complex LocalBuffer[2][RADIX], bool bIsHorizontal, in uint ScanIdx, uint Loc, uint Stride, uint4 Window)
{
{ for (uint i = 0; i < RADIX; ++i) LocalBuffer[0][ i ] = float2(0.f, 0.f); }
{ for (uint i = 0; i < RADIX; ++i) LocalBuffer[1][ i ] = float2(0.f, 0.f); }
if (bIsHorizontal)
{
// offset for window start
uint2 Pixel = uint2(Loc, ScanIdx) + Window.xy;
UNROLL
for (uint i = 0; i < RADIX ; ++i, Pixel.x += Stride)
{
bool InWindow = Pixel.x < Window.z;
if (InWindow)
{
float4 SrcValue = SrcTexture[Pixel];
LocalBuffer[0][ i ] = SrcValue.xy;
LocalBuffer[1][ i ] = SrcValue.zw;
}
else
{
LocalBuffer[0][ i ] = 0.0;
LocalBuffer[1][ i ] = 0.0;
}
}
}
else
{
// offset for window start
uint2 Pixel = uint2(ScanIdx, Loc) + Window.xy;
UNROLL
for (uint i = 0; i < RADIX; ++i, Pixel.y += Stride)
{
bool InWindow = Pixel.y < Window.w;
if (InWindow)
{
float4 SrcValue = SrcTexture[Pixel];
LocalBuffer[0][ i ] = SrcValue.xy;
LocalBuffer[1][ i ] = SrcValue.zw;
}
else
{
LocalBuffer[0][ i ] = 0.0;
LocalBuffer[1][ i ] = 0.0;
}
}
}
}
// Copy Data from main memory (src texture) to local buffer
// Loads zero values for areas outside the window.
void CopyDataSrcWindowToLocal(inout Complex LocalBuffer[2][RADIX], bool bIsHorizontal, uint ScanIdx, uint Loc, uint Stride, uint2 WindowMin, uint2 WindowMax )
{
{ for (uint i = 0; i < RADIX; ++i) LocalBuffer[0][ i ] = float2(0.f, 0.f); }
{ for (uint i = 0; i < RADIX; ++i) LocalBuffer[1][ i ] = float2(0.f, 0.f); }
if (bIsHorizontal)
{
uint2 Pixel = uint2(Loc, ScanIdx) + uint2(WindowMin.x, 0);
UNROLL
for (uint i = 0; i < RADIX; ++i, Pixel.x += Stride)
{
bool InWindow = Pixel.x < WindowMax.x;
if (InWindow)
{
float4 SrcValue = SrcTexture[Pixel];
LocalBuffer[0][ i ] = SrcValue.xy;
LocalBuffer[1][ i ] = SrcValue.zw;
}
else
{
LocalBuffer[0][ i ] = 0.0;
LocalBuffer[1][ i ] = 0.0;
}
}
}
else
{
uint2 Pixel = uint2(ScanIdx, Loc) + uint2(0, WindowMin.y);
UNROLL
for (uint i = 0; i < RADIX; ++i, Pixel.y += Stride)
{
bool InWindow = Pixel.y < WindowMax.y;
if (InWindow)
{
float4 SrcValue = SrcTexture[Pixel];
LocalBuffer[0][ i ] = SrcValue.xy;
LocalBuffer[1][ i ] = SrcValue.zw;
}
else
{
LocalBuffer[0][ i ] = 0.0;
LocalBuffer[1][ i ] = 0.0;
}
}
}
}
// Copy windowed Data back to main memory aligned with ROIRect.xy
void CopyDataLocalToDstWindow(in Complex LocalBuffer[2][RADIX], bool bIsHorizontal, in uint ScanIdx, uint Loc, uint Stride, uint4 ROIRect)
{
if(bIsHorizontal)
{
uint2 Pixel = uint2(Loc + ROIRect.x, ScanIdx + ROIRect.y);
UNROLL_N(RADIX)
for (uint r = 0; r < RADIX && Pixel.x < ROIRect.z; ++r, Pixel.x += Stride)
{
float4 DstValue;
DstValue.xy = LocalBuffer[0][r];
DstValue.zw = LocalBuffer[1][r];
DstTexture[Pixel] = DstValue;
}
}
else
{
uint2 Pixel = uint2(ScanIdx + ROIRect.x, Loc + ROIRect.y);
UNROLL_N(RADIX)
for (uint r = 0; r < RADIX && Pixel.y < ROIRect.w; ++r, Pixel.y += Stride)
{
float4 DstValue;
DstValue.xy = LocalBuffer[0][r];
DstValue.zw = LocalBuffer[1][r];
DstTexture[Pixel] = DstValue;
}
}
}
// Copy windowed Data back to main aligned with 0,0
void CopyDataLocalToDstWindow(in Complex LocalBuffer[2][RADIX], bool bIsHorizontal, uint ScanIdx, uint Loc, uint Stride, uint2 Extent)
{
uint4 ROIRect = uint4(0, 0, Extent.x, Extent.y);
CopyDataLocalToDstWindow(LocalBuffer, bIsHorizontal, ScanIdx, Loc, Stride, ROIRect);
}
// ---------------------------------------------------------------------------------------------------------------------------------------
// For the 2D FFT - Specialized Utilities that manage the main memory layout of "two-for-one" results.
// when transforming 4 channels (r.g.b.a) as two complex signals (r+ig, b+ia)
// and unpacking the 4 resulting 1/2 length complex transforms R, G, B, A
// ---------------------------------------------------------------------------------------------------------------------------------------
// Writes the (thread local) data from a 2-for-1 transform (transform of real f and g packed as f+i*g) to main memory, by first splitting F and G
// where 'F' is the transform of 'f' and 'G' is the transform of 'g.'
//
// NB: This requires the buffer be of length SignalLength + 2.
// NB: F_o, G_o, F_N/2 and G_N/2 will be real.. All other coefficients are complex
//
// here N = SignalLength. The resulting data layout will be
//
// float2 value: {F_o, 0}, {F_1}, {F_2},..,{F_{N/2-1}}, {F_N/2, 0}, {G_{N/2 +1}}, {G_{N/2 +2}} . .{ G_{N-2}}, {G_{N-1}}, {G_o, 0}, {G_N/2,0}
// offset: 0 , 1 , 2 ,..., N/2-1, N/2, N/2 +1, N/2 +2, .. N-2, N-1, N, N + 1
//
void WriteTwoForOneFrequencyData(in bool bIsHorizontal, inout Complex LocalBuffer[2][RADIX], uint ScanIdx, uint Loc, uint Stride, uint N)
{
FFTMemoryBarrier();
// Decompose the transforms. Note '0' and 'N/2' offsets will still be mixed, and have to be explicitly dealt with below.
SplitTwoForOne(LocalBuffer, Loc, Stride, N);
const bool bIsFirstElement = (Loc == 0);
const uint Non2 = N / 2;
if (bIsHorizontal)
{
uint2 Pixel = uint2(Loc, ScanIdx);
float4 DstValue;
UNROLL
for (uint r = 0; r < RADIX ; ++r, Pixel.x += Stride)
{
DstValue.xy = LocalBuffer[ 0 ][ r ];
DstValue.zw = LocalBuffer[ 1 ][ r ];
DstTexture[Pixel] = DstValue;
// The N/2 element holds F_N/2 + I G_N/2
// Write F_N/2 into this column, and G_N/2 into the the last column
if (Pixel.x == Non2)
{
DstTexture[Pixel] = float4(DstValue.x, 0.f, DstValue.z, 0.f);
DstTexture[uint2(N + 1, Pixel.y)] = float4(DstValue.y, 0.f, DstValue.w, 0.f);
}
}
// First element holds F_o + iG_o.
// Write Go into the second to last column. (this is the same as G_N)
if (bIsFirstElement)
{
DstValue.xy = LocalBuffer[ 0 ][ 0 ];
DstValue.zw = LocalBuffer[ 1 ][ 0 ];
DstTexture[uint2(0, Pixel.y)] = float4(DstValue.x, 0.f, DstValue.z, 0.f); // F_o
DstTexture[uint2(N, Pixel.y)] = float4(DstValue.y, 0.f, DstValue.w, 0.f); // G_o
}
}
else
{
uint2 Pixel = uint2(ScanIdx, Loc);
float4 DstValue;
UNROLL
for (uint r = 0; r < RADIX ; ++r, Pixel.y += Stride)
{
DstValue.xy = LocalBuffer[ 0 ][ r ];
DstValue.zw = LocalBuffer[ 1 ][ r ];
DstTexture[Pixel] = DstValue;
// The N/2 element holds F_N/2 + I G_N/2
// Write F_N/2 into this column, and G_N/2 into the the last column
if (Pixel.y == Non2)
{
DstTexture[Pixel] = float4(DstValue.x, 0.f, DstValue.z, 0.f);
DstTexture[uint2(Pixel.x, N + 1)] = float4(DstValue.y, 0.f, DstValue.w, 0.f);
}
}
// First element holds F_o + iG_o.
// Write Go into the second to last column. (this is the same as G_N)
if (bIsFirstElement)
{
DstValue.xy = LocalBuffer[ 0 ][ 0 ];
DstValue.zw = LocalBuffer[ 1 ][ 0 ];
DstTexture[uint2(Pixel.x, 0)] = float4(DstValue.x, 0.f, DstValue.z, 0.f); // F_o
DstTexture[uint2(Pixel.x, N)] = float4(DstValue.y, 0.f, DstValue.w, 0.f); // G_o
}
}
}
// The inverse of WriteTwoForOneFrequencyData()
// Reads into local registers, data written in the TwoForOneFrequency layout back into the form consistent with the transform of a single complex signal.
void ReadTwoForOneFrequencyData(bool bIsHorizontal, inout Complex LocalBuffer[2][RADIX], in uint ScanIdx, in uint Loc, in uint Stride, in uint N)
{
const bool bIsFirstElement = (Loc == 0);
const uint Non2 = N / 2;
if (bIsHorizontal)
{
// last two values
float4 NValue = SrcTexture[uint2(N, ScanIdx)];
float4 NppValue = SrcTexture[uint2(N +1, ScanIdx)];
uint2 Pixel = uint2(Loc, ScanIdx);
UNROLL
for (uint i = 0; i < RADIX; ++i, Pixel.x += Stride)
{
float4 SrcValue = SrcTexture[Pixel];
LocalBuffer[ 0 ][ i ] = SrcValue.xy;
LocalBuffer[ 1 ][ i ] = SrcValue.zw;
if ( Pixel.x == Non2)
{
// local buffer will be pure real with F_N/2, need to add I * G_N/2 (G_N/2 is real ie float2(G_r, 0))
float4 TmpValue = NppValue; // will be (#,0,#,0)
LocalBuffer[ 0 ][ i ] += NppValue.yx;
LocalBuffer[ 1 ][ i ] += NppValue.wz;
}
}
if (bIsFirstElement)
{
float4 LastSrcValue = SrcTexture[uint2(N, Pixel.y)]; // will be (#,0,#,0)
LocalBuffer[ 0 ][ 0 ] += NValue.yx;
LocalBuffer[ 1 ][ 0 ] += NValue.wz;
}
}
else
{
// last two values
float4 NValue = SrcTexture[uint2(ScanIdx, N)];
float4 NppValue = SrcTexture[uint2(ScanIdx, N + 1)];
uint2 Pixel = uint2(ScanIdx, Loc);
UNROLL
for (uint i = 0; i < RADIX; ++i, Pixel.y += Stride)
{
float4 SrcValue = SrcTexture[Pixel];
LocalBuffer[ 0 ][ i ] = SrcValue.xy;
LocalBuffer[ 1 ][ i ] = SrcValue.zw;
if ( Pixel.y == Non2)
{
// local buffer will be pure real with F_N/2, need to add IG_N/2
LocalBuffer[ 0 ][ i ] += NppValue.yx;
LocalBuffer[ 1 ][ i ] += NppValue.wz;
}
}
if (bIsFirstElement)
{
LocalBuffer[ 0 ][ 0 ] += NValue.yx;
LocalBuffer[ 1 ][ 0 ] += NValue.wz;
}
}
// Combine the transforms of the two real signals (F,G) as Z = F + I G
MergeTwoForOne(LocalBuffer, Loc, Stride, N);
// Done with the group shared memory that was used in the merge
FFTMemoryBarrier();
}