Files
UnrealEngine/Engine/Source/Runtime/SignalProcessing/Private/VectorFFT.cpp
2025-05-18 13:04:45 +08:00

1266 lines
53 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
#include "VectorFFT.h"
#include "SignalProcessingModule.h"
#include "Templates/UniquePtr.h"
#include "DSP/FFTAlgorithm.h"
#include "DSP/FloatArrayMath.h"
namespace Audio
{
// Implementation of a complex FFT.
class FVectorComplexFFT
{
public:
// Minimum size of fft is required to support 2 radix-4 fft stages.
static const int32 MinLog2FFTSize = 4;
// Maximum size of fft is set to avoid exceedingly large allocs and support
// various logic for indexing blocks internally.
static const int32 MaxLog2FFTSize = 16;
// Constructor
//
// @param InLog2FFTSize - Log2 size of the FFT.
FVectorComplexFFT(int32 InLog2FFTSize)
: Log2FFTSize(InLog2FFTSize)
, FFTSize(0)
, NumFloats(0)
{
check(Log2FFTSize >= MinLog2FFTSize);
check(Log2FFTSize <= MaxLog2FFTSize);
FFTSize = 1 << Log2FFTSize;
NumFloats = 2 * FFTSize; // Takes 2 floats to represent a complex number
InverseWorkBuffer.AddUninitialized(NumFloats);
// Pregenerate weights needed to calculate this size of FFT.
GenerateRadix4Weights();
GenerateFinalIndices();
GenerateFinalWeights();
}
~FVectorComplexFFT()
{
}
// Perform forward complex FFT
//
// @param InComplex - Interleaved complex data with (2 * FFTSize) num floats.
// @param OutComplex - Interleaved complex data with (2 * FFTSize) num floats.
void ForwardComplexToComplex(const float* RESTRICT InComplex, float* RESTRICT OutComplex)
{
// To perform FFT, must complete Log2FFTSize stages. Each radix pass performs 2^m stages
// where 2^m is the radix number. So a radix-4 stage is radix-2^m or radix-2^2. Hence radix
// 4 performs two stages. Radix-8 is Radix-2^3, so it performs 3 stages.
int32 CompletedStages = 0;
if (Log2FFTSize & 1)
{
// If we have an odd number of stages, start with a radix-8 to
// perform first 3 stages.
Radix8ButterflyConstantWeight(InComplex, OutComplex, Log2FFTSize);
CompletedStages = 3;
}
else
{
// Ifawe have an even number of stages, start with a radix-4 to
// perform first 2 stages.
Radix4ButterflyConstantWeight(InComplex, OutComplex, Log2FFTSize);
CompletedStages = 2;
}
// Fit in a few more constant weight radix4s if possible. This routine is faster
// than the default Radix4Butterfly because it does not need weights.
for (int32 StageIndex = CompletedStages; StageIndex < Log2FFTSize - 4 ; StageIndex += 2 )
{
Radix4ButterflyConstantWeight(OutComplex, OutComplex, Log2FFTSize - StageIndex);
}
// Perform a bunch of radix4 ffts with varying weights
// Logically you would arrange this loop to first iterate over stage indices, and then over
// butterfly indices, but by reorganizing the loop the code is more cache coherent.
for (int32 ButterflyIndex = 1 ; CompletedStages < (Log2FFTSize - 4) ; CompletedStages += 2)
{
for ( ; ButterflyIndex < (1 << CompletedStages); ++ButterflyIndex )
{
for (int32 StageIndex = CompletedStages; StageIndex < Log2FFTSize - 4 ; StageIndex += 2 )
{
Radix4Butterfly(OutComplex, ButterflyIndex, Log2FFTSize - StageIndex, Radix4Weights[ButterflyIndex]);
}
}
}
if (CompletedStages < (Log2FFTSize - 2))
{
// Special case for 2nd to last stage of fft which has better cache coherency
Radix4Butterfly2ndToFinal(OutComplex, Log2FFTSize - 4);
}
// Special case for last stage of fft with cache coherency tricks and index reversal built in.
Radix4ButterflyFinal(OutComplex, Log2FFTSize - 2);
}
void InverseComplexToComplex(const float* RESTRICT InComplex, float* RESTRICT OutComplex)
{
// Perform inverse FFT by complex conjugating the input and output.
float* WorkData = InverseWorkBuffer.GetData();
ScaledComplexConjugate(InComplex, 1.f, WorkData, NumFloats);
ForwardComplexToComplex(WorkData, OutComplex);
const float Scale = 1.f / static_cast<float>(FFTSize);
ScaledComplexConjugate(OutComplex, Scale, OutComplex, NumFloats);
}
private:
// Weight structure for general radix 4 pass.
struct FRadix4Weight
{
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1RNeg[4]; // Negative version of W1R.
// These have specialized sign flips for calculations of D2 and D3
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1RD2[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1RD3[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1I[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2RNeg[4]; // Negative Version of W2R
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2I[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3RNeg[4]; // Negative Version of W4R
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3I[4];
};
// Structure to hold loaded inputs on final radix 4 pass
struct FFinalInputs
{
VectorRegister4Float A0;
VectorRegister4Float A1;
VectorRegister4Float A2;
VectorRegister4Float A3;
VectorRegister4Float A4;
VectorRegister4Float A5;
VectorRegister4Float A6;
VectorRegister4Float A7;
};
// Structure to hold loaded outputs on final radix 4 pass
struct FFinalOutputs
{
VectorRegister4Float D0;
VectorRegister4Float D1;
VectorRegister4Float D2;
VectorRegister4Float D3;
VectorRegister4Float D4;
VectorRegister4Float D5;
VectorRegister4Float D6;
VectorRegister4Float D7;
};
// Weight structure for final Radix4 pass
struct FFinalWeights
{
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1I[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2I[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3I[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W4R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W4I[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W5R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W5I[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W6R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W6I[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W7R[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W7I[4];
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2RNeg[4]; // Negative version of W2R
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2RD4[4]; // Special case for calculating D4
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2RD6[4]; // Special case for calculating D5
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3RD5[4]; // Special case for calculating D6
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3RD7[4]; // Special case for calculating D7
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3RNeg[4]; // Negative version of W3R
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W4RNeg[4]; // Negative version of W4R
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W5RNeg[4]; // Negative version of W5R
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W6RNeg[4]; // Negative version of W6R
alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W7RNeg[4]; // Negative version of W7R
};
// Radix4 index locations for performing final Radix4 passes and bit reversal
// without overwriting needed data.
struct FFinalIndices
{
int32 ReadIndex;
int32 WriteIndex;
};
// Perform complex conjugate as well as scale.
//
// @param InValues - Array of floats representing complex values in interleave format.
// @param Scale - Scale to apply.
// @param OutValues - Array of floats representing complex values in interleave format.
// @param Num - Number of floats in array (NOT number of complex values).
void ScaledComplexConjugate(const float* InValues, float Scale, float* OutValues, int32 Num)
{
ArrayScaledComplexConjugate(InValues, Num, OutValues, Scale);
}
// Perform a radix-4 butterfly which uses constant weights.
void Radix4ButterflyConstantWeight(const float* InValues, float* OutValues, int32 InStageIndex)
{
// This routine only supported when stage index is greater than or equal to 3.
// This comes about for two reasons,
// 1. A Radix-4 processes two stages. (m = StageCount, 2^m = 4).
// 2. SIMD optimizations processes two butterflies in parallel, so must have an
// even number of butterflies. This sets the mimum stage index to 3, which will
// result in minimally 2 radix-4 butterflies being calculated.
check(InStageIndex >= 3);
// Calculate number of constant weight butterflies in this stage.
const int NumButterflies = 1 << (InStageIndex - 2);
const VectorRegister4Float SignFlipImag = MakeVectorRegisterFloat(1.f, -1.f, 1.f, -1.f);
const int32 Offset0 = 0;
const int32 Offset1 = 2 * NumButterflies;
const int32 Offset2 = 4 * NumButterflies;
const int32 Offset3 = 6 * NumButterflies;
for (int32 i = 0; i < NumButterflies; i += 2)
{
const int32 Pos = 2 * i;
const int32 Pos0 = Offset0 + Pos;
const int32 Pos1 = Offset1 + Pos;
const int32 Pos2 = Offset2 + Pos;
const int32 Pos3 = Offset3 + Pos;
VectorRegister4Float A0 = VectorLoad(&InValues[Pos0]);
VectorRegister4Float A1 = VectorLoad(&InValues[Pos1]);
VectorRegister4Float A2 = VectorLoad(&InValues[Pos2]);
VectorRegister4Float A3 = VectorLoad(&InValues[Pos3]);
VectorRegister4Float C0 = VectorAdd(A0, A2);
VectorRegister4Float C2 = VectorSubtract(A0, A2);
VectorRegister4Float C1 = VectorAdd(A1, A3);
VectorRegister4Float C3 = VectorSubtract(A1, A3);
VectorRegister4Float C3Conj = VectorMultiply(C3, SignFlipImag);
VectorRegister4Float C3ConjSwizzle = VectorSwizzle(C3Conj, 1, 0, 3, 2);
VectorRegister4Float D0 = VectorAdd(C1, C0);
VectorRegister4Float D1 = VectorSubtract(C0, C1);
VectorRegister4Float D2 = VectorAdd(C2, C3ConjSwizzle);
VectorRegister4Float D3 = VectorSubtract(C2, C3ConjSwizzle);
VectorStore(D0, &OutValues[Pos0]);
VectorStore(D1, &OutValues[Pos1]);
VectorStore(D2, &OutValues[Pos2]);
VectorStore(D3, &OutValues[Pos3]);
}
}
// Perform a radix-8 butterfly which uses constant weights.
void Radix8ButterflyConstantWeight(const float* InValues, float* OutValues, int32 InStageIndex)
{
const float Sqrt2D2 = .7071067811865475244f;
// This routine only supported when stage index is greater than or equal to four.
// This comes about for two reasons,
// 1. A Radix-8 processes three stages. (m = StageCount, 2^m = 8).
// 2. SIMD optimizations processes two butterflies in parallel, so must have an
// even number of butterflies. This sets the mimum stage index to 4, which will
// result in minimally 2 radix-8 butterflies being calculated.
check(InStageIndex >= 4);
// Calculate number of constant weight butterflies in this stage.
const int32 NumButterflies = 1 << (InStageIndex - 3);
const VectorRegister4Float SignFlipImag = MakeVectorRegisterFloat(1.f, -1.f, 1.f, -1.f);
const VectorRegister4Float VectorSqrt2D2 = MakeVectorRegisterFloat(Sqrt2D2, Sqrt2D2, Sqrt2D2, Sqrt2D2);
const VectorRegister4Float VectorNegSqrt2D2 = MakeVectorRegisterFloat(-Sqrt2D2, -Sqrt2D2, -Sqrt2D2, -Sqrt2D2);
const int32 Offset0 = 0;
const int32 Offset1 = 2 * NumButterflies;
const int32 Offset2 = 4 * NumButterflies;
const int32 Offset3 = 6 * NumButterflies;
const int32 Offset4 = 8 * NumButterflies;
const int32 Offset5 = 10 * NumButterflies;
const int32 Offset6 = 12 * NumButterflies;
const int32 Offset7 = 14 * NumButterflies;
for (int32 i = 0; i < NumButterflies; i += 2)
{
const int32 Pos = 2 * i;
const int32 Pos0 = Pos;
const int32 Pos1 = Offset1 + Pos;
const int32 Pos2 = Offset2 + Pos;
const int32 Pos3 = Offset3 + Pos;
const int32 Pos4 = Offset4 + Pos;
const int32 Pos5 = Offset5 + Pos;
const int32 Pos6 = Offset6 + Pos;
const int32 Pos7 = Offset7 + Pos;
VectorRegister4Float A0 = VectorLoad(&InValues[Pos0]);
VectorRegister4Float A1 = VectorLoad(&InValues[Pos1]);
VectorRegister4Float A2 = VectorLoad(&InValues[Pos2]);
VectorRegister4Float A3 = VectorLoad(&InValues[Pos3]);
VectorRegister4Float A4 = VectorLoad(&InValues[Pos4]);
VectorRegister4Float A5 = VectorLoad(&InValues[Pos5]);
VectorRegister4Float A6 = VectorLoad(&InValues[Pos6]);
VectorRegister4Float A7 = VectorLoad(&InValues[Pos7]);
VectorRegister4Float B0 = VectorAdd(A0, A4);
VectorRegister4Float B1 = VectorAdd(A1, A5);
VectorRegister4Float B2 = VectorAdd(A2, A6);
VectorRegister4Float B3 = VectorAdd(A3, A7);
VectorRegister4Float B4 = VectorSubtract(A0, A4);
VectorRegister4Float B5 = VectorSubtract(A1, A5);
VectorRegister4Float B6 = VectorSubtract(A2, A6);
VectorRegister4Float B7 = VectorSubtract(A3, A7);
VectorRegister4Float B6Conj = VectorMultiply(SignFlipImag, B6);
VectorRegister4Float B6ConjSwizzle = VectorSwizzle(B6Conj, 1, 0, 3, 2);
VectorRegister4Float B7Conj = VectorMultiply(SignFlipImag, B7);
VectorRegister4Float B7ConjSwizzle = VectorSwizzle(B7Conj, 1, 0, 3, 2);
VectorRegister4Float C0 = VectorAdd(B0, B2);
VectorRegister4Float C1 = VectorAdd(B1, B3);
VectorRegister4Float C2 = VectorSubtract(B0, B2);
VectorRegister4Float C3 = VectorSubtract(B1, B3);
VectorRegister4Float C4 = VectorAdd(B4, B6ConjSwizzle);
VectorRegister4Float C5 = VectorAdd(B5, B7ConjSwizzle);
VectorRegister4Float C6 = VectorSubtract(B4, B6ConjSwizzle);
VectorRegister4Float C7 = VectorSubtract(B5, B7ConjSwizzle);
VectorRegister4Float C3Conj = VectorMultiply(SignFlipImag, C3);
VectorRegister4Float C3ConjSwizzle = VectorSwizzle(C3Conj, 1, 0, 3, 2);
VectorRegister4Float C5Conj = VectorMultiply(SignFlipImag, C5);
VectorRegister4Float C5ConjSwizzle = VectorSwizzle(C5Conj, 1, 0, 3, 2);
VectorRegister4Float T5 = VectorAdd(C5, C5ConjSwizzle);
VectorRegister4Float C7Swizzle = VectorSwizzle(C7, 1, 0, 3, 2);
VectorRegister4Float T7 = VectorMultiplyAdd(SignFlipImag, C7, C7Swizzle);
VectorRegister4Float T7Conj = VectorMultiply(T7, SignFlipImag);
VectorRegister4Float D0 = VectorAdd(C0, C1);
VectorRegister4Float D1 = VectorSubtract(C0, C1);
VectorRegister4Float D2 = VectorAdd(C2, C3ConjSwizzle);
VectorRegister4Float D3 = VectorSubtract(C2, C3ConjSwizzle);
VectorRegister4Float D4 = VectorMultiplyAdd(T5, VectorSqrt2D2, C4);
VectorRegister4Float D5 = VectorMultiplyAdd(T5, VectorNegSqrt2D2, C4);
VectorRegister4Float D6 = VectorMultiplyAdd(VectorNegSqrt2D2, T7Conj, C6);
VectorRegister4Float D7 = VectorMultiplyAdd(VectorSqrt2D2, T7Conj, C6);
VectorStore(D0, &OutValues[Pos0]);
VectorStore(D1, &OutValues[Pos1]);
VectorStore(D2, &OutValues[Pos2]);
VectorStore(D3, &OutValues[Pos3]);
VectorStore(D4, &OutValues[Pos4]);
VectorStore(D5, &OutValues[Pos5]);
VectorStore(D6, &OutValues[Pos6]);
VectorStore(D7, &OutValues[Pos7]);
}
}
// Perform a radix4 butterfly with dynamic weights.
void Radix4Butterfly(float* InOutValues, int32 ButterflyIndex, int32 InStageIndex, const FRadix4Weight& Weights)
{
// Number of values between butterflies.
const int32 Stride = 1 << InStageIndex;
const int32 NumButterflies = 1 << (InStageIndex - 2);
// Load weights for butterfly
const VectorRegister4Float Weight1Real = VectorLoad(Weights.W1R);
const VectorRegister4Float Weight1Imag = VectorLoad(Weights.W1I);
const VectorRegister4Float Weight2Real = VectorLoad(Weights.W2R);
const VectorRegister4Float Weight2Imag = VectorLoad(Weights.W2I);
const VectorRegister4Float Weight3Real = VectorLoad(Weights.W3R);
const VectorRegister4Float Weight3Imag = VectorLoad(Weights.W3I);
const VectorRegister4Float Weight3RealNeg = VectorLoad(Weights.W3RNeg);
const VectorRegister4Float Weight2RealNeg = VectorLoad(Weights.W2RNeg);
const VectorRegister4Float Weight1RealNeg = VectorLoad(Weights.W1RNeg);
const VectorRegister4Float Weight1RealD2 = VectorLoad(Weights.W1RD2);
const VectorRegister4Float Weight1RealD3 = VectorLoad(Weights.W1RD3);
// Perform butterflies.
for (int32 i = 0; i < NumButterflies; i += 2)
{
const int32 Pos0 = 2 * (Stride * ButterflyIndex + i);
const int32 Pos1 = 2 * (Stride * ButterflyIndex + 1 * NumButterflies + i);
const int32 Pos2 = 2 * (Stride * ButterflyIndex + 2 * NumButterflies + i);
const int32 Pos3 = 2 * (Stride * ButterflyIndex + 3 * NumButterflies + i);
VectorRegister4Float A0 = VectorLoad(&InOutValues[Pos0]);
VectorRegister4Float A1 = VectorLoad(&InOutValues[Pos1]);
VectorRegister4Float A2 = VectorLoad(&InOutValues[Pos2]);
VectorRegister4Float A3 = VectorLoad(&InOutValues[Pos3]);
VectorRegister4Float A1Swizzle = VectorSwizzle(A1, 1, 0, 3, 2);
VectorRegister4Float A2Swizzle = VectorSwizzle(A2, 1, 0, 3, 2);
VectorRegister4Float A3Swizzle = VectorSwizzle(A3, 1, 0, 3, 2);
VectorRegister4Float B1 = VectorMultiplyAdd(A1Swizzle, Weight1Imag, A1);
VectorRegister4Float B2 = VectorMultiplyAdd(A2Swizzle, Weight2Imag, A2);
VectorRegister4Float B3 = VectorMultiplyAdd(A3Swizzle, Weight3Imag, A3);
VectorRegister4Float C0 = VectorMultiplyAdd(B2, Weight2Real, A0);
VectorRegister4Float C2 = VectorMultiplyAdd(B2, Weight2RealNeg, A0);
VectorRegister4Float C1 = VectorMultiplyAdd(B3, Weight3Real, B1);
VectorRegister4Float C3 = VectorMultiplyAdd(B3, Weight3RealNeg, B1);
VectorRegister4Float C3Swizzle = VectorSwizzle(C3, 1, 0, 3, 2);
VectorRegister4Float D0 = VectorMultiplyAdd(C1, Weight1Real, C0);
VectorRegister4Float D1 = VectorMultiplyAdd(C1, Weight1RealNeg, C0);
VectorRegister4Float D2 = VectorMultiplyAdd(C3Swizzle, Weight1RealD2, C2);
VectorRegister4Float D3 = VectorMultiplyAdd(C3Swizzle, Weight1RealD3, C2);
VectorStore(D0, &InOutValues[Pos0]);
VectorStore(D1, &InOutValues[Pos1]);
VectorStore(D2, &InOutValues[Pos2]);
VectorStore(D3, &InOutValues[Pos3]);
}
}
// Special case of 2nd to last radix4 which has to load new weights for
// each iteration.
void Radix4Butterfly2ndToFinal(float* InOutValues, int32 StageIndex)
{
int32 NumPasses = 1 << StageIndex;
// Elements between passes.
const int32 Stride = 16;
for (int32 i = 0; i < NumPasses; ++i)
{
// Load values for current weight.
const FRadix4Weight& Weights = Radix4Weights[i];
const VectorRegister4Float Weight1Real = VectorLoad(Weights.W1R);
const VectorRegister4Float Weight1Imag = VectorLoad(Weights.W1I);
const VectorRegister4Float Weight2Real = VectorLoad(Weights.W2R);
const VectorRegister4Float Weight2Imag = VectorLoad(Weights.W2I);
const VectorRegister4Float Weight3Real = VectorLoad(Weights.W3R);
const VectorRegister4Float Weight3Imag = VectorLoad(Weights.W3I);
const VectorRegister4Float Weight3RealNeg = VectorLoad(Weights.W3RNeg);
const VectorRegister4Float Weight2RealNeg = VectorLoad(Weights.W2RNeg);
const VectorRegister4Float Weight1RealNeg = VectorLoad(Weights.W1RNeg);
const VectorRegister4Float Weight1RealD2 = VectorLoad(Weights.W1RD2);
const VectorRegister4Float Weight1RealD3 = VectorLoad(Weights.W1RD3);
for (int32 j = 0; j < 4 ; j += 2)
{
const int32 Pos0 = 2 * (Stride * i + j);
const int32 Pos1 = 2 * (Stride * i + 4 + j);
const int32 Pos2 = 2 * (Stride * i + 8 + j);
const int32 Pos3 = 2 * (Stride * i + 12 + j);
VectorRegister4Float A0 = VectorLoad(&InOutValues[Pos0]);
VectorRegister4Float A1 = VectorLoad(&InOutValues[Pos1]);
VectorRegister4Float A2 = VectorLoad(&InOutValues[Pos2]);
VectorRegister4Float A3 = VectorLoad(&InOutValues[Pos3]);
VectorRegister4Float A1Swizzle = VectorSwizzle(A1, 1, 0, 3, 2);
VectorRegister4Float A2Swizzle = VectorSwizzle(A2, 1, 0, 3, 2);
VectorRegister4Float A3Swizzle = VectorSwizzle(A3, 1, 0, 3, 2);
VectorRegister4Float B1 = VectorMultiplyAdd(A1Swizzle, Weight1Imag, A1);
VectorRegister4Float B2 = VectorMultiplyAdd(A2Swizzle, Weight2Imag, A2);
VectorRegister4Float B3 = VectorMultiplyAdd(A3Swizzle, Weight3Imag, A3);
VectorRegister4Float C0 = VectorMultiplyAdd(B2, Weight2Real, A0);
VectorRegister4Float C1 = VectorMultiplyAdd(B3, Weight3Real, B1);
VectorRegister4Float C2 = VectorMultiplyAdd(B2, Weight2RealNeg, A0);
VectorRegister4Float C3 = VectorMultiplyAdd(B3, Weight3RealNeg, B1);
VectorRegister4Float C3Swizzle = VectorSwizzle(C3, 1, 0, 3, 2);
VectorRegister4Float D0 = VectorMultiplyAdd(C1, Weight1Real, C0);
VectorRegister4Float D1 = VectorMultiplyAdd(C1, Weight1RealNeg, C0);
VectorRegister4Float D2 = VectorMultiplyAdd(C3Swizzle, Weight1RealD2, C2);
VectorRegister4Float D3 = VectorMultiplyAdd(C3Swizzle, Weight1RealD3, C2);
VectorStore(D0, &InOutValues[Pos0]);
VectorStore(D1, &InOutValues[Pos1]);
VectorStore(D2, &InOutValues[Pos2]);
VectorStore(D3, &InOutValues[Pos3]);
}
}
}
// Read data for final butterfly. Performs part of bit reversal order.
void ReadFinalButterflyInputs(const float* InValues, int32 InNumButterflies, int32 InReadIndex, FFinalInputs& OutValues)
{
const int32 Pos0 = 2 * (0 * InNumButterflies + 4 * InReadIndex);
const int32 Pos1 = Pos0 + 4;
const int32 Pos2 = 2 * (2 * InNumButterflies + 4 * InReadIndex);
const int32 Pos3 = Pos2 + 4;
const int32 Pos4 = 2 * (1 * InNumButterflies + 4 * InReadIndex);
const int32 Pos5 = Pos4 + 4;
const int32 Pos6 = 2 * (3 * InNumButterflies + 4 * InReadIndex);
const int32 Pos7 = Pos6 + 4;
VectorRegister4Float T0 = VectorLoad(&InValues[Pos0]);
VectorRegister4Float T1 = VectorLoad(&InValues[Pos1]);
VectorRegister4Float T2 = VectorLoad(&InValues[Pos2]);
VectorRegister4Float T3 = VectorLoad(&InValues[Pos3]);
VectorRegister4Float T4 = VectorLoad(&InValues[Pos4]);
VectorRegister4Float T5 = VectorLoad(&InValues[Pos5]);
VectorRegister4Float T6 = VectorLoad(&InValues[Pos6]);
VectorRegister4Float T7 = VectorLoad(&InValues[Pos7]);
OutValues.A0 = VectorShuffle(T0, T2, 0, 1, 0, 1);
OutValues.A1 = VectorShuffle(T4, T6, 0, 1, 0, 1);
OutValues.A2 = VectorShuffle(T0, T2, 2, 3, 2, 3);
OutValues.A3 = VectorShuffle(T4, T6, 2, 3, 2, 3);
OutValues.A4 = VectorShuffle(T1, T3, 0, 1, 0, 1);
OutValues.A5 = VectorShuffle(T5, T7, 0, 1, 0, 1);
OutValues.A6 = VectorShuffle(T1, T3, 2, 3, 2, 3);
OutValues.A7 = VectorShuffle(T5, T7, 2, 3, 2, 3);
}
// Write data for final butterfly. Performs part of bit reversal order.
void WriteFinalButterflyOutputs(const FFinalOutputs& InResult, int32 InNumButterflies, int32 InWriteIndex, float* OutValues)
{
const int32 Pos0 = 2 * (0 * InNumButterflies + 4 * InWriteIndex);
const int32 Pos1 = Pos0 + 4;
const int32 Pos2 = 2 * (2 * InNumButterflies + 4 * InWriteIndex);
const int32 Pos3 = Pos2 + 4;
const int32 Pos4 = 2 * (1 * InNumButterflies + 4 * InWriteIndex);
const int32 Pos5 = Pos4 + 4;
const int32 Pos6 = 2 * (3 * InNumButterflies + 4 * InWriteIndex);
const int32 Pos7 = Pos6 + 4;
VectorStore(InResult.D0, &OutValues[Pos0]);
VectorStore(InResult.D1, &OutValues[Pos1]);
VectorStore(InResult.D2, &OutValues[Pos2]);
VectorStore(InResult.D3, &OutValues[Pos3]);
VectorStore(InResult.D4, &OutValues[Pos4]);
VectorStore(InResult.D5, &OutValues[Pos5]);
VectorStore(InResult.D6, &OutValues[Pos6]);
VectorStore(InResult.D7, &OutValues[Pos7]);
}
// Compute butterfly in final stage.
void Radix4ButterflyFinalIteration(const FFinalInputs& Inputs, const FFinalWeights& InWeights, FFinalOutputs& Outputs)
{
// Note: Some weights are altered to bake in sign flips to avoid an extra multiply later on.
const VectorRegister4Float W2I = VectorLoad(InWeights.W2I);
const VectorRegister4Float W2R = VectorLoad(InWeights.W2R);
const VectorRegister4Float W3I = VectorLoad(InWeights.W3I);
const VectorRegister4Float W3R = VectorLoad(InWeights.W3R);
const VectorRegister4Float W4I = VectorLoad(InWeights.W4I);
const VectorRegister4Float W4R = VectorLoad(InWeights.W4R);
const VectorRegister4Float W5I = VectorLoad(InWeights.W5I);
const VectorRegister4Float W5R = VectorLoad(InWeights.W5R);
const VectorRegister4Float W6I = VectorLoad(InWeights.W6I);
const VectorRegister4Float W6R = VectorLoad(InWeights.W6R);
const VectorRegister4Float W7I = VectorLoad(InWeights.W7I);
const VectorRegister4Float W7R = VectorLoad(InWeights.W7R);
const VectorRegister4Float W2RNeg = VectorLoad(InWeights.W2RNeg);
const VectorRegister4Float W3RNeg = VectorLoad(InWeights.W3RNeg);
const VectorRegister4Float W4RNeg = VectorLoad(InWeights.W4RNeg);
const VectorRegister4Float W5RNeg = VectorLoad(InWeights.W5RNeg);
const VectorRegister4Float W6RNeg = VectorLoad(InWeights.W6RNeg);
const VectorRegister4Float W7RNeg = VectorLoad(InWeights.W7RNeg);
const VectorRegister4Float W2RD4 = VectorLoad(InWeights.W2RD4);
const VectorRegister4Float W2RD6 = VectorLoad(InWeights.W2RD6);
const VectorRegister4Float W3RD5 = VectorLoad(InWeights.W3RD5);
const VectorRegister4Float W3RD7 = VectorLoad(InWeights.W3RD7);
VectorRegister4Float A2Swizzle = VectorSwizzle(Inputs.A2, 1, 0, 3, 2);
VectorRegister4Float A3Swizzle = VectorSwizzle(Inputs.A3, 1, 0, 3, 2);
VectorRegister4Float A4Swizzle = VectorSwizzle(Inputs.A4, 1, 0, 3, 2);
VectorRegister4Float A5Swizzle = VectorSwizzle(Inputs.A5, 1, 0, 3, 2);
VectorRegister4Float A6Swizzle = VectorSwizzle(Inputs.A6, 1, 0, 3, 2);
VectorRegister4Float A7Swizzle = VectorSwizzle(Inputs.A7, 1, 0, 3, 2);
VectorRegister4Float B2 = VectorMultiplyAdd(A2Swizzle, W2I, Inputs.A2);
VectorRegister4Float B3 = VectorMultiplyAdd(A3Swizzle, W3I, Inputs.A3);
VectorRegister4Float B4 = VectorMultiplyAdd(A4Swizzle, W4I, Inputs.A4);
VectorRegister4Float B5 = VectorMultiplyAdd(A5Swizzle, W5I, Inputs.A5);
VectorRegister4Float B6 = VectorMultiplyAdd(A6Swizzle, W6I, Inputs.A6);
VectorRegister4Float B7 = VectorMultiplyAdd(A7Swizzle, W7I, Inputs.A7);
VectorRegister4Float C0 = VectorMultiplyAdd(B4, W4R, Inputs.A0);
VectorRegister4Float C1 = VectorMultiplyAdd(B5, W5R, Inputs.A1);
VectorRegister4Float C2 = VectorMultiplyAdd(B6, W6R, B2);
VectorRegister4Float C3 = VectorMultiplyAdd(B7, W7R, B3);
VectorRegister4Float C4 = VectorMultiplyAdd(B4, W4RNeg, Inputs.A0);
VectorRegister4Float C5 = VectorMultiplyAdd(B5, W5RNeg, Inputs.A1);
VectorRegister4Float C6 = VectorMultiplyAdd(B6, W6RNeg, B2);
VectorRegister4Float C7 = VectorMultiplyAdd(B7, W7RNeg, B3);
VectorRegister4Float C6Swizzle = VectorSwizzle(C6, 1, 0, 3, 2);
VectorRegister4Float C7Swizzle = VectorSwizzle(C7, 1, 0, 3, 2);
Outputs.D0 = VectorMultiplyAdd(C2, W2R, C0);
Outputs.D1 = VectorMultiplyAdd(C3, W3R, C1);
Outputs.D2 = VectorMultiplyAdd(C2, W2RNeg, C0);
Outputs.D3 = VectorMultiplyAdd(C3, W3RNeg, C1);
Outputs.D4 = VectorMultiplyAdd(C6Swizzle, W2RD4, C4);
Outputs.D5 = VectorMultiplyAdd(C7Swizzle, W3RD5, C5);
Outputs.D6 = VectorMultiplyAdd(C6Swizzle, W2RD6, C4);
Outputs.D7 = VectorMultiplyAdd(C7Swizzle, W3RD7, C5);
}
// Perform last set of radix 4 butterflies.
//
// This method is special since it also performs bit order reversal in a
// moderately cache coherent manner.
void Radix4ButterflyFinal(float* InOutValues, int InStageIndex)
{
int32 NumButterflies = 1 << InStageIndex;
int32 NumIterations = NumButterflies >> 2;
FFinalInputs Inputs;
FFinalOutputs Outputs;
int32 Iteration = 0;
ReadFinalButterflyInputs(InOutValues, NumButterflies, FinalIndices[Iteration].ReadIndex, Inputs);
Radix4ButterflyFinalIteration(Inputs, FinalWeights[Iteration], Outputs);
for (Iteration = 1; Iteration < NumIterations; ++Iteration)
{
ReadFinalButterflyInputs(InOutValues, NumButterflies, FinalIndices[Iteration].ReadIndex, Inputs);
WriteFinalButterflyOutputs(Outputs, NumButterflies, FinalIndices[Iteration - 1].WriteIndex, InOutValues);
Radix4ButterflyFinalIteration(Inputs, FinalWeights[Iteration], Outputs);
}
WriteFinalButterflyOutputs(Outputs, NumButterflies, FinalIndices[Iteration - 1].WriteIndex, InOutValues);
}
int32 IntLog2(int32 InValue)
{
check(InValue > 0);
check(FMath::CountBits(InValue) == 1);
return FMath::CountTrailingZeros(InValue);
}
void GenerateFinalIndices()
{
// Each pass of the final radix 4 operates on 16 complex values.
const int32 FinalPassSize = 16;
const int32 NumFinalIndices = FFTSize / FinalPassSize;
// We need to ensure that FFTSize is at least 16 or else final
// FFT pass go past end of buffer.
check(FFTSize >= FinalPassSize);
FinalIndices.Reset();
FinalIndices.AddUninitialized(NumFinalIndices);
// These indices perform part of the bit order reversal on 16 element boundaries.
// Need to shift bits to get the bit reversed order on 16 element blocks.
const int32 Shift = 32 - (IntLog2(FFTSize) - 4);
int32 Index = 0;
for (int32 ReadIndex = 0; (ReadIndex < NumFinalIndices) && (Index < NumFinalIndices); ++ReadIndex)
{
// Get bit reversed order on 16 element block
const int32 WriteIndex = ReverseBits(ReadIndex) >> Shift;
// If ReadIndex > WriteIndex, then ReadIndex in a previous iteration had the
// value WriteIndex has now, and we do not want to repeat it
if (ReadIndex == WriteIndex)
{
// If equal, add one entry to read and write to same index.
FinalIndices[Index] = { ReadIndex, WriteIndex };
Index++;
}
else if (ReadIndex < WriteIndex)
{
// If ReadIndex < WriteIndex, add table entries in both orders.
// Loop logic in final radix pass will make sure that nothing
// gets overwritten.
FinalIndices[Index] = { ReadIndex, WriteIndex };
Index++;
FinalIndices[Index] = { WriteIndex, ReadIndex };
Index++;
}
}
}
void GenerateFinalWeights()
{
// Each pass of the final radix 4 operates on 16 complex values.
const int32 FinalPassSize = 16;
const int32 NumFinalWeights = FFTSize / FinalPassSize;
// We need to ensure that FFTSize is at least 16 or else final
// FFT pass go past end of buffer.
check(FFTSize >= FinalPassSize);
FinalWeights.Reset();
FinalWeights.AddUninitialized(NumFinalWeights);
const double Scale = 1. / static_cast<double>(FFTSize);
for (int32 Pass = 0; Pass < NumFinalWeights; ++Pass)
{
const int ReadIndex = FinalIndices[Pass].ReadIndex;
const double RotatedBitFraction = RotateBitsAroundPoint(4 * ReadIndex);
const double Phase = 2. * PI * (RotatedBitFraction + 0 * Scale);
const double Phase1 = 2. * PI * (RotatedBitFraction + 1 * Scale);
const double Phase2 = 2. * PI * (RotatedBitFraction + 2 * Scale);
const double Phase3 = 2. * PI * (RotatedBitFraction + 3 * Scale);
FinalWeights[Pass].W2I[0] = -FMath::Tan(Phase);
FinalWeights[Pass].W2I[1] = FMath::Tan(Phase);
FinalWeights[Pass].W2I[2] = -FMath::Tan(Phase1);
FinalWeights[Pass].W2I[3] = FMath::Tan(Phase1);
FinalWeights[Pass].W3I[0] = -FMath::Tan(Phase2);
FinalWeights[Pass].W3I[1] = FMath::Tan(Phase2);
FinalWeights[Pass].W3I[2] = -FMath::Tan(Phase3);
FinalWeights[Pass].W3I[3] = FMath::Tan(Phase3);
FinalWeights[Pass].W2R[0] = FMath::Cos(Phase);
FinalWeights[Pass].W2R[1] = FMath::Cos(Phase);
FinalWeights[Pass].W2R[2] = FMath::Cos(Phase1);
FinalWeights[Pass].W2R[3] = FMath::Cos(Phase1);
FinalWeights[Pass].W2RD4[0] = -FMath::Cos(Phase);
FinalWeights[Pass].W2RD4[1] = FMath::Cos(Phase);
FinalWeights[Pass].W2RD4[2] = -FMath::Cos(Phase1);
FinalWeights[Pass].W2RD4[3] = FMath::Cos(Phase1);
FinalWeights[Pass].W2RD6[0] = FMath::Cos(Phase);
FinalWeights[Pass].W2RD6[1] = -FMath::Cos(Phase);
FinalWeights[Pass].W2RD6[2] = FMath::Cos(Phase1);
FinalWeights[Pass].W2RD6[3] = -FMath::Cos(Phase1);
FinalWeights[Pass].W2RNeg[0] = -FMath::Cos(Phase);
FinalWeights[Pass].W2RNeg[1] = -FMath::Cos(Phase);
FinalWeights[Pass].W2RNeg[2] = -FMath::Cos(Phase1);
FinalWeights[Pass].W2RNeg[3] = -FMath::Cos(Phase1);
FinalWeights[Pass].W3R[0] = FMath::Cos(Phase2);
FinalWeights[Pass].W3R[1] = FMath::Cos(Phase2);
FinalWeights[Pass].W3R[2] = FMath::Cos(Phase3);
FinalWeights[Pass].W3R[3] = FMath::Cos(Phase3);
FinalWeights[Pass].W3RD5[0] = -FMath::Cos(Phase2);
FinalWeights[Pass].W3RD5[1] = FMath::Cos(Phase2);
FinalWeights[Pass].W3RD5[2] = -FMath::Cos(Phase3);
FinalWeights[Pass].W3RD5[3] = FMath::Cos(Phase3);
FinalWeights[Pass].W3RD7[0] = FMath::Cos(Phase2);
FinalWeights[Pass].W3RD7[1] = -FMath::Cos(Phase2);
FinalWeights[Pass].W3RD7[2] = FMath::Cos(Phase3);
FinalWeights[Pass].W3RD7[3] = -FMath::Cos(Phase3);
FinalWeights[Pass].W3RNeg[0] = -FMath::Cos(Phase2);
FinalWeights[Pass].W3RNeg[1] = -FMath::Cos(Phase2);
FinalWeights[Pass].W3RNeg[2] = -FMath::Cos(Phase3);
FinalWeights[Pass].W3RNeg[3] = -FMath::Cos(Phase3);
FinalWeights[Pass].W4I[0] = -FMath::Tan(Phase + Phase);
FinalWeights[Pass].W4I[1] = FMath::Tan(Phase + Phase);
FinalWeights[Pass].W4I[2] = -FMath::Tan(Phase1 + Phase1);
FinalWeights[Pass].W4I[3] = FMath::Tan(Phase1 + Phase1);
FinalWeights[Pass].W5I[0] = -FMath::Tan(Phase2 + Phase2);
FinalWeights[Pass].W5I[1] = FMath::Tan(Phase2 + Phase2);
FinalWeights[Pass].W5I[2] = -FMath::Tan(Phase3 + Phase3);
FinalWeights[Pass].W5I[3] = FMath::Tan(Phase3 + Phase3);
FinalWeights[Pass].W4R[0] = FMath::Cos(Phase + Phase);
FinalWeights[Pass].W4R[1] = FMath::Cos(Phase + Phase);
FinalWeights[Pass].W4R[2] = FMath::Cos(Phase1 + Phase1);
FinalWeights[Pass].W4R[3] = FMath::Cos(Phase1 + Phase1);
FinalWeights[Pass].W4RNeg[0] = -FMath::Cos(Phase + Phase);
FinalWeights[Pass].W4RNeg[1] = -FMath::Cos(Phase + Phase);
FinalWeights[Pass].W4RNeg[2] = -FMath::Cos(Phase1 + Phase1);
FinalWeights[Pass].W4RNeg[3] = -FMath::Cos(Phase1 + Phase1);
FinalWeights[Pass].W5R[0] = FMath::Cos(Phase2 + Phase2);
FinalWeights[Pass].W5R[1] = FMath::Cos(Phase2 + Phase2);
FinalWeights[Pass].W5R[2] = FMath::Cos(Phase3 + Phase3);
FinalWeights[Pass].W5R[3] = FMath::Cos(Phase3 + Phase3);
FinalWeights[Pass].W5RNeg[0] = -FMath::Cos(Phase2 + Phase2);
FinalWeights[Pass].W5RNeg[1] = -FMath::Cos(Phase2 + Phase2);
FinalWeights[Pass].W5RNeg[2] = -FMath::Cos(Phase3 + Phase3);
FinalWeights[Pass].W5RNeg[3] = -FMath::Cos(Phase3 + Phase3);
FinalWeights[Pass].W6I[0] = -FMath::Tan(3. * Phase);
FinalWeights[Pass].W6I[1] = FMath::Tan(3. * Phase);
FinalWeights[Pass].W6I[2] = -FMath::Tan(3. * Phase1);
FinalWeights[Pass].W6I[3] = FMath::Tan(3. * Phase1);
FinalWeights[Pass].W7I[0] = -FMath::Tan(3. * Phase2);
FinalWeights[Pass].W7I[1] = FMath::Tan(3. * Phase2);
FinalWeights[Pass].W7I[2] = -FMath::Tan(3. * Phase3);
FinalWeights[Pass].W7I[3] = FMath::Tan(3. * Phase3);
FinalWeights[Pass].W6R[0] = 2 * FMath::Cos(Phase + Phase) - 1;
FinalWeights[Pass].W6R[1] = 2 * FMath::Cos(Phase + Phase) - 1;
FinalWeights[Pass].W6R[2] = 2 * FMath::Cos(Phase1 + Phase1) - 1;
FinalWeights[Pass].W6R[3] = 2 * FMath::Cos(Phase1 + Phase1) - 1;
FinalWeights[Pass].W6RNeg[0] = -(2 * FMath::Cos(Phase + Phase) - 1);
FinalWeights[Pass].W6RNeg[1] = -(2 * FMath::Cos(Phase + Phase) - 1);
FinalWeights[Pass].W6RNeg[2] = -(2 * FMath::Cos(Phase1 + Phase1) - 1);
FinalWeights[Pass].W6RNeg[3] = -(2 * FMath::Cos(Phase1 + Phase1) - 1);
FinalWeights[Pass].W7R[0] = 2 * FMath::Cos(Phase2 + Phase2) - 1;
FinalWeights[Pass].W7R[1] = 2 * FMath::Cos(Phase2 + Phase2) - 1;
FinalWeights[Pass].W7R[2] = 2 * FMath::Cos(Phase3 + Phase3) - 1;
FinalWeights[Pass].W7R[3] = 2 * FMath::Cos(Phase3 + Phase3) - 1;
FinalWeights[Pass].W7RNeg[0] = -(2 * FMath::Cos(Phase2 + Phase2) - 1);
FinalWeights[Pass].W7RNeg[1] = -(2 * FMath::Cos(Phase2 + Phase2) - 1);
FinalWeights[Pass].W7RNeg[2] = -(2 * FMath::Cos(Phase3 + Phase3) - 1);
FinalWeights[Pass].W7RNeg[3] = -(2 * FMath::Cos(Phase3 + Phase3) - 1);
}
}
void GenerateRadix4Weights()
{
const int32 Radix4PassSize = 16;
check(FFTSize >= Radix4PassSize);
// Each radix-4 butterfly
int32 MaxNumButterfliesInStage = FFTSize / Radix4PassSize;
Radix4Weights.Reset();
Radix4Weights.AddUninitialized(MaxNumButterfliesInStage);
for (int32 ButterflyIndex = 0; ButterflyIndex < MaxNumButterfliesInStage; ++ButterflyIndex)
{
const double Phase = 2. * PI * RotateBitsAroundPoint(4 * ButterflyIndex);
const float W1R = static_cast<float>(FMath::Cos(Phase));
const float W1I = static_cast<float>(FMath::Tan(Phase));
const float W2R = static_cast<float>(FMath::Cos(Phase + Phase));
const float W2I = static_cast<float>(FMath::Tan(Phase + Phase));
const float W3R = static_cast<float>(2. * W2R - 1.);
const float W3I = static_cast<float>(FMath::Tan(3. * Phase));
Radix4Weights[ButterflyIndex].W1R[0] = W1R;
Radix4Weights[ButterflyIndex].W1I[0] = -W1I;
Radix4Weights[ButterflyIndex].W2R[0] = W2R;
Radix4Weights[ButterflyIndex].W2I[0] = -W2I;
Radix4Weights[ButterflyIndex].W3R[0] = W3R;
Radix4Weights[ButterflyIndex].W3I[0] = -W3I;
Radix4Weights[ButterflyIndex].W1RNeg[0] = -W1R;
Radix4Weights[ButterflyIndex].W1RD2[0] = -W1R;
Radix4Weights[ButterflyIndex].W1RD3[0] = W1R;
Radix4Weights[ButterflyIndex].W2RNeg[0] = -W2R;
Radix4Weights[ButterflyIndex].W3RNeg[0] = -W3R;
Radix4Weights[ButterflyIndex].W1R[1] = W1R;
Radix4Weights[ButterflyIndex].W1I[1] = W1I;
Radix4Weights[ButterflyIndex].W2R[1] = W2R;
Radix4Weights[ButterflyIndex].W2I[1] = W2I;
Radix4Weights[ButterflyIndex].W3R[1] = W3R;
Radix4Weights[ButterflyIndex].W3I[1] = W3I;
Radix4Weights[ButterflyIndex].W1RNeg[1] = -W1R;
Radix4Weights[ButterflyIndex].W1RD2[1] = W1R;
Radix4Weights[ButterflyIndex].W1RD3[1] = -W1R;
Radix4Weights[ButterflyIndex].W2RNeg[1] = -W2R;
Radix4Weights[ButterflyIndex].W3RNeg[1] = -W3R;
Radix4Weights[ButterflyIndex].W1R[2] = W1R;
Radix4Weights[ButterflyIndex].W1I[2] = -W1I;
Radix4Weights[ButterflyIndex].W2R[2] = W2R;
Radix4Weights[ButterflyIndex].W2I[2] = -W2I;
Radix4Weights[ButterflyIndex].W3R[2] = W3R;
Radix4Weights[ButterflyIndex].W3I[2] = -W3I;
Radix4Weights[ButterflyIndex].W1RNeg[2] = -W1R;
Radix4Weights[ButterflyIndex].W1RD2[2] = -W1R;
Radix4Weights[ButterflyIndex].W1RD3[2] = W1R;
Radix4Weights[ButterflyIndex].W2RNeg[2] = -W2R;
Radix4Weights[ButterflyIndex].W3RNeg[2] = -W3R;
Radix4Weights[ButterflyIndex].W1R[3] = W1R;
Radix4Weights[ButterflyIndex].W1I[3] = W1I;
Radix4Weights[ButterflyIndex].W2R[3] = W2R;
Radix4Weights[ButterflyIndex].W2I[3] = W2I;
Radix4Weights[ButterflyIndex].W3R[3] = W3R;
Radix4Weights[ButterflyIndex].W3I[3] = W3I;
Radix4Weights[ButterflyIndex].W1RNeg[3] = -W1R;
Radix4Weights[ButterflyIndex].W1RD2[3] = W1R;
Radix4Weights[ButterflyIndex].W1RD3[3] = -W1R;
Radix4Weights[ButterflyIndex].W2RNeg[3] = -W2R;
Radix4Weights[ButterflyIndex].W3RNeg[3] = -W3R;
}
}
// A funny but useful function which reverses bits and
// places them behind a point. For example
// 0101 is transformed to 0.1010
double RotateBitsAroundPoint(uint32 InValue)
{
uint32 ReversedValue = ReverseBits(InValue);
double OutValue = ReversedValue / 4294967296.;
OutValue = 1. / 4294967296. * ReversedValue;
return OutValue;
}
uint32 ReverseBits(uint32 InValue)
{
static const uint8 ByteReversal[256] = {
0, 128, 64, 192, 32, 160, 96, 224, 16, 144, 80, 208, 48, 176, 112, 240,
8, 136, 72, 200, 40, 168, 104, 232, 24, 152, 88, 216, 56, 184, 120, 248,
4, 132, 68, 196, 36, 164, 100, 228, 20, 148, 84, 212, 52, 180, 116, 244,
12, 140, 76, 204, 44, 172, 108, 236, 28, 156, 92, 220, 60, 188, 124, 252,
2, 130, 66, 194, 34, 162, 98, 226, 18, 146, 82, 210, 50, 178, 114, 242,
10, 138, 74, 202, 42, 170, 106, 234, 26, 154, 90, 218, 58, 186, 122, 250,
6, 134, 70, 198, 38, 166, 102, 230, 22, 150, 86, 214, 54, 182, 118, 246,
14, 142, 78, 206, 46, 174, 110, 238, 30, 158, 94, 222, 62, 190, 126, 254,
1, 129, 65, 193, 33, 161, 97, 225, 17, 145, 81, 209, 49, 177, 113, 241,
9, 137, 73, 201, 41, 169, 105, 233, 25, 153, 89, 217, 57, 185, 121, 249,
5, 133, 69, 197, 37, 165, 101, 229, 21, 149, 85, 213, 53, 181, 117, 245,
13, 141, 77, 205, 45, 173, 109, 237, 29, 157, 93, 221, 61, 189, 125, 253,
3, 131, 67, 195, 35, 163, 99, 227, 19, 147, 83, 211, 51, 179, 115, 243,
11, 139, 75, 203, 43, 171, 107, 235, 27, 155, 91, 219, 59, 187, 123, 251,
7, 135, 71, 199, 39, 167, 103, 231, 23, 151, 87, 215, 55, 183, 119, 247,
15, 143, 79, 207, 47, 175, 111, 239, 31, 159, 95, 223, 63, 191, 127, 255
};
uint8 Byte0 = ByteReversal[InValue >> 0*8 & 0xff];
uint8 Byte1 = ByteReversal[InValue >> 1*8 & 0xff];
uint8 Byte2 = ByteReversal[InValue >> 2*8 & 0xff];
uint8 Byte3 = ByteReversal[InValue >> 3*8 & 0xff];
uint32 OutValue = Byte0 << 3*8 | Byte1 << 2*8 | Byte2 << 1*8 | Byte3 << 0*8;
return OutValue;
}
int32 Log2FFTSize;
int32 FFTSize;
int32 NumFloats;
TArray<FRadix4Weight> Radix4Weights;
TArray<FFinalIndices> FinalIndices;
TArray<FFinalWeights> FinalWeights;
FAlignedFloatBuffer InverseWorkBuffer;
};
// Maximum log 2 size of fft
const int32 FVectorRealToComplexFFT::MinLog2FFTSize = FVectorComplexFFT::MinLog2FFTSize + 1;
// Maximum log 2 size of fft
const int32 FVectorRealToComplexFFT::MaxLog2FFTSize = FVectorComplexFFT::MaxLog2FFTSize + 1;
void FVectorRealToComplexFFT::InitRealSequenceConversionBuffers()
{
// Conversion buffers for performing a real valued FFT using a complex fft
// The values in the buffer are setup to support SIMD operations resulting
// in some duplicate data.
ForwardConvBuffers.AlphaReal.AddUninitialized(FFTSize);
ForwardConvBuffers.AlphaImag.AddUninitialized(FFTSize);
ForwardConvBuffers.BetaReal.AddUninitialized(FFTSize);
ForwardConvBuffers.BetaImag.AddUninitialized(FFTSize);
InverseConvBuffers.AlphaReal.AddUninitialized(FFTSize);
InverseConvBuffers.AlphaImag.AddUninitialized(FFTSize);
InverseConvBuffers.BetaReal.AddUninitialized(FFTSize);
InverseConvBuffers.BetaImag.AddUninitialized(FFTSize);
float* AlphaRealForwardBufferData = ForwardConvBuffers.AlphaReal.GetData();
float* AlphaImagForwardBufferData = ForwardConvBuffers.AlphaImag.GetData();
float* BetaRealForwardBufferData = ForwardConvBuffers.BetaReal.GetData();
float* BetaImagForwardBufferData = ForwardConvBuffers.BetaImag.GetData();
float* AlphaRealInverseBufferData = InverseConvBuffers.AlphaReal.GetData();
float* AlphaImagInverseBufferData = InverseConvBuffers.AlphaImag.GetData();
float* BetaRealInverseBufferData = InverseConvBuffers.BetaReal.GetData();
float* BetaImagInverseBufferData = InverseConvBuffers.BetaImag.GetData();
float PhaseIncrement = PI / static_cast<float>(FFTSize);
for (int32 i = 0; i < FFTSize; i += 2)
{
const float Phase = PhaseIncrement * i;
const float BetaReal = 0.5 * (1. - FMath::Sin(Phase));
const float BetaImag = -0.5 * FMath::Cos(Phase);
const float AlphaReal = 0.5 * (1. + FMath::Sin(Phase));
const float AlphaImag = 0.5 * FMath::Cos(Phase);
AlphaRealForwardBufferData[i] = AlphaReal;
AlphaRealForwardBufferData[i + 1] = -AlphaReal;// Sign flipped to simplify SIMD math
AlphaImagForwardBufferData[i] = AlphaImag;
AlphaImagForwardBufferData[i + 1] = AlphaImag;
BetaRealForwardBufferData[i] = BetaReal;
BetaRealForwardBufferData[i + 1] = BetaReal;
BetaImagForwardBufferData[i] = -BetaImag; // Sign flipped to simplify SIMD math
BetaImagForwardBufferData[i + 1] = BetaImag;
AlphaRealInverseBufferData[i] = AlphaReal;
AlphaRealInverseBufferData[i + 1] = -AlphaReal; // Sign flipped to simplify SIMD math
AlphaImagInverseBufferData[i] = AlphaImag;
AlphaImagInverseBufferData[i + 1] = AlphaImag;
BetaRealInverseBufferData[i] = BetaReal;
BetaRealInverseBufferData[i + 1] = BetaReal;
BetaImagInverseBufferData[i] = -BetaImag; // Sign flipped to simplify SIMD math
BetaImagInverseBufferData[i + 1] = BetaImag;
}
}
// Performs conversion of buffers required to do real fft using complex fft.
void FVectorRealToComplexFFT::ConvertSequence(const FConversionBuffers& InBuffers, const float* RESTRICT InValues, int32 InStartIndex, float* RESTRICT OutValues)
{
const float* AlphaRealData = InBuffers.AlphaReal.GetData();
const float* AlphaImagData = InBuffers.AlphaImag.GetData();
const float* BetaRealData = InBuffers.BetaReal.GetData();
const float* BetaImagData = InBuffers.BetaImag.GetData();
if (FFTSize > InStartIndex)
{
VectorRegister4Float VInRev1 = VectorLoad(&InValues[FFTSize - InStartIndex]);
for (int32 i = InStartIndex; i < FFTSize; i += 4)
{
VectorRegister4Float VIn = VectorLoad(&InValues[i]);
VectorRegister4Float VInRISwap = VectorSwizzle(VIn, 1, 0, 3, 2);
VectorRegister4Float VInRev2 = VectorLoad(&InValues[FFTSize - i - 4]);
VectorRegister4Float VInRev = VectorShuffle(VInRev1, VInRev2, 0, 1, 2, 3);
VInRev1 = VInRev2;
VectorRegister4Float VInRevRISwap = VectorSwizzle(VInRev, 1, 0, 3, 2);
VectorRegister4Float VAlphaReal = VectorLoad(&AlphaRealData[i]);
VectorRegister4Float VAlphaImag = VectorLoad(&AlphaImagData[i]);
VectorRegister4Float VBetaReal = VectorLoad(&BetaRealData[i]);
VectorRegister4Float VBetaImag = VectorLoad(&BetaImagData[i]);
// Out1 = [ R * Ar, I * Ar]
// Out2 = [ I * Ai, R * Ai]
// Out3 = [NR * Br, NI * Br]
// Out4 = [NI * Bi, NR * Bi]
//VectorRegister4Float Out1 = VectorMultiply(VIn, VAlphaReal);
VectorRegister4Float Out2 = VectorMultiply(VInRISwap, VAlphaImag);
//VectorRegister4Float Out3 = VectorMultiply(VInRev, VBetaReal);
VectorRegister4Float Out4 = VectorMultiply(VInRevRISwap, VBetaImag);
// Out12 = [(R * Ar) + (I * Ai), (I * Ar) + (R * Ai)]
VectorRegister4Float Out12 = VectorMultiplyAdd(VIn, VAlphaReal, Out2);
// Out34 = [(NR * Br) + (NI * Bi), (NR * Bi) + (NI * Br)]
VectorRegister4Float Out34 = VectorMultiplyAdd(VInRev, VBetaReal, Out4);
// Out = [
// (R * Ar) + (I * Ai) + (NR * Br) + (NI * Bi),
// (I * Ar) + (R * Ai) + (NR * Bi) + (NI * Br)
// ]
VectorRegister4Float Out = VectorAdd(Out12, Out34);
VectorStore(Out, &OutValues[i]);
}
}
}
FVectorRealToComplexFFT::FVectorRealToComplexFFT(int32 InLog2FFTSize)
: FFTSize(1 << InLog2FFTSize)
, Log2FFTSize(InLog2FFTSize)
, ComplexFFT(new FVectorComplexFFT(InLog2FFTSize - 1)) // Utilize a N/2 length complex fft to perform N length fft.
{
WorkBuffer.AddUninitialized(FFTSize);
InitRealSequenceConversionBuffers();
}
FVectorRealToComplexFFT::~FVectorRealToComplexFFT()
{
}
int32 FVectorRealToComplexFFT::Size() const
{
return FFTSize;
}
/** Scaling applied when performing forward FFT. */
EFFTScaling FVectorRealToComplexFFT::ForwardScaling() const
{
return EFFTScaling::MultipliedBySqrtFFTSize;
}
/** Scaling applied when performing inverse FFT. */
EFFTScaling FVectorRealToComplexFFT::InverseScaling() const
{
return EFFTScaling::DividedBySqrtFFTSize;
}
void FVectorRealToComplexFFT::ForwardRealToComplex(const float* RESTRICT InReal, float* RESTRICT OutComplex)
{
// Performs a N sized real-to-complex FFT using an N/2 complex-to-complex FFT.
float* WorkData = WorkBuffer.GetData();
ComplexFFT->ForwardComplexToComplex(InReal, WorkData);
const float* AlphaRealForwardData = ForwardConvBuffers.AlphaReal.GetData();
const float* AlphaImagForwardData = ForwardConvBuffers.AlphaImag.GetData();
const float* BetaRealForwardData = ForwardConvBuffers.BetaReal.GetData();
const float* BetaImagForwardData = ForwardConvBuffers.BetaImag.GetData();
// Handle special case of this math to account for cyclical index math.
OutComplex[0] = (WorkData[0] * AlphaRealForwardData[0])
+ (WorkData[1] * AlphaImagForwardData[0])
+ (WorkData[0] * BetaRealForwardData[0])
+ (WorkData[1] * BetaImagForwardData[0]);
OutComplex[1] = (WorkData[1] * AlphaRealForwardData[1])
+ (WorkData[0] * AlphaImagForwardData[1])
+ (WorkData[0] * BetaImagForwardData[1])
+ (WorkData[1] * BetaRealForwardData[1]);
OutComplex[2] = (WorkData[2] * AlphaRealForwardData[2])
+ (WorkData[3] * AlphaImagForwardData[2])
+ (WorkData[FFTSize - 2] * BetaRealForwardData[2])
+ (WorkData[FFTSize - 1] * BetaImagForwardData[2]);
OutComplex[3] = (WorkData[3] * AlphaRealForwardData[3])
+ (WorkData[2] * AlphaImagForwardData[3])
+ (WorkData[FFTSize - 2] * BetaImagForwardData[3])
+ (WorkData[FFTSize - 1] * BetaRealForwardData[3]);
// Convert all other values using optimized SIMD
ConvertSequence(ForwardConvBuffers, WorkData, 4, OutComplex);
// Handle special case of nyquist frequency
OutComplex[FFTSize] = WorkData[0] - WorkData[1];
OutComplex[FFTSize + 1] = 0.f;
}
void FVectorRealToComplexFFT::InverseComplexToReal(const float* RESTRICT InComplex, float* RESTRICT OutReal)
{
// Performs a N sized complex-to-real FFT using an N/2 complex-to-complex FFT.
float* WorkData = WorkBuffer.GetData();
const float* AlphaRealInverseData = InverseConvBuffers.AlphaReal.GetData();
const float* AlphaImagInverseData = InverseConvBuffers.AlphaImag.GetData();
const float* BetaRealInverseData = InverseConvBuffers.BetaReal.GetData();
const float* BetaImagInverseData = InverseConvBuffers.BetaImag.GetData();
// Handle special case of this math to account for cyclical index math.
WorkData[0] = (InComplex[0] * AlphaRealInverseData[0])
+ (InComplex[1] * AlphaImagInverseData[0])
+ (InComplex[FFTSize] * BetaRealInverseData[0])
+ (InComplex[FFTSize + 1] * BetaImagInverseData[0]);
WorkData[1] = (InComplex[1] * AlphaRealInverseData[1])
+ (InComplex[0] * AlphaImagInverseData[1])
+ (InComplex[FFTSize] * BetaImagInverseData[1])
+ (InComplex[FFTSize + 1] * BetaRealInverseData[0]);
WorkData[2] = (InComplex[2] * AlphaRealInverseData[2])
+ (InComplex[3] * AlphaImagInverseData[2])
+ (InComplex[FFTSize - 2] * BetaRealInverseData[2])
+ (InComplex[FFTSize - 1] * BetaImagInverseData[2]);
WorkData[3] = (InComplex[3] * AlphaRealInverseData[3])
+ (InComplex[2] * AlphaImagInverseData[3])
+ (InComplex[FFTSize - 2] * BetaImagInverseData[3])
+ (InComplex[FFTSize - 1] * BetaRealInverseData[3]);
// Convert all other values using optimized SIMD
ConvertSequence(ForwardConvBuffers, InComplex, 4, WorkData);
// Perform Inverse FFT
ComplexFFT->InverseComplexToComplex(WorkData, OutReal);
}
void FVectorRealToComplexFFT::BatchForwardRealToComplex(int32 InCount, const float* const RESTRICT InReal[], float* RESTRICT OutComplex[])
{
for (int32 i = 0; i < InCount; i++)
{
ForwardRealToComplex(InReal[i], OutComplex[i]);
}
}
void FVectorRealToComplexFFT::BatchInverseComplexToReal(int32 InCount, const float* const RESTRICT InComplex[], float* RESTRICT OutReal[])
{
for (int32 i = 0; i < InCount; i++)
{
InverseComplexToReal(InComplex[i], OutReal[i]);
}
}
/*************************************************************************************************/
/**************************************** FVectorFFTFactory **************************************/
/*************************************************************************************************/
FVectorFFTFactory::~FVectorFFTFactory()
{
}
/** Name of this particular factory. */
FName FVectorFFTFactory::GetFactoryName() const
{
static const FName FactoryName = FName(TEXT("FVectorFFTFactory"));
return FactoryName;
}
/** If true, this implementation uses hardware acceleration. */
bool FVectorFFTFactory::IsHardwareAccelerated() const
{
return false;
}
/** If true, this implementation requires input and output arrays to be 128 bit aligned. */
bool FVectorFFTFactory::Expects128BitAlignedArrays() const
{
return false;
}
/** Returns true if the input settings are supported by this factory. */
bool FVectorFFTFactory::AreFFTSettingsSupported(const FFFTSettings& InSettings) const
{
// Supports FFT sizes of 5 to 16, though an FFT
bool bIsMinSizeSupported = InSettings.Log2Size >= FVectorRealToComplexFFT::MinLog2FFTSize;
bool bIsMaxSizeSupported = InSettings.Log2Size <= FVectorRealToComplexFFT::MaxLog2FFTSize;
bool bIsAlignmentSupported = InSettings.bArrays128BitAligned;
return bIsMinSizeSupported && bIsAlignmentSupported && bIsMaxSizeSupported;
}
/** Creates a new FFT algorithm. */
TUniquePtr<IFFTAlgorithm> FVectorFFTFactory::NewFFTAlgorithm(const FFFTSettings& InSettings)
{
if (AreFFTSettingsSupported(InSettings))
{
return MakeUnique<FVectorRealToComplexFFT>(InSettings.Log2Size);
}
return TUniquePtr<IFFTAlgorithm>();
}
}