// Copyright Epic Games, Inc. All Rights Reserved. #include "VectorFFT.h" #include "SignalProcessingModule.h" #include "Templates/UniquePtr.h" #include "DSP/FFTAlgorithm.h" #include "DSP/FloatArrayMath.h" namespace Audio { // Implementation of a complex FFT. class FVectorComplexFFT { public: // Minimum size of fft is required to support 2 radix-4 fft stages. static const int32 MinLog2FFTSize = 4; // Maximum size of fft is set to avoid exceedingly large allocs and support // various logic for indexing blocks internally. static const int32 MaxLog2FFTSize = 16; // Constructor // // @param InLog2FFTSize - Log2 size of the FFT. FVectorComplexFFT(int32 InLog2FFTSize) : Log2FFTSize(InLog2FFTSize) , FFTSize(0) , NumFloats(0) { check(Log2FFTSize >= MinLog2FFTSize); check(Log2FFTSize <= MaxLog2FFTSize); FFTSize = 1 << Log2FFTSize; NumFloats = 2 * FFTSize; // Takes 2 floats to represent a complex number InverseWorkBuffer.AddUninitialized(NumFloats); // Pregenerate weights needed to calculate this size of FFT. GenerateRadix4Weights(); GenerateFinalIndices(); GenerateFinalWeights(); } ~FVectorComplexFFT() { } // Perform forward complex FFT // // @param InComplex - Interleaved complex data with (2 * FFTSize) num floats. // @param OutComplex - Interleaved complex data with (2 * FFTSize) num floats. void ForwardComplexToComplex(const float* RESTRICT InComplex, float* RESTRICT OutComplex) { // To perform FFT, must complete Log2FFTSize stages. Each radix pass performs 2^m stages // where 2^m is the radix number. So a radix-4 stage is radix-2^m or radix-2^2. Hence radix // 4 performs two stages. Radix-8 is Radix-2^3, so it performs 3 stages. int32 CompletedStages = 0; if (Log2FFTSize & 1) { // If we have an odd number of stages, start with a radix-8 to // perform first 3 stages. Radix8ButterflyConstantWeight(InComplex, OutComplex, Log2FFTSize); CompletedStages = 3; } else { // Ifawe have an even number of stages, start with a radix-4 to // perform first 2 stages. Radix4ButterflyConstantWeight(InComplex, OutComplex, Log2FFTSize); CompletedStages = 2; } // Fit in a few more constant weight radix4s if possible. This routine is faster // than the default Radix4Butterfly because it does not need weights. for (int32 StageIndex = CompletedStages; StageIndex < Log2FFTSize - 4 ; StageIndex += 2 ) { Radix4ButterflyConstantWeight(OutComplex, OutComplex, Log2FFTSize - StageIndex); } // Perform a bunch of radix4 ffts with varying weights // Logically you would arrange this loop to first iterate over stage indices, and then over // butterfly indices, but by reorganizing the loop the code is more cache coherent. for (int32 ButterflyIndex = 1 ; CompletedStages < (Log2FFTSize - 4) ; CompletedStages += 2) { for ( ; ButterflyIndex < (1 << CompletedStages); ++ButterflyIndex ) { for (int32 StageIndex = CompletedStages; StageIndex < Log2FFTSize - 4 ; StageIndex += 2 ) { Radix4Butterfly(OutComplex, ButterflyIndex, Log2FFTSize - StageIndex, Radix4Weights[ButterflyIndex]); } } } if (CompletedStages < (Log2FFTSize - 2)) { // Special case for 2nd to last stage of fft which has better cache coherency Radix4Butterfly2ndToFinal(OutComplex, Log2FFTSize - 4); } // Special case for last stage of fft with cache coherency tricks and index reversal built in. Radix4ButterflyFinal(OutComplex, Log2FFTSize - 2); } void InverseComplexToComplex(const float* RESTRICT InComplex, float* RESTRICT OutComplex) { // Perform inverse FFT by complex conjugating the input and output. float* WorkData = InverseWorkBuffer.GetData(); ScaledComplexConjugate(InComplex, 1.f, WorkData, NumFloats); ForwardComplexToComplex(WorkData, OutComplex); const float Scale = 1.f / static_cast(FFTSize); ScaledComplexConjugate(OutComplex, Scale, OutComplex, NumFloats); } private: // Weight structure for general radix 4 pass. struct FRadix4Weight { alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1RNeg[4]; // Negative version of W1R. // These have specialized sign flips for calculations of D2 and D3 alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1RD2[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1RD3[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1I[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2RNeg[4]; // Negative Version of W2R alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2I[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3RNeg[4]; // Negative Version of W4R alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3I[4]; }; // Structure to hold loaded inputs on final radix 4 pass struct FFinalInputs { VectorRegister4Float A0; VectorRegister4Float A1; VectorRegister4Float A2; VectorRegister4Float A3; VectorRegister4Float A4; VectorRegister4Float A5; VectorRegister4Float A6; VectorRegister4Float A7; }; // Structure to hold loaded outputs on final radix 4 pass struct FFinalOutputs { VectorRegister4Float D0; VectorRegister4Float D1; VectorRegister4Float D2; VectorRegister4Float D3; VectorRegister4Float D4; VectorRegister4Float D5; VectorRegister4Float D6; VectorRegister4Float D7; }; // Weight structure for final Radix4 pass struct FFinalWeights { alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W1I[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2I[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3I[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W4R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W4I[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W5R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W5I[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W6R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W6I[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W7R[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W7I[4]; alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2RNeg[4]; // Negative version of W2R alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2RD4[4]; // Special case for calculating D4 alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W2RD6[4]; // Special case for calculating D5 alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3RD5[4]; // Special case for calculating D6 alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3RD7[4]; // Special case for calculating D7 alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W3RNeg[4]; // Negative version of W3R alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W4RNeg[4]; // Negative version of W4R alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W5RNeg[4]; // Negative version of W5R alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W6RNeg[4]; // Negative version of W6R alignas(AUDIO_SIMD_BYTE_ALIGNMENT) float W7RNeg[4]; // Negative version of W7R }; // Radix4 index locations for performing final Radix4 passes and bit reversal // without overwriting needed data. struct FFinalIndices { int32 ReadIndex; int32 WriteIndex; }; // Perform complex conjugate as well as scale. // // @param InValues - Array of floats representing complex values in interleave format. // @param Scale - Scale to apply. // @param OutValues - Array of floats representing complex values in interleave format. // @param Num - Number of floats in array (NOT number of complex values). void ScaledComplexConjugate(const float* InValues, float Scale, float* OutValues, int32 Num) { ArrayScaledComplexConjugate(InValues, Num, OutValues, Scale); } // Perform a radix-4 butterfly which uses constant weights. void Radix4ButterflyConstantWeight(const float* InValues, float* OutValues, int32 InStageIndex) { // This routine only supported when stage index is greater than or equal to 3. // This comes about for two reasons, // 1. A Radix-4 processes two stages. (m = StageCount, 2^m = 4). // 2. SIMD optimizations processes two butterflies in parallel, so must have an // even number of butterflies. This sets the mimum stage index to 3, which will // result in minimally 2 radix-4 butterflies being calculated. check(InStageIndex >= 3); // Calculate number of constant weight butterflies in this stage. const int NumButterflies = 1 << (InStageIndex - 2); const VectorRegister4Float SignFlipImag = MakeVectorRegisterFloat(1.f, -1.f, 1.f, -1.f); const int32 Offset0 = 0; const int32 Offset1 = 2 * NumButterflies; const int32 Offset2 = 4 * NumButterflies; const int32 Offset3 = 6 * NumButterflies; for (int32 i = 0; i < NumButterflies; i += 2) { const int32 Pos = 2 * i; const int32 Pos0 = Offset0 + Pos; const int32 Pos1 = Offset1 + Pos; const int32 Pos2 = Offset2 + Pos; const int32 Pos3 = Offset3 + Pos; VectorRegister4Float A0 = VectorLoad(&InValues[Pos0]); VectorRegister4Float A1 = VectorLoad(&InValues[Pos1]); VectorRegister4Float A2 = VectorLoad(&InValues[Pos2]); VectorRegister4Float A3 = VectorLoad(&InValues[Pos3]); VectorRegister4Float C0 = VectorAdd(A0, A2); VectorRegister4Float C2 = VectorSubtract(A0, A2); VectorRegister4Float C1 = VectorAdd(A1, A3); VectorRegister4Float C3 = VectorSubtract(A1, A3); VectorRegister4Float C3Conj = VectorMultiply(C3, SignFlipImag); VectorRegister4Float C3ConjSwizzle = VectorSwizzle(C3Conj, 1, 0, 3, 2); VectorRegister4Float D0 = VectorAdd(C1, C0); VectorRegister4Float D1 = VectorSubtract(C0, C1); VectorRegister4Float D2 = VectorAdd(C2, C3ConjSwizzle); VectorRegister4Float D3 = VectorSubtract(C2, C3ConjSwizzle); VectorStore(D0, &OutValues[Pos0]); VectorStore(D1, &OutValues[Pos1]); VectorStore(D2, &OutValues[Pos2]); VectorStore(D3, &OutValues[Pos3]); } } // Perform a radix-8 butterfly which uses constant weights. void Radix8ButterflyConstantWeight(const float* InValues, float* OutValues, int32 InStageIndex) { const float Sqrt2D2 = .7071067811865475244f; // This routine only supported when stage index is greater than or equal to four. // This comes about for two reasons, // 1. A Radix-8 processes three stages. (m = StageCount, 2^m = 8). // 2. SIMD optimizations processes two butterflies in parallel, so must have an // even number of butterflies. This sets the mimum stage index to 4, which will // result in minimally 2 radix-8 butterflies being calculated. check(InStageIndex >= 4); // Calculate number of constant weight butterflies in this stage. const int32 NumButterflies = 1 << (InStageIndex - 3); const VectorRegister4Float SignFlipImag = MakeVectorRegisterFloat(1.f, -1.f, 1.f, -1.f); const VectorRegister4Float VectorSqrt2D2 = MakeVectorRegisterFloat(Sqrt2D2, Sqrt2D2, Sqrt2D2, Sqrt2D2); const VectorRegister4Float VectorNegSqrt2D2 = MakeVectorRegisterFloat(-Sqrt2D2, -Sqrt2D2, -Sqrt2D2, -Sqrt2D2); const int32 Offset0 = 0; const int32 Offset1 = 2 * NumButterflies; const int32 Offset2 = 4 * NumButterflies; const int32 Offset3 = 6 * NumButterflies; const int32 Offset4 = 8 * NumButterflies; const int32 Offset5 = 10 * NumButterflies; const int32 Offset6 = 12 * NumButterflies; const int32 Offset7 = 14 * NumButterflies; for (int32 i = 0; i < NumButterflies; i += 2) { const int32 Pos = 2 * i; const int32 Pos0 = Pos; const int32 Pos1 = Offset1 + Pos; const int32 Pos2 = Offset2 + Pos; const int32 Pos3 = Offset3 + Pos; const int32 Pos4 = Offset4 + Pos; const int32 Pos5 = Offset5 + Pos; const int32 Pos6 = Offset6 + Pos; const int32 Pos7 = Offset7 + Pos; VectorRegister4Float A0 = VectorLoad(&InValues[Pos0]); VectorRegister4Float A1 = VectorLoad(&InValues[Pos1]); VectorRegister4Float A2 = VectorLoad(&InValues[Pos2]); VectorRegister4Float A3 = VectorLoad(&InValues[Pos3]); VectorRegister4Float A4 = VectorLoad(&InValues[Pos4]); VectorRegister4Float A5 = VectorLoad(&InValues[Pos5]); VectorRegister4Float A6 = VectorLoad(&InValues[Pos6]); VectorRegister4Float A7 = VectorLoad(&InValues[Pos7]); VectorRegister4Float B0 = VectorAdd(A0, A4); VectorRegister4Float B1 = VectorAdd(A1, A5); VectorRegister4Float B2 = VectorAdd(A2, A6); VectorRegister4Float B3 = VectorAdd(A3, A7); VectorRegister4Float B4 = VectorSubtract(A0, A4); VectorRegister4Float B5 = VectorSubtract(A1, A5); VectorRegister4Float B6 = VectorSubtract(A2, A6); VectorRegister4Float B7 = VectorSubtract(A3, A7); VectorRegister4Float B6Conj = VectorMultiply(SignFlipImag, B6); VectorRegister4Float B6ConjSwizzle = VectorSwizzle(B6Conj, 1, 0, 3, 2); VectorRegister4Float B7Conj = VectorMultiply(SignFlipImag, B7); VectorRegister4Float B7ConjSwizzle = VectorSwizzle(B7Conj, 1, 0, 3, 2); VectorRegister4Float C0 = VectorAdd(B0, B2); VectorRegister4Float C1 = VectorAdd(B1, B3); VectorRegister4Float C2 = VectorSubtract(B0, B2); VectorRegister4Float C3 = VectorSubtract(B1, B3); VectorRegister4Float C4 = VectorAdd(B4, B6ConjSwizzle); VectorRegister4Float C5 = VectorAdd(B5, B7ConjSwizzle); VectorRegister4Float C6 = VectorSubtract(B4, B6ConjSwizzle); VectorRegister4Float C7 = VectorSubtract(B5, B7ConjSwizzle); VectorRegister4Float C3Conj = VectorMultiply(SignFlipImag, C3); VectorRegister4Float C3ConjSwizzle = VectorSwizzle(C3Conj, 1, 0, 3, 2); VectorRegister4Float C5Conj = VectorMultiply(SignFlipImag, C5); VectorRegister4Float C5ConjSwizzle = VectorSwizzle(C5Conj, 1, 0, 3, 2); VectorRegister4Float T5 = VectorAdd(C5, C5ConjSwizzle); VectorRegister4Float C7Swizzle = VectorSwizzle(C7, 1, 0, 3, 2); VectorRegister4Float T7 = VectorMultiplyAdd(SignFlipImag, C7, C7Swizzle); VectorRegister4Float T7Conj = VectorMultiply(T7, SignFlipImag); VectorRegister4Float D0 = VectorAdd(C0, C1); VectorRegister4Float D1 = VectorSubtract(C0, C1); VectorRegister4Float D2 = VectorAdd(C2, C3ConjSwizzle); VectorRegister4Float D3 = VectorSubtract(C2, C3ConjSwizzle); VectorRegister4Float D4 = VectorMultiplyAdd(T5, VectorSqrt2D2, C4); VectorRegister4Float D5 = VectorMultiplyAdd(T5, VectorNegSqrt2D2, C4); VectorRegister4Float D6 = VectorMultiplyAdd(VectorNegSqrt2D2, T7Conj, C6); VectorRegister4Float D7 = VectorMultiplyAdd(VectorSqrt2D2, T7Conj, C6); VectorStore(D0, &OutValues[Pos0]); VectorStore(D1, &OutValues[Pos1]); VectorStore(D2, &OutValues[Pos2]); VectorStore(D3, &OutValues[Pos3]); VectorStore(D4, &OutValues[Pos4]); VectorStore(D5, &OutValues[Pos5]); VectorStore(D6, &OutValues[Pos6]); VectorStore(D7, &OutValues[Pos7]); } } // Perform a radix4 butterfly with dynamic weights. void Radix4Butterfly(float* InOutValues, int32 ButterflyIndex, int32 InStageIndex, const FRadix4Weight& Weights) { // Number of values between butterflies. const int32 Stride = 1 << InStageIndex; const int32 NumButterflies = 1 << (InStageIndex - 2); // Load weights for butterfly const VectorRegister4Float Weight1Real = VectorLoad(Weights.W1R); const VectorRegister4Float Weight1Imag = VectorLoad(Weights.W1I); const VectorRegister4Float Weight2Real = VectorLoad(Weights.W2R); const VectorRegister4Float Weight2Imag = VectorLoad(Weights.W2I); const VectorRegister4Float Weight3Real = VectorLoad(Weights.W3R); const VectorRegister4Float Weight3Imag = VectorLoad(Weights.W3I); const VectorRegister4Float Weight3RealNeg = VectorLoad(Weights.W3RNeg); const VectorRegister4Float Weight2RealNeg = VectorLoad(Weights.W2RNeg); const VectorRegister4Float Weight1RealNeg = VectorLoad(Weights.W1RNeg); const VectorRegister4Float Weight1RealD2 = VectorLoad(Weights.W1RD2); const VectorRegister4Float Weight1RealD3 = VectorLoad(Weights.W1RD3); // Perform butterflies. for (int32 i = 0; i < NumButterflies; i += 2) { const int32 Pos0 = 2 * (Stride * ButterflyIndex + i); const int32 Pos1 = 2 * (Stride * ButterflyIndex + 1 * NumButterflies + i); const int32 Pos2 = 2 * (Stride * ButterflyIndex + 2 * NumButterflies + i); const int32 Pos3 = 2 * (Stride * ButterflyIndex + 3 * NumButterflies + i); VectorRegister4Float A0 = VectorLoad(&InOutValues[Pos0]); VectorRegister4Float A1 = VectorLoad(&InOutValues[Pos1]); VectorRegister4Float A2 = VectorLoad(&InOutValues[Pos2]); VectorRegister4Float A3 = VectorLoad(&InOutValues[Pos3]); VectorRegister4Float A1Swizzle = VectorSwizzle(A1, 1, 0, 3, 2); VectorRegister4Float A2Swizzle = VectorSwizzle(A2, 1, 0, 3, 2); VectorRegister4Float A3Swizzle = VectorSwizzle(A3, 1, 0, 3, 2); VectorRegister4Float B1 = VectorMultiplyAdd(A1Swizzle, Weight1Imag, A1); VectorRegister4Float B2 = VectorMultiplyAdd(A2Swizzle, Weight2Imag, A2); VectorRegister4Float B3 = VectorMultiplyAdd(A3Swizzle, Weight3Imag, A3); VectorRegister4Float C0 = VectorMultiplyAdd(B2, Weight2Real, A0); VectorRegister4Float C2 = VectorMultiplyAdd(B2, Weight2RealNeg, A0); VectorRegister4Float C1 = VectorMultiplyAdd(B3, Weight3Real, B1); VectorRegister4Float C3 = VectorMultiplyAdd(B3, Weight3RealNeg, B1); VectorRegister4Float C3Swizzle = VectorSwizzle(C3, 1, 0, 3, 2); VectorRegister4Float D0 = VectorMultiplyAdd(C1, Weight1Real, C0); VectorRegister4Float D1 = VectorMultiplyAdd(C1, Weight1RealNeg, C0); VectorRegister4Float D2 = VectorMultiplyAdd(C3Swizzle, Weight1RealD2, C2); VectorRegister4Float D3 = VectorMultiplyAdd(C3Swizzle, Weight1RealD3, C2); VectorStore(D0, &InOutValues[Pos0]); VectorStore(D1, &InOutValues[Pos1]); VectorStore(D2, &InOutValues[Pos2]); VectorStore(D3, &InOutValues[Pos3]); } } // Special case of 2nd to last radix4 which has to load new weights for // each iteration. void Radix4Butterfly2ndToFinal(float* InOutValues, int32 StageIndex) { int32 NumPasses = 1 << StageIndex; // Elements between passes. const int32 Stride = 16; for (int32 i = 0; i < NumPasses; ++i) { // Load values for current weight. const FRadix4Weight& Weights = Radix4Weights[i]; const VectorRegister4Float Weight1Real = VectorLoad(Weights.W1R); const VectorRegister4Float Weight1Imag = VectorLoad(Weights.W1I); const VectorRegister4Float Weight2Real = VectorLoad(Weights.W2R); const VectorRegister4Float Weight2Imag = VectorLoad(Weights.W2I); const VectorRegister4Float Weight3Real = VectorLoad(Weights.W3R); const VectorRegister4Float Weight3Imag = VectorLoad(Weights.W3I); const VectorRegister4Float Weight3RealNeg = VectorLoad(Weights.W3RNeg); const VectorRegister4Float Weight2RealNeg = VectorLoad(Weights.W2RNeg); const VectorRegister4Float Weight1RealNeg = VectorLoad(Weights.W1RNeg); const VectorRegister4Float Weight1RealD2 = VectorLoad(Weights.W1RD2); const VectorRegister4Float Weight1RealD3 = VectorLoad(Weights.W1RD3); for (int32 j = 0; j < 4 ; j += 2) { const int32 Pos0 = 2 * (Stride * i + j); const int32 Pos1 = 2 * (Stride * i + 4 + j); const int32 Pos2 = 2 * (Stride * i + 8 + j); const int32 Pos3 = 2 * (Stride * i + 12 + j); VectorRegister4Float A0 = VectorLoad(&InOutValues[Pos0]); VectorRegister4Float A1 = VectorLoad(&InOutValues[Pos1]); VectorRegister4Float A2 = VectorLoad(&InOutValues[Pos2]); VectorRegister4Float A3 = VectorLoad(&InOutValues[Pos3]); VectorRegister4Float A1Swizzle = VectorSwizzle(A1, 1, 0, 3, 2); VectorRegister4Float A2Swizzle = VectorSwizzle(A2, 1, 0, 3, 2); VectorRegister4Float A3Swizzle = VectorSwizzle(A3, 1, 0, 3, 2); VectorRegister4Float B1 = VectorMultiplyAdd(A1Swizzle, Weight1Imag, A1); VectorRegister4Float B2 = VectorMultiplyAdd(A2Swizzle, Weight2Imag, A2); VectorRegister4Float B3 = VectorMultiplyAdd(A3Swizzle, Weight3Imag, A3); VectorRegister4Float C0 = VectorMultiplyAdd(B2, Weight2Real, A0); VectorRegister4Float C1 = VectorMultiplyAdd(B3, Weight3Real, B1); VectorRegister4Float C2 = VectorMultiplyAdd(B2, Weight2RealNeg, A0); VectorRegister4Float C3 = VectorMultiplyAdd(B3, Weight3RealNeg, B1); VectorRegister4Float C3Swizzle = VectorSwizzle(C3, 1, 0, 3, 2); VectorRegister4Float D0 = VectorMultiplyAdd(C1, Weight1Real, C0); VectorRegister4Float D1 = VectorMultiplyAdd(C1, Weight1RealNeg, C0); VectorRegister4Float D2 = VectorMultiplyAdd(C3Swizzle, Weight1RealD2, C2); VectorRegister4Float D3 = VectorMultiplyAdd(C3Swizzle, Weight1RealD3, C2); VectorStore(D0, &InOutValues[Pos0]); VectorStore(D1, &InOutValues[Pos1]); VectorStore(D2, &InOutValues[Pos2]); VectorStore(D3, &InOutValues[Pos3]); } } } // Read data for final butterfly. Performs part of bit reversal order. void ReadFinalButterflyInputs(const float* InValues, int32 InNumButterflies, int32 InReadIndex, FFinalInputs& OutValues) { const int32 Pos0 = 2 * (0 * InNumButterflies + 4 * InReadIndex); const int32 Pos1 = Pos0 + 4; const int32 Pos2 = 2 * (2 * InNumButterflies + 4 * InReadIndex); const int32 Pos3 = Pos2 + 4; const int32 Pos4 = 2 * (1 * InNumButterflies + 4 * InReadIndex); const int32 Pos5 = Pos4 + 4; const int32 Pos6 = 2 * (3 * InNumButterflies + 4 * InReadIndex); const int32 Pos7 = Pos6 + 4; VectorRegister4Float T0 = VectorLoad(&InValues[Pos0]); VectorRegister4Float T1 = VectorLoad(&InValues[Pos1]); VectorRegister4Float T2 = VectorLoad(&InValues[Pos2]); VectorRegister4Float T3 = VectorLoad(&InValues[Pos3]); VectorRegister4Float T4 = VectorLoad(&InValues[Pos4]); VectorRegister4Float T5 = VectorLoad(&InValues[Pos5]); VectorRegister4Float T6 = VectorLoad(&InValues[Pos6]); VectorRegister4Float T7 = VectorLoad(&InValues[Pos7]); OutValues.A0 = VectorShuffle(T0, T2, 0, 1, 0, 1); OutValues.A1 = VectorShuffle(T4, T6, 0, 1, 0, 1); OutValues.A2 = VectorShuffle(T0, T2, 2, 3, 2, 3); OutValues.A3 = VectorShuffle(T4, T6, 2, 3, 2, 3); OutValues.A4 = VectorShuffle(T1, T3, 0, 1, 0, 1); OutValues.A5 = VectorShuffle(T5, T7, 0, 1, 0, 1); OutValues.A6 = VectorShuffle(T1, T3, 2, 3, 2, 3); OutValues.A7 = VectorShuffle(T5, T7, 2, 3, 2, 3); } // Write data for final butterfly. Performs part of bit reversal order. void WriteFinalButterflyOutputs(const FFinalOutputs& InResult, int32 InNumButterflies, int32 InWriteIndex, float* OutValues) { const int32 Pos0 = 2 * (0 * InNumButterflies + 4 * InWriteIndex); const int32 Pos1 = Pos0 + 4; const int32 Pos2 = 2 * (2 * InNumButterflies + 4 * InWriteIndex); const int32 Pos3 = Pos2 + 4; const int32 Pos4 = 2 * (1 * InNumButterflies + 4 * InWriteIndex); const int32 Pos5 = Pos4 + 4; const int32 Pos6 = 2 * (3 * InNumButterflies + 4 * InWriteIndex); const int32 Pos7 = Pos6 + 4; VectorStore(InResult.D0, &OutValues[Pos0]); VectorStore(InResult.D1, &OutValues[Pos1]); VectorStore(InResult.D2, &OutValues[Pos2]); VectorStore(InResult.D3, &OutValues[Pos3]); VectorStore(InResult.D4, &OutValues[Pos4]); VectorStore(InResult.D5, &OutValues[Pos5]); VectorStore(InResult.D6, &OutValues[Pos6]); VectorStore(InResult.D7, &OutValues[Pos7]); } // Compute butterfly in final stage. void Radix4ButterflyFinalIteration(const FFinalInputs& Inputs, const FFinalWeights& InWeights, FFinalOutputs& Outputs) { // Note: Some weights are altered to bake in sign flips to avoid an extra multiply later on. const VectorRegister4Float W2I = VectorLoad(InWeights.W2I); const VectorRegister4Float W2R = VectorLoad(InWeights.W2R); const VectorRegister4Float W3I = VectorLoad(InWeights.W3I); const VectorRegister4Float W3R = VectorLoad(InWeights.W3R); const VectorRegister4Float W4I = VectorLoad(InWeights.W4I); const VectorRegister4Float W4R = VectorLoad(InWeights.W4R); const VectorRegister4Float W5I = VectorLoad(InWeights.W5I); const VectorRegister4Float W5R = VectorLoad(InWeights.W5R); const VectorRegister4Float W6I = VectorLoad(InWeights.W6I); const VectorRegister4Float W6R = VectorLoad(InWeights.W6R); const VectorRegister4Float W7I = VectorLoad(InWeights.W7I); const VectorRegister4Float W7R = VectorLoad(InWeights.W7R); const VectorRegister4Float W2RNeg = VectorLoad(InWeights.W2RNeg); const VectorRegister4Float W3RNeg = VectorLoad(InWeights.W3RNeg); const VectorRegister4Float W4RNeg = VectorLoad(InWeights.W4RNeg); const VectorRegister4Float W5RNeg = VectorLoad(InWeights.W5RNeg); const VectorRegister4Float W6RNeg = VectorLoad(InWeights.W6RNeg); const VectorRegister4Float W7RNeg = VectorLoad(InWeights.W7RNeg); const VectorRegister4Float W2RD4 = VectorLoad(InWeights.W2RD4); const VectorRegister4Float W2RD6 = VectorLoad(InWeights.W2RD6); const VectorRegister4Float W3RD5 = VectorLoad(InWeights.W3RD5); const VectorRegister4Float W3RD7 = VectorLoad(InWeights.W3RD7); VectorRegister4Float A2Swizzle = VectorSwizzle(Inputs.A2, 1, 0, 3, 2); VectorRegister4Float A3Swizzle = VectorSwizzle(Inputs.A3, 1, 0, 3, 2); VectorRegister4Float A4Swizzle = VectorSwizzle(Inputs.A4, 1, 0, 3, 2); VectorRegister4Float A5Swizzle = VectorSwizzle(Inputs.A5, 1, 0, 3, 2); VectorRegister4Float A6Swizzle = VectorSwizzle(Inputs.A6, 1, 0, 3, 2); VectorRegister4Float A7Swizzle = VectorSwizzle(Inputs.A7, 1, 0, 3, 2); VectorRegister4Float B2 = VectorMultiplyAdd(A2Swizzle, W2I, Inputs.A2); VectorRegister4Float B3 = VectorMultiplyAdd(A3Swizzle, W3I, Inputs.A3); VectorRegister4Float B4 = VectorMultiplyAdd(A4Swizzle, W4I, Inputs.A4); VectorRegister4Float B5 = VectorMultiplyAdd(A5Swizzle, W5I, Inputs.A5); VectorRegister4Float B6 = VectorMultiplyAdd(A6Swizzle, W6I, Inputs.A6); VectorRegister4Float B7 = VectorMultiplyAdd(A7Swizzle, W7I, Inputs.A7); VectorRegister4Float C0 = VectorMultiplyAdd(B4, W4R, Inputs.A0); VectorRegister4Float C1 = VectorMultiplyAdd(B5, W5R, Inputs.A1); VectorRegister4Float C2 = VectorMultiplyAdd(B6, W6R, B2); VectorRegister4Float C3 = VectorMultiplyAdd(B7, W7R, B3); VectorRegister4Float C4 = VectorMultiplyAdd(B4, W4RNeg, Inputs.A0); VectorRegister4Float C5 = VectorMultiplyAdd(B5, W5RNeg, Inputs.A1); VectorRegister4Float C6 = VectorMultiplyAdd(B6, W6RNeg, B2); VectorRegister4Float C7 = VectorMultiplyAdd(B7, W7RNeg, B3); VectorRegister4Float C6Swizzle = VectorSwizzle(C6, 1, 0, 3, 2); VectorRegister4Float C7Swizzle = VectorSwizzle(C7, 1, 0, 3, 2); Outputs.D0 = VectorMultiplyAdd(C2, W2R, C0); Outputs.D1 = VectorMultiplyAdd(C3, W3R, C1); Outputs.D2 = VectorMultiplyAdd(C2, W2RNeg, C0); Outputs.D3 = VectorMultiplyAdd(C3, W3RNeg, C1); Outputs.D4 = VectorMultiplyAdd(C6Swizzle, W2RD4, C4); Outputs.D5 = VectorMultiplyAdd(C7Swizzle, W3RD5, C5); Outputs.D6 = VectorMultiplyAdd(C6Swizzle, W2RD6, C4); Outputs.D7 = VectorMultiplyAdd(C7Swizzle, W3RD7, C5); } // Perform last set of radix 4 butterflies. // // This method is special since it also performs bit order reversal in a // moderately cache coherent manner. void Radix4ButterflyFinal(float* InOutValues, int InStageIndex) { int32 NumButterflies = 1 << InStageIndex; int32 NumIterations = NumButterflies >> 2; FFinalInputs Inputs; FFinalOutputs Outputs; int32 Iteration = 0; ReadFinalButterflyInputs(InOutValues, NumButterflies, FinalIndices[Iteration].ReadIndex, Inputs); Radix4ButterflyFinalIteration(Inputs, FinalWeights[Iteration], Outputs); for (Iteration = 1; Iteration < NumIterations; ++Iteration) { ReadFinalButterflyInputs(InOutValues, NumButterflies, FinalIndices[Iteration].ReadIndex, Inputs); WriteFinalButterflyOutputs(Outputs, NumButterflies, FinalIndices[Iteration - 1].WriteIndex, InOutValues); Radix4ButterflyFinalIteration(Inputs, FinalWeights[Iteration], Outputs); } WriteFinalButterflyOutputs(Outputs, NumButterflies, FinalIndices[Iteration - 1].WriteIndex, InOutValues); } int32 IntLog2(int32 InValue) { check(InValue > 0); check(FMath::CountBits(InValue) == 1); return FMath::CountTrailingZeros(InValue); } void GenerateFinalIndices() { // Each pass of the final radix 4 operates on 16 complex values. const int32 FinalPassSize = 16; const int32 NumFinalIndices = FFTSize / FinalPassSize; // We need to ensure that FFTSize is at least 16 or else final // FFT pass go past end of buffer. check(FFTSize >= FinalPassSize); FinalIndices.Reset(); FinalIndices.AddUninitialized(NumFinalIndices); // These indices perform part of the bit order reversal on 16 element boundaries. // Need to shift bits to get the bit reversed order on 16 element blocks. const int32 Shift = 32 - (IntLog2(FFTSize) - 4); int32 Index = 0; for (int32 ReadIndex = 0; (ReadIndex < NumFinalIndices) && (Index < NumFinalIndices); ++ReadIndex) { // Get bit reversed order on 16 element block const int32 WriteIndex = ReverseBits(ReadIndex) >> Shift; // If ReadIndex > WriteIndex, then ReadIndex in a previous iteration had the // value WriteIndex has now, and we do not want to repeat it if (ReadIndex == WriteIndex) { // If equal, add one entry to read and write to same index. FinalIndices[Index] = { ReadIndex, WriteIndex }; Index++; } else if (ReadIndex < WriteIndex) { // If ReadIndex < WriteIndex, add table entries in both orders. // Loop logic in final radix pass will make sure that nothing // gets overwritten. FinalIndices[Index] = { ReadIndex, WriteIndex }; Index++; FinalIndices[Index] = { WriteIndex, ReadIndex }; Index++; } } } void GenerateFinalWeights() { // Each pass of the final radix 4 operates on 16 complex values. const int32 FinalPassSize = 16; const int32 NumFinalWeights = FFTSize / FinalPassSize; // We need to ensure that FFTSize is at least 16 or else final // FFT pass go past end of buffer. check(FFTSize >= FinalPassSize); FinalWeights.Reset(); FinalWeights.AddUninitialized(NumFinalWeights); const double Scale = 1. / static_cast(FFTSize); for (int32 Pass = 0; Pass < NumFinalWeights; ++Pass) { const int ReadIndex = FinalIndices[Pass].ReadIndex; const double RotatedBitFraction = RotateBitsAroundPoint(4 * ReadIndex); const double Phase = 2. * PI * (RotatedBitFraction + 0 * Scale); const double Phase1 = 2. * PI * (RotatedBitFraction + 1 * Scale); const double Phase2 = 2. * PI * (RotatedBitFraction + 2 * Scale); const double Phase3 = 2. * PI * (RotatedBitFraction + 3 * Scale); FinalWeights[Pass].W2I[0] = -FMath::Tan(Phase); FinalWeights[Pass].W2I[1] = FMath::Tan(Phase); FinalWeights[Pass].W2I[2] = -FMath::Tan(Phase1); FinalWeights[Pass].W2I[3] = FMath::Tan(Phase1); FinalWeights[Pass].W3I[0] = -FMath::Tan(Phase2); FinalWeights[Pass].W3I[1] = FMath::Tan(Phase2); FinalWeights[Pass].W3I[2] = -FMath::Tan(Phase3); FinalWeights[Pass].W3I[3] = FMath::Tan(Phase3); FinalWeights[Pass].W2R[0] = FMath::Cos(Phase); FinalWeights[Pass].W2R[1] = FMath::Cos(Phase); FinalWeights[Pass].W2R[2] = FMath::Cos(Phase1); FinalWeights[Pass].W2R[3] = FMath::Cos(Phase1); FinalWeights[Pass].W2RD4[0] = -FMath::Cos(Phase); FinalWeights[Pass].W2RD4[1] = FMath::Cos(Phase); FinalWeights[Pass].W2RD4[2] = -FMath::Cos(Phase1); FinalWeights[Pass].W2RD4[3] = FMath::Cos(Phase1); FinalWeights[Pass].W2RD6[0] = FMath::Cos(Phase); FinalWeights[Pass].W2RD6[1] = -FMath::Cos(Phase); FinalWeights[Pass].W2RD6[2] = FMath::Cos(Phase1); FinalWeights[Pass].W2RD6[3] = -FMath::Cos(Phase1); FinalWeights[Pass].W2RNeg[0] = -FMath::Cos(Phase); FinalWeights[Pass].W2RNeg[1] = -FMath::Cos(Phase); FinalWeights[Pass].W2RNeg[2] = -FMath::Cos(Phase1); FinalWeights[Pass].W2RNeg[3] = -FMath::Cos(Phase1); FinalWeights[Pass].W3R[0] = FMath::Cos(Phase2); FinalWeights[Pass].W3R[1] = FMath::Cos(Phase2); FinalWeights[Pass].W3R[2] = FMath::Cos(Phase3); FinalWeights[Pass].W3R[3] = FMath::Cos(Phase3); FinalWeights[Pass].W3RD5[0] = -FMath::Cos(Phase2); FinalWeights[Pass].W3RD5[1] = FMath::Cos(Phase2); FinalWeights[Pass].W3RD5[2] = -FMath::Cos(Phase3); FinalWeights[Pass].W3RD5[3] = FMath::Cos(Phase3); FinalWeights[Pass].W3RD7[0] = FMath::Cos(Phase2); FinalWeights[Pass].W3RD7[1] = -FMath::Cos(Phase2); FinalWeights[Pass].W3RD7[2] = FMath::Cos(Phase3); FinalWeights[Pass].W3RD7[3] = -FMath::Cos(Phase3); FinalWeights[Pass].W3RNeg[0] = -FMath::Cos(Phase2); FinalWeights[Pass].W3RNeg[1] = -FMath::Cos(Phase2); FinalWeights[Pass].W3RNeg[2] = -FMath::Cos(Phase3); FinalWeights[Pass].W3RNeg[3] = -FMath::Cos(Phase3); FinalWeights[Pass].W4I[0] = -FMath::Tan(Phase + Phase); FinalWeights[Pass].W4I[1] = FMath::Tan(Phase + Phase); FinalWeights[Pass].W4I[2] = -FMath::Tan(Phase1 + Phase1); FinalWeights[Pass].W4I[3] = FMath::Tan(Phase1 + Phase1); FinalWeights[Pass].W5I[0] = -FMath::Tan(Phase2 + Phase2); FinalWeights[Pass].W5I[1] = FMath::Tan(Phase2 + Phase2); FinalWeights[Pass].W5I[2] = -FMath::Tan(Phase3 + Phase3); FinalWeights[Pass].W5I[3] = FMath::Tan(Phase3 + Phase3); FinalWeights[Pass].W4R[0] = FMath::Cos(Phase + Phase); FinalWeights[Pass].W4R[1] = FMath::Cos(Phase + Phase); FinalWeights[Pass].W4R[2] = FMath::Cos(Phase1 + Phase1); FinalWeights[Pass].W4R[3] = FMath::Cos(Phase1 + Phase1); FinalWeights[Pass].W4RNeg[0] = -FMath::Cos(Phase + Phase); FinalWeights[Pass].W4RNeg[1] = -FMath::Cos(Phase + Phase); FinalWeights[Pass].W4RNeg[2] = -FMath::Cos(Phase1 + Phase1); FinalWeights[Pass].W4RNeg[3] = -FMath::Cos(Phase1 + Phase1); FinalWeights[Pass].W5R[0] = FMath::Cos(Phase2 + Phase2); FinalWeights[Pass].W5R[1] = FMath::Cos(Phase2 + Phase2); FinalWeights[Pass].W5R[2] = FMath::Cos(Phase3 + Phase3); FinalWeights[Pass].W5R[3] = FMath::Cos(Phase3 + Phase3); FinalWeights[Pass].W5RNeg[0] = -FMath::Cos(Phase2 + Phase2); FinalWeights[Pass].W5RNeg[1] = -FMath::Cos(Phase2 + Phase2); FinalWeights[Pass].W5RNeg[2] = -FMath::Cos(Phase3 + Phase3); FinalWeights[Pass].W5RNeg[3] = -FMath::Cos(Phase3 + Phase3); FinalWeights[Pass].W6I[0] = -FMath::Tan(3. * Phase); FinalWeights[Pass].W6I[1] = FMath::Tan(3. * Phase); FinalWeights[Pass].W6I[2] = -FMath::Tan(3. * Phase1); FinalWeights[Pass].W6I[3] = FMath::Tan(3. * Phase1); FinalWeights[Pass].W7I[0] = -FMath::Tan(3. * Phase2); FinalWeights[Pass].W7I[1] = FMath::Tan(3. * Phase2); FinalWeights[Pass].W7I[2] = -FMath::Tan(3. * Phase3); FinalWeights[Pass].W7I[3] = FMath::Tan(3. * Phase3); FinalWeights[Pass].W6R[0] = 2 * FMath::Cos(Phase + Phase) - 1; FinalWeights[Pass].W6R[1] = 2 * FMath::Cos(Phase + Phase) - 1; FinalWeights[Pass].W6R[2] = 2 * FMath::Cos(Phase1 + Phase1) - 1; FinalWeights[Pass].W6R[3] = 2 * FMath::Cos(Phase1 + Phase1) - 1; FinalWeights[Pass].W6RNeg[0] = -(2 * FMath::Cos(Phase + Phase) - 1); FinalWeights[Pass].W6RNeg[1] = -(2 * FMath::Cos(Phase + Phase) - 1); FinalWeights[Pass].W6RNeg[2] = -(2 * FMath::Cos(Phase1 + Phase1) - 1); FinalWeights[Pass].W6RNeg[3] = -(2 * FMath::Cos(Phase1 + Phase1) - 1); FinalWeights[Pass].W7R[0] = 2 * FMath::Cos(Phase2 + Phase2) - 1; FinalWeights[Pass].W7R[1] = 2 * FMath::Cos(Phase2 + Phase2) - 1; FinalWeights[Pass].W7R[2] = 2 * FMath::Cos(Phase3 + Phase3) - 1; FinalWeights[Pass].W7R[3] = 2 * FMath::Cos(Phase3 + Phase3) - 1; FinalWeights[Pass].W7RNeg[0] = -(2 * FMath::Cos(Phase2 + Phase2) - 1); FinalWeights[Pass].W7RNeg[1] = -(2 * FMath::Cos(Phase2 + Phase2) - 1); FinalWeights[Pass].W7RNeg[2] = -(2 * FMath::Cos(Phase3 + Phase3) - 1); FinalWeights[Pass].W7RNeg[3] = -(2 * FMath::Cos(Phase3 + Phase3) - 1); } } void GenerateRadix4Weights() { const int32 Radix4PassSize = 16; check(FFTSize >= Radix4PassSize); // Each radix-4 butterfly int32 MaxNumButterfliesInStage = FFTSize / Radix4PassSize; Radix4Weights.Reset(); Radix4Weights.AddUninitialized(MaxNumButterfliesInStage); for (int32 ButterflyIndex = 0; ButterflyIndex < MaxNumButterfliesInStage; ++ButterflyIndex) { const double Phase = 2. * PI * RotateBitsAroundPoint(4 * ButterflyIndex); const float W1R = static_cast(FMath::Cos(Phase)); const float W1I = static_cast(FMath::Tan(Phase)); const float W2R = static_cast(FMath::Cos(Phase + Phase)); const float W2I = static_cast(FMath::Tan(Phase + Phase)); const float W3R = static_cast(2. * W2R - 1.); const float W3I = static_cast(FMath::Tan(3. * Phase)); Radix4Weights[ButterflyIndex].W1R[0] = W1R; Radix4Weights[ButterflyIndex].W1I[0] = -W1I; Radix4Weights[ButterflyIndex].W2R[0] = W2R; Radix4Weights[ButterflyIndex].W2I[0] = -W2I; Radix4Weights[ButterflyIndex].W3R[0] = W3R; Radix4Weights[ButterflyIndex].W3I[0] = -W3I; Radix4Weights[ButterflyIndex].W1RNeg[0] = -W1R; Radix4Weights[ButterflyIndex].W1RD2[0] = -W1R; Radix4Weights[ButterflyIndex].W1RD3[0] = W1R; Radix4Weights[ButterflyIndex].W2RNeg[0] = -W2R; Radix4Weights[ButterflyIndex].W3RNeg[0] = -W3R; Radix4Weights[ButterflyIndex].W1R[1] = W1R; Radix4Weights[ButterflyIndex].W1I[1] = W1I; Radix4Weights[ButterflyIndex].W2R[1] = W2R; Radix4Weights[ButterflyIndex].W2I[1] = W2I; Radix4Weights[ButterflyIndex].W3R[1] = W3R; Radix4Weights[ButterflyIndex].W3I[1] = W3I; Radix4Weights[ButterflyIndex].W1RNeg[1] = -W1R; Radix4Weights[ButterflyIndex].W1RD2[1] = W1R; Radix4Weights[ButterflyIndex].W1RD3[1] = -W1R; Radix4Weights[ButterflyIndex].W2RNeg[1] = -W2R; Radix4Weights[ButterflyIndex].W3RNeg[1] = -W3R; Radix4Weights[ButterflyIndex].W1R[2] = W1R; Radix4Weights[ButterflyIndex].W1I[2] = -W1I; Radix4Weights[ButterflyIndex].W2R[2] = W2R; Radix4Weights[ButterflyIndex].W2I[2] = -W2I; Radix4Weights[ButterflyIndex].W3R[2] = W3R; Radix4Weights[ButterflyIndex].W3I[2] = -W3I; Radix4Weights[ButterflyIndex].W1RNeg[2] = -W1R; Radix4Weights[ButterflyIndex].W1RD2[2] = -W1R; Radix4Weights[ButterflyIndex].W1RD3[2] = W1R; Radix4Weights[ButterflyIndex].W2RNeg[2] = -W2R; Radix4Weights[ButterflyIndex].W3RNeg[2] = -W3R; Radix4Weights[ButterflyIndex].W1R[3] = W1R; Radix4Weights[ButterflyIndex].W1I[3] = W1I; Radix4Weights[ButterflyIndex].W2R[3] = W2R; Radix4Weights[ButterflyIndex].W2I[3] = W2I; Radix4Weights[ButterflyIndex].W3R[3] = W3R; Radix4Weights[ButterflyIndex].W3I[3] = W3I; Radix4Weights[ButterflyIndex].W1RNeg[3] = -W1R; Radix4Weights[ButterflyIndex].W1RD2[3] = W1R; Radix4Weights[ButterflyIndex].W1RD3[3] = -W1R; Radix4Weights[ButterflyIndex].W2RNeg[3] = -W2R; Radix4Weights[ButterflyIndex].W3RNeg[3] = -W3R; } } // A funny but useful function which reverses bits and // places them behind a point. For example // 0101 is transformed to 0.1010 double RotateBitsAroundPoint(uint32 InValue) { uint32 ReversedValue = ReverseBits(InValue); double OutValue = ReversedValue / 4294967296.; OutValue = 1. / 4294967296. * ReversedValue; return OutValue; } uint32 ReverseBits(uint32 InValue) { static const uint8 ByteReversal[256] = { 0, 128, 64, 192, 32, 160, 96, 224, 16, 144, 80, 208, 48, 176, 112, 240, 8, 136, 72, 200, 40, 168, 104, 232, 24, 152, 88, 216, 56, 184, 120, 248, 4, 132, 68, 196, 36, 164, 100, 228, 20, 148, 84, 212, 52, 180, 116, 244, 12, 140, 76, 204, 44, 172, 108, 236, 28, 156, 92, 220, 60, 188, 124, 252, 2, 130, 66, 194, 34, 162, 98, 226, 18, 146, 82, 210, 50, 178, 114, 242, 10, 138, 74, 202, 42, 170, 106, 234, 26, 154, 90, 218, 58, 186, 122, 250, 6, 134, 70, 198, 38, 166, 102, 230, 22, 150, 86, 214, 54, 182, 118, 246, 14, 142, 78, 206, 46, 174, 110, 238, 30, 158, 94, 222, 62, 190, 126, 254, 1, 129, 65, 193, 33, 161, 97, 225, 17, 145, 81, 209, 49, 177, 113, 241, 9, 137, 73, 201, 41, 169, 105, 233, 25, 153, 89, 217, 57, 185, 121, 249, 5, 133, 69, 197, 37, 165, 101, 229, 21, 149, 85, 213, 53, 181, 117, 245, 13, 141, 77, 205, 45, 173, 109, 237, 29, 157, 93, 221, 61, 189, 125, 253, 3, 131, 67, 195, 35, 163, 99, 227, 19, 147, 83, 211, 51, 179, 115, 243, 11, 139, 75, 203, 43, 171, 107, 235, 27, 155, 91, 219, 59, 187, 123, 251, 7, 135, 71, 199, 39, 167, 103, 231, 23, 151, 87, 215, 55, 183, 119, 247, 15, 143, 79, 207, 47, 175, 111, 239, 31, 159, 95, 223, 63, 191, 127, 255 }; uint8 Byte0 = ByteReversal[InValue >> 0*8 & 0xff]; uint8 Byte1 = ByteReversal[InValue >> 1*8 & 0xff]; uint8 Byte2 = ByteReversal[InValue >> 2*8 & 0xff]; uint8 Byte3 = ByteReversal[InValue >> 3*8 & 0xff]; uint32 OutValue = Byte0 << 3*8 | Byte1 << 2*8 | Byte2 << 1*8 | Byte3 << 0*8; return OutValue; } int32 Log2FFTSize; int32 FFTSize; int32 NumFloats; TArray Radix4Weights; TArray FinalIndices; TArray FinalWeights; FAlignedFloatBuffer InverseWorkBuffer; }; // Maximum log 2 size of fft const int32 FVectorRealToComplexFFT::MinLog2FFTSize = FVectorComplexFFT::MinLog2FFTSize + 1; // Maximum log 2 size of fft const int32 FVectorRealToComplexFFT::MaxLog2FFTSize = FVectorComplexFFT::MaxLog2FFTSize + 1; void FVectorRealToComplexFFT::InitRealSequenceConversionBuffers() { // Conversion buffers for performing a real valued FFT using a complex fft // The values in the buffer are setup to support SIMD operations resulting // in some duplicate data. ForwardConvBuffers.AlphaReal.AddUninitialized(FFTSize); ForwardConvBuffers.AlphaImag.AddUninitialized(FFTSize); ForwardConvBuffers.BetaReal.AddUninitialized(FFTSize); ForwardConvBuffers.BetaImag.AddUninitialized(FFTSize); InverseConvBuffers.AlphaReal.AddUninitialized(FFTSize); InverseConvBuffers.AlphaImag.AddUninitialized(FFTSize); InverseConvBuffers.BetaReal.AddUninitialized(FFTSize); InverseConvBuffers.BetaImag.AddUninitialized(FFTSize); float* AlphaRealForwardBufferData = ForwardConvBuffers.AlphaReal.GetData(); float* AlphaImagForwardBufferData = ForwardConvBuffers.AlphaImag.GetData(); float* BetaRealForwardBufferData = ForwardConvBuffers.BetaReal.GetData(); float* BetaImagForwardBufferData = ForwardConvBuffers.BetaImag.GetData(); float* AlphaRealInverseBufferData = InverseConvBuffers.AlphaReal.GetData(); float* AlphaImagInverseBufferData = InverseConvBuffers.AlphaImag.GetData(); float* BetaRealInverseBufferData = InverseConvBuffers.BetaReal.GetData(); float* BetaImagInverseBufferData = InverseConvBuffers.BetaImag.GetData(); float PhaseIncrement = PI / static_cast(FFTSize); for (int32 i = 0; i < FFTSize; i += 2) { const float Phase = PhaseIncrement * i; const float BetaReal = 0.5 * (1. - FMath::Sin(Phase)); const float BetaImag = -0.5 * FMath::Cos(Phase); const float AlphaReal = 0.5 * (1. + FMath::Sin(Phase)); const float AlphaImag = 0.5 * FMath::Cos(Phase); AlphaRealForwardBufferData[i] = AlphaReal; AlphaRealForwardBufferData[i + 1] = -AlphaReal;// Sign flipped to simplify SIMD math AlphaImagForwardBufferData[i] = AlphaImag; AlphaImagForwardBufferData[i + 1] = AlphaImag; BetaRealForwardBufferData[i] = BetaReal; BetaRealForwardBufferData[i + 1] = BetaReal; BetaImagForwardBufferData[i] = -BetaImag; // Sign flipped to simplify SIMD math BetaImagForwardBufferData[i + 1] = BetaImag; AlphaRealInverseBufferData[i] = AlphaReal; AlphaRealInverseBufferData[i + 1] = -AlphaReal; // Sign flipped to simplify SIMD math AlphaImagInverseBufferData[i] = AlphaImag; AlphaImagInverseBufferData[i + 1] = AlphaImag; BetaRealInverseBufferData[i] = BetaReal; BetaRealInverseBufferData[i + 1] = BetaReal; BetaImagInverseBufferData[i] = -BetaImag; // Sign flipped to simplify SIMD math BetaImagInverseBufferData[i + 1] = BetaImag; } } // Performs conversion of buffers required to do real fft using complex fft. void FVectorRealToComplexFFT::ConvertSequence(const FConversionBuffers& InBuffers, const float* RESTRICT InValues, int32 InStartIndex, float* RESTRICT OutValues) { const float* AlphaRealData = InBuffers.AlphaReal.GetData(); const float* AlphaImagData = InBuffers.AlphaImag.GetData(); const float* BetaRealData = InBuffers.BetaReal.GetData(); const float* BetaImagData = InBuffers.BetaImag.GetData(); if (FFTSize > InStartIndex) { VectorRegister4Float VInRev1 = VectorLoad(&InValues[FFTSize - InStartIndex]); for (int32 i = InStartIndex; i < FFTSize; i += 4) { VectorRegister4Float VIn = VectorLoad(&InValues[i]); VectorRegister4Float VInRISwap = VectorSwizzle(VIn, 1, 0, 3, 2); VectorRegister4Float VInRev2 = VectorLoad(&InValues[FFTSize - i - 4]); VectorRegister4Float VInRev = VectorShuffle(VInRev1, VInRev2, 0, 1, 2, 3); VInRev1 = VInRev2; VectorRegister4Float VInRevRISwap = VectorSwizzle(VInRev, 1, 0, 3, 2); VectorRegister4Float VAlphaReal = VectorLoad(&AlphaRealData[i]); VectorRegister4Float VAlphaImag = VectorLoad(&AlphaImagData[i]); VectorRegister4Float VBetaReal = VectorLoad(&BetaRealData[i]); VectorRegister4Float VBetaImag = VectorLoad(&BetaImagData[i]); // Out1 = [ R * Ar, I * Ar] // Out2 = [ I * Ai, R * Ai] // Out3 = [NR * Br, NI * Br] // Out4 = [NI * Bi, NR * Bi] //VectorRegister4Float Out1 = VectorMultiply(VIn, VAlphaReal); VectorRegister4Float Out2 = VectorMultiply(VInRISwap, VAlphaImag); //VectorRegister4Float Out3 = VectorMultiply(VInRev, VBetaReal); VectorRegister4Float Out4 = VectorMultiply(VInRevRISwap, VBetaImag); // Out12 = [(R * Ar) + (I * Ai), (I * Ar) + (R * Ai)] VectorRegister4Float Out12 = VectorMultiplyAdd(VIn, VAlphaReal, Out2); // Out34 = [(NR * Br) + (NI * Bi), (NR * Bi) + (NI * Br)] VectorRegister4Float Out34 = VectorMultiplyAdd(VInRev, VBetaReal, Out4); // Out = [ // (R * Ar) + (I * Ai) + (NR * Br) + (NI * Bi), // (I * Ar) + (R * Ai) + (NR * Bi) + (NI * Br) // ] VectorRegister4Float Out = VectorAdd(Out12, Out34); VectorStore(Out, &OutValues[i]); } } } FVectorRealToComplexFFT::FVectorRealToComplexFFT(int32 InLog2FFTSize) : FFTSize(1 << InLog2FFTSize) , Log2FFTSize(InLog2FFTSize) , ComplexFFT(new FVectorComplexFFT(InLog2FFTSize - 1)) // Utilize a N/2 length complex fft to perform N length fft. { WorkBuffer.AddUninitialized(FFTSize); InitRealSequenceConversionBuffers(); } FVectorRealToComplexFFT::~FVectorRealToComplexFFT() { } int32 FVectorRealToComplexFFT::Size() const { return FFTSize; } /** Scaling applied when performing forward FFT. */ EFFTScaling FVectorRealToComplexFFT::ForwardScaling() const { return EFFTScaling::MultipliedBySqrtFFTSize; } /** Scaling applied when performing inverse FFT. */ EFFTScaling FVectorRealToComplexFFT::InverseScaling() const { return EFFTScaling::DividedBySqrtFFTSize; } void FVectorRealToComplexFFT::ForwardRealToComplex(const float* RESTRICT InReal, float* RESTRICT OutComplex) { // Performs a N sized real-to-complex FFT using an N/2 complex-to-complex FFT. float* WorkData = WorkBuffer.GetData(); ComplexFFT->ForwardComplexToComplex(InReal, WorkData); const float* AlphaRealForwardData = ForwardConvBuffers.AlphaReal.GetData(); const float* AlphaImagForwardData = ForwardConvBuffers.AlphaImag.GetData(); const float* BetaRealForwardData = ForwardConvBuffers.BetaReal.GetData(); const float* BetaImagForwardData = ForwardConvBuffers.BetaImag.GetData(); // Handle special case of this math to account for cyclical index math. OutComplex[0] = (WorkData[0] * AlphaRealForwardData[0]) + (WorkData[1] * AlphaImagForwardData[0]) + (WorkData[0] * BetaRealForwardData[0]) + (WorkData[1] * BetaImagForwardData[0]); OutComplex[1] = (WorkData[1] * AlphaRealForwardData[1]) + (WorkData[0] * AlphaImagForwardData[1]) + (WorkData[0] * BetaImagForwardData[1]) + (WorkData[1] * BetaRealForwardData[1]); OutComplex[2] = (WorkData[2] * AlphaRealForwardData[2]) + (WorkData[3] * AlphaImagForwardData[2]) + (WorkData[FFTSize - 2] * BetaRealForwardData[2]) + (WorkData[FFTSize - 1] * BetaImagForwardData[2]); OutComplex[3] = (WorkData[3] * AlphaRealForwardData[3]) + (WorkData[2] * AlphaImagForwardData[3]) + (WorkData[FFTSize - 2] * BetaImagForwardData[3]) + (WorkData[FFTSize - 1] * BetaRealForwardData[3]); // Convert all other values using optimized SIMD ConvertSequence(ForwardConvBuffers, WorkData, 4, OutComplex); // Handle special case of nyquist frequency OutComplex[FFTSize] = WorkData[0] - WorkData[1]; OutComplex[FFTSize + 1] = 0.f; } void FVectorRealToComplexFFT::InverseComplexToReal(const float* RESTRICT InComplex, float* RESTRICT OutReal) { // Performs a N sized complex-to-real FFT using an N/2 complex-to-complex FFT. float* WorkData = WorkBuffer.GetData(); const float* AlphaRealInverseData = InverseConvBuffers.AlphaReal.GetData(); const float* AlphaImagInverseData = InverseConvBuffers.AlphaImag.GetData(); const float* BetaRealInverseData = InverseConvBuffers.BetaReal.GetData(); const float* BetaImagInverseData = InverseConvBuffers.BetaImag.GetData(); // Handle special case of this math to account for cyclical index math. WorkData[0] = (InComplex[0] * AlphaRealInverseData[0]) + (InComplex[1] * AlphaImagInverseData[0]) + (InComplex[FFTSize] * BetaRealInverseData[0]) + (InComplex[FFTSize + 1] * BetaImagInverseData[0]); WorkData[1] = (InComplex[1] * AlphaRealInverseData[1]) + (InComplex[0] * AlphaImagInverseData[1]) + (InComplex[FFTSize] * BetaImagInverseData[1]) + (InComplex[FFTSize + 1] * BetaRealInverseData[0]); WorkData[2] = (InComplex[2] * AlphaRealInverseData[2]) + (InComplex[3] * AlphaImagInverseData[2]) + (InComplex[FFTSize - 2] * BetaRealInverseData[2]) + (InComplex[FFTSize - 1] * BetaImagInverseData[2]); WorkData[3] = (InComplex[3] * AlphaRealInverseData[3]) + (InComplex[2] * AlphaImagInverseData[3]) + (InComplex[FFTSize - 2] * BetaImagInverseData[3]) + (InComplex[FFTSize - 1] * BetaRealInverseData[3]); // Convert all other values using optimized SIMD ConvertSequence(ForwardConvBuffers, InComplex, 4, WorkData); // Perform Inverse FFT ComplexFFT->InverseComplexToComplex(WorkData, OutReal); } void FVectorRealToComplexFFT::BatchForwardRealToComplex(int32 InCount, const float* const RESTRICT InReal[], float* RESTRICT OutComplex[]) { for (int32 i = 0; i < InCount; i++) { ForwardRealToComplex(InReal[i], OutComplex[i]); } } void FVectorRealToComplexFFT::BatchInverseComplexToReal(int32 InCount, const float* const RESTRICT InComplex[], float* RESTRICT OutReal[]) { for (int32 i = 0; i < InCount; i++) { InverseComplexToReal(InComplex[i], OutReal[i]); } } /*************************************************************************************************/ /**************************************** FVectorFFTFactory **************************************/ /*************************************************************************************************/ FVectorFFTFactory::~FVectorFFTFactory() { } /** Name of this particular factory. */ FName FVectorFFTFactory::GetFactoryName() const { static const FName FactoryName = FName(TEXT("FVectorFFTFactory")); return FactoryName; } /** If true, this implementation uses hardware acceleration. */ bool FVectorFFTFactory::IsHardwareAccelerated() const { return false; } /** If true, this implementation requires input and output arrays to be 128 bit aligned. */ bool FVectorFFTFactory::Expects128BitAlignedArrays() const { return false; } /** Returns true if the input settings are supported by this factory. */ bool FVectorFFTFactory::AreFFTSettingsSupported(const FFFTSettings& InSettings) const { // Supports FFT sizes of 5 to 16, though an FFT bool bIsMinSizeSupported = InSettings.Log2Size >= FVectorRealToComplexFFT::MinLog2FFTSize; bool bIsMaxSizeSupported = InSettings.Log2Size <= FVectorRealToComplexFFT::MaxLog2FFTSize; bool bIsAlignmentSupported = InSettings.bArrays128BitAligned; return bIsMinSizeSupported && bIsAlignmentSupported && bIsMaxSizeSupported; } /** Creates a new FFT algorithm. */ TUniquePtr FVectorFFTFactory::NewFFTAlgorithm(const FFFTSettings& InSettings) { if (AreFFTSettingsSupported(InSettings)) { return MakeUnique(InSettings.Log2Size); } return TUniquePtr(); } }