// Copyright Epic Games Tools, LLC. All Rights Reserved. #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include #include #include "radaudio_mdct.h" #include "radaudio_mdct_internal.h" #include "radaudio_mdct_internal.inl" #include "rrbits.h" // The FFT alg used here was designed to be very FMA-friendly, but because we can't assume FMAs are present on // all target HW and want consistent results everywhere, we're using FMA-less algorithms for this application. // Notation throughout this file: // // Let z = a + bi. Then conj(z) = a - bi. // // We can swap the real and imaginary parts of z to yield s(z) = b + ai ("swap"). // Now because // // iz = i(a + bi) = -b + ai // // we get s(z) = i z*, which is convenient to manipulate this algebraically. Now // obviously from the definition, we have // // s(s(z)) = z // // Regular complex arithmetic rules further give the identities // // s(z + w) = i conj(z + w) = i conj(z) + i conj(w) = s(z) + s(w) // s(zw) = i conj(zw) = i conj(z) conj(w) = s(z) conj(w) = conj(z) s(w) // // Note the "swap identity" (by the multiplication rule applied twice) // // s(s(z) s(w)) = s(s(z) i conj(w)) = s(i s(zw)) = conj(i) s(s(zw)) = -i zw // // We mostly work with split real/imaginary parts throughout this file, so these // swaps are "free" (just a matter of renaming variables). This lets us reduce // complex multiplications -izw to regular complex multiplications zw with some // swapping of the real/imaginary parts. (izw can also be handled by computing // -izw as noted, and then folding a negate into the uses.) namespace radaudio_fft_impl { // tables generated by radaudio_mdct_build_tables.cpp #include "radaudio_mdct_tables.inl" // The FFT kernel is parameterized by an "Elem" type that gives shared functionality and determines // the vector width. The bitrev + initial passes need to work slightly differently as the vector width // increases, which is why that is inside here as well. namespace { struct ElemF32 { static constexpr size_t kCount = 1; float v; ElemF32() {} explicit ElemF32(float f) : v(f) {} static ElemF32 load(float const* ptr) { return ElemF32(*ptr); } void store(float* ptr) { *ptr = v; } ElemF32 operator+(ElemF32 b) const { return ElemF32(v + b.v); } ElemF32 operator-(ElemF32 b) const { return ElemF32(v - b.v); } ElemF32 operator*(ElemF32 b) const { return ElemF32(v * b.v); } ElemF32 reverse() const { return *this; } static RADFORCEINLINE void radix2_twiddle( ElemF32& ar, ElemF32& ai, ElemF32& br, ElemF32& bi, ElemF32 wr, ElemF32 wi ) { radix2_twiddle_unfused(ar, ai, br, bi, wr, wi); } static RADFORCEINLINE void load_deinterleave(ElemF32& re, ElemF32& im, float const* ptr) { re.v = ptr[0]; im.v = ptr[1]; } static size_t bitrev_initial_radix4(float *out, float const *in, size_t N, FftSign sign) { size_t Nbits = rrCtz64(N); size_t shift_amt = kMaxFFTLog2 - Nbits; size_t step = N / 4; float const * inA = in; float const * inB = in + burst_swizzle(2 * step); // note: 2 not 1 because it's bit-reversed float const * inC = in + burst_swizzle(1 * step); // note: 1 not 2 because it's bit-reversed float const * inD = in + burst_swizzle(3 * step); // This was originally written for the negative sign variant, but all we need to do // to toggle the sign is to swap inC and inD pointers if (sign == FftSign_Positive) swap(inC, inD); // Apply the initial permutation along with the initial radix-4 butterflies // (which are special because the twiddles are with +-1 and +-i only, i.e. trivial) for (size_t i = 0; i < N; i += 4) { size_t is = burst_swizzle(i); // dest index size_t j = s_bit_reverse[i] >> shift_amt; size_t js = burst_swizzle(j); // source index float ar = inA[js + 0*kBurstSize]; float ai = inA[js + 1*kBurstSize]; float br = inB[js + 0*kBurstSize]; float bi = inB[js + 1*kBurstSize]; float cr = inC[js + 0*kBurstSize]; float ci = inC[js + 1*kBurstSize]; float dr = inD[js + 0*kBurstSize]; float di = inD[js + 1*kBurstSize]; dft4_bfly_permuted(ar, ai, br, bi, cr, ci, dr, di); out[is + 0 + 0*kBurstSize] = ar; out[is + 0 + 1*kBurstSize] = ai; out[is + 1 + 0*kBurstSize] = br; out[is + 1 + 1*kBurstSize] = bi; out[is + 2 + 0*kBurstSize] = cr; out[is + 2 + 1*kBurstSize] = ci; out[is + 3 + 0*kBurstSize] = dr; out[is + 3 + 1*kBurstSize] = di; } if (Nbits & 1) { size_t const swiz_N = burst_swizzle(N); float const * twiddle_i = &s_fft_twiddles[8 + 4]; float const * twiddle_r = &s_fft_twiddles[8 + 2]; float * outA = out; float * outB = out + burst_swizzle(4); size_t swiz_dec = burst_swizzle(~size_t(4)); static_assert(kBurstSize >= 4, "Twiddle addressing assumes kBurstSwizzle >= 4"); for (size_t j = 0; j < swiz_N; j = (j - swiz_dec) & swiz_dec) { float ar = outA[j + 0*kBurstSize]; float ai = outA[j + 1*kBurstSize]; float br = outB[j + 0*kBurstSize]; float bi = outB[j + 1*kBurstSize]; // Twiddle index size_t const k = j & 3; radix2_twiddle_unfused(ar, ai, br, bi, twiddle_r[k], twiddle_i[k]); outA[j + 0*kBurstSize] = ar; outA[j + 1*kBurstSize] = ai; outB[j + 0*kBurstSize] = br; outB[j + 1*kBurstSize] = bi; } return 8; } else return 4; } static void store_interleaved(float * dest, ElemF32 re, ElemF32 im) { dest[0] = re.v; dest[1] = im.v; } }; } // anon namespace FftKernelSet const kernels_scalar = { ElemF32::bitrev_initial_radix4, burst_r4_fft_single_pass, burst_imdct_prefft, burst_imdct_postfft, }; } // namespace radaudio_fft_impl static radaudio_fft_impl::FftKernelSet const * choose_kernels(radaudio_cpu_features cpu) { using namespace radaudio_fft_impl; #if defined(DO_BUILD_SSE4) || defined(DO_BUILD_AVX2) if (cpu.has_sse2) { #if defined(DO_BUILD_AVX2) if (cpu.has_avx2) return &kernels_avx2; else #endif return &kernels_sse2; } #endif #ifdef DO_BUILD_NEON return &kernels_neon; #else return &kernels_scalar; #endif } // The complex FFT driver func. static void radaudio_fft(float *out, float const *in, size_t N, FftSign sign, radaudio_fft_impl::FftKernelSet const * kernels) { using namespace radaudio_fft_impl; const size_t swiz_N = burst_swizzle(N); RR_ASSERT(16 <= N && N <= kMaxFFTN); RR_ASSERT((N & (N - 1)) == 0); // checks for pow2 size_t const initial_step = kernels->initial(out, in, N, sign); // For the size we support here, an iterative FFT is always fine since we're // comfortably in the L1D cache (the largest FFT we do is 512 complex elements, // which is 4K of data). // Iteratively do all the CT passes for increasing N (DIT order), indexed by step size (which is N/4) for (size_t step = initial_step; step <= N / 4; step *= 4) kernels->cfft_pass(out, step, swiz_N, sign); } // ---- static void get_mdct_twiddles(const float **out_tw_re, const float **out_tw_im, size_t N) { const float *base = nullptr; if (N == RADAUDIO_LONG_BLOCK_LEN) { base = radaudio_fft_impl::s_mdct_long_twiddles; } else if (N == RADAUDIO_SHORT_BLOCK_LEN) { base = radaudio_fft_impl::s_mdct_short_twiddles; } else { RR_BREAK(); } *out_tw_re = base; *out_tw_im = base + N/2; } // Computes N MDCT coeffs from 2N input values signal0:signal1 // needs N floats worth of aligned workspace at "work" // // N must be even, >=4. void radaudio_mdct_fft(radaudio_cpu_features cpu, float *mdct_coef, size_t N, float const *signal0, float const *signal1, float *work) { size_t M1 = N>>2; size_t M2 = N>>1; // Pre-pass turns signal0/signal1 into complex input to FFT (in w0c = mdct_coef) // then FFT w0c -> w1c // then post-pass w1c -> mdct_coef // The high-level reduction is MDCT -> DCT-IV -> FFT, the latter using the approach described // in R. Gluth, "Regular FFT-related transform kernels for DCT/DST-based polyphase filter banks" (1991) // // Consider the input signal evenly partitioned into 4 parts [a b c d] // to compute a MDCT, compute the DCT-IV of x = [-c^R-d a-b^R] (where ^R denotes sequence reversal) // // Then compute the N-coeff DCT-IV using a N/2-coeff complex FFT as follows: (Matlab for a column vector x) // nrows = size(x,1); // if mod(nrows,2) ~= 0 || nrows<4 // error('Only even-sized DCT-IVs of size at least 4 are supported'); // end // % Interleave even and reversed odd sub-sequences into complex values // u_prime = x(1:2:end,:) + j*flipud(x(2:2:end,:)); // % Calculate twiddles // twiddle = exp(-j*pi/nrows*((0:nrows/2-1)' + 1/8)); // % The main calc // f = twiddle .* fft(twiddle .* u_prime); // % Build the result // y = zeros(nrows,size(x,2)); // y(1:2:end,:) = real(f); // y(2:2:end,:) = -flipud(imag(f)); using namespace radaudio_fft_impl; FftKernelSet const * kernels = choose_kernels(cpu); const float *tw_re = nullptr; const float *tw_im = nullptr; get_mdct_twiddles(&tw_re, &tw_im, N); // Pre-pass for (size_t i = 0; i < M1; i++) { size_t j = M2 - 1 - i; size_t k = i*2; size_t is = burst_swizzle(i); size_t js = burst_swizzle(j); float w0re = tw_re[i]; float w0im = tw_im[i]; float w1re = tw_re[j]; float w1im = tw_im[j]; float re0 = signal1[M2-k-1] + signal1[M2+k+0]; float im0 = signal0[M2+k+0] - signal0[M2-k-1]; float re1 = signal0[M2+k+1] - signal0[M2-k-2]; float im1 = signal1[M2-k-2] + signal1[M2+k+1]; mdct_coef[is + 0*kBurstSize] = w0re*re0 - w0im*im0; mdct_coef[is + 1*kBurstSize] = w0re*im0 + w0im*re0; mdct_coef[js + 0*kBurstSize] = w1re*re1 - w1im*im1; mdct_coef[js + 1*kBurstSize] = w1re*im1 + w1im*re1; } // Size-N/2 complex FFT radaudio_fft(work, mdct_coef, M2, FftSign_Negative, kernels); // Post-pass for (size_t i = 0; i < M2; i++) { size_t is = burst_swizzle(i); size_t j = i*2; float wre = tw_re[i]; float wim = tw_im[i]; float re = work[is + 0*kBurstSize]; float im = work[is + 1*kBurstSize]; mdct_coef[j] = wim*im - wre*re; mdct_coef[N-1-j] = wre*im + wim*re; } } // Abstractly, computes 2N IMDCT results signal0:signal1 from N input coeffs // Practically, computes N IMDCT results [--:sig0:sig1:--] from N input coeffs // and packs them as [sig0:sig1] in signal_both. // // needs N floats worth of aligned workspace at "work" // // both signal outputs are packed into a single buffer to allow the signal buffer // to be used as an additional work buffer. // // N must be even, >=4. void radaudio_imdct_fft_only_middle(radaudio_cpu_features cpu, float *signal_both, float *mdct_coef, size_t N) { size_t M2 = N>>1; float *signal0 = signal_both; float *signal1 = signal_both + M2; using namespace radaudio_fft_impl; FftKernelSet const * kernels = choose_kernels(cpu); const float *tw_re = nullptr; const float *tw_im = nullptr; get_mdct_twiddles(&tw_re, &tw_im, N); // The first step is a DCT-IV, so we start with the interleave/twiddle dance // NOTE: since our twiddles are negated, we pick up a -1 scale factor here. // This is harmless and gets cancelled out immediately in the post-twiddle. kernels->imdct_pre(signal_both, mdct_coef, tw_re, tw_im, N); // now mdct_coef = new work buffer radaudio_fft(mdct_coef, signal_both, M2, FftSign_Negative, kernels); kernels->imdct_post(signal0, signal1, mdct_coef, tw_re, tw_im, N); }