Files
UnrealEngine/Engine/Source/Runtime/BinkAudioDecoder/SDK/BinkAudio/Src/radfft_sse3.inl
2025-05-18 13:04:45 +08:00

243 lines
8.4 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
// We really don't use much from SSE3, and what we use is actually
// reasonably easy to synthesize using at most two SSE2 instructions.
// So the idea is to put that code in here and then #include it twice,
// once for real SSE3, once for SSE2 with a few #defines. See radfft.cpp
// for how that's accomplished.
// SSE3 real FFT post-pass.
static void RADFFT_SSE3_PREFIX(rfpost)(rfft_complex out[], UINTa kEnd, UINTa N4)
{
if (N4 < 2)
{
scalar_rfpost(out, N4, N4);
return;
}
F32 const *twiddle = (F32 const *) (s_twiddles + (N4 + 2));
F32 *out0 = (F32 *) (out + 2);
F32 *out1 = (F32 *) (out + (N4 * 2) - 3);
F32 *out0_end = (F32 *) (out + kEnd);
__m128 conjflip = _mm_setr_ps(0.0f, -0.0f, 0.0f, -0.0f);
__m128 half = _mm_setr_ps(0.5f, -0.5f, 0.5f, -0.5f);
// Handle first pair of bins scalar.
scalar_rfpost(out, 2, N4);
while (out0 < out0_end)
{
__m128 a = _mm_load_ps(out0);
#if !defined(_MSC_VER) || _MSC_VER != 1600 || defined(__RAD64__) // VC2010 32bit fails to compile this correctly.
__m128 b = _mm_loadl_pi(_mm_load_ps(out1 - 2), (__m64 const *) (out1 + 2)); // ==shuffle(loadu(out1), zwxy)
#else
// this does the same thing but doesn't generate bad code on
// VC++ 2010 SP1 32-bit
__m128 b = _mm_loadu_ps(out1);
b = _mm_shuffle_ps(b, b, 0x4e);
#endif
__m128 w = _mm_load_ps(twiddle);
__m128 w_re = _mm_moveldup_ps(w);
__m128 w_im = _mm_movehdup_ps(w);
// Half-scaled butterfly
__m128 d = _mm_mul_ps(_mm_addsub_ps(b, a), half);
__m128 e = _mm_add_ps(a, d);
// Twiddle to get -conj(o) (easier than getting o with what we have)
__m128 noconj = _mm_addsub_ps(_mm_mul_ps(_mm_shuffle_ps(d, d, 0xb1), w_re), _mm_mul_ps(d, w_im));
// Generate outputs
__m128 o0 = _mm_addsub_ps(e, noconj); // e + o
__m128 o1 = _mm_add_ps(_mm_xor_ps(e, conjflip), noconj);
// Store
_mm_store_ps(out0, o0);
_mm_storeu_ps(out1, _mm_shuffle_ps(o1, o1, 0x4e));
out0 += 4;
out1 -= 4;
twiddle += 4;
}
}
// SSE3 real IFFT pre-pass.
static void RADFFT_SSE3_PREFIX(ripre)(rfft_complex out[], UINTa kEnd, UINTa N4)
{
if (N4 < 2)
{
scalar_rfpost(out, N4, N4);
return;
}
F32 const *twiddle = (F32 const *) (s_twiddles + (N4 + 2));
F32 *out0 = (F32 *) (out + 2);
F32 *out1 = (F32 *) (out + (N4 * 2) - 3);
F32 *out0_end = (F32 *) (out + kEnd);
__m128 conjflip = _mm_setr_ps(0.0f, -0.0f, 0.0f, -0.0f);
__m128 half = _mm_setr_ps(-0.5f, 0.5f, -0.5f, 0.5f);
// Handle first pair of bins scalar
scalar_ripre(out, 2, N4);
while (out0 < out0_end)
{
__m128 a = _mm_load_ps(out0);
#if !defined(_MSC_VER) || _MSC_VER != 1600 || defined(__RAD64__) // VC2010 32bit fails to compile this correctly.
__m128 b = _mm_loadl_pi(_mm_load_ps(out1 - 2), (__m64 const *) (out1 + 2)); // ==shuffle(loadu(out1), zwxy)
#else
// this does the same thing but doesn't generate bad code on
// VC++ 2010 SP1 32-bit
__m128 b = _mm_loadu_ps(out1);
b = _mm_shuffle_ps(b, b, 0x4e);
#endif
__m128 w = _mm_load_ps(twiddle);
__m128 w_re = _mm_moveldup_ps(w);
__m128 w_im = _mm_movehdup_ps(w);
w_im = _mm_xor_ps(w_im, conjflip);
// Half-scaled butterfly
__m128 d = _mm_mul_ps(_mm_addsub_ps(b, a), half);
__m128 e = _mm_sub_ps(a, d);
// Twiddle to get -conj(o) (easier than getting o with what we have)
__m128 noconj = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(d, d, 0xb1), w_re), _mm_mul_ps(d, w_im));
// Generate outputs
__m128 o0 = _mm_addsub_ps(e, noconj); // e + o
__m128 o1 = _mm_add_ps(_mm_xor_ps(e, conjflip), noconj);
// Store
_mm_store_ps(out0, o0);
_mm_storeu_ps(out1, _mm_shuffle_ps(o1, o1, 0x4e));
out0 += 4;
out1 -= 4;
twiddle += 4;
}
}
// SSE3 complex forward conjugate split-radix reduction pass. This is the main workhorse inner loop for FFTs.
static void RADFFT_SSE3_PREFIX(cfpass)(rfft_complex data[], PlanElement const *plan, UINTa Nover4)
{
__m128 conjflip = _mm_setr_ps(0.0f, -0.0f, 0.0f, -0.0f);
--plan;
do
{
++plan;
UINTa N1 = plan->Nloop;
UINTa step = N1 * 2;
F32 *out = (F32 *) (data + plan->offs);
F32 *out_end = out + step;
F32 const *twiddle = (F32 const *) (s_twiddles + N1);
do
{
__m128 Zk = _mm_load_ps(out + 2*step);
__m128 Zpk = _mm_load_ps(out + 3*step);
__m128 w = _mm_load_ps(twiddle);
__m128 w_re = _mm_moveldup_ps(w);
__m128 w_im = _mm_movehdup_ps(w);
// Twiddle Zk, Z'k
Zk = _mm_addsub_ps(_mm_mul_ps(Zk, w_re), _mm_mul_ps(_mm_shuffle_ps(Zk, Zk, 0xb1 /* yxwz */), w_im));
Zpk = _mm_addsub_ps(_mm_mul_ps(_mm_shuffle_ps(Zpk, Zpk, 0xb1), w_re), _mm_mul_ps(Zpk, w_im));
__m128 Zsum = _mm_add_ps(_mm_shuffle_ps(Zpk, Zpk, 0xb1), Zk);
__m128 Zdif = _mm_sub_ps(_mm_shuffle_ps(Zk, Zk, 0xb1), Zpk);
// Even inputs
__m128 Uk0 = _mm_load_ps(out + 0*step);
__m128 Uk1 = _mm_load_ps(out + 1*step);
// Output butterflies
_mm_store_ps(out + 0*step, _mm_add_ps(Uk0, Zsum));
_mm_store_ps(out + 1*step, _mm_add_ps(Uk1, _mm_xor_ps(Zdif, conjflip)));
_mm_store_ps(out + 2*step, _mm_sub_ps(Uk0, Zsum));
_mm_store_ps(out + 3*step, _mm_addsub_ps(Uk1, Zdif));
out += 4;
twiddle += 4;
} while (out < out_end);
} while (plan->Nloop < Nover4);
}
// SSE3 complex inverse conjugate split-radix reduction pass. This is the main workhorse inner loop for IFFTs.
static void RADFFT_SSE3_PREFIX(cipass)(rfft_complex data[], PlanElement const *plan, UINTa Nover4)
{
__m128 conjflip = _mm_setr_ps(0.0f, -0.0f, 0.0f, -0.0f);
--plan;
do
{
++plan;
UINTa N1 = plan->Nloop;
UINTa step = N1 * 2;
F32 *out = (F32 *) (data + plan->offs);
F32 *out_end = out + step;
F32 const *twiddle = (F32 const *) (s_twiddles + N1);
do
{
__m128 Zk = _mm_load_ps(out + 2*step);
__m128 Zpk = _mm_load_ps(out + 3*step);
__m128 w = _mm_load_ps(twiddle);
__m128 w_re = _mm_moveldup_ps(w);
__m128 w_im = _mm_movehdup_ps(w);
// Twiddle Zk, Z'k
Zpk = _mm_addsub_ps(_mm_mul_ps(Zpk, w_re), _mm_mul_ps(_mm_shuffle_ps(Zpk, Zpk, 0xb1), w_im));
Zk = _mm_addsub_ps(_mm_mul_ps(_mm_shuffle_ps(Zk, Zk, 0xb1), w_re), _mm_mul_ps(Zk, w_im));
__m128 Zsum = _mm_add_ps(_mm_shuffle_ps(Zk, Zk, 0xb1), Zpk);
__m128 Zdif = _mm_sub_ps(_mm_shuffle_ps(Zpk, Zpk, 0xb1), Zk);
// Even inputs
__m128 Uk0 = _mm_load_ps(out + 0*step);
__m128 Uk1 = _mm_load_ps(out + 1*step);
// Output butterflies
_mm_store_ps(out + 0*step, _mm_add_ps(Uk0, Zsum));
_mm_store_ps(out + 1*step, _mm_add_ps(Uk1, _mm_xor_ps(Zdif, conjflip)));
_mm_store_ps(out + 2*step, _mm_sub_ps(Uk0, Zsum));
_mm_store_ps(out + 3*step, _mm_addsub_ps(Uk1, Zdif));
out += 4;
twiddle += 4;
} while (out < out_end);
} while (plan->Nloop < Nover4);
}
static void RADFFT_SSE3_PREFIX(dct2_modulate)(F32 out[], F32 const in[], UINTa N, UINTa Nlast, rfft_complex const *twiddle)
{
if (N < 8)
{
scalar_dct2_modulate(out, in, N, Nlast, twiddle);
return;
}
// First few bins have exceptional cases, let scalar routine handle it
FFTASSERT((Nlast % 4) == 0 && Nlast >= 4);
scalar_dct2_modulate(out, in, N, 4, twiddle);
for (UINTa k = 4; k < Nlast; k += 2)
{
__m128 v = _mm_load_ps(in + k*2);
__m128 w = _mm_load_ps(&twiddle[k].re);
__m128 w_re = _mm_moveldup_ps(w);
__m128 w_im = _mm_movehdup_ps(w);
__m128 x = _mm_addsub_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, 0xb1), w_re), _mm_mul_ps(v, w_im));
__m128 s = _mm_shuffle_ps(x, x, 0x2d);
_mm_storel_pi((__m64 *) (out + k), s);
_mm_storeh_pi((__m64 *) (out + N - 1 - k), s);
}
}
// vim:et:sts=4:sw=4