243 lines
8.4 KiB
C++
243 lines
8.4 KiB
C++
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
// We really don't use much from SSE3, and what we use is actually
|
|
// reasonably easy to synthesize using at most two SSE2 instructions.
|
|
// So the idea is to put that code in here and then #include it twice,
|
|
// once for real SSE3, once for SSE2 with a few #defines. See radfft.cpp
|
|
// for how that's accomplished.
|
|
|
|
// SSE3 real FFT post-pass.
|
|
static void RADFFT_SSE3_PREFIX(rfpost)(rfft_complex out[], UINTa kEnd, UINTa N4)
|
|
{
|
|
if (N4 < 2)
|
|
{
|
|
scalar_rfpost(out, N4, N4);
|
|
return;
|
|
}
|
|
|
|
F32 const *twiddle = (F32 const *) (s_twiddles + (N4 + 2));
|
|
F32 *out0 = (F32 *) (out + 2);
|
|
F32 *out1 = (F32 *) (out + (N4 * 2) - 3);
|
|
F32 *out0_end = (F32 *) (out + kEnd);
|
|
|
|
__m128 conjflip = _mm_setr_ps(0.0f, -0.0f, 0.0f, -0.0f);
|
|
__m128 half = _mm_setr_ps(0.5f, -0.5f, 0.5f, -0.5f);
|
|
|
|
// Handle first pair of bins scalar.
|
|
scalar_rfpost(out, 2, N4);
|
|
|
|
while (out0 < out0_end)
|
|
{
|
|
__m128 a = _mm_load_ps(out0);
|
|
#if !defined(_MSC_VER) || _MSC_VER != 1600 || defined(__RAD64__) // VC2010 32bit fails to compile this correctly.
|
|
__m128 b = _mm_loadl_pi(_mm_load_ps(out1 - 2), (__m64 const *) (out1 + 2)); // ==shuffle(loadu(out1), zwxy)
|
|
#else
|
|
// this does the same thing but doesn't generate bad code on
|
|
// VC++ 2010 SP1 32-bit
|
|
__m128 b = _mm_loadu_ps(out1);
|
|
b = _mm_shuffle_ps(b, b, 0x4e);
|
|
#endif
|
|
__m128 w = _mm_load_ps(twiddle);
|
|
__m128 w_re = _mm_moveldup_ps(w);
|
|
__m128 w_im = _mm_movehdup_ps(w);
|
|
|
|
// Half-scaled butterfly
|
|
__m128 d = _mm_mul_ps(_mm_addsub_ps(b, a), half);
|
|
__m128 e = _mm_add_ps(a, d);
|
|
|
|
// Twiddle to get -conj(o) (easier than getting o with what we have)
|
|
__m128 noconj = _mm_addsub_ps(_mm_mul_ps(_mm_shuffle_ps(d, d, 0xb1), w_re), _mm_mul_ps(d, w_im));
|
|
|
|
// Generate outputs
|
|
__m128 o0 = _mm_addsub_ps(e, noconj); // e + o
|
|
__m128 o1 = _mm_add_ps(_mm_xor_ps(e, conjflip), noconj);
|
|
|
|
// Store
|
|
_mm_store_ps(out0, o0);
|
|
_mm_storeu_ps(out1, _mm_shuffle_ps(o1, o1, 0x4e));
|
|
|
|
out0 += 4;
|
|
out1 -= 4;
|
|
twiddle += 4;
|
|
}
|
|
}
|
|
|
|
// SSE3 real IFFT pre-pass.
|
|
static void RADFFT_SSE3_PREFIX(ripre)(rfft_complex out[], UINTa kEnd, UINTa N4)
|
|
{
|
|
if (N4 < 2)
|
|
{
|
|
scalar_rfpost(out, N4, N4);
|
|
return;
|
|
}
|
|
|
|
F32 const *twiddle = (F32 const *) (s_twiddles + (N4 + 2));
|
|
F32 *out0 = (F32 *) (out + 2);
|
|
F32 *out1 = (F32 *) (out + (N4 * 2) - 3);
|
|
F32 *out0_end = (F32 *) (out + kEnd);
|
|
|
|
__m128 conjflip = _mm_setr_ps(0.0f, -0.0f, 0.0f, -0.0f);
|
|
__m128 half = _mm_setr_ps(-0.5f, 0.5f, -0.5f, 0.5f);
|
|
|
|
// Handle first pair of bins scalar
|
|
scalar_ripre(out, 2, N4);
|
|
|
|
while (out0 < out0_end)
|
|
{
|
|
__m128 a = _mm_load_ps(out0);
|
|
#if !defined(_MSC_VER) || _MSC_VER != 1600 || defined(__RAD64__) // VC2010 32bit fails to compile this correctly.
|
|
__m128 b = _mm_loadl_pi(_mm_load_ps(out1 - 2), (__m64 const *) (out1 + 2)); // ==shuffle(loadu(out1), zwxy)
|
|
#else
|
|
// this does the same thing but doesn't generate bad code on
|
|
// VC++ 2010 SP1 32-bit
|
|
__m128 b = _mm_loadu_ps(out1);
|
|
b = _mm_shuffle_ps(b, b, 0x4e);
|
|
#endif
|
|
__m128 w = _mm_load_ps(twiddle);
|
|
__m128 w_re = _mm_moveldup_ps(w);
|
|
__m128 w_im = _mm_movehdup_ps(w);
|
|
|
|
w_im = _mm_xor_ps(w_im, conjflip);
|
|
|
|
// Half-scaled butterfly
|
|
__m128 d = _mm_mul_ps(_mm_addsub_ps(b, a), half);
|
|
__m128 e = _mm_sub_ps(a, d);
|
|
|
|
// Twiddle to get -conj(o) (easier than getting o with what we have)
|
|
__m128 noconj = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(d, d, 0xb1), w_re), _mm_mul_ps(d, w_im));
|
|
|
|
// Generate outputs
|
|
__m128 o0 = _mm_addsub_ps(e, noconj); // e + o
|
|
__m128 o1 = _mm_add_ps(_mm_xor_ps(e, conjflip), noconj);
|
|
|
|
// Store
|
|
_mm_store_ps(out0, o0);
|
|
_mm_storeu_ps(out1, _mm_shuffle_ps(o1, o1, 0x4e));
|
|
|
|
out0 += 4;
|
|
out1 -= 4;
|
|
twiddle += 4;
|
|
}
|
|
}
|
|
|
|
// SSE3 complex forward conjugate split-radix reduction pass. This is the main workhorse inner loop for FFTs.
|
|
static void RADFFT_SSE3_PREFIX(cfpass)(rfft_complex data[], PlanElement const *plan, UINTa Nover4)
|
|
{
|
|
__m128 conjflip = _mm_setr_ps(0.0f, -0.0f, 0.0f, -0.0f);
|
|
--plan;
|
|
|
|
do
|
|
{
|
|
++plan;
|
|
UINTa N1 = plan->Nloop;
|
|
UINTa step = N1 * 2;
|
|
F32 *out = (F32 *) (data + plan->offs);
|
|
F32 *out_end = out + step;
|
|
|
|
F32 const *twiddle = (F32 const *) (s_twiddles + N1);
|
|
do
|
|
{
|
|
__m128 Zk = _mm_load_ps(out + 2*step);
|
|
__m128 Zpk = _mm_load_ps(out + 3*step);
|
|
__m128 w = _mm_load_ps(twiddle);
|
|
__m128 w_re = _mm_moveldup_ps(w);
|
|
__m128 w_im = _mm_movehdup_ps(w);
|
|
|
|
// Twiddle Zk, Z'k
|
|
Zk = _mm_addsub_ps(_mm_mul_ps(Zk, w_re), _mm_mul_ps(_mm_shuffle_ps(Zk, Zk, 0xb1 /* yxwz */), w_im));
|
|
Zpk = _mm_addsub_ps(_mm_mul_ps(_mm_shuffle_ps(Zpk, Zpk, 0xb1), w_re), _mm_mul_ps(Zpk, w_im));
|
|
|
|
__m128 Zsum = _mm_add_ps(_mm_shuffle_ps(Zpk, Zpk, 0xb1), Zk);
|
|
__m128 Zdif = _mm_sub_ps(_mm_shuffle_ps(Zk, Zk, 0xb1), Zpk);
|
|
|
|
// Even inputs
|
|
__m128 Uk0 = _mm_load_ps(out + 0*step);
|
|
__m128 Uk1 = _mm_load_ps(out + 1*step);
|
|
|
|
// Output butterflies
|
|
_mm_store_ps(out + 0*step, _mm_add_ps(Uk0, Zsum));
|
|
_mm_store_ps(out + 1*step, _mm_add_ps(Uk1, _mm_xor_ps(Zdif, conjflip)));
|
|
_mm_store_ps(out + 2*step, _mm_sub_ps(Uk0, Zsum));
|
|
_mm_store_ps(out + 3*step, _mm_addsub_ps(Uk1, Zdif));
|
|
|
|
out += 4;
|
|
twiddle += 4;
|
|
} while (out < out_end);
|
|
} while (plan->Nloop < Nover4);
|
|
}
|
|
|
|
// SSE3 complex inverse conjugate split-radix reduction pass. This is the main workhorse inner loop for IFFTs.
|
|
static void RADFFT_SSE3_PREFIX(cipass)(rfft_complex data[], PlanElement const *plan, UINTa Nover4)
|
|
{
|
|
__m128 conjflip = _mm_setr_ps(0.0f, -0.0f, 0.0f, -0.0f);
|
|
--plan;
|
|
|
|
do
|
|
{
|
|
++plan;
|
|
UINTa N1 = plan->Nloop;
|
|
UINTa step = N1 * 2;
|
|
F32 *out = (F32 *) (data + plan->offs);
|
|
F32 *out_end = out + step;
|
|
|
|
F32 const *twiddle = (F32 const *) (s_twiddles + N1);
|
|
do
|
|
{
|
|
__m128 Zk = _mm_load_ps(out + 2*step);
|
|
__m128 Zpk = _mm_load_ps(out + 3*step);
|
|
__m128 w = _mm_load_ps(twiddle);
|
|
__m128 w_re = _mm_moveldup_ps(w);
|
|
__m128 w_im = _mm_movehdup_ps(w);
|
|
|
|
// Twiddle Zk, Z'k
|
|
Zpk = _mm_addsub_ps(_mm_mul_ps(Zpk, w_re), _mm_mul_ps(_mm_shuffle_ps(Zpk, Zpk, 0xb1), w_im));
|
|
Zk = _mm_addsub_ps(_mm_mul_ps(_mm_shuffle_ps(Zk, Zk, 0xb1), w_re), _mm_mul_ps(Zk, w_im));
|
|
|
|
__m128 Zsum = _mm_add_ps(_mm_shuffle_ps(Zk, Zk, 0xb1), Zpk);
|
|
__m128 Zdif = _mm_sub_ps(_mm_shuffle_ps(Zpk, Zpk, 0xb1), Zk);
|
|
|
|
// Even inputs
|
|
__m128 Uk0 = _mm_load_ps(out + 0*step);
|
|
__m128 Uk1 = _mm_load_ps(out + 1*step);
|
|
|
|
// Output butterflies
|
|
_mm_store_ps(out + 0*step, _mm_add_ps(Uk0, Zsum));
|
|
_mm_store_ps(out + 1*step, _mm_add_ps(Uk1, _mm_xor_ps(Zdif, conjflip)));
|
|
_mm_store_ps(out + 2*step, _mm_sub_ps(Uk0, Zsum));
|
|
_mm_store_ps(out + 3*step, _mm_addsub_ps(Uk1, Zdif));
|
|
|
|
out += 4;
|
|
twiddle += 4;
|
|
} while (out < out_end);
|
|
} while (plan->Nloop < Nover4);
|
|
}
|
|
|
|
static void RADFFT_SSE3_PREFIX(dct2_modulate)(F32 out[], F32 const in[], UINTa N, UINTa Nlast, rfft_complex const *twiddle)
|
|
{
|
|
if (N < 8)
|
|
{
|
|
scalar_dct2_modulate(out, in, N, Nlast, twiddle);
|
|
return;
|
|
}
|
|
|
|
// First few bins have exceptional cases, let scalar routine handle it
|
|
FFTASSERT((Nlast % 4) == 0 && Nlast >= 4);
|
|
scalar_dct2_modulate(out, in, N, 4, twiddle);
|
|
|
|
for (UINTa k = 4; k < Nlast; k += 2)
|
|
{
|
|
__m128 v = _mm_load_ps(in + k*2);
|
|
__m128 w = _mm_load_ps(&twiddle[k].re);
|
|
__m128 w_re = _mm_moveldup_ps(w);
|
|
__m128 w_im = _mm_movehdup_ps(w);
|
|
|
|
__m128 x = _mm_addsub_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, 0xb1), w_re), _mm_mul_ps(v, w_im));
|
|
__m128 s = _mm_shuffle_ps(x, x, 0x2d);
|
|
|
|
_mm_storel_pi((__m64 *) (out + k), s);
|
|
_mm_storeh_pi((__m64 *) (out + N - 1 - k), s);
|
|
}
|
|
}
|
|
|
|
// vim:et:sts=4:sw=4
|
|
|