// Copyright Epic Games, Inc. All Rights Reserved.

#include "DSP/VectorLinearResampler.h"

//
// There's a lot here that's about making sure we never have
// to convert to float in the loop. So we keep to advancing
// states - the position in INT space, and the fraction in
// VECTOR space. The difficulty is we need to mask off the
// upper bits as we advance the fraction. The trickery is
// to do this in the actual IEEE float representation. Since
// the most the step can advance is by 65535.0f / 65536.0f,
// we can place the addition operation in an area of the float
// space that prevents the exponent from changing in response
// to that addition. In this case, [2, 3). Addition can only
// ever change the highest mantissa bit, so we can just directly
// mask off that single bit (0xffbfffff).
//
// To do this, we have to take advantage of the mod math identity:
// (a + b) mod c == ((a mod c) + (b mod c)) mod c.
// so we precompute the frac steps, all modded off (0xffff), and the
// step advance vector, also modded off, then scale them all down to [0, 1).
//
// Now, getting the lerp fraction is an AND, and a SUB to get from [2,3) to
// [0, 1).
//
// On top of that, we use 64 bit loads to get the adjacent samples
// for lerping.
//
uint32 Audio::FVectorLinearResampler::ResampleMono(uint32 OutputFramesNeeded, uint32 FixedPointSampleRate, float const* SourceFrames, float* OutputFrames)
{
	// we CANT mask off the upper bits, as it's possible the fraction could
	// jump an entire sample and we need to retain that offset.
	const uint32 FarthestPositionFixed = CurrentFrameFraction + (OutputFramesNeeded - 1) * FixedPointSampleRate;
	const uint32 NextRunStartPositionFixed = FarthestPositionFixed + FixedPointSampleRate;

	float const* LeftSamples = SourceFrames;
	uint32 OutputFrame = 0;
	uint32 CurrentFrameFixed = CurrentFrameFraction;

#if (PLATFORM_ENABLE_VECTORINTRINSICS || PLATFORM_ENABLE_VECTORINTRINSICS_NEON) // SIMD path.
	{

		float CurrentFrameFractionFloat = (float)(CurrentFrameFraction & 0xffff);

		VectorRegister4Float ScaleVec = VectorSetFloat1(1.0f / 65536.0f);
		VectorRegister4Float TwoVec = VectorSetFloat1(2.0f);
		VectorRegister4Float OneVec = VectorSetFloat1(1.0f);
		VectorRegister4Float ThreeVec = VectorSetFloat1(3.0f);

		VectorRegister4Float CurFracStepVec = VectorSet(
			(float)((0 * FixedPointSampleRate) & 0xffff),
			(float)((1 * FixedPointSampleRate) & 0xffff),
			(float)((2 * FixedPointSampleRate) & 0xffff),
			(float)((3 * FixedPointSampleRate) & 0xffff));

		VectorRegister4Float CurFracVec = VectorSetFloat1(CurrentFrameFractionFloat);
		CurFracVec = VectorAdd(CurFracVec, CurFracStepVec);
		CurFracVec = VectorMultiply(CurFracVec, ScaleVec);
		CurFracVec = VectorAdd(CurFracVec, TwoVec);

		VectorRegister4Float MaskVec = MakeVectorRegisterFloat(0xffbfffff, 0xffbfffff, 0xffbfffff, 0xffbfffff);

		VectorRegister4Float StepVec = VectorSetFloat1((float)((FixedPointSampleRate * 4) & 0xffff));
		StepVec = VectorMultiply(StepVec, ScaleVec);

		uint32 OutputFramesNeededSIMD = OutputFramesNeeded & ~3;

		for (; OutputFrame < OutputFramesNeededSIMD; OutputFrame += 4)
		{
			uint32 SourceOffsets[4] = {
				CurrentFrameFixed >> 16,
				(CurrentFrameFixed + FixedPointSampleRate) >> 16,
				(CurrentFrameFixed + FixedPointSampleRate + FixedPointSampleRate) >> 16,
				(CurrentFrameFixed + FixedPointSampleRate + FixedPointSampleRate + FixedPointSampleRate) >> 16
			};

			// [0, 0+1, 1, 1+1]
			VectorRegister4Float LeftSamples01 = VectorLoadTwoPairsFloat(LeftSamples + SourceOffsets[0], LeftSamples + SourceOffsets[1]);
			// [2, 2+1, 3, 3+1]
			VectorRegister4Float LeftSamples23 = VectorLoadTwoPairsFloat(LeftSamples + SourceOffsets[2], LeftSamples + SourceOffsets[3]);

			// [0, 1, 2, 3]
			// [0+1, 1+1, 2+1, 3+1]
			VectorRegister4Float LeftSamplesFrom, LeftSamplesTo;
			VectorDeinterleave(LeftSamplesFrom, LeftSamplesTo, LeftSamples01, LeftSamples23);

			// our lerp vector is CurFracVec, masking off the 1 bit in the mantissa, subtract 2.
			CurFracVec = VectorBitwiseAnd(CurFracVec, MaskVec);
			VectorRegister4Float LerpFactor = VectorSubtract(CurFracVec, TwoVec);
			VectorRegister4Float OneMinusLerpFactor = VectorSubtract(ThreeVec, CurFracVec);

			CurFracVec = VectorAdd(CurFracVec, StepVec);
			VectorRegister4Float OutputVec = VectorAdd(VectorMultiply(LeftSamplesFrom, OneMinusLerpFactor), VectorMultiply(LeftSamplesTo, LerpFactor));

			VectorStore(OutputVec, OutputFrames);

			CurrentFrameFixed += 4 * FixedPointSampleRate;
			OutputFrames += 4;
		}
	}
#endif

	// Remnants
	for (; OutputFrame < OutputFramesNeeded; OutputFrame++)
	{
		uint32 SourceOffset = CurrentFrameFixed >> 16;
		uint32 CurFrac = CurrentFrameFixed & 0xffff;

		float LerpFactor = (float)(CurFrac) * (1.0f / 65536.0f);

		CurrentFrameFixed += FixedPointSampleRate;

		float Sample1 = LeftSamples[SourceOffset];
		float Sample2 = LeftSamples[SourceOffset + 1];
		OutputFrames[0] = Sample1 * (1 - LerpFactor) + (LerpFactor)* Sample2;

		OutputFrames++;
	}
	CurrentFrameFraction = NextRunStartPositionFixed - (FarthestPositionFixed & ~0xffff);

	return FarthestPositionFixed >> 16;
}

uint32 Audio::FVectorLinearResampler::ResampleStereo(uint32 OutputFramesNeeded, uint32 FixedPointSampleRate, float const* SourceFrames, uint32 SourceFramesStrideFloats, float* OutputFrames, uint32 OutputFramesStrideFloats)
{
	// we CANT mask off the upper bits, as it's possible the fraction could
	// jump an entire sample and we need to retain that offset.
	const uint32 FarthestPositionFixed = CurrentFrameFraction + (OutputFramesNeeded - 1) * FixedPointSampleRate;
	const uint32 NextRunStartPositionFixed = FarthestPositionFixed + FixedPointSampleRate;

	float const* LeftSamples = SourceFrames;
	float const* RightSamples = SourceFrames + SourceFramesStrideFloats;
	uint32 OutputFrame = 0;
	uint32 CurrentFrameFixed = CurrentFrameFraction;

#if (PLATFORM_ENABLE_VECTORINTRINSICS || PLATFORM_ENABLE_VECTORINTRINSICS_NEON) // SIMD path.
	{
		float CurrentFrameFractionFloat = (float)(CurrentFrameFraction & 0xffff);

		VectorRegister4Float ScaleVec = VectorSetFloat1(1.0f / 65536.0f);
		VectorRegister4Float TwoVec = VectorSetFloat1(2.0f);
		VectorRegister4Float OneVec = VectorSetFloat1(1.0f);
		VectorRegister4Float ThreeVec = VectorSetFloat1(3.0f);

		VectorRegister4Float CurFracStepVec = VectorSet(
			(float)((0 * FixedPointSampleRate) & 0xffff),
			(float)((1 * FixedPointSampleRate) & 0xffff),
			(float)((2 * FixedPointSampleRate) & 0xffff),
			(float)((3 * FixedPointSampleRate) & 0xffff));

		VectorRegister4Float CurFracVec = VectorSetFloat1(CurrentFrameFractionFloat);
		CurFracVec = VectorAdd(CurFracVec, CurFracStepVec);
		CurFracVec = VectorMultiply(CurFracVec, ScaleVec);
		CurFracVec = VectorAdd(CurFracVec, TwoVec);

		VectorRegister4Float MaskVec = MakeVectorRegisterFloat(0xffbfffff, 0xffbfffff, 0xffbfffff, 0xffbfffff);

		VectorRegister4Float StepVec = VectorSetFloat1((float)((FixedPointSampleRate * 4) & 0xffff));
		StepVec = VectorMultiply(StepVec, ScaleVec);

		uint32 OutputFramesNeededSIMD = OutputFramesNeeded & ~3;

		for (; OutputFrame < OutputFramesNeededSIMD; OutputFrame += 4)
		{
			const uint32 SourceOffsets[4] = {
				CurrentFrameFixed >> 16,
				(CurrentFrameFixed + FixedPointSampleRate) >> 16,
				(CurrentFrameFixed + FixedPointSampleRate + FixedPointSampleRate) >> 16,
				(CurrentFrameFixed + FixedPointSampleRate + FixedPointSampleRate + FixedPointSampleRate) >> 16
			};

			// [0, 0+1, 1, 1+1]
			VectorRegister4Float LeftSamples01 = VectorLoadTwoPairsFloat(LeftSamples + SourceOffsets[0], LeftSamples + SourceOffsets[1]);
			VectorRegister4Float RightSamples01 = VectorLoadTwoPairsFloat(RightSamples + SourceOffsets[0], RightSamples + SourceOffsets[1]);

			// [2, 2+1, 3, 3+1]
			VectorRegister4Float LeftSamples23 = VectorLoadTwoPairsFloat(LeftSamples + SourceOffsets[2], LeftSamples + SourceOffsets[3]);
			VectorRegister4Float RightSamples23 = VectorLoadTwoPairsFloat(RightSamples + SourceOffsets[2], RightSamples + SourceOffsets[3]);

			// want [0, 1, 2, 3]
			// [0+1, 1+1, 2+1, 3+1]
			VectorRegister4Float LeftSamplesFrom, LeftSamplesTo;
			VectorDeinterleave(LeftSamplesFrom, LeftSamplesTo, LeftSamples01, LeftSamples23);
			VectorRegister4Float RightSamplesFrom, RightSamplesTo;
			VectorDeinterleave(RightSamplesFrom, RightSamplesTo, RightSamples01, RightSamples23);

			// our lerp vector is cur_frac_vec, masking off the 1 bit in the mantissa, subtract 2.
			CurFracVec = VectorBitwiseAnd(CurFracVec, MaskVec);
			VectorRegister4Float LerpFactor = VectorSubtract(CurFracVec, TwoVec);
			VectorRegister4Float OneMinusLerpFactor = VectorSubtract(ThreeVec, CurFracVec);

			CurFracVec = VectorAdd(CurFracVec, StepVec);

			VectorRegister4Float LeftOutputVec = VectorAdd(VectorMultiply(LeftSamplesFrom, OneMinusLerpFactor), VectorMultiply(LeftSamplesTo, LerpFactor));
			VectorRegister4Float RightOutputVec = VectorAdd(VectorMultiply(RightSamplesFrom, OneMinusLerpFactor), VectorMultiply(RightSamplesTo, LerpFactor));

			VectorStore(LeftOutputVec, OutputFrames);
			VectorStore(RightOutputVec, OutputFrames + OutputFramesStrideFloats);

			CurrentFrameFixed += 4 * FixedPointSampleRate;
			OutputFrames += 4;
		}
	}
#endif

	for (; OutputFrame < OutputFramesNeeded; OutputFrame++)
	{
		uint32 SourceOffset = CurrentFrameFixed >> 16;
		uint32 CurFrac = CurrentFrameFixed & 0xffff;

		float LerpFactor = (float)(CurFrac) * (1.0f / 65536.0f);

		CurrentFrameFixed += FixedPointSampleRate;

		float Sample1 = LeftSamples[SourceOffset];
		float Sample2 = LeftSamples[SourceOffset + 1];
		OutputFrames[0] = Sample1 * (1 - LerpFactor) + (LerpFactor)* Sample2;

		Sample1 = RightSamples[SourceOffset];
		Sample2 = RightSamples[SourceOffset + 1];
		OutputFrames[OutputFramesStrideFloats] = Sample1 * (1 - LerpFactor) + (LerpFactor)* Sample2;

		OutputFrames++;
	}
	CurrentFrameFraction = NextRunStartPositionFixed - (FarthestPositionFixed & ~0xffff);

	return FarthestPositionFixed >> 16;
}