// Copyright Epic Games, Inc. All Rights Reserved.

// This constructs a FDFScalar, but does not 'rebalance' the input floats.
// Use DFTwoSum instead if the input is not already in the correct high/low format.
FDFType DFConstructor(FFloatType High, FFloatType Low)
{
	FDFType Result;
	Result.High = High;
	Result.Low = Low;
	return Result;
}

FDFType DFPromote(FDFType Value) { return Value; }

// Convert to double float
FDFType DFPromote(FFloatType Value) { return DFConstructor(Value, (FFloatType)0); }

// Truncate to single precision float
// (This could just return Value.High if you're sure the input is in the correct DF format)
FFloatType DFDemote(FDFType Value) { return Value.High + Value.Low; }

FFloatType DFDemote(FFloatType Value) { return Value; }

FDFType DFNegate(FDFType Value)
{
	return DFConstructor(-Value.High, -Value.Low);
}

FFloatType DFSign(FDFType Value)
{
	return (FFloatType)sign(Value.High);
}

FDFType DFAbs(FDFType Value)
{
	const FFloatType Sign = DFSign(Value);
	return DFConstructor(Sign * Value.High, Sign * Value.Low);
}

/**************/
/** ADDITION **/
/**************/

// Sum up two single precision vectors to one double precision vector.
// [1] Algorithm 2
FDFType DFTwoSum(FFloatType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs + Rhs);
#else
	const FFloatType S = INVARIANT_ADD(Lhs, Rhs);
	const FFloatType V = INVARIANT_SUB(S, Lhs);
	const FFloatType Q = INVARIANT_SUB(S, V);
	const FFloatType R = INVARIANT_SUB(Lhs, Q);
	const FFloatType T = INVARIANT_SUB(Rhs, V);
	const FFloatType Y = INVARIANT_ADD(R, T);
	return DFConstructor(S, Y);
#endif
}

// An optimized version of DFTwoSum, under the assumption that
// a = 0 or b = 0, or e1 >= e2, where e1 and e2 are the exponents of a and b respectively.
// [1] Algorithm 1
// The worst case precision if this assumption is violated, is just regular fp32 precision
FDFType DFFastTwoSum(FFloatType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs + Rhs);
#else
	const FFloatType S = INVARIANT_ADD(Lhs, Rhs);
	const FFloatType T = INVARIANT_SUB(S, Lhs);
	const FFloatType E = INVARIANT_SUB(Rhs, T);
	return DFConstructor(S, E);
#endif
}

// Alias for DFTwoSum
FDFType DFAdd(FFloatType Lhs, FFloatType Rhs) { return DFTwoSum(Lhs, Rhs); }
FDFType DFFastAdd(FFloatType Lhs, FFloatType Rhs) { return DFTwoSum(Lhs, Rhs); }

// [1] Algorithm 6
FDFType DFAdd(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High + Rhs.High);
#else
	// lossless add
	FDFType S = DFTwoSum(Lhs.High, Rhs.High);
	const FDFType T = DFTwoSum(Lhs.Low, Rhs.Low);
	// merge and rebalance
	S.Low += T.High;
	S = DFFastTwoSum(S.High, S.Low);
	S.Low += T.Low;
	S = DFFastTwoSum(S.High, S.Low);
	return S;
#endif
}

// [1] Algorithm 4
FDFType DFAdd(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High + Rhs);
#else
	const FDFType S = DFTwoSum(Lhs.High, Rhs);
	const FFloatType T = Lhs.Low + S.Low;
	return DFFastTwoSum(S.High, T);
#endif
}

FDFType DFAdd(FFloatType Lhs, FDFType Rhs) { return DFAdd(Rhs, Lhs); }

FFloatType DFAddDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High + Rhs.High;
#else
	// lossless add
	FDFType S = DFTwoSum(Lhs.High, Rhs.High);
	const FDFType T = DFTwoSum(Lhs.Low, Rhs.Low);
	// merge and rebalance
	S.Low += T.High;
	S = DFFastTwoSum(S.High, S.Low);
	S.Low += T.Low;
	return S.High + S.Low;
#endif
}

FFloatType DFAddDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High + Rhs;
#else
	const FDFType S = DFTwoSum(Lhs.High, Rhs);
	const FFloatType T = Lhs.Low + S.Low;
	return S.High + T;
#endif
}

FFloatType DFAddDemote(FFloatType Lhs, FDFType Rhs) { return DFAddDemote(Rhs, Lhs); }
FFloatType DFAddDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs + Rhs; }

FDFType DFFastAdd(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High + Rhs.High);
#else
	FDFType S = DFTwoSum(Lhs.High, Rhs.High);
	S.Low += Lhs.Low + Rhs.Low;
	S = DFFastTwoSum(S.High, S.Low);
	return S;
#endif
}

FDFType DFFastAdd(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High + Rhs);
#else
	FDFType S = DFTwoSum(Lhs.High, Rhs);
	S.Low += Lhs.Low;
	S = DFFastTwoSum(S.High, S.Low);
	return S;
#endif
}

FDFType DFFastAdd(FFloatType Lhs, FDFType Rhs) { return DFFastAdd(Rhs, Lhs); }

FFloatType DFFastAddDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High + Rhs.High;
#else
	FDFType S = DFTwoSum(Lhs.High, Rhs.High);
	S.Low += Lhs.Low + Rhs.Low;
	return S.High + S.Low;
#endif
}

FFloatType DFFastAddDemote(FDFType Lhs, FFloatType Rhs) { return DFAddDemote(Lhs, Rhs); }
FFloatType DFFastAddDemote(FFloatType Lhs, FDFType Rhs) { return DFFastAddDemote(Rhs, Lhs); }
FFloatType DFFastAddDemote(FFloatType Lhs, FFloatType Rhs) { return Rhs + Lhs; }

/*****************/
/** SUBTRACTION **/
/*****************/

FDFType DFSubtract(FFloatType Lhs, FFloatType Rhs) { return DFAdd(Lhs, -Rhs); }
FDFType DFSubtract(FDFType Lhs, FDFType Rhs) { return DFAdd(Lhs, DFNegate(Rhs)); }
FDFType DFSubtract(FDFType Lhs, FFloatType Rhs) { return DFAdd(Lhs, -Rhs); }
FDFType DFSubtract(FFloatType Lhs, FDFType Rhs) { return DFAdd(Lhs, DFNegate(Rhs)); }

FFloatType DFSubtractDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs - Rhs; }
FFloatType DFSubtractDemote(FDFType Lhs, FDFType Rhs) { return DFAddDemote(Lhs, DFNegate(Rhs)); }
FFloatType DFSubtractDemote(FDFType Lhs, FFloatType Rhs) { return DFAddDemote(Lhs, -Rhs); }
FFloatType DFSubtractDemote(FFloatType Lhs, FDFType Rhs) { return DFAddDemote(Lhs, DFNegate(Rhs)); }

FDFType DFFastSubtract(FFloatType Lhs, FFloatType Rhs) { return DFFastAdd(Lhs, -Rhs); }
FDFType DFFastSubtract(FDFType Lhs, FDFType Rhs) { return DFFastAdd(Lhs, DFNegate(Rhs)); }
FDFType DFFastSubtract(FDFType Lhs, FFloatType Rhs) { return DFFastAdd(Lhs, -Rhs); }
FDFType DFFastSubtract(FFloatType Lhs, FDFType Rhs) { return DFFastAdd(Lhs, DFNegate(Rhs)); }

FFloatType DFFastSubtractDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs - Rhs; }
FFloatType DFFastSubtractDemote(FDFType Lhs, FDFType Rhs) { return DFFastAddDemote(Lhs, DFNegate(Rhs)); }
FFloatType DFFastSubtractDemote(FDFType Lhs, FFloatType Rhs) { return DFFastAddDemote(Lhs, -Rhs); }
FFloatType DFFastSubtractDemote(FFloatType Lhs, FDFType Rhs) { return DFFastAddDemote(Lhs, DFNegate(Rhs)); }

// Subtract using only 2 ops instead of ~20, and convert the result to fp32 (1 op)
// CAREFUL, this has strict input preconditions to get output that has any precision at all.
// Assuming you require a precision of 2^-4, the precondition for Lhs and Rhs is
//   (abs(Lhs) < 2^20 && abs(Rhs) < 2^20) ||
//   ( abs(Lhs) < 2^43 && abs(Rhs) < 2^43 && Sign(Lhs) == Sign(Rhs) && abs(trunc(log2(abs(Lhs))) - trunc(log2(abs(Rhs)))) < 1 )
// As the preconditions imply, this does not work for addition due to FP rounding.
FFloatType DFFastLocalSubtractDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High - Rhs.High;
#elif UE_DF_NO_FAST_MATH
	return DFSubtractDemote(Lhs, Rhs);
#else
	const FFloatType High = INVARIANT_SUB(Lhs.High, Rhs.High);
	const FFloatType Low = INVARIANT_SUB(Lhs.Low, Rhs.Low);
	const FFloatType Sum = INVARIANT_ADD(High, Low);
	return Sum;
#endif
}

FFloatType DFFastLocalSubtractDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High - Rhs;
#elif UE_DF_NO_FAST_MATH
	return DFSubtractDemote(Lhs, Rhs);
#else
	const FFloatType High = INVARIANT_SUB(Lhs.High, Rhs);
	const FFloatType Sum = INVARIANT_ADD(High, Lhs.Low);
	return Sum;
#endif
}

FFloatType DFFastLocalSubtractDemote(FFloatType Lhs, FDFType Rhs)
{
	return DFFastLocalSubtractDemote(DFPromote(Lhs), Rhs);
}

/********************/
/** MULTIPLICATION **/
/********************/

// [4] Algorithm 'mul12' (Page 241)
FDFType DFTwoProductNoFMA(FFloatType Lhs, FFloatType Rhs)
{
	const FFloatType Constant = 4097; // = 2^(t - t/2) + 1, where t is the number of mantissa bits
	const FFloatType Px = INVARIANT_MUL(Lhs, Constant);
	const FFloatType Hx = INVARIANT_ADD(INVARIANT_SUB(Lhs, Px), Px);
	const FFloatType Tx = INVARIANT_SUB(Lhs, Hx);
	const FFloatType Py = INVARIANT_MUL(Rhs, Constant);
	const FFloatType Hy = INVARIANT_ADD(INVARIANT_SUB(Rhs, Py), Py);
	const FFloatType Ty = INVARIANT_SUB(Rhs, Hy);
	const FFloatType P = INVARIANT_MUL(Hx, Hy);
	const FFloatType Q = INVARIANT_ADD(INVARIANT_MUL(Hx, Ty), INVARIANT_MUL(Tx, Hy));
	const FFloatType Z = INVARIANT_ADD(P, Q);
	const FFloatType E = INVARIANT_ADD(INVARIANT_ADD(INVARIANT_SUB(P, Z), Q), INVARIANT_MUL(Tx, Ty));
	return DFFastTwoSum(Z, E);
}

// Multiply up two single precision scalars to one double precision scalar. (AKA Fast2Mult)
// [1] Algorithm 3
FDFType DFTwoProduct(FFloatType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs * Rhs);
#elif !PLATFORM_SUPPORTS_FMA
	return DFTwoProductNoFMA(Lhs, Rhs);
#else
	const FFloatType P = Lhs * Rhs;
	const FFloatType E = INVARIANT_FMA(Lhs, Rhs, -P);
	return DFConstructor(P, E);
#endif
}

// Alias for DFTwoProduct
FDFType DFMultiply(FFloatType Lhs, FFloatType Rhs) { return DFTwoProduct(Lhs, Rhs); }

// [1] Algorithm 11.
// If no FMA is available, mad() expansion makes this equivalent to [1] Algorithm 10
FDFType DFMultiply(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High * Rhs.High);
#else
	const FDFType P = DFTwoProduct(Lhs.High, Rhs.High);
	FFloatType T = INVARIANT_MUL(Lhs.High, Rhs.Low);
	T = INVARIANT_FMA(Lhs.Low, Rhs.High, T);
	T = INVARIANT_ADD(T, P.Low);

	return DFFastTwoSum(P.High, T);
#endif
}

// [1] Algorithm 9
// If no FMA is available, mad() expansion makes this equivalent to [1] Algorithm 8
FDFType DFMultiply(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High * Rhs);
#else
	FDFType P = DFTwoProduct(Lhs.High, Rhs);
	FFloatType T = INVARIANT_FMA(Lhs.Low, Rhs, P.Low);
	return DFFastTwoSum(P.High, T);
#endif
}

FDFType DFMultiply(FFloatType Lhs, FDFType Rhs) { return DFMultiply(Rhs, Lhs); }

FFloatType DFMultiplyDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High * Rhs.High;
#else
	const FDFType P = DFTwoProduct(Lhs.High, Rhs.High);
	FFloatType T = INVARIANT_MUL(Lhs.High, Rhs.Low);
	T = INVARIANT_FMA(Lhs.Low, Rhs.High, T);
	T = INVARIANT_ADD(T, P.Low);
	const FFloatType S = INVARIANT_ADD(P.High, T);
	return S;
#endif
}

FFloatType DFMultiplyDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High * Rhs;
#else
	FDFType P = DFTwoProduct(Lhs.High, Rhs);
	P.Low = INVARIANT_FMA(Lhs.Low, Rhs, P.Low);
	const FFloatType S = INVARIANT_ADD(P.High, P.Low);
	return S;
#endif
}

FFloatType DFMultiplyDemote(FFloatType Lhs, FDFType Rhs) { return DFMultiplyDemote(Rhs, Lhs); }
FFloatType DFMultiplyDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs * Rhs; }

// Fast multiplication that assumes the factor is a power of two.
// If this assumption is valid, no precision is lost.
FDFType DFMultiplyByPow2(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High * Rhs);
#else
	return DFConstructor(Lhs.High * Rhs, Lhs.Low * Rhs);
#endif
}

FDFType DFSqr(FDFType V) { return DFMultiply(V, V); }
FDFType DFSqr(FFloatType V) { return DFTwoProduct(V, V); }

/**************/
/** DIVISION **/
/**************/

FDFType DFFastDivide(FDFType Lhs, FDFType Rhs);

// [1] Algorithm 18
FDFType DFDivide(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High / Rhs.High);
#elif !PLATFORM_SUPPORTS_FMA
	return DFFastDivide(Lhs, Rhs);
#else
	const FFloatType Th = INVARIANT_DIV(1.0, Rhs.High);
	const FFloatType Rh = INVARIANT_FMA(-Rhs.High, Th, (FFloatType)1);
	const FFloatType Rl = INVARIANT_MUL(-Rhs.Low, Th);
	const FDFType E = DFFastTwoSum(Rh, Rl);
	const FDFType D = DFMultiply(E, Th);
	const FDFType M = DFAdd(D, Th);
	return DFMultiply(Lhs, M);
#endif
}

// [1] Algorithm 15
FDFType DFDivide(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High / Rhs);
#else
	const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs);
	const FDFType P = DFTwoProduct(Th, Rhs);
	const FFloatType Dh = INVARIANT_SUB(Lhs.High, P.High);
	const FFloatType Dt = INVARIANT_SUB(Dh, P.Low);
	const FFloatType D = INVARIANT_ADD(Dt, Lhs.Low);
	const FFloatType Tl = INVARIANT_DIV(D, Rhs);
	return DFFastTwoSum(Th, Tl);
#endif
}

FDFType DFDivide(FFloatType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs / Rhs);
#else
	return DFDivide(DFPromote(Lhs), Rhs);
#endif
}

FDFType DFDivide(FFloatType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs / Rhs.High);
#else
	return DFDivide(DFPromote(Lhs), Rhs);
#endif
}

// Less precise than 18, but almost half the ops
// [1] Algorithm 17
FDFType DFFastDivide(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High / Rhs.High);
#elif UE_DF_NO_FAST_MATH
	return DFDivide(Lhs, Rhs);
#else
	const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs.High);
	const FDFType R = DFMultiply(Rhs, Th);
	const FFloatType Ph = INVARIANT_SUB(Lhs.High, R.High);
	const FFloatType Dl = INVARIANT_SUB(Lhs.Low, R.Low);
	const FFloatType D = INVARIANT_ADD(Ph, Dl);
	const FFloatType Tl = INVARIANT_DIV(D, Rhs.High);
	return DFFastTwoSum(Th, Tl);
#endif
}

FDFType DFFastDivide(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High / Rhs);
#elif UE_DF_NO_FAST_MATH && PLATFORM_SUPPORTS_FMA
	return DFDivide(Lhs, Rhs);
#else
	const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs);
	const FDFType R = DFTwoProduct(Rhs, Th);
	const FFloatType Ph = INVARIANT_SUB(Lhs.High, R.High);
	const FFloatType Dl = INVARIANT_SUB(Lhs.Low, R.Low);
	const FFloatType D = INVARIANT_ADD(Ph, Dl);
	const FFloatType Tl = INVARIANT_DIV(D, Rhs);
	return DFFastTwoSum(Th, Tl);
#endif
}

FDFType DFFastDivide(FFloatType Lhs, FDFType Rhs) { return DFFastDivide(DFPromote(Lhs), Rhs); }
FDFType DFFastDivide(FFloatType Lhs, FFloatType Rhs) { return DFFastDivide(DFPromote(Lhs), Rhs); }

FFloatType DFFastDivideDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High / Rhs.High;
#elif UE_DF_NO_FAST_MATH
	return DFDemote(DFDivide(Lhs, Rhs));
#else
	const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs.High);
	const FDFType R = DFMultiply(Rhs, Th);
	const FFloatType Ph = INVARIANT_SUB(Lhs.High, R.High);
	const FFloatType Dl = INVARIANT_SUB(Lhs.Low, R.Low);
	const FFloatType D = INVARIANT_ADD(Ph, Dl);
	const FFloatType Tl = INVARIANT_DIV(D, Rhs.High);
	return Th + Tl;
#endif
}

FFloatType DFFastDivideDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High / Rhs;
#elif UE_DF_NO_FAST_MATH
	return DFDemote(DFDivide(Lhs, Rhs));
#else
	const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs);
	const FDFType R = DFTwoProduct(Rhs, Th);
	const FFloatType Ph = INVARIANT_SUB(Lhs.High, R.High);
	const FFloatType Dl = INVARIANT_SUB(Lhs.Low, R.Low);
	const FFloatType D = INVARIANT_ADD(Ph, Dl);
	const FFloatType Tl = INVARIANT_DIV(D, Rhs);
	return Th + Tl;
#endif
}

FFloatType DFFastDivideDemote(FFloatType Lhs, FDFType Rhs) { return DFFastDivideDemote(DFPromote(Lhs), Rhs); }
FFloatType DFFastDivideDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs / Rhs; }

// Fast division that assumes the divisor is a power of two.
// If this assumption is valid, no precision is lost.
FDFType DFDivideByPow2(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(Lhs.High / Rhs);
#else
	return DFConstructor(Lhs.High / Rhs, Lhs.Low / Rhs);
#endif
}

FDFType DFRcp(FDFType V) { return DFFastDivide((FFloatType)1.0, V); }
FFloatType DFRcpDemote(FDFType V) { return rcp(V.High); }

/****************/
/** COMPARISON **/
/****************/

FBoolType DFEqualsApprox(FDFType Lhs, FDFType Rhs, float Threshold)
{
	return DFSubtractDemote(Lhs, Rhs) < Threshold;
}
FBoolType DFEqualsApprox(FDFType Lhs, FFloatType Rhs, float Threshold)
{
	return DFSubtractDemote(Lhs, Rhs) < Threshold;
}
FBoolType DFEqualsApprox(FFloatType Lhs, FDFType Rhs, float Threshold)
{
	return DFSubtractDemote(Lhs, Rhs) < Threshold;
}

FBoolType DFEquals(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High == Rhs.High;
#else
#if COMPILER_SUPPORTS_HLSL2021
	return and(Lhs.High == Rhs.High, Lhs.Low == Rhs.Low);
#else
	return Lhs.High == Rhs.High && Lhs.Low == Rhs.Low;
#endif
#endif
}
FBoolType DFEquals(FDFType Lhs, FFloatType Rhs)
{
	return DFEquals(Lhs, DFPromote(Rhs));
}
FBoolType DFEquals(FFloatType Lhs, FDFType Rhs)
{
	return DFEquals(DFPromote(Lhs), Rhs);
}

FDFType DFSelect(FBoolType S, FDFType Lhs, FDFType Rhs) { return DFConstructor(select(S, Lhs.High, Rhs.High), select(S, Lhs.Low, Rhs.Low)); }
FDFType DFSelect(FBoolType S, FDFType Lhs, FFloatType Rhs) { return DFConstructor(select(S, Lhs.High, Rhs), select(S, Lhs.Low, (FFloatType)0)); }
FDFType DFSelect(FBoolType S, FFloatType Lhs, FDFType Rhs) { return DFConstructor(select(S, Lhs, Rhs.High), select(S, (FFloatType)0, Rhs.Low)); }

FBoolType DFGreater(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High > Rhs.High;
#else
#if COMPILER_SUPPORTS_HLSL2021
		return or(Lhs.High > Rhs.High, and(Lhs.High == Rhs.High, Lhs.Low > Rhs.Low));
#else
		return Lhs.High > Rhs.High || (Lhs.High == Rhs.High && Lhs.Low > Rhs.Low);
#endif
#endif
}

FBoolType DFLess(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High < Rhs.High;
#else
#if COMPILER_SUPPORTS_HLSL2021
	return or(Lhs.High < Rhs.High, and(Lhs.High == Rhs.High, Lhs.Low < Rhs.Low));
#else
	return Lhs.High < Rhs.High || (Lhs.High == Rhs.High && Lhs.Low < Rhs.Low);
#endif
#endif
}

FBoolType DFGreater(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High > Rhs;
#else
#if COMPILER_SUPPORTS_HLSL2021
	return or(Lhs.High > Rhs, and(Lhs.High == Rhs, Lhs.Low > 0.0));
#else
	return Lhs.High > Rhs || (Lhs.High == Rhs && Lhs.Low > 0.0);
#endif
#endif
}

FBoolType DFLess(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return Lhs.High < Rhs;
#else
#if COMPILER_SUPPORTS_HLSL2021
	return or(Lhs.High < Rhs, and(Lhs.High == Rhs, Lhs.Low < 0.0));
#else
	return Lhs.High < Rhs || (Lhs.High == Rhs && Lhs.Low < 0.0);
#endif
#endif
}

FBoolType DFGreater(FFloatType Lhs, FDFType Rhs) { return DFLess(Rhs, Lhs); }
FBoolType DFLess(FFloatType Lhs, FDFType Rhs) { return DFGreater(Rhs, Lhs); }

FBoolType DFGreaterEqual(FDFType Lhs, FDFType Rhs) { return !DFLess(Lhs, Rhs); }
FBoolType DFGreaterEqual(FDFType Lhs, FFloatType Rhs) { return !DFLess(Lhs, Rhs); }
FBoolType DFGreaterEqual(FFloatType Lhs, FDFType Rhs) { return !DFLess(Lhs, Rhs); }

FBoolType DFLessEqual(FDFType Lhs, FFloatType Rhs) { return !DFGreater(Lhs, Rhs); }
FBoolType DFLessEqual(FDFType Lhs, FDFType Rhs) { return !DFGreater(Lhs, Rhs); }
FBoolType DFLessEqual(FFloatType Lhs, FDFType Rhs) { return !DFGreater(Lhs, Rhs); }

FDFType DFMin(FDFType Lhs, FDFType Rhs)
{
	return DFSelect(DFLess(Lhs, Rhs), Lhs, Rhs);
}

FDFType DFMin(FDFType Lhs, FFloatType Rhs)
{
	return DFSelect(DFLess(Lhs, Rhs), Lhs, DFPromote(Rhs));
}

FDFType DFMin(FFloatType Lhs, FDFType Rhs)
{
	return DFSelect(DFLess(Lhs, Rhs), DFPromote(Lhs), Rhs);
}

FDFType DFMax(FDFType Lhs, FDFType Rhs)
{
	return DFSelect(DFLess(Lhs, Rhs), Rhs, Lhs);
}

FDFType DFMax(FDFType Lhs, FFloatType Rhs)
{
	return DFSelect(DFLess(Lhs, Rhs), DFPromote(Rhs), Lhs);
}

FDFType DFMax(FFloatType Lhs, FDFType Rhs)
{
	return DFSelect(DFLess(Lhs, Rhs), Rhs, DFPromote(Lhs));
}

/**********/
/** MISC **/
/**********/

struct FDFTypeDeriv
{
	FDFType Value;
	FFloatType Ddx;
	FFloatType Ddy;
};

// [2] Algorithm 8
FDFType DFSqrt(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(sqrt(V.High));
#else
	const FFloatType Sh = sqrt(V.High);
	const FFloatType P1 = INVARIANT_FMA(-Sh, Sh, V.High); // Note: precision loss here if FMA is not supported
	const FFloatType P2 = INVARIANT_ADD(V.Low, P1);

	const FFloatType Sl = INVARIANT_DIV(P2, (2.0 * Sh));
	return DFFastTwoSum(Sh, Sl);

	// equivalent to (remark 3.5)
	//precise FFloatType Tl = P2 / Sh;
	//precise FFloatType Zh = FMA((FFloatType)0.5, Tl, Sh);
	//precise FFloatType D = Zh - Sh;
	//precise FFloatType Zl = FMA((FFloatType)0.5, Tl, -D);
	//return DFConstructor(Zh, Zl);
#endif
}

// [2] Algorithm 9
FFloatType DFSqrtDemote(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return sqrt(V.High);
#else
	const FFloatType Sh = sqrt(V.High);
	const FFloatType P1 = INVARIANT_FMA(-Sh, Sh, V.High); // Note: precision loss here if FMA is not supported
	const FFloatType P2 = INVARIANT_ADD(V.Low, P1);

	const FFloatType Sl = INVARIANT_DIV(P2, (2.0 * Sh));
	return Sh + Sl;
#endif
}

FDFType DFRsqrt(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(rsqrt(V.High));
#else
	return DFDivide((FFloatType)1.0, DFSqrt(V));
#endif
}
FFloatType DFRsqrtDemote(FDFType V) { return rsqrt(V.High); }

FFloatType DFSin(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return sin(V.High);
#else
	FFloatType Sh, Ch;
	sincos(V.High, Sh, Ch);
	FFloatType Sl, Cl;
	sincos(V.Low, Sl, Cl);
	return Sh*Cl + Ch*Sl; // trigonometric identity for sin(h+l)
#endif
}

FFloatType DFCos(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return cos(V.High);
#else
	FFloatType Sh, Ch;
	sincos(V.High, Sh, Ch);
	FFloatType Sl, Cl;
	sincos(V.Low, Sl, Cl);
	return Ch*Cl - Sh*Sl; // trigonometric identity for cos(h+l)
#endif
}

void DFSinCos(FDFType V, out FFloatType Sin, out FFloatType Cos)
{
#if UE_DF_FORCE_FP32_OPS
	sincos(V.High, Sin, Cos);
#else
	FFloatType Sh, Ch;
	sincos(V.High, Sh, Ch);
	FFloatType Sl, Cl;
	sincos(V.Low, Sl, Cl);
	Sin = FMA(Sh, Cl,  Ch*Sl); // trigonometric identity for sin(h+l)
	Cos = FMA(Ch, Cl, -Sh*Sl); // trigonometric identity for cos(h+l)
#endif
}

FFloatType DFTan(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return tan(V.High);
#else
	FFloatType Th = tan(V.High);
	FFloatType Tl = tan(V.Low);
	return (Th + Tl) / (1.0 - Th * Tl); // trigonometric identity for tan(h+l)
#endif
}

FFloatType DFAsin(FDFType V)
{
	return asin(DFDemote(V));
}

FFloatType DFAcos(FDFType V)
{
	return acos(DFDemote(V));
}

FFloatType DFAtan(FDFType V)
{
	return atan(DFDemote(V));
}

FFloatType DFModf(FDFType V, out FDFType Integer)
{
#if UE_DF_FORCE_FP32_OPS
	Integer.Low = 0;
	return modf(V.High, Integer.High);
#else
	FFloatType IntHigh;
	FFloatType FracHigh = modf(V.High, IntHigh);
	FFloatType IntLow;
	FFloatType FracLow = modf(V.Low, IntLow);
	FFloatType IntSum;
	FFloatType FracSum = modf(FracHigh + FracLow, IntSum);
	Integer = DFTwoSum(IntHigh, IntLow + IntSum);
	return FracSum;
#endif
}

FFloatType DFModfDemote(FDFType V, out FFloatType Integer)
{
#if UE_DF_FORCE_FP32_OPS
	return modf(V.High, Integer);
#else
	FFloatType IntHigh;
	FFloatType FracHigh = modf(V.High, IntHigh);
	FFloatType IntLow;
	FFloatType FracLow = modf(V.Low, IntLow);
	FFloatType IntSum;
	FFloatType FracSum = modf(FracHigh + FracLow, IntSum);
	Integer = IntHigh + IntLow + IntSum;
	return FracSum;
#endif
}

FDFType DFCeil(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(ceil(V.High));
#else
	FFloatType IntHigh;
	FFloatType FracHigh = modf(V.High, IntHigh);
	FFloatType IntLow;
	FFloatType FracLow = modf(V.Low, IntLow);
	IntLow += ceil(FracHigh + FracLow);
	return DFTwoSum(IntHigh, IntLow);
#endif
}

FDFType DFFloor(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(floor(V.High));
#else
	FFloatType IntHigh;
	FFloatType FracHigh = modf(V.High, IntHigh);
	FFloatType IntLow;
	FFloatType FracLow = modf(V.Low, IntLow);
	IntLow += floor(FracHigh + FracLow);
	return DFTwoSum(IntHigh, IntLow);
#endif
}

FDFType DFRound(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(round(V.High));
#else
	FFloatType Sign = DFSign(V);
	FDFType VAbs = DFConstructor(Sign*V.High, Sign*V.Low);

	FFloatType IntHigh;
	FFloatType FracHigh = modf(VAbs.High, IntHigh);
	FFloatType IntLow;
	FFloatType FracLow = modf(VAbs.Low, IntLow);

	IntLow += floor(FracHigh + FracLow + 0.5);
	return DFTwoSum(Sign * IntHigh, Sign * IntLow);
#endif
}

FDFType DFTrunc(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(trunc(V.High));
#else
	FFloatType Sign = DFSign(V);
	FDFType VAbs = DFConstructor(Sign*V.High, Sign*V.Low);

	FFloatType IntHigh;
	FFloatType FracHigh = modf(VAbs.High, IntHigh);
	FFloatType IntLow;
	FFloatType FracLow = modf(VAbs.Low, IntLow);

	IntLow += floor(FracHigh + FracLow);
	return DFTwoSum(Sign * IntHigh, Sign * IntLow);
#endif
}

FDFType DFFrac(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(frac(V.High));
#else
	return DFSubtract(V, DFFloor(V));
#endif
}

// Technically, this function has very low precision due to discontinuity+rounding
// (e.g. 0.99.. ~= 1.0 => frac(0.99..) = frac(1.0) = 0.0)
// but this is irrelevant in practical cases
// TODO: test negative values
FFloatType DFFracDemote(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return frac(V.High);
#else
	return frac(frac(V.High) + frac(V.Low));
#endif
}

// Similar to HLSL fmod, this is equivalent to `lhs - rhs * trunc(lhs / rhs)`
// so fmod(-0.1, 1.0) returns -0.1, not 0.9 (as mod does in GLSL, which is the true mathematical modulo operator)
FDFType DFFmod(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(fmod(Lhs.High, Rhs));
#else
	return DFSubtract(Lhs, DFMultiply(DFTrunc(DFDivide(Lhs, Rhs)), Rhs));
#endif
}

FFloatType DFFmodDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return fmod(Lhs.High, Rhs);
#else
	//DF_TODO: optimize. Is equal to frac(Lhs/Rhs)*Rhs?
	//not equal to fmod(fmod(high)+fmod(low)) due to how fmod handles negative values
	return DFSubtractDemote(Lhs, DFMultiply(DFTrunc(DFDivide(Lhs, Rhs)), Rhs)); 
#endif
}

FFloatType DFFmodByPow2Demote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
	return fmod(Lhs.High, Rhs);
#else
	return DFFracDemote(DFDivideByPow2(Lhs, Rhs)) * Rhs;
#endif
}

FDFType DFLerp(FDFType Lhs, FDFType Rhs, FFloatType S)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(lerp(Lhs.High, Rhs.High, S));
#else
	return DFTwoSum(lerp(Lhs.High, Rhs.High, S), lerp(Lhs.Low, Rhs.Low, S)); //DF_TODO: precision issues
#endif
}

FDFType DFSaturate(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(saturate(V.High));
#else
	V = DFSelect(DFLess(V, 1.0), V, 1.0);
	V = DFSelect(DFLess(V, 0.0), 0.0, V);
	return V;
#endif
}
FFloatType DFSaturateDemote(FDFType V) { return saturate(DFDemote(V)); }

FDFType DFSmoothStep(FDFType Lhs, FDFType Rhs, FDFType S)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(smoothstep(Lhs.High, Rhs.High, S.High));
#else
	FDFType T = DFSaturate(DFDivide(DFSubtract(S, Lhs), DFSubtract(Rhs, Lhs)));
	return DFMultiply(DFSqr(T), DFSubtract(3.0f, DFMultiplyByPow2(T, 2.0f)));
#endif
}
FFloatType DFSmoothStepDemote(FDFType Lhs, FDFType Rhs, FDFType S)
{
#if UE_DF_FORCE_FP32_OPS
	return smoothstep(Lhs.High, Rhs.High, S.High);
#else
	FFloatType T = DFSaturateDemote(DFDivide(DFSubtract(S, Lhs), DFSubtract(Rhs, Lhs)));
	return T*T*(3.0f - (2.0f*T));
#endif
}

FFloatType DFStep(FDFType Lhs, FDFType Rhs)    { return select(DFGreaterEqual(Rhs, Lhs), (FFloatType)1.0f, (FFloatType)0.0f); }
FFloatType DFStep(FDFType Lhs, FFloatType Rhs) { return select(DFGreaterEqual(Rhs, Lhs), (FFloatType)1.0f, (FFloatType)0.0f); }
FFloatType DFStep(FFloatType Lhs, FDFType Rhs) { return select(DFGreaterEqual(Rhs, Lhs), (FFloatType)1.0f, (FFloatType)0.0f); }

// No ddxy inside ray tracing shaders
#if RAYHITGROUPSHADER || RAYMISSHADER || RAYCALLABLESHADER || USE_FORCE_TEXTURE_MIP
FDFType DFDdx(FDFType V) { return (FDFType)0; }
FDFType DFDdy(FDFType V) { return (FDFType)0; }
FFloatType DFDdxDemote(FDFType V) { return (FFloatType)0; }
FFloatType DFDdyDemote(FDFType V) { return (FFloatType)0; }
#else

FDFType DFDdx(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(ddx(V.High));
#else
	return DFTwoSum(ddx(V.High), ddx(V.Low));
#endif
}

FDFType DFDdy(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return DFPromote(ddy(V.High));
#else
	return DFTwoSum(ddy(V.High), ddy(V.Low));
#endif
}

FFloatType DFDdxDemote(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return ddx(V.High);
#else
	return ddx(V.High) + ddx(V.Low);
#endif
}

FFloatType DFDdyDemote(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
	return ddy(V.High);
#else
	return ddy(V.High) + ddy(V.Low);
#endif
}
#endif