Files
UnrealEngine/Engine/Shaders/Private/DoubleFloatOperations.ush
2025-05-18 13:04:45 +08:00

1004 lines
27 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
// This constructs a FDFScalar, but does not 'rebalance' the input floats.
// Use DFTwoSum instead if the input is not already in the correct high/low format.
FDFType DFConstructor(FFloatType High, FFloatType Low)
{
FDFType Result;
Result.High = High;
Result.Low = Low;
return Result;
}
FDFType DFPromote(FDFType Value) { return Value; }
// Convert to double float
FDFType DFPromote(FFloatType Value) { return DFConstructor(Value, (FFloatType)0); }
// Truncate to single precision float
// (This could just return Value.High if you're sure the input is in the correct DF format)
FFloatType DFDemote(FDFType Value) { return Value.High + Value.Low; }
FFloatType DFDemote(FFloatType Value) { return Value; }
FDFType DFNegate(FDFType Value)
{
return DFConstructor(-Value.High, -Value.Low);
}
FFloatType DFSign(FDFType Value)
{
return (FFloatType)sign(Value.High);
}
FDFType DFAbs(FDFType Value)
{
const FFloatType Sign = DFSign(Value);
return DFConstructor(Sign * Value.High, Sign * Value.Low);
}
/**************/
/** ADDITION **/
/**************/
// Sum up two single precision vectors to one double precision vector.
// [1] Algorithm 2
FDFType DFTwoSum(FFloatType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs + Rhs);
#else
const FFloatType S = INVARIANT_ADD(Lhs, Rhs);
const FFloatType V = INVARIANT_SUB(S, Lhs);
const FFloatType Q = INVARIANT_SUB(S, V);
const FFloatType R = INVARIANT_SUB(Lhs, Q);
const FFloatType T = INVARIANT_SUB(Rhs, V);
const FFloatType Y = INVARIANT_ADD(R, T);
return DFConstructor(S, Y);
#endif
}
// An optimized version of DFTwoSum, under the assumption that
// a = 0 or b = 0, or e1 >= e2, where e1 and e2 are the exponents of a and b respectively.
// [1] Algorithm 1
// The worst case precision if this assumption is violated, is just regular fp32 precision
FDFType DFFastTwoSum(FFloatType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs + Rhs);
#else
const FFloatType S = INVARIANT_ADD(Lhs, Rhs);
const FFloatType T = INVARIANT_SUB(S, Lhs);
const FFloatType E = INVARIANT_SUB(Rhs, T);
return DFConstructor(S, E);
#endif
}
// Alias for DFTwoSum
FDFType DFAdd(FFloatType Lhs, FFloatType Rhs) { return DFTwoSum(Lhs, Rhs); }
FDFType DFFastAdd(FFloatType Lhs, FFloatType Rhs) { return DFTwoSum(Lhs, Rhs); }
// [1] Algorithm 6
FDFType DFAdd(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High + Rhs.High);
#else
// lossless add
FDFType S = DFTwoSum(Lhs.High, Rhs.High);
const FDFType T = DFTwoSum(Lhs.Low, Rhs.Low);
// merge and rebalance
S.Low += T.High;
S = DFFastTwoSum(S.High, S.Low);
S.Low += T.Low;
S = DFFastTwoSum(S.High, S.Low);
return S;
#endif
}
// [1] Algorithm 4
FDFType DFAdd(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High + Rhs);
#else
const FDFType S = DFTwoSum(Lhs.High, Rhs);
const FFloatType T = Lhs.Low + S.Low;
return DFFastTwoSum(S.High, T);
#endif
}
FDFType DFAdd(FFloatType Lhs, FDFType Rhs) { return DFAdd(Rhs, Lhs); }
FFloatType DFAddDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High + Rhs.High;
#else
// lossless add
FDFType S = DFTwoSum(Lhs.High, Rhs.High);
const FDFType T = DFTwoSum(Lhs.Low, Rhs.Low);
// merge and rebalance
S.Low += T.High;
S = DFFastTwoSum(S.High, S.Low);
S.Low += T.Low;
return S.High + S.Low;
#endif
}
FFloatType DFAddDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High + Rhs;
#else
const FDFType S = DFTwoSum(Lhs.High, Rhs);
const FFloatType T = Lhs.Low + S.Low;
return S.High + T;
#endif
}
FFloatType DFAddDemote(FFloatType Lhs, FDFType Rhs) { return DFAddDemote(Rhs, Lhs); }
FFloatType DFAddDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs + Rhs; }
FDFType DFFastAdd(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High + Rhs.High);
#else
FDFType S = DFTwoSum(Lhs.High, Rhs.High);
S.Low += Lhs.Low + Rhs.Low;
S = DFFastTwoSum(S.High, S.Low);
return S;
#endif
}
FDFType DFFastAdd(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High + Rhs);
#else
FDFType S = DFTwoSum(Lhs.High, Rhs);
S.Low += Lhs.Low;
S = DFFastTwoSum(S.High, S.Low);
return S;
#endif
}
FDFType DFFastAdd(FFloatType Lhs, FDFType Rhs) { return DFFastAdd(Rhs, Lhs); }
FFloatType DFFastAddDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High + Rhs.High;
#else
FDFType S = DFTwoSum(Lhs.High, Rhs.High);
S.Low += Lhs.Low + Rhs.Low;
return S.High + S.Low;
#endif
}
FFloatType DFFastAddDemote(FDFType Lhs, FFloatType Rhs) { return DFAddDemote(Lhs, Rhs); }
FFloatType DFFastAddDemote(FFloatType Lhs, FDFType Rhs) { return DFFastAddDemote(Rhs, Lhs); }
FFloatType DFFastAddDemote(FFloatType Lhs, FFloatType Rhs) { return Rhs + Lhs; }
/*****************/
/** SUBTRACTION **/
/*****************/
FDFType DFSubtract(FFloatType Lhs, FFloatType Rhs) { return DFAdd(Lhs, -Rhs); }
FDFType DFSubtract(FDFType Lhs, FDFType Rhs) { return DFAdd(Lhs, DFNegate(Rhs)); }
FDFType DFSubtract(FDFType Lhs, FFloatType Rhs) { return DFAdd(Lhs, -Rhs); }
FDFType DFSubtract(FFloatType Lhs, FDFType Rhs) { return DFAdd(Lhs, DFNegate(Rhs)); }
FFloatType DFSubtractDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs - Rhs; }
FFloatType DFSubtractDemote(FDFType Lhs, FDFType Rhs) { return DFAddDemote(Lhs, DFNegate(Rhs)); }
FFloatType DFSubtractDemote(FDFType Lhs, FFloatType Rhs) { return DFAddDemote(Lhs, -Rhs); }
FFloatType DFSubtractDemote(FFloatType Lhs, FDFType Rhs) { return DFAddDemote(Lhs, DFNegate(Rhs)); }
FDFType DFFastSubtract(FFloatType Lhs, FFloatType Rhs) { return DFFastAdd(Lhs, -Rhs); }
FDFType DFFastSubtract(FDFType Lhs, FDFType Rhs) { return DFFastAdd(Lhs, DFNegate(Rhs)); }
FDFType DFFastSubtract(FDFType Lhs, FFloatType Rhs) { return DFFastAdd(Lhs, -Rhs); }
FDFType DFFastSubtract(FFloatType Lhs, FDFType Rhs) { return DFFastAdd(Lhs, DFNegate(Rhs)); }
FFloatType DFFastSubtractDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs - Rhs; }
FFloatType DFFastSubtractDemote(FDFType Lhs, FDFType Rhs) { return DFFastAddDemote(Lhs, DFNegate(Rhs)); }
FFloatType DFFastSubtractDemote(FDFType Lhs, FFloatType Rhs) { return DFFastAddDemote(Lhs, -Rhs); }
FFloatType DFFastSubtractDemote(FFloatType Lhs, FDFType Rhs) { return DFFastAddDemote(Lhs, DFNegate(Rhs)); }
// Subtract using only 2 ops instead of ~20, and convert the result to fp32 (1 op)
// CAREFUL, this has strict input preconditions to get output that has any precision at all.
// Assuming you require a precision of 2^-4, the precondition for Lhs and Rhs is
// (abs(Lhs) < 2^20 && abs(Rhs) < 2^20) ||
// ( abs(Lhs) < 2^43 && abs(Rhs) < 2^43 && Sign(Lhs) == Sign(Rhs) && abs(trunc(log2(abs(Lhs))) - trunc(log2(abs(Rhs)))) < 1 )
// As the preconditions imply, this does not work for addition due to FP rounding.
FFloatType DFFastLocalSubtractDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High - Rhs.High;
#elif UE_DF_NO_FAST_MATH
return DFSubtractDemote(Lhs, Rhs);
#else
const FFloatType High = INVARIANT_SUB(Lhs.High, Rhs.High);
const FFloatType Low = INVARIANT_SUB(Lhs.Low, Rhs.Low);
const FFloatType Sum = INVARIANT_ADD(High, Low);
return Sum;
#endif
}
FFloatType DFFastLocalSubtractDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High - Rhs;
#elif UE_DF_NO_FAST_MATH
return DFSubtractDemote(Lhs, Rhs);
#else
const FFloatType High = INVARIANT_SUB(Lhs.High, Rhs);
const FFloatType Sum = INVARIANT_ADD(High, Lhs.Low);
return Sum;
#endif
}
FFloatType DFFastLocalSubtractDemote(FFloatType Lhs, FDFType Rhs)
{
return DFFastLocalSubtractDemote(DFPromote(Lhs), Rhs);
}
/********************/
/** MULTIPLICATION **/
/********************/
// [4] Algorithm 'mul12' (Page 241)
FDFType DFTwoProductNoFMA(FFloatType Lhs, FFloatType Rhs)
{
const FFloatType Constant = 4097; // = 2^(t - t/2) + 1, where t is the number of mantissa bits
const FFloatType Px = INVARIANT_MUL(Lhs, Constant);
const FFloatType Hx = INVARIANT_ADD(INVARIANT_SUB(Lhs, Px), Px);
const FFloatType Tx = INVARIANT_SUB(Lhs, Hx);
const FFloatType Py = INVARIANT_MUL(Rhs, Constant);
const FFloatType Hy = INVARIANT_ADD(INVARIANT_SUB(Rhs, Py), Py);
const FFloatType Ty = INVARIANT_SUB(Rhs, Hy);
const FFloatType P = INVARIANT_MUL(Hx, Hy);
const FFloatType Q = INVARIANT_ADD(INVARIANT_MUL(Hx, Ty), INVARIANT_MUL(Tx, Hy));
const FFloatType Z = INVARIANT_ADD(P, Q);
const FFloatType E = INVARIANT_ADD(INVARIANT_ADD(INVARIANT_SUB(P, Z), Q), INVARIANT_MUL(Tx, Ty));
return DFFastTwoSum(Z, E);
}
// Multiply up two single precision scalars to one double precision scalar. (AKA Fast2Mult)
// [1] Algorithm 3
FDFType DFTwoProduct(FFloatType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs * Rhs);
#elif !PLATFORM_SUPPORTS_FMA
return DFTwoProductNoFMA(Lhs, Rhs);
#else
const FFloatType P = Lhs * Rhs;
const FFloatType E = INVARIANT_FMA(Lhs, Rhs, -P);
return DFConstructor(P, E);
#endif
}
// Alias for DFTwoProduct
FDFType DFMultiply(FFloatType Lhs, FFloatType Rhs) { return DFTwoProduct(Lhs, Rhs); }
// [1] Algorithm 11.
// If no FMA is available, mad() expansion makes this equivalent to [1] Algorithm 10
FDFType DFMultiply(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High * Rhs.High);
#else
const FDFType P = DFTwoProduct(Lhs.High, Rhs.High);
FFloatType T = INVARIANT_MUL(Lhs.High, Rhs.Low);
T = INVARIANT_FMA(Lhs.Low, Rhs.High, T);
T = INVARIANT_ADD(T, P.Low);
return DFFastTwoSum(P.High, T);
#endif
}
// [1] Algorithm 9
// If no FMA is available, mad() expansion makes this equivalent to [1] Algorithm 8
FDFType DFMultiply(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High * Rhs);
#else
FDFType P = DFTwoProduct(Lhs.High, Rhs);
FFloatType T = INVARIANT_FMA(Lhs.Low, Rhs, P.Low);
return DFFastTwoSum(P.High, T);
#endif
}
FDFType DFMultiply(FFloatType Lhs, FDFType Rhs) { return DFMultiply(Rhs, Lhs); }
FFloatType DFMultiplyDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High * Rhs.High;
#else
const FDFType P = DFTwoProduct(Lhs.High, Rhs.High);
FFloatType T = INVARIANT_MUL(Lhs.High, Rhs.Low);
T = INVARIANT_FMA(Lhs.Low, Rhs.High, T);
T = INVARIANT_ADD(T, P.Low);
const FFloatType S = INVARIANT_ADD(P.High, T);
return S;
#endif
}
FFloatType DFMultiplyDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High * Rhs;
#else
FDFType P = DFTwoProduct(Lhs.High, Rhs);
P.Low = INVARIANT_FMA(Lhs.Low, Rhs, P.Low);
const FFloatType S = INVARIANT_ADD(P.High, P.Low);
return S;
#endif
}
FFloatType DFMultiplyDemote(FFloatType Lhs, FDFType Rhs) { return DFMultiplyDemote(Rhs, Lhs); }
FFloatType DFMultiplyDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs * Rhs; }
// Fast multiplication that assumes the factor is a power of two.
// If this assumption is valid, no precision is lost.
FDFType DFMultiplyByPow2(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High * Rhs);
#else
return DFConstructor(Lhs.High * Rhs, Lhs.Low * Rhs);
#endif
}
FDFType DFSqr(FDFType V) { return DFMultiply(V, V); }
FDFType DFSqr(FFloatType V) { return DFTwoProduct(V, V); }
/**************/
/** DIVISION **/
/**************/
FDFType DFFastDivide(FDFType Lhs, FDFType Rhs);
// [1] Algorithm 18
FDFType DFDivide(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High / Rhs.High);
#elif !PLATFORM_SUPPORTS_FMA
return DFFastDivide(Lhs, Rhs);
#else
const FFloatType Th = INVARIANT_DIV(1.0, Rhs.High);
const FFloatType Rh = INVARIANT_FMA(-Rhs.High, Th, (FFloatType)1);
const FFloatType Rl = INVARIANT_MUL(-Rhs.Low, Th);
const FDFType E = DFFastTwoSum(Rh, Rl);
const FDFType D = DFMultiply(E, Th);
const FDFType M = DFAdd(D, Th);
return DFMultiply(Lhs, M);
#endif
}
// [1] Algorithm 15
FDFType DFDivide(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High / Rhs);
#else
const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs);
const FDFType P = DFTwoProduct(Th, Rhs);
const FFloatType Dh = INVARIANT_SUB(Lhs.High, P.High);
const FFloatType Dt = INVARIANT_SUB(Dh, P.Low);
const FFloatType D = INVARIANT_ADD(Dt, Lhs.Low);
const FFloatType Tl = INVARIANT_DIV(D, Rhs);
return DFFastTwoSum(Th, Tl);
#endif
}
FDFType DFDivide(FFloatType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs / Rhs);
#else
return DFDivide(DFPromote(Lhs), Rhs);
#endif
}
FDFType DFDivide(FFloatType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs / Rhs.High);
#else
return DFDivide(DFPromote(Lhs), Rhs);
#endif
}
// Less precise than 18, but almost half the ops
// [1] Algorithm 17
FDFType DFFastDivide(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High / Rhs.High);
#elif UE_DF_NO_FAST_MATH
return DFDivide(Lhs, Rhs);
#else
const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs.High);
const FDFType R = DFMultiply(Rhs, Th);
const FFloatType Ph = INVARIANT_SUB(Lhs.High, R.High);
const FFloatType Dl = INVARIANT_SUB(Lhs.Low, R.Low);
const FFloatType D = INVARIANT_ADD(Ph, Dl);
const FFloatType Tl = INVARIANT_DIV(D, Rhs.High);
return DFFastTwoSum(Th, Tl);
#endif
}
FDFType DFFastDivide(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High / Rhs);
#elif UE_DF_NO_FAST_MATH && PLATFORM_SUPPORTS_FMA
return DFDivide(Lhs, Rhs);
#else
const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs);
const FDFType R = DFTwoProduct(Rhs, Th);
const FFloatType Ph = INVARIANT_SUB(Lhs.High, R.High);
const FFloatType Dl = INVARIANT_SUB(Lhs.Low, R.Low);
const FFloatType D = INVARIANT_ADD(Ph, Dl);
const FFloatType Tl = INVARIANT_DIV(D, Rhs);
return DFFastTwoSum(Th, Tl);
#endif
}
FDFType DFFastDivide(FFloatType Lhs, FDFType Rhs) { return DFFastDivide(DFPromote(Lhs), Rhs); }
FDFType DFFastDivide(FFloatType Lhs, FFloatType Rhs) { return DFFastDivide(DFPromote(Lhs), Rhs); }
FFloatType DFFastDivideDemote(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High / Rhs.High;
#elif UE_DF_NO_FAST_MATH
return DFDemote(DFDivide(Lhs, Rhs));
#else
const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs.High);
const FDFType R = DFMultiply(Rhs, Th);
const FFloatType Ph = INVARIANT_SUB(Lhs.High, R.High);
const FFloatType Dl = INVARIANT_SUB(Lhs.Low, R.Low);
const FFloatType D = INVARIANT_ADD(Ph, Dl);
const FFloatType Tl = INVARIANT_DIV(D, Rhs.High);
return Th + Tl;
#endif
}
FFloatType DFFastDivideDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High / Rhs;
#elif UE_DF_NO_FAST_MATH
return DFDemote(DFDivide(Lhs, Rhs));
#else
const FFloatType Th = INVARIANT_DIV(Lhs.High, Rhs);
const FDFType R = DFTwoProduct(Rhs, Th);
const FFloatType Ph = INVARIANT_SUB(Lhs.High, R.High);
const FFloatType Dl = INVARIANT_SUB(Lhs.Low, R.Low);
const FFloatType D = INVARIANT_ADD(Ph, Dl);
const FFloatType Tl = INVARIANT_DIV(D, Rhs);
return Th + Tl;
#endif
}
FFloatType DFFastDivideDemote(FFloatType Lhs, FDFType Rhs) { return DFFastDivideDemote(DFPromote(Lhs), Rhs); }
FFloatType DFFastDivideDemote(FFloatType Lhs, FFloatType Rhs) { return Lhs / Rhs; }
// Fast division that assumes the divisor is a power of two.
// If this assumption is valid, no precision is lost.
FDFType DFDivideByPow2(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(Lhs.High / Rhs);
#else
return DFConstructor(Lhs.High / Rhs, Lhs.Low / Rhs);
#endif
}
FDFType DFRcp(FDFType V) { return DFFastDivide((FFloatType)1.0, V); }
FFloatType DFRcpDemote(FDFType V) { return rcp(V.High); }
/****************/
/** COMPARISON **/
/****************/
FBoolType DFEqualsApprox(FDFType Lhs, FDFType Rhs, float Threshold)
{
return DFSubtractDemote(Lhs, Rhs) < Threshold;
}
FBoolType DFEqualsApprox(FDFType Lhs, FFloatType Rhs, float Threshold)
{
return DFSubtractDemote(Lhs, Rhs) < Threshold;
}
FBoolType DFEqualsApprox(FFloatType Lhs, FDFType Rhs, float Threshold)
{
return DFSubtractDemote(Lhs, Rhs) < Threshold;
}
FBoolType DFEquals(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High == Rhs.High;
#else
#if COMPILER_SUPPORTS_HLSL2021
return and(Lhs.High == Rhs.High, Lhs.Low == Rhs.Low);
#else
return Lhs.High == Rhs.High && Lhs.Low == Rhs.Low;
#endif
#endif
}
FBoolType DFEquals(FDFType Lhs, FFloatType Rhs)
{
return DFEquals(Lhs, DFPromote(Rhs));
}
FBoolType DFEquals(FFloatType Lhs, FDFType Rhs)
{
return DFEquals(DFPromote(Lhs), Rhs);
}
FDFType DFSelect(FBoolType S, FDFType Lhs, FDFType Rhs) { return DFConstructor(select(S, Lhs.High, Rhs.High), select(S, Lhs.Low, Rhs.Low)); }
FDFType DFSelect(FBoolType S, FDFType Lhs, FFloatType Rhs) { return DFConstructor(select(S, Lhs.High, Rhs), select(S, Lhs.Low, (FFloatType)0)); }
FDFType DFSelect(FBoolType S, FFloatType Lhs, FDFType Rhs) { return DFConstructor(select(S, Lhs, Rhs.High), select(S, (FFloatType)0, Rhs.Low)); }
FBoolType DFGreater(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High > Rhs.High;
#else
#if COMPILER_SUPPORTS_HLSL2021
return or(Lhs.High > Rhs.High, and(Lhs.High == Rhs.High, Lhs.Low > Rhs.Low));
#else
return Lhs.High > Rhs.High || (Lhs.High == Rhs.High && Lhs.Low > Rhs.Low);
#endif
#endif
}
FBoolType DFLess(FDFType Lhs, FDFType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High < Rhs.High;
#else
#if COMPILER_SUPPORTS_HLSL2021
return or(Lhs.High < Rhs.High, and(Lhs.High == Rhs.High, Lhs.Low < Rhs.Low));
#else
return Lhs.High < Rhs.High || (Lhs.High == Rhs.High && Lhs.Low < Rhs.Low);
#endif
#endif
}
FBoolType DFGreater(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High > Rhs;
#else
#if COMPILER_SUPPORTS_HLSL2021
return or(Lhs.High > Rhs, and(Lhs.High == Rhs, Lhs.Low > 0.0));
#else
return Lhs.High > Rhs || (Lhs.High == Rhs && Lhs.Low > 0.0);
#endif
#endif
}
FBoolType DFLess(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return Lhs.High < Rhs;
#else
#if COMPILER_SUPPORTS_HLSL2021
return or(Lhs.High < Rhs, and(Lhs.High == Rhs, Lhs.Low < 0.0));
#else
return Lhs.High < Rhs || (Lhs.High == Rhs && Lhs.Low < 0.0);
#endif
#endif
}
FBoolType DFGreater(FFloatType Lhs, FDFType Rhs) { return DFLess(Rhs, Lhs); }
FBoolType DFLess(FFloatType Lhs, FDFType Rhs) { return DFGreater(Rhs, Lhs); }
FBoolType DFGreaterEqual(FDFType Lhs, FDFType Rhs) { return !DFLess(Lhs, Rhs); }
FBoolType DFGreaterEqual(FDFType Lhs, FFloatType Rhs) { return !DFLess(Lhs, Rhs); }
FBoolType DFGreaterEqual(FFloatType Lhs, FDFType Rhs) { return !DFLess(Lhs, Rhs); }
FBoolType DFLessEqual(FDFType Lhs, FFloatType Rhs) { return !DFGreater(Lhs, Rhs); }
FBoolType DFLessEqual(FDFType Lhs, FDFType Rhs) { return !DFGreater(Lhs, Rhs); }
FBoolType DFLessEqual(FFloatType Lhs, FDFType Rhs) { return !DFGreater(Lhs, Rhs); }
FDFType DFMin(FDFType Lhs, FDFType Rhs)
{
return DFSelect(DFLess(Lhs, Rhs), Lhs, Rhs);
}
FDFType DFMin(FDFType Lhs, FFloatType Rhs)
{
return DFSelect(DFLess(Lhs, Rhs), Lhs, DFPromote(Rhs));
}
FDFType DFMin(FFloatType Lhs, FDFType Rhs)
{
return DFSelect(DFLess(Lhs, Rhs), DFPromote(Lhs), Rhs);
}
FDFType DFMax(FDFType Lhs, FDFType Rhs)
{
return DFSelect(DFLess(Lhs, Rhs), Rhs, Lhs);
}
FDFType DFMax(FDFType Lhs, FFloatType Rhs)
{
return DFSelect(DFLess(Lhs, Rhs), DFPromote(Rhs), Lhs);
}
FDFType DFMax(FFloatType Lhs, FDFType Rhs)
{
return DFSelect(DFLess(Lhs, Rhs), Rhs, DFPromote(Lhs));
}
/**********/
/** MISC **/
/**********/
struct FDFTypeDeriv
{
FDFType Value;
FFloatType Ddx;
FFloatType Ddy;
};
// [2] Algorithm 8
FDFType DFSqrt(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(sqrt(V.High));
#else
const FFloatType Sh = sqrt(V.High);
const FFloatType P1 = INVARIANT_FMA(-Sh, Sh, V.High); // Note: precision loss here if FMA is not supported
const FFloatType P2 = INVARIANT_ADD(V.Low, P1);
const FFloatType Sl = INVARIANT_DIV(P2, (2.0 * Sh));
return DFFastTwoSum(Sh, Sl);
// equivalent to (remark 3.5)
//precise FFloatType Tl = P2 / Sh;
//precise FFloatType Zh = FMA((FFloatType)0.5, Tl, Sh);
//precise FFloatType D = Zh - Sh;
//precise FFloatType Zl = FMA((FFloatType)0.5, Tl, -D);
//return DFConstructor(Zh, Zl);
#endif
}
// [2] Algorithm 9
FFloatType DFSqrtDemote(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return sqrt(V.High);
#else
const FFloatType Sh = sqrt(V.High);
const FFloatType P1 = INVARIANT_FMA(-Sh, Sh, V.High); // Note: precision loss here if FMA is not supported
const FFloatType P2 = INVARIANT_ADD(V.Low, P1);
const FFloatType Sl = INVARIANT_DIV(P2, (2.0 * Sh));
return Sh + Sl;
#endif
}
FDFType DFRsqrt(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(rsqrt(V.High));
#else
return DFDivide((FFloatType)1.0, DFSqrt(V));
#endif
}
FFloatType DFRsqrtDemote(FDFType V) { return rsqrt(V.High); }
FFloatType DFSin(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return sin(V.High);
#else
FFloatType Sh, Ch;
sincos(V.High, Sh, Ch);
FFloatType Sl, Cl;
sincos(V.Low, Sl, Cl);
return Sh*Cl + Ch*Sl; // trigonometric identity for sin(h+l)
#endif
}
FFloatType DFCos(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return cos(V.High);
#else
FFloatType Sh, Ch;
sincos(V.High, Sh, Ch);
FFloatType Sl, Cl;
sincos(V.Low, Sl, Cl);
return Ch*Cl - Sh*Sl; // trigonometric identity for cos(h+l)
#endif
}
void DFSinCos(FDFType V, out FFloatType Sin, out FFloatType Cos)
{
#if UE_DF_FORCE_FP32_OPS
sincos(V.High, Sin, Cos);
#else
FFloatType Sh, Ch;
sincos(V.High, Sh, Ch);
FFloatType Sl, Cl;
sincos(V.Low, Sl, Cl);
Sin = FMA(Sh, Cl, Ch*Sl); // trigonometric identity for sin(h+l)
Cos = FMA(Ch, Cl, -Sh*Sl); // trigonometric identity for cos(h+l)
#endif
}
FFloatType DFTan(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return tan(V.High);
#else
FFloatType Th = tan(V.High);
FFloatType Tl = tan(V.Low);
return (Th + Tl) / (1.0 - Th * Tl); // trigonometric identity for tan(h+l)
#endif
}
FFloatType DFAsin(FDFType V)
{
return asin(DFDemote(V));
}
FFloatType DFAcos(FDFType V)
{
return acos(DFDemote(V));
}
FFloatType DFAtan(FDFType V)
{
return atan(DFDemote(V));
}
FFloatType DFModf(FDFType V, out FDFType Integer)
{
#if UE_DF_FORCE_FP32_OPS
Integer.Low = 0;
return modf(V.High, Integer.High);
#else
FFloatType IntHigh;
FFloatType FracHigh = modf(V.High, IntHigh);
FFloatType IntLow;
FFloatType FracLow = modf(V.Low, IntLow);
FFloatType IntSum;
FFloatType FracSum = modf(FracHigh + FracLow, IntSum);
Integer = DFTwoSum(IntHigh, IntLow + IntSum);
return FracSum;
#endif
}
FFloatType DFModfDemote(FDFType V, out FFloatType Integer)
{
#if UE_DF_FORCE_FP32_OPS
return modf(V.High, Integer);
#else
FFloatType IntHigh;
FFloatType FracHigh = modf(V.High, IntHigh);
FFloatType IntLow;
FFloatType FracLow = modf(V.Low, IntLow);
FFloatType IntSum;
FFloatType FracSum = modf(FracHigh + FracLow, IntSum);
Integer = IntHigh + IntLow + IntSum;
return FracSum;
#endif
}
FDFType DFCeil(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(ceil(V.High));
#else
FFloatType IntHigh;
FFloatType FracHigh = modf(V.High, IntHigh);
FFloatType IntLow;
FFloatType FracLow = modf(V.Low, IntLow);
IntLow += ceil(FracHigh + FracLow);
return DFTwoSum(IntHigh, IntLow);
#endif
}
FDFType DFFloor(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(floor(V.High));
#else
FFloatType IntHigh;
FFloatType FracHigh = modf(V.High, IntHigh);
FFloatType IntLow;
FFloatType FracLow = modf(V.Low, IntLow);
IntLow += floor(FracHigh + FracLow);
return DFTwoSum(IntHigh, IntLow);
#endif
}
FDFType DFRound(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(round(V.High));
#else
FFloatType Sign = DFSign(V);
FDFType VAbs = DFConstructor(Sign*V.High, Sign*V.Low);
FFloatType IntHigh;
FFloatType FracHigh = modf(VAbs.High, IntHigh);
FFloatType IntLow;
FFloatType FracLow = modf(VAbs.Low, IntLow);
IntLow += floor(FracHigh + FracLow + 0.5);
return DFTwoSum(Sign * IntHigh, Sign * IntLow);
#endif
}
FDFType DFTrunc(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(trunc(V.High));
#else
FFloatType Sign = DFSign(V);
FDFType VAbs = DFConstructor(Sign*V.High, Sign*V.Low);
FFloatType IntHigh;
FFloatType FracHigh = modf(VAbs.High, IntHigh);
FFloatType IntLow;
FFloatType FracLow = modf(VAbs.Low, IntLow);
IntLow += floor(FracHigh + FracLow);
return DFTwoSum(Sign * IntHigh, Sign * IntLow);
#endif
}
FDFType DFFrac(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(frac(V.High));
#else
return DFSubtract(V, DFFloor(V));
#endif
}
// Technically, this function has very low precision due to discontinuity+rounding
// (e.g. 0.99.. ~= 1.0 => frac(0.99..) = frac(1.0) = 0.0)
// but this is irrelevant in practical cases
// TODO: test negative values
FFloatType DFFracDemote(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return frac(V.High);
#else
return frac(frac(V.High) + frac(V.Low));
#endif
}
// Similar to HLSL fmod, this is equivalent to `lhs - rhs * trunc(lhs / rhs)`
// so fmod(-0.1, 1.0) returns -0.1, not 0.9 (as mod does in GLSL, which is the true mathematical modulo operator)
FDFType DFFmod(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(fmod(Lhs.High, Rhs));
#else
return DFSubtract(Lhs, DFMultiply(DFTrunc(DFDivide(Lhs, Rhs)), Rhs));
#endif
}
FFloatType DFFmodDemote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return fmod(Lhs.High, Rhs);
#else
//DF_TODO: optimize. Is equal to frac(Lhs/Rhs)*Rhs?
//not equal to fmod(fmod(high)+fmod(low)) due to how fmod handles negative values
return DFSubtractDemote(Lhs, DFMultiply(DFTrunc(DFDivide(Lhs, Rhs)), Rhs));
#endif
}
FFloatType DFFmodByPow2Demote(FDFType Lhs, FFloatType Rhs)
{
#if UE_DF_FORCE_FP32_OPS
return fmod(Lhs.High, Rhs);
#else
return DFFracDemote(DFDivideByPow2(Lhs, Rhs)) * Rhs;
#endif
}
FDFType DFLerp(FDFType Lhs, FDFType Rhs, FFloatType S)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(lerp(Lhs.High, Rhs.High, S));
#else
return DFTwoSum(lerp(Lhs.High, Rhs.High, S), lerp(Lhs.Low, Rhs.Low, S)); //DF_TODO: precision issues
#endif
}
FDFType DFSaturate(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(saturate(V.High));
#else
V = DFSelect(DFLess(V, 1.0), V, 1.0);
V = DFSelect(DFLess(V, 0.0), 0.0, V);
return V;
#endif
}
FFloatType DFSaturateDemote(FDFType V) { return saturate(DFDemote(V)); }
FDFType DFSmoothStep(FDFType Lhs, FDFType Rhs, FDFType S)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(smoothstep(Lhs.High, Rhs.High, S.High));
#else
FDFType T = DFSaturate(DFDivide(DFSubtract(S, Lhs), DFSubtract(Rhs, Lhs)));
return DFMultiply(DFSqr(T), DFSubtract(3.0f, DFMultiplyByPow2(T, 2.0f)));
#endif
}
FFloatType DFSmoothStepDemote(FDFType Lhs, FDFType Rhs, FDFType S)
{
#if UE_DF_FORCE_FP32_OPS
return smoothstep(Lhs.High, Rhs.High, S.High);
#else
FFloatType T = DFSaturateDemote(DFDivide(DFSubtract(S, Lhs), DFSubtract(Rhs, Lhs)));
return T*T*(3.0f - (2.0f*T));
#endif
}
FFloatType DFStep(FDFType Lhs, FDFType Rhs) { return select(DFGreaterEqual(Rhs, Lhs), (FFloatType)1.0f, (FFloatType)0.0f); }
FFloatType DFStep(FDFType Lhs, FFloatType Rhs) { return select(DFGreaterEqual(Rhs, Lhs), (FFloatType)1.0f, (FFloatType)0.0f); }
FFloatType DFStep(FFloatType Lhs, FDFType Rhs) { return select(DFGreaterEqual(Rhs, Lhs), (FFloatType)1.0f, (FFloatType)0.0f); }
// No ddxy inside ray tracing shaders
#if RAYHITGROUPSHADER || RAYMISSHADER || RAYCALLABLESHADER || USE_FORCE_TEXTURE_MIP
FDFType DFDdx(FDFType V) { return (FDFType)0; }
FDFType DFDdy(FDFType V) { return (FDFType)0; }
FFloatType DFDdxDemote(FDFType V) { return (FFloatType)0; }
FFloatType DFDdyDemote(FDFType V) { return (FFloatType)0; }
#else
FDFType DFDdx(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(ddx(V.High));
#else
return DFTwoSum(ddx(V.High), ddx(V.Low));
#endif
}
FDFType DFDdy(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return DFPromote(ddy(V.High));
#else
return DFTwoSum(ddy(V.High), ddy(V.Low));
#endif
}
FFloatType DFDdxDemote(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return ddx(V.High);
#else
return ddx(V.High) + ddx(V.Low);
#endif
}
FFloatType DFDdyDemote(FDFType V)
{
#if UE_DF_FORCE_FP32_OPS
return ddy(V.High);
#else
return ddy(V.High) + ddy(V.Low);
#endif
}
#endif