// Copyright Epic Games, Inc. All Rights Reserved. /*============================================================================= NFORRegression.ush =============================================================================*/ // TODO: Improve matrix multiplication with Strassem algorithm. // TODO: Improve the robustness of QR decomposition. // TODO: Find best iteration number for Newton Schulz Iteration Method. // TODO: Unroll with constant dimensions. // TODO: Merge duplicate shader code. #pragma once #include "NFORRegressionCommon.ush" #ifndef MATRIX_DIM #define MATRIX_DIM 8 #endif #ifndef NUM_NEWTON_SCHULTZ_ITERATIONS #define NUM_NEWTON_SCHULTZ_ITERATIONS 3 #endif #define FULL_MATRIX_SIZE ((MATRIX_DIM)*(MATRIX_DIM)) #define SYMMETRIC_MATRIX_SIZE (((MATRIX_DIM+1)*MATRIX_DIM)/2) #define LOWER_TRIANGLE_MATRIX_SIZE SYMMETRIC_MATRIX_SIZE #define QR_DECOMPOSITION_TYPE_DEFAULT 0 #define QR_DECOMPOSITION_TYPE_COLUMN_PIVOT 1 #define QR_DECOMPOSITION_TYPE QR_DECOMPOSITION_TYPE_DEFAULT Buffer A; Buffer B; int2 ADim; int2 BDim; int NumOfElements; int NumOfElementsPerRow; float MinLambda; struct FLMatrix { FDFScalar M[LOWER_TRIANGLE_MATRIX_SIZE]; }; #define LMatrixIndex2D(a,b) select(a>b, int2(b,a), int2(a,b)) #define LMatrixIndex1D(i) LMatrixIndex2D((i)%MATRIX_DIM, (i)/MATRIX_DIM) #define LMatrix(_L,y,x) (_L.M[(x) + (((y) +1) * (y)/2)]) // L^T(x,y) = L(y,x), upper matrix has x>y #define TLMatrixIndex2D(a,b) select(a>b, int2(a,b), int2(b,a)) #define TLMatrixIndex1D(i) TLMatrixIndex2D((i)%MATRIX_DIM, (i)/MATRIX_DIM) #define TLMatrix(_L,y,x) LMatrix(_L,x,y) struct FNFORVector { FDFScalar V[MATRIX_DIM]; }; struct FMatrix { FDFScalar M[FULL_MATRIX_SIZE]; }; #define MATRIX(_A,y,x) (_A.M[y * MATRIX_DIM + x]) void LMatrixNegate(inout FLMatrix A) { for (int y = 0; y < MATRIX_DIM; ++y) { for (int x = 0; x <= y; ++x) { LMatrix(A, y, x) = DFNegate(LMatrix(A, y, x)); } } } // Assume A and B is symmetric and stored in FLMatrix. // A@B != B@A even if A=A^T, B^T=B FMatrix LMatrixMultiply(in FLMatrix A, in FLMatrix B) { FMatrix Ret = (FMatrix)0; for (int y = 0; y < MATRIX_DIM; ++y) { for (int x = 0; x < MATRIX_DIM; ++x) { FDFScalar Sum = (FDFScalar)0; for (int k = 0; k < MATRIX_DIM; ++k) { int2 yk = LMatrixIndex2D(y,k); int2 kx = LMatrixIndex2D(k,x); Sum = DFAdd(Sum, DFMultiply(LMatrix(A, yk.y, yk.x), LMatrix(B, kx.y, kx.x))); } MATRIX(Ret, y, x) = Sum; } } return Ret; } float LMatrixMaxAbsValue(in FLMatrix A) { float MaxAbsValue = 0; for (int i = 0; i < SYMMETRIC_MATRIX_SIZE; ++i) { MaxAbsValue = max(MaxAbsValue, abs(DFDemote(A.M[i]))); } return MaxAbsValue; } // Assume A is symmetric, and B is not. However, A*B is symmetric FLMatrix ToLMatrixMultiply(in FLMatrix A, in FMatrix B) { FLMatrix Ret = (FLMatrix)0; for (int y = 0; y < MATRIX_DIM; ++y) { for (int x = 0; x <= y; ++x) { FDFScalar Sum = (FDFScalar)0; for (int k = 0; k < MATRIX_DIM; ++k) { int2 yk = LMatrixIndex2D(y,k); Sum = DFAdd(Sum, DFMultiply(LMatrix(A, yk.y, yk.x), MATRIX(B, k, x))); } LMatrix(Ret, y, x) = Sum; } } return Ret; } void Identity(out FMatrix M) { M = (FMatrix)0; for (int i = 0; i < MATRIX_DIM; ++i) { MATRIX(M, i, i) = DFPromote(1.0f); } } void LMatrixEuclideanNorm(in FLMatrix A, out float EuclideanNorm) { EuclideanNorm = 0.0f; for (int i = 0; i < FULL_MATRIX_SIZE; ++i) { int2 Index2D = LMatrixIndex1D(i); EuclideanNorm += Pow2(DFDemote(LMatrix(A, Index2D.y, Index2D.x))); } EuclideanNorm = sqrt(EuclideanNorm); } // |AX-I|. Euclidean norm of AX-I void MatrixConvergence(in FMatrix A, out float ConvergenceCriteria) { ConvergenceCriteria = 0.0f; for (int i = 0; i < MATRIX_DIM; ++i) { for (int j = 0; j < MATRIX_DIM; ++j) { ConvergenceCriteria += Pow2(DFDemote(MATRIX(A,i,j))-select(i==j, 1.0f, 0.0f)); } } ConvergenceCriteria = sqrt(ConvergenceCriteria); } void MatrixNegate(inout FMatrix Matrix) { for (int i = 0; i < FULL_MATRIX_SIZE; ++i) { Matrix.M[i] = DFNegate(Matrix.M[i]); } } FMatrix Copy(in FMatrix Matrix) { FMatrix Out; for (int i = 0; i < FULL_MATRIX_SIZE; ++i) { Out.M[i] = Matrix.M[i]; } return Out; } FMatrix MatrixMultiply(in FMatrix A, in FMatrix B) { FMatrix Ret; for (int i = 0; i < MATRIX_DIM; ++i) { for (int k = 0; k < MATRIX_DIM; ++k) { FDFScalar Sum = (FDFScalar)0; for (int j = 0; j < MATRIX_DIM; ++j) { const FDFScalar DFMultiplyScalar = DFMultiply(A.M[i * MATRIX_DIM + j], B.M[j * MATRIX_DIM + k]); Sum = DFAdd(Sum, DFMultiplyScalar); } Ret.M[i * MATRIX_DIM + k] = Sum; } } return Ret; } FDFScalar GetNFORVectorNorm2(in FDFScalar V[MATRIX_DIM], int N) { FDFScalar L2Norm = (FDFScalar)0; for (int i = 0; i < N; ++i) { L2Norm = DFAdd(L2Norm, DFMultiply(V[i], V[i])); } L2Norm = DFSqrt(L2Norm); return L2Norm; } void SwapIntVector(inout int V[MATRIX_DIM], int i, int j) { int Tmp = V[i]; V[i] = V[j]; V[j] = Tmp; } void SwampColumn(inout FMatrix Matrix, int i, int j) { FDFScalar Tmp; for (int row = 0; row < MATRIX_DIM; ++row) { Tmp = Matrix.M[row * MATRIX_DIM + i]; Matrix.M[row * MATRIX_DIM + i] = Matrix.M[row * MATRIX_DIM + j]; Matrix.M[row * MATRIX_DIM + j] = Tmp; } } FMatrix SelectSubMatrix(in FMatrix Matrix, int StartRowColumnIndex) { FMatrix Ret; for (int j = StartRowColumnIndex; j < MATRIX_DIM; ++j) { for (int k = StartRowColumnIndex; k < MATRIX_DIM; ++k) { Ret.M[(j - StartRowColumnIndex) * MATRIX_DIM + (k - StartRowColumnIndex)] = Matrix.M[j * MATRIX_DIM + k]; } } return Ret; } int GetPivot(in FMatrix Matrix, int N) { FNFORVector Vector; FDFScalar MaxNorm = DFPromote(-1.0f); int MaxIndex = 0; for (int column = 0; column < N; ++column) { for (int row = 0; row < N; ++row) { Vector.V[row] = Matrix.M[row * MATRIX_DIM + column]; } FDFScalar L2Norm = GetNFORVectorNorm2(Vector.V, N); if (DFGreater(L2Norm, MaxNorm)) { MaxNorm = L2Norm; MaxIndex = column; } } return MaxIndex + (MATRIX_DIM - N); } void HouseHoldReflection(in FDFScalar A[MATRIX_DIM], out FMatrix O, int N) { FDFScalar V[MATRIX_DIM]; for (int i = 0; i < N; ++i) { V[i] = A[i]; } FDFScalar L2Norm = GetNFORVectorNorm2(V, N); V[0] = DFAdd(V[0], DFMultiply(DFSign(A[0]), L2Norm)); L2Norm = GetNFORVectorNorm2(V, N); FDFScalar InvL2Norm = DFDivide(1.0f, L2Norm); for (int i = 0; i < N; ++i) { V[i] = DFMultiply(V[i], InvL2Norm); } //I - 2 * vv for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { O.M[i * MATRIX_DIM + j] = DFSubtract(lerp(0, 1, i == j), DFMultiply(2, DFMultiply(V[i], V[j]))); } } } // Calculate AX=B with (Household) QR decomposition and back substitution. // TODO: Reduce the memory used in calculation. void QRDecomposition(in FMatrix A, out FMatrix Q, out FMatrix R #if QR_DECOMPOSITION_TYPE == QR_DECOMPOSITION_TYPE_COLUMN_PIVOT , out int PivotIndices[MATRIX_DIM] #endif ) { Identity(Q); R = Copy(A); FMatrix H; FMatrix Reflection; for (int i = 0; i < MATRIX_DIM; ++i) { // Select column with the largest norm as pivot. #if QR_DECOMPOSITION_TYPE == QR_DECOMPOSITION_TYPE_COLUMN_PIVOT H = SelectSubMatrix(R, i); int Pivot = GetPivot(H, MATRIX_DIM - i); SwapIntVector(PivotIndices, i, Pivot); SwampColumn(R, i, Pivot); #endif // Apply Householder reflection. Identity(H); FDFScalar Selection[MATRIX_DIM]; for (int j = i; j < MATRIX_DIM; ++j) { Selection[j - i] = R.M[j * MATRIX_DIM + i]; } HouseHoldReflection(Selection, Reflection, MATRIX_DIM - i); for (int j = i; j < MATRIX_DIM; ++j) { for (int k = i; k < MATRIX_DIM; ++k) { H.M[j * MATRIX_DIM + k] = Reflection.M[(j - i) * MATRIX_DIM + (k - i)]; } } Q = MatrixMultiply(Q, H); R = MatrixMultiply(H, R); } } void QR_Solve(int AOffset, int ASize, int BOffset) { FMatrix NA; FMatrix NQ; FMatrix NR; #if QR_DECOMPOSITION_TYPE == QR_DECOMPOSITION_TYPE_COLUMN_PIVOT int PivotIndices[MATRIX_DIM]; for (int i = 0; i < MATRIX_DIM; ++i) { PivotIndices[i] = i; } #endif // QR decomposition. for (int i = 0; i < FULL_MATRIX_SIZE; ++i) { NA.M[i] = DFPromote(A[AOffset + i]); } QRDecomposition(NA, NQ, NR #if QR_DECOMPOSITION_TYPE == QR_DECOMPOSITION_TYPE_COLUMN_PIVOT , PivotIndices #endif ); // Q^T * B FDFScalar Y[MATRIX_DIM * 3]; for (int i = 0; i < MATRIX_DIM; ++i) { for (int k = 0; k < 3; ++k) { FDFScalar Sum = (FDFScalar)0; for (int j = 0; j < MATRIX_DIM; ++j) { Sum = DFAdd(Sum, DFMultiply(NQ.M[j * MATRIX_DIM + i], B[BOffset + j * 3 + k])); } Y[i * 3 + k] = Sum; } } //back substitution for each color. // RX = Y, Y = Q^T*B FDFScalar X[MATRIX_DIM * 3]; for (int i = 0; i < (MATRIX_DIM * 3); ++i) { X[i] = DFPromote(0.0f); } for (int c = 0; c < 3; ++c) { for (int i = MATRIX_DIM - 1; i >= 0; --i) { FDFScalar Ri = NR.M[i * MATRIX_DIM + i]; if (DFEquals(Ri, 0.0f)) { break; } FDFScalar Xc = Y[i * 3 + c]; for (int j = i + 1; j < MATRIX_DIM; ++j) { Xc = DFSubtract(Xc, DFMultiply(NR.M[i * MATRIX_DIM + j], X[j * 3 + c])); } X[i * 3 + c] = DFDivide(Xc, Ri); } } //Copy to B matrix. #if QR_DECOMPOSITION_TYPE == QR_DECOMPOSITION_TYPE_COLUMN_PIVOT for (int i = 0; i < MATRIX_DIM; ++i) { for (int j = 0; j < 3; ++j) { Result[BOffset + PivotIndices[i] * 3 + j] = DFDemote(X[i * 3 + j]); } } #else for (int i = 0; i < (MATRIX_DIM * 3); ++i) { Result[BOffset + i] = DFDemote(X[i]); } #endif } bool CholeskeyDecomposition(inout FLMatrix L, FLMatrix AMatrix, float lambda) { bool bComplete = true; L = (FLMatrix)0; // We need adaptive lambda otherwise, we cannot find a one size fit for all to make // this decomposition always successful. // To achieve this, we use the max value of the absolute of the matrtix by the lambda lambda *= LMatrixMaxAbsValue(AMatrix); // In case lambda becomes too small, we use the min lambda to provide sufficient regularization. lambda = max(lambda, MinLambda); for (int i = 0; i < MATRIX_DIM; ++i) { for (int j = 0; j <= i; ++j) { FDFScalar Sum = (FDFScalar)0; for (int k = 0; k < j; ++k) { Sum = DFAdd(Sum, DFMultiply(LMatrix(L, i, k), LMatrix(L, j, k))); } if (i == j) { // Check Delta > 0 FDFScalar Delta = DFAdd(LMatrix(AMatrix,i,i), DFSubtract(lambda, Sum)); bComplete &= DFGreater(Delta, 0) ? true : false; LMatrix(L, i, j) = DFSqrt(Delta); } else { // Check Ljj != 0 FDFScalar Ljj = LMatrix(L, j, j); bComplete &= DFEquals(Ljj, 0) ? false : true; LMatrix(L, i, j) = DFDivide(DFSubtract(LMatrix(AMatrix,i,j), Sum), Ljj); } } } return bComplete; } FLMatrix CholeskyFactorInverse(FLMatrix L) { FMatrix Y; //2. Forward substitution // LL^TX=B, LY = B, where B is identity matrix. for (int c = 0; c < MATRIX_DIM; ++c) { for (int i = 0; i < MATRIX_DIM; ++i) { FDFScalar Sum = (FDFScalar)0; for (int j = 0; j < i; ++j) { Sum = DFAdd(Sum, DFMultiply(MATRIX(Y,j,c), LMatrix(L, i, j))); } MATRIX(Y,i,c) = DFDivide(DFSubtract(lerp(0.0f, 1.0f, i==c), Sum), LMatrix(L, i, i)); } } FLMatrix X = (FLMatrix)0; //3. Backward substitution. //L^TX = Y for (int c = 0; c < MATRIX_DIM; ++c) { for (int i = MATRIX_DIM - 1; i >= c; --i) { FDFScalar Sum = (FDFScalar)0; for (int j = i + 1; j < MATRIX_DIM; ++j) { int2 jc = TLMatrixIndex2D(j,c); Sum = DFAdd(Sum, DFMultiply(TLMatrix(X, jc.y, jc.x), TLMatrix(L, i, j))); } TLMatrix(X,c,i) = DFDivide(DFSubtract(MATRIX(Y,i,c), Sum), TLMatrix(L, i, i)); } } return X; } bool CholeskeySolve(int AOffset, int ASize, int BOffset, float lambda, RWBuffer Result) { // Decomposte the matrix to L. As the matrix A might not be positive definite, // We solve for (A + \lambda*I)X=B instead, where \lambda is a very small value. FLMatrix L = (FLMatrix)0; bool bComplete = true; for (int i = 0; i < MATRIX_DIM; ++i) { for (int j = 0; j <= i; ++j) { FDFScalar Sum = (FDFScalar)0; for (int k = 0; k < j; ++k) { Sum = DFAdd(Sum, DFMultiply(LMatrix(L, i, k),LMatrix(L, j, k))); } if (i == j) { // Check Delta > 0 FDFScalar Delta = DFAdd(A[AOffset + i * MATRIX_DIM + i], DFSubtract(lambda, Sum)); bComplete &= DFGreater(Delta, 0) ? true : false; LMatrix(L, i, j) = DFSqrt(Delta); } else { // Check Ljj != 0 FDFScalar Ljj = LMatrix(L, j, j); bComplete &= DFEquals(Ljj, 0) ? false : true; LMatrix(L, i, j) = DFDivide(DFSubtract(A[AOffset + i * MATRIX_DIM + j], Sum), Ljj); } } } FDFScalar Y[MATRIX_DIM * 3]; //2. Forward substitution // LL^TX=B, LY = B, for (int c = 0; c < 3; ++c) { for (int i = 0; i < MATRIX_DIM; ++i) { FDFScalar Sum = (FDFScalar)0; for (int j = 0; j < i; ++j) { Sum = DFAdd(Sum, DFMultiply(Y[j * 3 + c], LMatrix(L, i, j))); } Y[i * 3 + c] = DFDivide(DFSubtract(B[BOffset + i * 3 + c], Sum), LMatrix(L, i, i)); } } FDFScalar X[MATRIX_DIM * 3]; //3. Backward substitution. //L^TX = Y for (int c = 0; c < 3; ++c) { for (int i = MATRIX_DIM-1; i >= 0; --i) { FDFScalar Sum = (FDFScalar)0; for (int j = i + 1; j < MATRIX_DIM; ++j) { Sum = DFAdd(Sum, DFMultiply(X[j * 3 + c], TLMatrix(L, i, j))); } X[i * 3 + c] = DFDivide(DFSubtract(Y[i * 3 + c], Sum), TLMatrix(L, i, i)); } } for (int i = 0; i < (MATRIX_DIM * 3); ++i) { if (isnan(DFDemote(X[i]))) { bComplete = false; } } if (bComplete) { for (int i = 0; i < (MATRIX_DIM * 3); ++i) { Result[BOffset + i] = DFDemote(X[i]); } } return bComplete; } bool InitializeGuess(inout FLMatrix L, FLMatrix AMatrix, float inLambda) { bool bComplete = true; #if NEWTON_INITIAL_GUESS_TYPE == INITIAL_GUESS_INVERSE_CHOLESKY_DECOMPOSITION bComplete = CholeskeyDecomposition(L, AMatrix, inLambda); if (bComplete) { L = CholeskyFactorInverse(L); } #elif NEWTON_INITIAL_GUESS_TYPE == INITIAL_GUESS_EUCLIDEAN_NORM float EuclideanNorm = 0; LMatrixEuclideanNorm(AMatrix, EuclideanNorm); // Improve the initial guess with identify matrix scaled by Euclidean norm. L = (FLMatrix)0; for (int x = 0; x < MATRIX_DIM; ++x) { LMatrix(L, x, x) = DFPromote(1 / EuclideanNorm); } #else #error not implemented #endif return bComplete; } FLMatrix MatrixInverse(FLMatrix AMatrix, float inLambda, inout bool bComplete) { FLMatrix X; // Note that X is invertable in the initialization. bComplete = InitializeGuess(X, AMatrix, inLambda); #define NUM_INVERSE_ITERATIONS_DOUBLE NUM_NEWTON_SCHULTZ_ITERATIONS FMatrix AX = (FMatrix)0; #if LINEAR_SOLVER_TYPE == LINEAR_SOLVER_TYPE_NEWTON_CHOLESKY float MaxTolerance = 1000; float CurrentTolerance = 0.0f; #endif for (int Step = 0; Step < NUM_INVERSE_ITERATIONS_DOUBLE; ++Step) { // A * X is not guranteed to be symmetric AX = LMatrixMultiply(AMatrix, X); // The norm might increase when combining choleksy and newton while Cholesky is overfitting. // Should stop the application if norm increase is detected. #if LINEAR_SOLVER_TYPE == LINEAR_SOLVER_TYPE_NEWTON_CHOLESKY && NEWTON_SCHULZ_EARLY_STOP MatrixConvergence(AX, CurrentTolerance); if (CurrentTolerance > MaxTolerance) { break; } MaxTolerance = min(CurrentTolerance, MaxTolerance); #endif MatrixNegate(AX); // 2*I + (-AX) for (int x = 0; x < MATRIX_DIM; ++x) { MATRIX(AX, x, x) = DFAdd(2.0f, MATRIX(AX, x, x)); } // X ( 2*I - AX) is guranteed to be symmetric X = ToLMatrixMultiply(X, AX); } return X; } bool NewtonIterativeSolve(int AOffset, int ASize, int BOffset, float inLambda, RWBuffer Result) { FLMatrix AMatrix = (FLMatrix)0; for (int y = 0; y < MATRIX_DIM; ++y) { for (int x = 0; x <= y; ++x) { LMatrix(AMatrix, y, x) = DFPromote(A[AOffset + y * MATRIX_DIM + x]); } } bool bComplete = false; AMatrix = MatrixInverse(AMatrix, inLambda, bComplete); if (!bComplete) { return bComplete; } //Calculate approx(A^{-1}) * B. for (int i = 0; i < MATRIX_DIM; ++i) { for (int k = 0; k < BDim.y; ++k) { FDFScalar Sum = DFPromote(0.0f); for (int j = 0; j < MATRIX_DIM; ++j) { int2 ij = TLMatrixIndex2D(i,j); Sum = DFAdd(Sum, DFMultiply(TLMatrix(AMatrix, ij.y, ij.x), DFPromote(B[BOffset + j * BDim.y + k]))); } Result[BOffset + i * BDim.y + k] = DFDemote(Sum); } } return bComplete; }