Files
UnrealEngine/Engine/Plugins/Experimental/NFORDenoise/Shaders/NFORDenoise.usf
2025-05-18 13:04:45 +08:00

1719 lines
54 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#include "/Engine/Public/Platform.ush"
#include "/Engine/Private/Common.ush"
#include "/Engine/Private/ScreenPass.ush"
#include "NFORRegressionCommon.ush"
#ifndef SOURCE_CHANNEL_COUNT
#define SOURCE_CHANNEL_COUNT 4
#endif
#if !COMPUTESHADER
#define THREAD_GROUP_SIZE 1
#endif
#if SOURCE_CHANNEL_COUNT == 1
#define TPixelValue float
#elif SOURCE_CHANNEL_COUNT == 2
#define TPixelValue float2
#elif SOURCE_CHANNEL_COUNT == 3
#define TPixelValue float3
#elif SOURCE_CHANNEL_COUNT == 4
#define TPixelValue float4
#endif
#define IMAGE_VARINCE_NORMAL 0
#define IMAGE_VARIANCE_GREYSCALE 1
#define IMAGE_VARIANCE_COLORED 2
#ifndef IMAGE_VARIANCE_TYPE
#define IMAGE_VARIANCE_TYPE IMAGE_VARIANCE_GREYSCALE
#endif
#define PRE_ALBEDO_DIVIDE_DISABLED 0
#define PRE_ALBEDO_DIVIDE_EACH 1
#define PRE_ALBEDO_DIVIDE_FINAL 2
#ifndef PRE_ALBEDO_DIVIDE
#define PRE_ALBEDO_DIVIDE PRE_ALBEDO_DIVIDE_DISABLED
#endif
#ifndef APPEND_CONSTANT_DIMENSION_TO_X
#define APPEND_CONSTANT_DIMENSION_TO_X 0
#endif
#ifndef NONLOCALMEAN_SEPARATE_SOURCE
#define NONLOCALMEAN_SEPARATE_SOURCE 0
#endif
#define NONLOCALMEAN_PATCHONLY 0
#define NONLOCALMEAN_DISTACNE_PATCH 1
#define NONLOCALMEAN_WEIGHT_TYPE NONLOCALMEAN_PATCHONLY
#define NONLOCALMEAN_SEPERABLE_FILTER_HORIZONTAL 0
#define NONLOCALMEAN_SEPERABLE_FILTER_VERTICAL 1
#ifndef NONLOCALMEAN_SEPRERABLE_PASS
#define NONLOCALMEAN_SEPRERABLE_PASS NONLOCALMEAN_SEPERABLE_FILTER_HORIZONTAL
#endif
#define NONLOCALMEAN_ATLAS_ONE_SYMMETRIC_PAIR 0
#define NONLOCALMEAN_ATLAS_TWO_SYMMETRIC_PAIR 1
#ifndef NONLOCALMEAN_ATLAS_TYPE
#define NONLOCALMEAN_ATLAS_TYPE NONLOCALMEAN_ATLAS_TWO_SYMMETRIC_PAIR
#endif
#if NONLOCALMEAN_ATLAS_TYPE == NONLOCALMEAN_ATLAS_ONE_SYMMETRIC_PAIR
#define TAtlasValueType float2
#elif NONLOCALMEAN_ATLAS_TYPE == NONLOCALMEAN_ATLAS_TWO_SYMMETRIC_PAIR
#define TAtlasValueType float4
#else
#error NONLOCALMEAN_ATLAS_TYPE is not supported.
#endif
#ifndef BUFFER_PASS_THROUGH
#define BUFFER_PASS_THROUGH 0
#endif
#define NONLOCALMEAN_ATLAS_SYMMETRIC_PAIR_COUNT (NONLOCALMEAN_ATLAS_TYPE+1)
#define NLM_WEIGHTLAYOUT_NONE 0
#define NLM_WEIGHTLAYOUT_NUM_OF_WEIGHTS_PER_PIXELxWxH 1
#define NLM_WEIGHTLAYOUT_WxHxNUM_OF_WEIGHTS_PER_PIXEL 2
#define NLM_WEIGHTLAYOUT_4xWxHxNUM_OF_WEIGHTS_BY_4 3
#ifndef NLM_WEIGHTLAYOUT
#define NLM_WEIGHTLAYOUT NLM_WEIGHTLAYOUT_NONE
#endif
#if NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_4xWxHxNUM_OF_WEIGHTS_BY_4
#define TWeightReadType float4
#define WEIGHT_PIXEL_INCREMENT 4
#else
#define TWeightReadType float
#define WEIGHT_PIXEL_INCREMENT 1
#endif
#ifndef NUM_FEATURE
#define NUM_FEATURE 0
#endif
#ifndef SMALL_MATRIX_OPTIMIZE
#define SMALL_MATRIX_OPTIMIZE 0
#endif
// Sampling steps for performance optimization.
#ifndef USE_SAMPLING_STEP
#define USE_SAMPLING_STEP 0
#endif
#define SAMPLING_TYPE_INCREMENTAL 0
#define SAMPLING_TYPE_FOCUS_CURRENT_FRAME 1
#define SAMPLING_TYPE SAMPLING_TYPE_FOCUS_CURRENT_FRAME
// Matrix multiplication
#define WEIGHTED_MULTIPLICATION_QUADRATIC 0 //X^TWX
#define WEIGHTED_MULTIPLICATION_GENERALIZED 1 //X^TWY
#ifndef WEIGHTED_MULTIPLICATION_TYPE
#define WEIGHTED_MULTIPLICATION_TYPE WEIGHTED_MULTIPLICATION_QUADRATIC
#endif
// Linear equation system solver
#define INPUT_MATRIX_TYPE_SUCCESS 0
#define INPUT_MATRIX_TYPE_FAIL 1
#ifndef INPUT_MATRIX_TYPE
#define INPUT_MATRIX_TYPE INPUT_MATRIX_TYPE_SUCCESS
#endif
#define OUTPUT_MATRIX_TYPE_SUCCESS 0
#define OUTPUT_MATRIX_TYPE_FAIL 1
#ifndef OUTPUT_INDICES
#define OUTPUT_INDICES 0
#endif
// Reconstruction
#define RECONSTRUCTION_TYPE_SCATTER 0
#define RECONSTRUCTION_TYPE_GATHER 1
#ifndef RECONSTRUCTION_TYPE
#define RECONSTRUCTION_TYPE RECONSTRUCTION_TYPE_SCATTER
#endif
#define RADIANCE_PREPROCESS_SCALE_FACTOR 0.1f
#define RADIANCE_POSTPROCESS_INVERSE_SCALE_FACTOR (1/RADIANCE_PREPROCESS_SCALE_FACTOR);
//--------------------------------------------------------------------------------------------------
// Util and general texture operations
//--------------------------------------------------------------------------------------------------
#define TEXTURE_OPS_MULTIPLY 0
#define TEXTURE_OPS_DIVIDE 1
#define TEXTURE_OPS_ADD_CONSTANT 2
#define TEXTURE_OPS_ACCUMULATE 3
#ifndef TEXTURE_OPS
#define TEXTURE_OPS TEXTURE_OPS_MULTIPLY
#endif
#define TEXTURE_COPY_TARGET_SINGLE_CHANNEL 0
#define TEXTURE_COPY_SOURCE_SINGLE_CHANNEL 1
#ifndef TEXTURE_COPY_TYPE
#define TEXTURE_COPY_TYPE TEXTURE_COPY_TARGET_SINGLE_CHANNEL
#endif
#ifndef ACCUMULATE_BY_MASK
#define ACCUMULATE_BY_MASK 0
#endif
float Length2(TPixelValue Value)
{
#if SOURCE_CHANNEL_COUNT == 1
return Value * Value;
#elif SOURCE_CHANNEL_COUNT == 4
return length2(Value.rgb);
#else
return length2(Value);
#endif
}
TPixelValue GetImageValue(int2 P, Texture2D<TPixelValue> inImage)
{
return inImage.Load(int3(P, 0));
}
int2 GetMirroredPosition(int2 P, int2 inTextureSize)
{
int2 TextureSizeMax = inTextureSize - 1;
P = abs(TextureSizeMax - abs(P - TextureSizeMax));
return P;
}
//--------------------------------------------------------------------------------------------------
// General texture operations
Texture2D<TPixelValue> Source;
Texture2D<uint> Mask;
RWTexture2D<float4> RWTarget;
int2 SourcePosition;
int2 TargetPosition;
int2 Size;
int ForceOperation;
float4 ConstantValue;
float4 LoadSource(uint2 Position)
{
float4 SourceValue = 0;
#if SOURCE_CHANNEL_COUNT == 1
SourceValue.x = Source.Load(uint3(Position, 0));
#elif SOURCE_CHANNEL_COUNT == 2
SourceValue.xy = Source.Load(uint3(Position, 0));
#elif SOURCE_CHANNEL_COUNT == 3
SourceValue.xyz = Source.Load(uint3(Position, 0));
#elif SOURCE_CHANNEL_COUNT == 4
SourceValue = Source.Load(uint3(Position, 0));
#endif
return SourceValue;
}
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void TextureOperationCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= (uint2)Size))
{
return;
}
const uint2 Position = DispatchThreadID.xy;
const uint2 ResolvedSourcePosition = max(SourcePosition + Position, 0);
const uint2 ResolvedTargetPosition = TargetPosition + Position;
if (all(ResolvedTargetPosition >= 0))
{
#if TEXTURE_OPS == TEXTURE_OPS_MULTIPLY
float4 STexel = LoadSource(ResolvedSourcePosition);
RWTarget[ResolvedTargetPosition].rgb *= lerp(1.0f, STexel.rgb, (STexel.rgb != 0) | ForceOperation);
#elif TEXTURE_OPS == TEXTURE_OPS_DIVIDE
float4 STexel = LoadSource(ResolvedSourcePosition);
RWTarget[ResolvedTargetPosition].rgb /= lerp(1.0f, STexel.rgb, (STexel.rgb != 0) | ForceOperation);
#elif TEXTURE_OPS == TEXTURE_OPS_ADD_CONSTANT
float3 Value = 0;
#if ACCUMULATE_BY_MASK
uint MaskId = clamp(Mask[ResolvedTargetPosition], 0, 3); // support up to 4 masked add.
Value = ConstantValue[MaskId];
#else
Value = ConstantValue.rgb;
#endif
RWTarget[ResolvedTargetPosition].rgb += Value;
#elif TEXTURE_OPS == TEXTURE_OPS_ACCUMULATE
float4 STexel = LoadSource(ResolvedSourcePosition);
RWTarget[ResolvedTargetPosition] += STexel;
#endif
}
}
int2 SourceOffset;
int2 TargetOffset;
int Channel;
int2 CopySize;
int2 TextureSize;
TPixelValue CopyTexturePS(float4 SvPosition : SV_POSITION) : SV_Target0
{
uint2 Position = (uint2)SvPosition.xy + SourceOffset;
return Source.Load(uint3(GetMirroredPosition(Position, TextureSize), 0));
}
#if TEXTURE_COPY_TYPE == TEXTURE_COPY_TARGET_SINGLE_CHANNEL
Texture2D<float4> CopySource;
RWTexture2D<float> RWCopyTarget;
#elif TEXTURE_COPY_TYPE == TEXTURE_COPY_SOURCE_SINGLE_CHANNEL
Texture2D<float> CopySource;
RWTexture2D<float4> RWCopyTarget;
#endif
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void CopyTextureSingleChannelCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= (uint2)CopySize))
{
return;
}
uint2 ResolvedSourcePosition = DispatchThreadID.xy + SourceOffset;
uint2 ResolvedTargetPosition = DispatchThreadID.xy + TargetOffset;
if (all(ResolvedTargetPosition >= 0))
{
#if TEXTURE_COPY_TYPE == TEXTURE_COPY_TARGET_SINGLE_CHANNEL
RWCopyTarget[ResolvedTargetPosition] = CopySource.Load(uint3(GetMirroredPosition(ResolvedSourcePosition, TextureSize), 0))[Channel];
#elif TEXTURE_COPY_TYPE == TEXTURE_COPY_SOURCE_SINGLE_CHANNEL
RWCopyTarget[ResolvedTargetPosition][Channel] = CopySource.Load(uint3(GetMirroredPosition(ResolvedSourcePosition, TextureSize), 0));
#endif
}
}
//--------------------------------------------------------------------------------------------------
// Variance definition and functions
//
// Assumption: Variance texture is of type float4 and holds std instead
//
#define TImageVariance float4
float GetImageVariance(int2 P, Texture2D inVariance, int inVarianceChannelOffset)
{
float Var = 0;
#if (IMAGE_VARIANCE_TYPE == IMAGE_VARIANCE_NORMAL) | (IMAGE_VARIANCE_TYPE == IMAGE_VARIANCE_GREYSCALE)
// The loaded variance is actually the std
float Std = inVariance.Load(int3(P, 0))[inVarianceChannelOffset];
Var = Pow2(Std);
#elif IMAGE_VARIANCE_TYPE == IMAGE_VARIANCE_COLORED
// Assume channel independence and use Luminance() for grey scale
// calculation.
const float3 Constants = float3(0.09, 0.3481, 0.0121);
float3 StdRGB = inVariance.Load(int3(P, 0)).rgb;
Var = dot(Pow2(StdRGB), Constants);
#else
#error IMAGE_VARIANCE_TYPE not supported
#endif
return Var;
}
//------------------------------------------------------------------------------------------------------
// Radiance normalization
//------------------------------------------------------------------------------------------------------
Texture2D<float4> Normal;
Texture2D<float4> NormalVariance;
RWTexture2D<uint> RWMask;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void ClassifyPreAlbedoDivideMaskIdCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= (uint2)TextureSize))
{
return;
}
const uint2 Position = DispatchThreadID.xy;
float4 NormalValues = 0;
NormalValues.rgb = Normal.Load(uint3(Position, 0)).rgb;
NormalValues.a = NormalVariance.Load(uint3(Position, 0)).b;
uint MaskId = 0;
if (all(NormalValues == 0))
{
MaskId = 1;
}
RWMask[Position] = MaskId;
}
Texture2D<float4> Albedo;
RWTexture2D<float4> RWRadianceVariance;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void NormalizeRadianceVarianceByAlbedoCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= (uint2)Size))
{
return;
}
const uint2 Position = DispatchThreadID.xy;
float AlbedoLum = Luminance(Albedo.Load(uint3(Position, 0)).rgb);
// Since r stores the std instead of variance, need to divide albedo instead of albedo^2.
RWRadianceVariance[Position].r /= lerp(1.0f, AlbedoLum, AlbedoLum > 0);
}
RWTexture2D<float4> RWImage;
RWTexture2D<float4> RWImageVariance;
float MaxValue;
int VarianceChannelOffset;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void AdjustFeatureRangeCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= (uint2)Size))
{
return;
}
const uint2 Position = DispatchThreadID.xy;
float3 Feature = RWImage[Position].xyz;
#if (IMAGE_VARIANCE_TYPE == IMAGE_VARIANCE_NORMAL)
float Value = length(Feature);
#elif (IMAGE_VARIANCE_TYPE == IMAGE_VARIANCE_GREYSCALE)
float Value = Luminance(Feature);
#else
float Value = -1.0f;
#endif
const float ScaleFactor = lerp(1.0f, MaxValue / Value, Value > MaxValue);
BRANCH
if (ScaleFactor < 1.0f)
{
RWImage[Position].xyz *= ScaleFactor;
RWImageVariance[Position][VarianceChannelOffset] *= ScaleFactor; // Scaling the std does not require power.
}
}
//--------------------------------------------------------------------------------------------------
// Non-local mean definition and function
//--------------------------------------------------------------------------------------------------
// When NONLOCALMEAN_SEPARATE_SOURCE == 1, the NLM calculates the distance from source image to target image
// instead of on its own.
int PatchSize;
int PatchDistance;
float Bandwidth;
// Dispatch parameters
int DispatchId;
int2 DispatchTileSize;
int DispatchTileCount;
int4 SeparableFilteringRegion;
int3 DispatchRegionSize;
Texture2D<TPixelValue> Image;
Texture2D<TImageVariance> Variance;
#if NONLOCALMEAN_SEPARATE_SOURCE
Texture2D<TPixelValue> TargetImage;
Texture2D<TImageVariance> TargetVariance;
#else
#define TargetImage Image
#define TargetVariance Variance
#endif
struct FImageContext
{
TPixelValue Value;
float Variance;
};
FImageContext GetImageContext(int2 Position, Texture2D<TPixelValue> inImage, Texture2D inVariance, float VarianceScale = 1.0f)
{
FImageContext ImageContext = (FImageContext)0;
// Use mirrored position. Clamp to boarder has artifacts at border.
Position = GetMirroredPosition(Position, TextureSize);
ImageContext.Value = GetImageValue(Position, inImage);
ImageContext.Variance = GetImageVariance(Position, inVariance, VarianceChannelOffset) * VarianceScale;
return ImageContext;
}
// Note that GetSquaredDistance(P,Q) != GetSquaredDistance(Q,P), So we cannot share the weight with this asymetric distance.
float GetSquaredDistance(TPixelValue Cp, float VarP, TPixelValue Cq, float VarQ)
{
const float Epsilon = 1e-8f; // Small enough to preserve dark areas
float dpq = (Length2(Cp - Cq) - (VarP + min(VarP, VarQ))) / (Epsilon + Pow2(Bandwidth) * (VarP + VarQ));
const float MaxDistance = 10000;
dpq = min(dpq, MaxDistance);
return dpq;
}
// Get the both way x=GetSquaredDistance(P,Q) and y=GetSquaredDistance(Q,P) together.
float2 GetSymmetricSquaredDistance(TPixelValue Cp, float VarP, TPixelValue Cq, float VarQ)
{
const float Epsilon = 1e-8f; // Small enough to preserve dark areas
float VarPQMin = min(VarP, VarQ);
float2 VarPQ = float2((VarP + VarPQMin), (VarQ + VarPQMin));
float2 dpq = (Length2(Cp - Cq) - VarPQ) / (Epsilon + Pow2(Bandwidth) * (VarP + VarQ));
const float MaxDistance = 10000;
dpq = min(dpq, MaxDistance);
return dpq;
}
float GetSquaredDistance(FImageContext PCtx, FImageContext QCtx)
{
return GetSquaredDistance(PCtx.Value, PCtx.Variance, QCtx.Value, QCtx.Variance);
}
float GetSquaredDistance(int2 P, int2 Q, float VarianceScale = 1.0f)
{
FImageContext PCtx = GetImageContext(P, Image, Variance, VarianceScale);
FImageContext QCtx = GetImageContext(Q, TargetImage, TargetVariance, VarianceScale);
float dpq = GetSquaredDistance(PCtx, QCtx);
return dpq;
}
float2 GetSymmetricSquaredDistance(FImageContext PCtx, FImageContext QCtx)
{
return GetSymmetricSquaredDistance(PCtx.Value, PCtx.Variance, QCtx.Value, QCtx.Variance);
}
float2 GetSymmetricSquaredDistance(int2 P, int2 Q, float VarianceScale = 1.0f)
{
FImageContext PCtx = GetImageContext(P, Image, Variance, VarianceScale);
FImageContext QCtx = GetImageContext(Q, TargetImage, TargetVariance, VarianceScale);
float2 dpq = GetSymmetricSquaredDistance(PCtx, QCtx);
return dpq;
}
float GetPatchSquaredDistance(int2 P, int2 Q)
{
const float VarianceScale = 1.0f;
// TODO: z-order for performance improvement?
float Sum = 0;
for (int y = -PatchSize; y <= PatchSize; ++y)
{
for (int x = -PatchSize; x <= PatchSize; ++x)
{
int2 Pij = P + int2(x, y);
int2 Qij = Q + int2(x, y);
Sum += GetSquaredDistance(Pij, Qij, VarianceScale);
}
}
// ||Pp - Pq||^2
float PathSquareDistance = max(0, Sum / Pow2(2 * PatchSize + 1));
return PathSquareDistance;
}
TAtlasValueType GetNonLocalMeanWeight(int2 P, int2 Q, TAtlasValueType PatchPQSqaureDistance)
{
TAtlasValueType NonLocalMeanWeight = (TAtlasValueType)0;
#if NONLOCALMEAN_WEIGHT_TYPE == NONLOCALMEAN_PATCHONLY
NonLocalMeanWeight = FastExp(-PatchPQSqaureDistance);
#elif NONLOCALMEAN_WEIGHT_TYPE == NONLOCALMEAN_DISTACNE_PATCH
#error not implemented
const float SigmaS = 2.0f;//TODO: move to CVar
NonLocalMeanWeight = FastExp(-length2(P - Q) / (2 * Pow2(SigmaS))) * FastExp(-PatchPQSqaureDistance);
#endif
#if PRE_ALBEDO_DIVIDE == PRE_ALBEDO_DIVIDE_DISABLED
const TAtlasValueType kMinFloat16 = (TAtlasValueType)0.000000059604645f;
//without albedo divide, below is better when scattering results.
NonLocalMeanWeight = max(kMinFloat16, NonLocalMeanWeight);
#endif
return NonLocalMeanWeight;
}
// P is on source image, Q is on target image
float GetNonLocalMeanWeight(int2 P, int2 Q)
{
//TODO: add a box filter?
float PatchpqSqaure = GetPatchSquaredDistance(P,Q);
float NonLocalMeanWeight = 1.0f;
#if NONLOCALMEAN_WEIGHT_TYPE == NONLOCALMEAN_PATCHONLY
NonLocalMeanWeight = FastExp(-PatchpqSqaure);
#elif NONLOCALMEAN_WEIGHT_TYPE == NONLOCALMEAN_DISTACNE_PATCH
const float SigmaS = 2.0f;//TODO: move to CVar
NonLocalMeanWeight = FastExp(-length2(P - Q) / (2 * Pow2(SigmaS))) * FastExp(-PatchpqSqaure);
#endif
#if PRE_ALBEDO_DIVIDE == PRE_ALBEDO_DIVIDE_DISABLED
const float kMinFloat16 = 0.000000059604645f;
//without albedo divide, below is better when scattering results.
NonLocalMeanWeight = max(kMinFloat16, NonLocalMeanWeight);
#endif
return NonLocalMeanWeight;
}
RWTexture2D<TPixelValue> DenoisedImage;
Buffer<TWeightReadType> NonLocalMeanWeights;
int DenoisingChannelCount;
int4 FilteringRegion;
uint GetWeightBufferIndex(uint2 LocalPositionInRegion, uint WeightIndex, uint2 RegionSize)
{
#if (NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_NONE) \
|| (NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_NUM_OF_WEIGHTS_PER_PIXELxWxH)
const uint PatchWidth = PatchDistance * 2 + 1;
const uint NumOfWeightsPerPixel = Pow2(PatchWidth);
uint BufferIndex = WeightIndex + NumOfWeightsPerPixel * (LocalPositionInRegion.x + RegionSize.x * LocalPositionInRegion.y);
#elif NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_WxHxNUM_OF_WEIGHTS_PER_PIXEL
// Best if neighbour pixels query the same weight index.
uint BufferIndex = LocalPositionInRegion.x + RegionSize.x * (LocalPositionInRegion.y + RegionSize.y * WeightIndex);
#elif NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_4xWxHxNUM_OF_WEIGHTS_BY_4
// Assume the buffer layout is 4xWxHx[N/4] and the buffer is accessed with Buffer<float>
uint StartIndex = WeightIndex / 4;
uint WeightOffset = WeightIndex % 4;
uint BufferIndex = WeightOffset + 4 *(LocalPositionInRegion.x + RegionSize.x * (LocalPositionInRegion.y + RegionSize.y * StartIndex));
#else
#error not implemented
#endif
return BufferIndex;
}
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void NonLocalMeanFilteringCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
int2 RegionSize = int2(FilteringRegion.z - FilteringRegion.x, FilteringRegion.w - FilteringRegion.y);
if (any(DispatchThreadID.xy >= RegionSize))
{
return;
}
const uint PatchWidth = PatchDistance * 2 + 1;
const uint NumOfWeightsPerPixel = Pow2(PatchWidth);
const uint2 Position = DispatchThreadID.xy + FilteringRegion.xy;
float TotalWeights = 0;
TPixelValue Result = 0;
for (int i = 0; i < NumOfWeightsPerPixel; i += WEIGHT_PIXEL_INCREMENT)
{
int x = i % PatchWidth - PatchDistance;
int y = i / PatchWidth - PatchDistance;
int2 Q = Position + int2(x,y);
#if NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_NONE
TWeightReadType Weight = GetNonLocalMeanWeight(Position, Q);
#else
uint WeightIndex = i;// (x + PatchDistance) + (y + PatchDistance) * PatchWidth;
uint BufferIndex = GetWeightBufferIndex(DispatchThreadID.xy, WeightIndex, RegionSize) / WEIGHT_PIXEL_INCREMENT;
TWeightReadType Weight = NonLocalMeanWeights[BufferIndex];
#endif
UNROLL
for (int j = 0; j < WEIGHT_PIXEL_INCREMENT; ++j)
{
#if WEIGHT_PIXEL_INCREMENT == 1
float LocalWeight = Weight;
#else
int CombinedIndex = i + j;
float LocalWeight = Weight[j];
x = CombinedIndex % PatchWidth - PatchDistance;
y = CombinedIndex / PatchWidth - PatchDistance;
Q = Position + int2(x,y);
if (CombinedIndex < NumOfWeightsPerPixel)
#endif
{
Result += LocalWeight * GetImageValue(GetMirroredPosition(Q, TextureSize), TargetImage);
TotalWeights += LocalWeight;
}
}
}
Result = Result / max(1e-6f, TotalWeights);
#if SOURCE_CHANNEL_COUNT == 1
DenoisedImage[Position] = Result;
#else
TPixelValue TargetValue = GetImageValue(GetMirroredPosition(Position, TextureSize), TargetImage);
for (int i = 0; i < DenoisingChannelCount; ++i)
{
TargetValue[i] = Result[i];
}
DenoisedImage[Position] = TargetValue;
#endif
}
// Calculate the non-local mean weights for Region
int4 Region;
RWBuffer<float> RWNonLocalMeanWeights;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void NonLocalMeanWeightsCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
int2 RegionSize = int2(Region.z - Region.x, Region.w - Region.y);
if (any(DispatchThreadID.xy >= RegionSize))
{
return;
}
const uint2 Position = DispatchThreadID.xy + Region.xy;
const int PatchDiameter = 2 * PatchDistance + 1;
const int PixelOffset = Pow2(PatchDiameter);
const int BufferOffset = (DispatchThreadID.x + DispatchThreadID.y * RegionSize.x) * PixelOffset;
for (int i = 0; i < PixelOffset; ++i)
{
int2 Offset = int2(i % PatchDiameter, i / PatchDiameter) - int2(PatchDistance, PatchDistance);
int2 Q = Position + Offset;
#if (NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_NONE) \
|| (NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_NUM_OF_WEIGHTS_PER_PIXELxWxH)
uint BufferIndex = BufferOffset + i;
#else
uint BufferIndex = GetWeightBufferIndex(DispatchThreadID.xy, i, RegionSize);
#endif
RWNonLocalMeanWeights[BufferIndex] = GetNonLocalMeanWeight(Position, Q);
}
}
RWTexture2D<TAtlasValueType> RWNLMWeightAtlas;
int2 NLMWeightAtlasSize;
uint2 GetAtlasPosition(uint3 DispatchThreadID, uint2 TileRegionSize,uint2 Offset = 0)
{
uint2 TileId = uint2(DispatchThreadID.z % DispatchTileSize.x, DispatchThreadID.z / DispatchTileSize.x);
uint2 TileOffset = TileId * TileRegionSize;
uint2 AtlasPosition = TileOffset + DispatchThreadID.xy + Offset;
return AtlasPosition;
}
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, THREAD_GROUP_SIZE)]
void NonLocalMeanGetSqauredDistanceToAtlasCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID >= DispatchRegionSize))
{
return;
}
const float VarianceScale = 1.0f;
// Query the current position.
uint2 Position = DispatchThreadID.xy + SeparableFilteringRegion.xy;
FImageContext PContext = GetImageContext(Position, Image, Variance, VarianceScale);
// Calculate the squared distance. When source and target is different, we store the 2 consecutive
// distances instead of one pair.
#define DISTANCE_QUERY_COUNT ((NONLOCALMEAN_SEPARATE_SOURCE+1)*NONLOCALMEAN_ATLAS_SYMMETRIC_PAIR_COUNT)
int BaseOffsetIndex = (DispatchId * DispatchTileCount + DispatchThreadID.z) * DISTANCE_QUERY_COUNT;
int PatchSearchWidth = PatchDistance * 2 + 1;
TAtlasValueType Distances = (TAtlasValueType)0;
FImageContext QContext;
UNROLL
for (int i = 0; i < DISTANCE_QUERY_COUNT; ++i)
{
int OffsetIndex = BaseOffsetIndex + i;
int2 Offset = int2(OffsetIndex % PatchSearchWidth, OffsetIndex / PatchSearchWidth) - int2(PatchDistance, PatchDistance);
int2 Q = Position + Offset;
QContext = GetImageContext(Q, TargetImage, TargetVariance, VarianceScale);
#if NONLOCALMEAN_SEPARATE_SOURCE
float Distance = GetSquaredDistance(PContext, QContext);
Distances[i] = Distance;
#else
float2 Distance = GetSymmetricSquaredDistance(PContext, QContext);
Distances[i * 2 + 0] = Distance.x;
Distances[i * 2 + 1] = Distance.y;
#endif
}
uint2 AtlasPosition = GetAtlasPosition(DispatchThreadID, DispatchRegionSize.xy);
RWNLMWeightAtlas[AtlasPosition] = Distances;
}
RWBuffer<TAtlasValueType> RWNLMWeights;
Texture2D<TAtlasValueType> NLMWeightAtlasSource;
RWTexture2D<TAtlasValueType> RWNLMWeightAtlasTarget;
int3 SeperableRegionSize;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, THREAD_GROUP_SIZE)]
void NonLocalMeanSeperableFilterPatchSqauredDistanceCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID >= SeperableRegionSize))
{
return;
}
#if NONLOCALMEAN_SEPRERABLE_PASS == NONLOCALMEAN_SEPERABLE_FILTER_HORIZONTAL
int2 Direction = int2(1, 0);
uint2 Offset = 0;
#elif NONLOCALMEAN_SEPRERABLE_PASS == NONLOCALMEAN_SEPERABLE_FILTER_VERTICAL
int2 Direction = int2(0, 1);
uint2 Offset = uint2(PatchSize,PatchSize); // Don't need to run the vertical filter for the outside patch region.
#endif
uint2 AtlasPosition = GetAtlasPosition(DispatchThreadID, DispatchRegionSize.xy, Offset);
TAtlasValueType Sum = (TAtlasValueType)0;
for (int i = -PatchSize; i <= PatchSize; ++i)
{
int2 Pij = AtlasPosition + i * Direction;
Sum += NLMWeightAtlasSource[Pij];
}
Sum /= (2 * PatchSize + 1);
#if NONLOCALMEAN_SEPRERABLE_PASS == NONLOCALMEAN_SEPERABLE_FILTER_HORIZONTAL
RWNLMWeightAtlasTarget[AtlasPosition] = Sum;
#elif NONLOCALMEAN_SEPRERABLE_PASS == NONLOCALMEAN_SEPERABLE_FILTER_VERTICAL
uint2 Q = 0;
TAtlasValueType PatchSquareDistance = max(0, Sum);
TAtlasValueType PatchWeight0 = GetNonLocalMeanWeight(AtlasPosition, Q, PatchSquareDistance);
// Write to buffer with dimension layout XxYxNumB (each element is of size Count(TAtlasValueType,float)/2)
#if BUFFER_PASS_THROUGH
const uint x = DispatchThreadID.x - (PatchDistance);
const uint y = DispatchThreadID.y - (PatchDistance);
const uint X = SeperableRegionSize.x - 2 * PatchDistance;
const uint Y = SeperableRegionSize.y - 2 * PatchDistance;
#else
const uint x = DispatchThreadID.x;
const uint y = DispatchThreadID.y;
const uint X = SeperableRegionSize.x;
const uint Y = SeperableRegionSize.y;
#endif
const uint z = DispatchId * DispatchTileCount + DispatchThreadID.z;
const uint SaveIndex = x + X * (y + z * Y);
#if BUFFER_PASS_THROUGH
const uint MaxZ = (Pow2(2 * PatchDistance + 1) - 1 + 4) / 4;
if (x>=0 && y >=0 && x < X && y < Y && z < MaxZ)
#endif
{
RWNLMWeights[SaveIndex] = PatchWeight0;
}
#endif
}
Buffer<float2> SourceBuffer;
RWBuffer<float> RWTargetBuffer;
int4 SourceBufferDim;// B,X+2*pd, Y+2*pd, Wb
int3 TargetBufferDim;// W, X, Y
int HalfOffsetSearchCount;
float2 GetInputFromBuffer(uint x, uint y, uint w)
{
const uint B = SourceBufferDim.x;
const uint X = SourceBufferDim.y;
const uint Y = SourceBufferDim.z;
const uint Wb = SourceBufferDim.w;
const uint b = w % B;
const uint wb = w / B;
const uint InputIndex = b + B * (x + X * (y + Y * wb));
return SourceBuffer[InputIndex];
}
void WriteWeight(uint2 PixelPosition, uint2 WeightPosition, float Weight)
{
const uint PatchDiemeter = 2 * PatchDistance + 1;
uint WeightIndex = WeightPosition.x + WeightPosition.y * PatchDiemeter;
#if (NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_NONE) \
|| (NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_NUM_OF_WEIGHTS_PER_PIXELxWxH)
uint OutputIndex = WeightIndex + PixelPosition.x * TargetBufferDim.x + PixelPosition.y * TargetBufferDim.x * TargetBufferDim.y;
#elif NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_WxHxNUM_OF_WEIGHTS_PER_PIXEL
uint OutputIndex = PixelPosition.x + TargetBufferDim.y * (PixelPosition.y + TargetBufferDim.z * WeightIndex);
#elif NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_4xWxHxNUM_OF_WEIGHTS_BY_4
uint StartIndex = WeightIndex / 4;
uint WeightOffset = WeightIndex % 4;
uint OutputIndex = WeightOffset + 4 * (PixelPosition.x + TargetBufferDim.y * (PixelPosition.y + TargetBufferDim.z * StartIndex));
#endif
RWTargetBuffer[OutputIndex] = Weight;
}
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, THREAD_GROUP_SIZE)]
void NonLocalMeanReshapeBufferCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
uint3 ReshapeDim = uint3(SourceBufferDim.y, SourceBufferDim.z, HalfOffsetSearchCount);
if (any(DispatchThreadID >= ReshapeDim.xyz))
{
return;
}
const uint x2pd = DispatchThreadID.x;
const uint y2pd = DispatchThreadID.y;
const uint w = DispatchThreadID.z;
// Get from input
// For seperate source
// both .x and .y stores Source(x,y)->Target(P)
// For same image
// .x stores (x,y)->P, weights
// .y stores P ->(x,y) weights
const float2 PatchWeights = GetInputFromBuffer(x2pd, y2pd, w);
const uint PatchWidth = 2 * PatchDistance + 1;
int2 CoordinateXY2pd = int2(x2pd, y2pd);
int2 CoordinateXY = CoordinateXY2pd - PatchDistance;
int2 RegionSize = TargetBufferDim.yz;
#if NONLOCALMEAN_SEPARATE_SOURCE
for (int i = 0; i < 2; ++i)
{
int WeightIndex = (2 * w + i);
if (WeightIndex < TargetBufferDim.x && all(CoordinateXY < RegionSize) && all(CoordinateXY >= 0))
{
int2 WeightCoordinateP = int2(WeightIndex % PatchWidth, WeightIndex / PatchWidth);
WriteWeight(CoordinateXY, WeightCoordinateP, PatchWeights[i]);
}
}
#else
// For a given w index in upper matrix, convert to (i,j), find the mapping to lower coordinate.
int2 WeightCoordinateP = int2 (w % PatchWidth, w / PatchWidth);
const int2 WeightCenterPoint = int2(PatchDistance, PatchDistance);
int2 DeltaXY = WeightCoordinateP - WeightCenterPoint;
int2 WeightCoordinateQ = WeightCenterPoint - DeltaXY;
int2 CoordinateP2pd = CoordinateXY2pd + DeltaXY;
// Save (x,y) -> P weights
if (all(CoordinateXY < RegionSize) && all(CoordinateXY >= 0))
{
WriteWeight(CoordinateXY, WeightCoordinateP, PatchWeights.x);
}
// Save P -> (x,y) weights if it's inside the region.
int2 CoordinateP = CoordinateP2pd - PatchDistance;
if (all(CoordinateP < RegionSize) && all (CoordinateP >= 0))
{
WriteWeight(CoordinateP, WeightCoordinateQ, PatchWeights.y);
}
#endif
}
//------------------------------------------------------------------------------------------------------
// Collaborative filtering
// 1. Tiling
//------------------------------------------------------------------------------------------------------
RWBuffer<float> Dest;
int CopyChannelOffset;
int CopyChannelCount;
int BufferChannelOffset;
int BufferChannelSize;
int4 CopyRegion;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void CopyTextureToBufferCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
uint2 CopyRegionSize = uint2(CopyRegion.z - CopyRegion.x, CopyRegion.w - CopyRegion.y);
if (any(DispatchThreadID.xy >= CopyRegionSize))
{
return;
}
const uint2 Position = DispatchThreadID.xy + CopyRegion.xy;
const int BufferOffset = (DispatchThreadID.x + DispatchThreadID.y * CopyRegionSize.x) * BufferChannelSize + BufferChannelOffset;
// If copy region is outside of the texture region, get a mirror.
TPixelValue PixelValue = Source.Load(uint3(GetMirroredPosition(Position, TextureSize), 0));
const int EndCopyChannelOffset = CopyChannelOffset + CopyChannelCount;
#if SOURCE_CHANNEL_COUNT == 1
Dest[BufferOffset] = PixelValue;
#else
for (int i = CopyChannelOffset; i < EndCopyChannelOffset; ++i)
{
Dest[BufferOffset + (i - CopyChannelOffset)] = PixelValue[i];
}
#endif
}
RWTexture2D<float4> RWSource;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void NormalizeTextureCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= TextureSize))
{
return;
}
const uint2 Position = DispatchThreadID.xy;
float4 Texel = RWSource[Position];
if (Texel.w == 0)
{
Texel.w = 1;
}
RWSource[Position] = Texel / Texel.w;
}
//------------------------------------------------------------------------------------------------------
// 2. Weighted Least-square solver
Buffer<float> X;
Buffer<TWeightReadType> W;
Buffer<float> Y;
RWBuffer<float> Result;
int2 XDim;
int WDim;
int NumOfTemporalFrames;
int NumOfWeigthsPerPixelPerFrame;
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_GENERALIZED
int2 YDim;
#else
#define YDim XDim
#endif
// X, Y is stored per pixel, in each pixel, it stores the pixel footage for each temporal frame
// Pixel layout example with two data per temporal frame
// P1 ... Pn
// |F0 F1|F0 F1|F0 F1|
// T0 T1 T2
//
// W stores one weight per pixel, when a frame stored, the next frame is stored
// P1 ... Pn | P1 ... Pn|
// T0 T1 ...
float GetX(uint2 Position, uint f)
{
const uint Width = TextureSize.x + 2 * PatchDistance;
const uint F = XDim.y;
const uint Index = (Position.y * Width + Position.x) * (F * NumOfTemporalFrames) + f;
return X[Index];
}
float GetX(uint2 Position, uint Db, uint FrameIndex)
{
const uint F = XDim.y;
return GetX(Position, FrameIndex * F + Db);
}
float GetY(int2 Position, int f)
{
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_GENERALIZED
const float Width = TextureSize.x + 2 * PatchDistance;
const float F = YDim.y;
const int Index = (Position.y * Width + Position.x) * (F * NumOfTemporalFrames) + f;
return Y[Index];
#else
return GetX(Position, f);
#endif
}
TWeightReadType GetW(int2 Position, int WeightIndex, int FrameIndex)
{
const int NumberOfPixels = TextureSize.x * TextureSize.y;
#if (NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_NONE) \
|| (NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_NUM_OF_WEIGHTS_PER_PIXELxWxH)
const int Index = NumOfWeigthsPerPixelPerFrame *(NumberOfPixels * FrameIndex + (Position.y * TextureSize.x + Position.x)) + WeightIndex;
#elif NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_WxHxNUM_OF_WEIGHTS_PER_PIXEL
const int Index = NumOfWeigthsPerPixelPerFrame * NumberOfPixels * FrameIndex
+ Position.x + TextureSize.x * (Position.y + TextureSize.y * WeightIndex);
#elif NLM_WEIGHTLAYOUT == NLM_WEIGHTLAYOUT_4xWxHxNUM_OF_WEIGHTS_BY_4
const uint FrameOffset = NumberOfPixels * (NumOfWeigthsPerPixelPerFrame + 4 - 1) / 4;
uint StartIndex = WeightIndex / 4; // return the weight stride instead of the specific weight
uint WeightOffset = WeightIndex % 4;
const uint Index = FrameOffset * FrameIndex
+ (Position.x + TextureSize.x * (Position.y + TextureSize.y * StartIndex));
#else
#error not implemented
#endif
return W[Index];
}
int2 GetModifiedXyYyDimension()
{
const int ModifiedXyDim = XDim.y + APPEND_CONSTANT_DIMENSION_TO_X;
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_GENERALIZED
const int ModifiedYyDim = YDim.y;
#else
const int ModifiedYyDim = YDim.y + APPEND_CONSTANT_DIMENSION_TO_X;
#endif
return int2(ModifiedXyDim, ModifiedYyDim);
}
void WriteResult(int2 Position, int f, float Value)
{
const int2 ModifiedXyYyDim = GetModifiedXyYyDimension();
const int Index = (Position.y * TextureSize.x + Position.x) * (ModifiedXyYyDim.x * ModifiedXyYyDim.y) + f;
Result[Index] = Value;
}
#include "/Engine/Private/DoubleFloat.ush"
int SamplingStep;
void SolveMatrixMultiplicationPerMatrixItem(int2 Position, int2 ModifiedXyYyDim, int PatchWidth, int SingleFrameDataCount)
{
#if NLM_WEIGHTLAYOUT != NLM_WEIGHTLAYOUT_4xWxHxNUM_OF_WEIGHTS_BY_4
for (int i = 0; i < ModifiedXyYyDim.x; ++i)
{
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_QUADRATIC
for (int k = i; k < ModifiedXyYyDim.y; ++k)
#else
for (int k = 0; k < ModifiedXyYyDim.y; ++k)
#endif
{
FDFScalar R = (FDFScalar)0;
for (int j = 0; j < XDim.x; ++j)
{
int IndexWithinFrame = j % SingleFrameDataCount;
int2 Offset = int2(IndexWithinFrame % PatchWidth, IndexWithinFrame / PatchWidth);
int frame_index = j / SingleFrameDataCount;
float w = GetW(Position, IndexWithinFrame, frame_index); // get w, cache ???, 19*19*T
int2 TargetPosition = Position + Offset;
#if APPEND_CONSTANT_DIMENSION_TO_X
float x = 1.0f;
float y = 1.0f;
BRANCH
if (i < XDim.y)
{
x = GetX(TargetPosition, XDim.y * frame_index + i);
}
BRANCH
if (k < YDim.y)
{
y = GetY(TargetPosition, YDim.y * frame_index + k);
}
#else
float x = GetX(TargetPosition, XDim.y * frame_index + i); // get x from the ith row X^T
float y = GetY(TargetPosition, YDim.y * frame_index + k); // get y from the kth column of Y
#endif
R = DFAdd(R, DFMultiply(DFTwoSum(x, 0.0f), DFMultiply(DFTwoSum(sqrt(w), 0.0f), DFTwoSum(y, 0.0f))));
}
float RResolved = DFDemote(R);
WriteResult(Position, i * ModifiedXyYyDim.y + k, RResolved);
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_QUADRATIC
WriteResult(Position, k * ModifiedXyYyDim.y + i, RResolved);
#endif
}
}
#endif // NLM_WEIGHTLAYOUT != NLM_WEIGHTLAYOUT_4xWxHxNUM_OF_WEIGHTS_BY_4
}
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_QUADRATIC
#define MATRIX_CACHE_SIZE (((NUM_FEATURE + APPEND_CONSTANT_DIMENSION_TO_X) * (NUM_FEATURE + APPEND_CONSTANT_DIMENSION_TO_X) + 1) / 2)
#elif WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_GENERALIZED
#define MATRIX_CACHE_SIZE ((NUM_FEATURE + APPEND_CONSTANT_DIMENSION_TO_X)*3)
#endif
struct FMatrixCache
{
float Data[MATRIX_CACHE_SIZE];
};
struct FFeatureXCache
{
float Data[NUM_FEATURE + APPEND_CONSTANT_DIMENSION_TO_X];
};
struct FFeatureYCache
{
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_QUADRATIC
float Data[NUM_FEATURE + APPEND_CONSTANT_DIMENSION_TO_X];
#else
float3 Data;
#endif
};
void FillFeatureXCache(out FFeatureXCache FeatureCache,uint2 Position, int2 ModifiedXyYyDim, uint FrameFeatureOffset)
{
for (int i = 0; i < XDim.y; ++i)
{
FeatureCache.Data[i] = GetX(Position, FrameFeatureOffset + i);
}
#if APPEND_CONSTANT_DIMENSION_TO_X
FeatureCache.Data[ModifiedXyYyDim.x - 1] = 1.0f;
#endif
}
void FillFeatureYCache(out FFeatureYCache FeatureCache, uint2 Position, int2 ModifiedXyYyDim, uint FrameFeatureOffset)
{
for (int i = 0; i < YDim.y; ++i)
{
FeatureCache.Data[i] = GetY(Position, FrameFeatureOffset + i);
}
}
int SourceFrameIndex;
void MatrixMultiplicationSmallMatrixOptimization(int2 Position, int2 ModifiedXyYyDim, int PatchWidth, int SingleFrameDataCount)
{
FMatrixCache MatrixCache = (FMatrixCache)0;
int NumOfFrames = XDim.x / SingleFrameDataCount;
int Index = 0;
#if USE_SAMPLING_STEP && (NLM_WEIGHTLAYOUT != NLM_WEIGHTLAYOUT_4xWxHxNUM_OF_WEIGHTS_BY_4)
#if SAMPLING_TYPE == SAMPLING_TYPE_INCREMENTAL
//This sampling might lead to incorrect DOF when there is ambiguity using feature to predict
//non-DOF background in previous/future frames, and DOF foreground that is the current frame.
//we might need to apply regression for each frame instead of using combined frames.
for (int step = 0; step < XDim.x; step+=SamplingStep)
{
int IndexWithinFrame = step / NumOfFrames;
int2 Offset = int2(IndexWithinFrame % PatchWidth, IndexWithinFrame / PatchWidth);
int2 TargetPosition = Position + Offset;
int t = step % NumOfFrames;
#elif SAMPLING_TYPE == SAMPLING_TYPE_FOCUS_CURRENT_FRAME
// focus sampling more to the current frames.
const int SourceFrame = SourceFrameIndex;
int PreviousSamplingFrame = NumOfFrames - 1;
int IndexWithinFrame = 0;
for (int step = 0; step < XDim.x && IndexWithinFrame < SingleFrameDataCount-1; step += SamplingStep)
{
int t = step % NumOfFrames;
if (t < PreviousSamplingFrame && NumOfFrames > 1)
{
PreviousSamplingFrame = t;
t = SourceFrame;
IndexWithinFrame += 1;
}
else
{
int Tmp = t;
t = SourceFrame;
PreviousSamplingFrame = Tmp;
}
int2 Offset = int2(IndexWithinFrame % PatchWidth, IndexWithinFrame / PatchWidth);
int2 TargetPosition = Position + Offset;
#endif
#else
for (int j = 0; j < SingleFrameDataCount; j += WEIGHT_PIXEL_INCREMENT)
{
int IndexWithinFrame = j;
int2 Offset = int2(IndexWithinFrame % PatchWidth, IndexWithinFrame / PatchWidth);
int2 TargetPosition = Position + Offset;
for (int t = 0; t < NumOfFrames; ++t)
#endif
{
int frame_index = t;
TWeightReadType sqrtw = sqrt(GetW(Position, IndexWithinFrame, frame_index)); // get w.
UNROLL
for (int WeightStrideIndex = 0; WeightStrideIndex < WEIGHT_PIXEL_INCREMENT; ++WeightStrideIndex)
{
#if WEIGHT_PIXEL_INCREMENT == 1
float LocalWeight = sqrtw;
#else
IndexWithinFrame = j + WeightStrideIndex;
Offset = int2(IndexWithinFrame % PatchWidth, IndexWithinFrame / PatchWidth);
TargetPosition = Position + Offset;
float LocalWeight = sqrtw[WeightStrideIndex];
if (IndexWithinFrame < SingleFrameDataCount)
#endif
{
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_GENERALIZED
FFeatureYCache Y;
FillFeatureYCache(Y, TargetPosition, ModifiedXyYyDim, YDim.y * frame_index);
#endif
FFeatureXCache X;
FillFeatureXCache(X, TargetPosition, ModifiedXyYyDim, XDim.y * frame_index);
Index = 0;
for (int i = 0; i < ModifiedXyYyDim.x; ++i)
{
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_QUADRATIC
for (int k = i; k < ModifiedXyYyDim.y; ++k)
#else
for (int k = 0; k < ModifiedXyYyDim.y; ++k)
#endif
{
float x = X.Data[i];
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_GENERALIZED
float y = Y.Data[k];
#else
float y = X.Data[k];
#endif
MatrixCache.Data[Index] += x * LocalWeight * y;
++Index;
}
}
}
}
}
}
Index = 0;
for (int i = 0; i < ModifiedXyYyDim.x; ++i)
{
#if WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_QUADRATIC
for (int k = i; k < ModifiedXyYyDim.y; ++k)
#else
for (int k = 0; k < ModifiedXyYyDim.y; ++k)
#endif
{
float RResolved = MatrixCache.Data[Index++];
WriteResult(Position, i * ModifiedXyYyDim.y + k, RResolved);
#if (WEIGHTED_MULTIPLICATION_TYPE == WEIGHTED_MULTIPLICATION_QUADRATIC)
WriteResult(Position, k * ModifiedXyYyDim.y + i, RResolved);
#endif
}
}
}
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void InPlaceBatchedMatrixMultiplicationCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= TextureSize))
{
return;
}
int2 Position = DispatchThreadID.xy;
int PatchWidth = 2 * PatchDistance + 1;
int SingleFrameDataCount = Pow2(2 * PatchDistance + 1);//TODO
//======================================================================================================
// For each pixel, solve the matrix X^T W * Y or X^T W * X
// asymptotic complexity O(n^3).
// Actual: FxAxN = 21*N for X^T W Y, where N = 19*19*T, where T = 1,3,5
// FxNxN = 21*N^2 for X^T W X
// The time spent for solving the quadratic form increases quadratically, 1, 9, 25.
// Improvement:
// 1. Based on "one in ten/twenty rules", we might not need that many Ns.
// E.g., if 7x3 weights needed to be predicted, we might shrink 19*19*5 to 19*19*(1~2) observations that is
// distributed over 5 frames.
const int2 ModifiedXyYyDim = GetModifiedXyYyDimension();
#if SMALL_MATRIX_OPTIMIZE
MatrixMultiplicationSmallMatrixOptimization(Position, ModifiedXyYyDim, PatchWidth, SingleFrameDataCount);
#else
SolveMatrixMultiplicationPerMatrixItem(Position, ModifiedXyYyDim, PatchWidth, SingleFrameDataCount);
#endif
}
#if LINEAR_SOLVER_TYPE == LINEAR_SOLVER_TYPE_NEWTON_SCHULZ
#define NUM_NEWTON_SCHULTZ_ITERATIONS 20
#define NEWTON_INITIAL_GUESS_TYPE INITIAL_GUESS_EUCLIDEAN_NORM
#elif LINEAR_SOLVER_TYPE == LINEAR_SOLVER_TYPE_NEWTON_CHOLESKY
// lambda = 2.5e-3, newton iteration count = 3, equivilent to 20 newton iterations.
#define NUM_NEWTON_SCHULTZ_ITERATIONS 3
#define NEWTON_INITIAL_GUESS_TYPE INITIAL_GUESS_INVERSE_CHOLESKY_DECOMPOSITION
#endif
#define MATRIX_DIM NUM_FEATURE
#include "NFORRegression.ush"
float Lambda;
RWBuffer<uint> RWSuccessAndFailIndexBuffer;
void LinearSolveEntry(uint AOffset, uint ASize, uint BOffset, uint MatrixIndex)
{
#if (LINEAR_SOLVER_TYPE == LINEAR_SOLVER_TYPE_NEWTON_SCHULZ || \
LINEAR_SOLVER_TYPE == LINEAR_SOLVER_TYPE_NEWTON_CHOLESKY)
bool bComplete = NewtonIterativeSolve(AOffset, ASize, BOffset, Lambda, Result);
#elif LINEAR_SOLVER_TYPE == LINEAR_SOLVER_TYPE_CHOLESKY
bool bComplete = CholeskeySolve(AOffset, ASize, BOffset, Lambda, Result);
#endif
#if OUTPUT_INDICES
bool WriteCountIndexLocation = bComplete ? OUTPUT_MATRIX_TYPE_SUCCESS : OUTPUT_MATRIX_TYPE_FAIL;
uint IndexToStore = 0;
InterlockedAdd(RWSuccessAndFailIndexBuffer[WriteCountIndexLocation], 1, IndexToStore);
IndexToStore = lerp(IndexToStore, (NumOfElements - IndexToStore - 1), WriteCountIndexLocation);
RWSuccessAndFailIndexBuffer[IndexToStore + 2] = MatrixIndex;
#endif
}
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void LinearSolverCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
const int MatrixIndex = DispatchThreadID.x + DispatchThreadID.y * NumOfElementsPerRow;
if (MatrixIndex >= NumOfElements || DispatchThreadID.x >= NumOfElementsPerRow)
{
return;
}
const int BSize = BDim.x * BDim.y;
const int BOffset = MatrixIndex * BSize;
const int ASize = ADim.x * ADim.y;
const int AOffset = MatrixIndex * ASize;
LinearSolveEntry(AOffset, ASize, BOffset, MatrixIndex);
}
RWBuffer<uint> RWIndirectDispatchArgsBuffer;
Buffer<uint> SuccessAndFailIndexBuffer;
[numthreads(1, 1, 1)]
void LinearSolverBuildIndirectDispatchArgsCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
const uint NumOfIndices = SuccessAndFailIndexBuffer[INPUT_MATRIX_TYPE];
if (NumOfIndices > 0)
{
const uint NumOfGroupsX = (NumOfIndices + (THREAD_GROUP_SIZE * THREAD_GROUP_SIZE) - 1) / (THREAD_GROUP_SIZE * THREAD_GROUP_SIZE);
WriteDispatchIndirectArgs(RWIndirectDispatchArgsBuffer, 0, NumOfGroupsX, 1, 1);
}
else
{
WriteDispatchIndirectArgs(RWIndirectDispatchArgsBuffer, 0, 0, 0, 0);
}
}
groupshared uint CompactNumOfElements;
[numthreads(THREAD_GROUP_SIZE*THREAD_GROUP_SIZE, 1, 1)]
void LinearSolverIndirectCS(in const uint3 DispatchThreadID : SV_DispatchThreadID, uint GI : SV_GroupIndex)
{
if (GI == 0)
{
CompactNumOfElements = SuccessAndFailIndexBuffer[INPUT_MATRIX_TYPE];
}
GroupMemoryBarrierWithGroupSync();
const uint ElementIndex = DispatchThreadID.x;
if (ElementIndex >= CompactNumOfElements)
{
return;
}
const uint ResolvedElementIndex = lerp(ElementIndex, NumOfElements - 1 - ElementIndex, INPUT_MATRIX_TYPE);
const uint MatrixIndex = SuccessAndFailIndexBuffer[2 + ResolvedElementIndex];
const int BSize = BDim.x * BDim.y;
const int BOffset = MatrixIndex * BSize;
const int ASize = ADim.x * ADim.y;
const int AOffset = MatrixIndex * ASize;
LinearSolveEntry(AOffset, ASize, BOffset, MatrixIndex);
}
//===============================================================================================
// Reconstruct spatial temporal images
int FrameIndex;
float AlbedoOffset;
RWTexture2D<float4> RWReconstruction;
RWStructuredBuffer<uint4> RWReconstructBuffer;
RWTexture2D<UlongType> RWReconstructBuffer64;
float GetB(int2 Position, int f)
{
const int Index = (Position.y * TextureSize.x + Position.x) * (BDim.x*BDim.y) + f;
return B[Index];
}
uint4 EncodeFloat4(float4 Value)
{
uint4 fractions = frac(max(Value,0.0f))*0xffff;
uint4 Integers = uint4(max(Value, 0.0f));
return Integers<<16 | (fractions & 0xffff);
}
float4 DecodeFloat4(uint4 Value)
{
float4 Decoded = float4(Value>>16) + float4(Value & 0xffff)/65536;
return Decoded;
}
void SaveFloat4ToBufferOrTexture(uint2 Position, float4 Value, uint NumOfScattering)
{
Value = max(Value, 0);
#if COMPILER_SUPPORTS_UINT64_IMAGE_ATOMICS
float4 XBWFraction = frac(Value);
uint4 XBWInteger = Value - XBWFraction;
XBWInteger = min(XBWInteger, 0xffffffff / NumOfScattering); // Avoid overflow.
uint4 XBWFractionAsInteger = uint4(XBWFraction * 0xffffffff);
{
const UlongType R = PackUlongType(uint2(XBWFractionAsInteger.r, XBWInteger.r));
ImageInterlockedAddUInt64(RWReconstructBuffer64, uint2(Position.x * 4 + 0, Position.y), R);
const UlongType G = PackUlongType(uint2(XBWFractionAsInteger.g, XBWInteger.g));
ImageInterlockedAddUInt64(RWReconstructBuffer64, uint2(Position.x * 4 + 1, Position.y), G);
const UlongType B = PackUlongType(uint2(XBWFractionAsInteger.b, XBWInteger.b));
ImageInterlockedAddUInt64(RWReconstructBuffer64, uint2(Position.x * 4 + 2, Position.y), B);
const UlongType A = PackUlongType(uint2(XBWFractionAsInteger.a, XBWInteger.a));
ImageInterlockedAddUInt64(RWReconstructBuffer64, uint2(Position.x * 4 + 3, Position.y), A);
}
#else
int IntermediateBufferIndex = Position.x + Position.y * (TextureSize.x + 2 * PatchDistance);
uint4 XBWEncoded = EncodeFloat4(Value);
float4 OldValue = 0;
InterlockedAdd(RWReconstructBuffer[IntermediateBufferIndex].x, XBWEncoded.x, OldValue.x);
InterlockedAdd(RWReconstructBuffer[IntermediateBufferIndex].y, XBWEncoded.y, OldValue.y);
InterlockedAdd(RWReconstructBuffer[IntermediateBufferIndex].z, XBWEncoded.z, OldValue.z);
InterlockedAdd(RWReconstructBuffer[IntermediateBufferIndex].w, XBWEncoded.w, OldValue.w);
#endif //COMPILER_SUPPORTS_UINT64_IMAGE_ATOMICS
}
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void ReconstructSpatialTemporalImageCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= TextureSize))
{
return;
}
int2 Position = DispatchThreadID.xy;
const int NumOfFeatures = XDim.y;
const int PatchSideLength = 2 * PatchDistance + 1;
// Get the weights for all the pixel's patch's reconstruction
const int BWeightStride = BDim.x * BDim.y;
float3 Weights[NUM_FEATURE]; // 6x3 matrix.
// checkf(NUM_FEATURE == XDim.y)
UNROLL
for (int i = 0; i < NUM_FEATURE; ++i)
{
float3 weight = 0;
weight.x = GetB(Position, i * BDim.y + 0);
weight.y = GetB(Position, i * BDim.y + 1);
weight.z = GetB(Position, i * BDim.y + 2);
Weights[i] = weight;
}
const int WeightStride = Pow2(PatchSideLength);
const int XStride = XDim.y * WeightStride;
float4 GatheredValue = 0;
#if PRE_ALBEDO_DIVIDE == PRE_ALBEDO_DIVIDE_EACH
float3 Albedo = 0;
#endif
// For center frame, reconstruct and scatter the result
for (int PixelIndex = 0; PixelIndex < WeightStride; PixelIndex += WEIGHT_PIXEL_INCREMENT)
{
int x = PixelIndex % PatchSideLength;
int y = PixelIndex / PatchSideLength;
TWeightReadType PixelWeights = GetW(Position, PixelIndex, FrameIndex);
UNROLL
for (int SubStrideIndex = 0; SubStrideIndex < WEIGHT_PIXEL_INCREMENT; ++SubStrideIndex)
{
#if WEIGHT_PIXEL_INCREMENT == 1
float LocalWeight = PixelWeights;
#else
int ResolvedPixelIndex = PixelIndex + SubStrideIndex;
x = ResolvedPixelIndex % PatchSideLength;
y = ResolvedPixelIndex / PatchSideLength;
float LocalWeight = PixelWeights[SubStrideIndex];
if (ResolvedPixelIndex < WeightStride)
#endif
{
uint2 TargetPosition = Position + int2(x, y);
float3 Denoised = 0;
for (int i = 0; i < NUM_FEATURE; ++i)
{
float FeatureI = 1.0f;
if (i < NumOfFeatures)
{
FeatureI = GetX(TargetPosition, i, FrameIndex);
#if PRE_ALBEDO_DIVIDE == PRE_ALBEDO_DIVIDE_EACH
//Assume the first three channels are albedo. TODO: use passed in index
if (i < 3)
{
Albedo[i] = FeatureI;
}
#endif
}
Denoised += FeatureI * Weights[i];
}
#if RECONSTRUCTION_TYPE == RECONSTRUCTION_TYPE_SCATTER
#if PRE_ALBEDO_DIVIDE == PRE_ALBEDO_DIVIDE_EACH
Denoised *= RADIANCE_PREPROCESS_SCALE_FACTOR * Albedo;
#elif PRE_ALBEDO_DIVIDE == PRE_ALBEDO_DIVIDE_FINAL
Denoised *= RADIANCE_PREPROCESS_SCALE_FACTOR;
#endif
// Scatter the result instead of gathering for the current frame to get high frequency results (better reconstruction).
float4 XBW = float4(Denoised * LocalWeight, LocalWeight);
SaveFloat4ToBufferOrTexture(TargetPosition, XBW, WeightStride);
#elif RECONSTRUCTION_TYPE == RECONSTRUCTION_TYPE_GATHER
#if PRE_ALBEDO_DIVIDE == PRE_ALBEDO_DIVIDE_EACH
Denoised *= Albedo;
#endif
GatheredValue += float4(Denoised * LocalWeight, LocalWeight);
#endif
}
}
}
#if RECONSTRUCTION_TYPE == RECONSTRUCTION_TYPE_GATHER
// For other frames, after reconstruction, perform reconstruction with the weights.
RWReconstruction[Position + PatchDistance] += GatheredValue;
#endif
}
StructuredBuffer<uint4> StructuredBufferSource;
Texture2D<UlongType> ReconstructBuffer64;
float DecodeUint64ToFloat(Texture2D<UlongType> inTexture, uint2 Position)
{
const uint2 REncoded = UnpackUlongType(inTexture[Position]);
return REncoded.y + float(REncoded.x) / 0xffffffff;
}
float4 DecodeFloat4FromBufferOrTexture(uint2 Position)
{
float4 DecodedValue = 0;
#if COMPILER_SUPPORTS_UINT64_IMAGE_ATOMICS
float R = DecodeUint64ToFloat(ReconstructBuffer64, uint2(Position.x * 4 + 0, Position.y));
float G = DecodeUint64ToFloat(ReconstructBuffer64, uint2(Position.x * 4 + 1, Position.y));
float B = DecodeUint64ToFloat(ReconstructBuffer64, uint2(Position.x * 4 + 2, Position.y));
float A = DecodeUint64ToFloat(ReconstructBuffer64, uint2(Position.x * 4 + 3, Position.y));
DecodedValue = float4(R,G,B,A);
#if PRE_ALBEDO_DIVIDE != PRE_ALBEDO_DIVIDE_DISABLED
DecodedValue.rgb *= RADIANCE_POSTPROCESS_INVERSE_SCALE_FACTOR;
#endif
#else
int BufferIndex = Position.x + Position.y * TextureSize.x;
DecodedValue = DecodeFloat4(StructuredBufferSource[BufferIndex]);
#endif
// There would be cases where a is 0 due to w being too small and encoded as 0.
// Without rejecting this sample, could see borders artifacts around each tile.
DecodedValue = lerp(0, DecodedValue, DecodedValue.a > 0);
return DecodedValue;
}
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void AccumulateBufferToTextureCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= TextureSize))
{
return;
}
uint2 Position = DispatchThreadID.xy;
float4 Color = DecodeFloat4FromBufferOrTexture(Position);
RWTarget[DispatchThreadID.xy] += Color;
}
//------------------------------------------------------------------------------------------------------
// Bandwidth selection
//------------------------------------------------------------------------------------------------------
Texture2D<TPixelValue> FilteredImage;
RWTexture2D<float> MSE;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void MSEEstimationCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= TextureSize))
{
return;
}
const uint2 Position = DispatchThreadID.xy;
float VarC = GetImageVariance(Position, Variance, VarianceChannelOffset);
TPixelValue Color = GetImageValue(Position, Image);
TPixelValue FilteredColor = GetImageValue(Position, FilteredImage);
// E[(F-C)^2-Var_C] = Bias^2_F + Var_F - 2Cov(C,F)
// Assume there is low correlation between F and C, and low bias of F
MSE[Position] = Length2(FilteredColor - Color) - VarC;
}
Texture2D<float> FilteredMSEs_0;
Texture2D<float> FilteredMSEs_1;
RWTexture2D<float> RWSelectionMap;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void GenerateSelectionMapCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= TextureSize))
{
return;
}
const uint2 Position = DispatchThreadID.xy;
float MSE0 = FilteredMSEs_0.Load(int3(Position, 0));
float MSE1 = FilteredMSEs_1.Load(int3(Position, 0));
RWSelectionMap[Position] = MSE0 < MSE1 ? 0.0f : 1.0f;
}
Texture2D<float4> FilteredImages_0;
Texture2D<float4> FilteredImages_1;
Texture2D<float> SelectionMap;
RWTexture2D<float4> RWFilteredImage;
[numthreads(THREAD_GROUP_SIZE, THREAD_GROUP_SIZE, 1)]
void CombineFilteredImageCS(in const uint3 DispatchThreadID : SV_DispatchThreadID)
{
if (any(DispatchThreadID.xy >= TextureSize))
{
return;
}
const uint2 Position = DispatchThreadID.xy;
float4 Color0 = FilteredImages_0.Load(int3(Position, 0));
float4 Color1 = FilteredImages_1.Load(int3(Position, 0));
float W = SelectionMap.Load(int3(Position, 0));
RWFilteredImage[Position] = lerp(Color0, Color1, W);
}