2833 lines
94 KiB
HLSL
2833 lines
94 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#include "SSDSignalAccumulatorArray.ush"
|
|
#include "SSDSignalBufferEncoding.ush"
|
|
#include "../TextureSampling.ush"
|
|
#include "../MonteCarlo.ush"
|
|
|
|
|
|
//------------------------------------------------------- ENUMS
|
|
|
|
/** Enums to choose how to compute the world distance for bilateral rejection. */
|
|
// Only depends on the reference sample's pixel size and depth.
|
|
#define SIGNAL_WORLD_FREQUENCY_REF_METADATA_ONLY 0
|
|
|
|
// Only depends on the sample's pixel size and depth.
|
|
#define SIGNAL_WORLD_FREQUENCY_SAMPLE_METADATA_ONLY 1
|
|
|
|
// Is the smallest according of pixel size and depth between reference and sample.
|
|
#define SIGNAL_WORLD_FREQUENCY_MIN_METADATA 2
|
|
|
|
// Depends only based of the sample's hit distance and metadata.
|
|
#define SIGNAL_WORLD_FREQUENCY_HIT_DISTANCE 3
|
|
|
|
// Uses FSSDSignalSample::WorldBluringRadius precomputed in the sample.
|
|
#define SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS 4
|
|
|
|
// Compute based on the harmonic being processed.
|
|
#define SIGNAL_WORLD_FREQUENCY_HARMONIC 5
|
|
|
|
|
|
//------------------------------------------------------- CONFIG DISABLED DEFAULTS
|
|
|
|
#ifndef CONFIG_ACCUMULATOR_VGPR_COMPRESSION
|
|
#define CONFIG_ACCUMULATOR_VGPR_COMPRESSION ACCUMULATOR_COMPRESSION_DISABLED
|
|
#endif
|
|
|
|
#define CONFIG_ENABLE_WAVE_BROADCAST (PLATFORM_SUPPORTS_WAVE_BROADCAST)
|
|
|
|
|
|
#ifndef COMPILE_BOX_KERNEL
|
|
#define COMPILE_BOX_KERNEL 0
|
|
#endif
|
|
|
|
#ifndef COMPILE_STACKOWIAK_KERNEL
|
|
#define COMPILE_STACKOWIAK_KERNEL 0
|
|
#endif
|
|
|
|
#ifndef COMPILE_DISK_KERNEL
|
|
#define COMPILE_DISK_KERNEL 0
|
|
#endif
|
|
|
|
#ifndef COMPILE_DIRECTIONAL_KERNEL
|
|
#define COMPILE_DIRECTIONAL_KERNEL 0
|
|
#endif
|
|
|
|
#ifndef COMPILE_RAW_EXPERIMENTAL_KERNEL
|
|
#define COMPILE_RAW_EXPERIMENTAL_KERNEL 0
|
|
#endif
|
|
|
|
#ifndef FORCE_IDENTICAL_COLOR_SPACE
|
|
#define FORCE_IDENTICAL_COLOR_SPACE 0
|
|
#endif
|
|
|
|
//------------------------------------------------------- STRUCTURES
|
|
|
|
/** Configures the spatial kernel. */
|
|
struct FSSDKernelConfig
|
|
{
|
|
// --------------------------- compile time.
|
|
|
|
// Compile time set of sample to use.
|
|
uint SampleSet;
|
|
|
|
// Compile time selection of sample to use.
|
|
uint SampleSubSetId;
|
|
|
|
// Compile time layout of the buffer to accumulate.
|
|
uint BufferLayout;
|
|
|
|
// Compile time number of multiplexed signal per signal domain.
|
|
uint MultiplexedSignalsPerSignalDomain;
|
|
|
|
// Selects how the world distance should be computed for bilateral rejection at compile time.
|
|
uint BilateralDistanceComputation;
|
|
|
|
// Number of ring for a disk kernel.
|
|
uint RingCount;
|
|
|
|
/** Selects how the computation of world vector between the reference and neighbor should be computed. */
|
|
uint NeighborToRefComputation;
|
|
|
|
// Layout of RefSceneMetadata
|
|
uint RefSceneMetadataLayout;
|
|
|
|
// Multiplier applied on the world bluring distance of the signal.
|
|
float WorldBluringDistanceMultiplier;
|
|
|
|
// Compile time configuration whether want do LOOP or UNROLL
|
|
// false by default to expose in user code when the shader byte code might potentially be big.
|
|
bool bUnroll;
|
|
|
|
// Compile time whether the center of the kernel sample is sampled.
|
|
bool bSampleKernelCenter;
|
|
|
|
// Compile time whether sampling previous frame or current frame metadata.
|
|
bool bPreviousFrameMetadata;
|
|
|
|
// Compile time whether reference metadata is current frame or previous frame.
|
|
bool bPreviousFrameRefMetadata;
|
|
|
|
// The sample should be accumulated starting from the further away.
|
|
bool bDescOrder;
|
|
|
|
// Whether a sample should be normalised to 1 before accmulation.
|
|
bool bNormalizeSample;
|
|
|
|
// Whether should min sample frequency of pair of samples
|
|
// [ Jimenez 2014, "Next Generation Post Processing in Call of Duty: Advanced Warfare" ]
|
|
bool bMinSamplePairInvFrequency;
|
|
|
|
// Whether the bilateral distance should be maxed with reference bilateral distance.
|
|
bool bMaxWithRefBilateralDistance;
|
|
|
|
// Whether the spherical harmonic of a sample should be computed before accumulation.
|
|
bool bComputeSampleColorSH;
|
|
|
|
// Whether should clamp the UV individually per signal.
|
|
bool bClampUVPerMultiplexedSignal;
|
|
|
|
// The color space that has been encoded in the buffer.
|
|
uint BufferColorSpace[SIGNAL_ARRAY_SIZE];
|
|
|
|
// The color space of the accumulation.
|
|
uint AccumulatorColorSpace[SIGNAL_ARRAY_SIZE];
|
|
|
|
// The color space of the accumulation.
|
|
uint BilateralSettings[SIGNAL_ARRAY_SIZE];
|
|
|
|
|
|
// --------------------------- Per wave.
|
|
|
|
// Buffer size and inv size.
|
|
float4 BufferSizeAndInvSize;
|
|
float4 BufferBilinearUVMinMax;
|
|
|
|
// Multiplier on the sample's offset.
|
|
float KernelSpreadFactor;
|
|
|
|
// The periode of the harmonic being sampled.
|
|
float HarmonicPeriode;
|
|
|
|
// Buffer's min and max UV, per texture.
|
|
float4 PerSignalUVMinMax[SIGNAL_ARRAY_SIZE];
|
|
|
|
|
|
// --------------------------- Per lane.
|
|
|
|
// Number of samples should be done when doing variable box sampling.
|
|
uint BoxKernelRadius;
|
|
|
|
// Runtime number of samples
|
|
uint SampleCount;
|
|
|
|
// Buffer coordinate of the center of the kernel.
|
|
float2 BufferUV;
|
|
|
|
// Metadata of the scene for the bilateral therm.
|
|
FSSDCompressedSceneInfos CompressedRefSceneMetadata;
|
|
|
|
// Buffer coordinate of the reference used for decompression.
|
|
// Please try to make this same as BufferUV.
|
|
float2 RefBufferUV;
|
|
|
|
// Runtime to force the first sample of the kernel to be accumulated.
|
|
bool bForceKernelCenterAccumulation;
|
|
|
|
// Runtime to force accumulating all sample.
|
|
bool bForceAllAccumulation;
|
|
|
|
// Runtime whether this pixel is dynamic object.
|
|
bool bIsDynamicPixel;
|
|
|
|
// Runtime selection of a track of sample.
|
|
uint SampleTrackId;
|
|
|
|
// Reference meta data.
|
|
float RefBilateralDistance[SIGNAL_ARRAY_SIZE];
|
|
|
|
// Uniform random values required for stocastic kernel.
|
|
float Randoms[1];
|
|
|
|
// Seed for hamerley sequence used for stocastic kernel.
|
|
uint2 HammersleySeed;
|
|
|
|
// Normalized pixel space direction for directional kernels.
|
|
float2 MajorAxis;
|
|
|
|
// The pixel radius along the major and minor axes for directional kernels.
|
|
float MajorPixelRadius;
|
|
float MinorPixelRadius;
|
|
|
|
#if DEBUG_OUTPUT
|
|
uint2 DebugPixelPosition;
|
|
uint DebugEventCounter;
|
|
#endif
|
|
};
|
|
|
|
FSSDKernelConfig CreateKernelConfig()
|
|
{
|
|
FSSDKernelConfig KernelConfig;
|
|
KernelConfig.SampleSet = SAMPLE_SET_1X1;
|
|
KernelConfig.SampleSubSetId = 0;
|
|
KernelConfig.BufferLayout = SIGNAL_BUFFER_LAYOUT_UNINITIALIZED;
|
|
KernelConfig.MultiplexedSignalsPerSignalDomain = SIGNAL_ARRAY_SIZE;
|
|
KernelConfig.NeighborToRefComputation = NEIGHBOR_TO_REF_CACHE_WORLD_POSITION;
|
|
KernelConfig.RefSceneMetadataLayout = METADATA_BUFFER_LAYOUT_DISABLED;
|
|
KernelConfig.RingCount = 0;
|
|
KernelConfig.WorldBluringDistanceMultiplier = 1.0;
|
|
KernelConfig.bUnroll = false;
|
|
KernelConfig.bSampleKernelCenter = false;
|
|
KernelConfig.bPreviousFrameMetadata = false;
|
|
KernelConfig.bPreviousFrameRefMetadata = false;
|
|
KernelConfig.BilateralDistanceComputation = SIGNAL_WORLD_FREQUENCY_MIN_METADATA;
|
|
KernelConfig.bDescOrder = false;
|
|
KernelConfig.bNormalizeSample = false;
|
|
KernelConfig.bMinSamplePairInvFrequency = false;
|
|
KernelConfig.bMaxWithRefBilateralDistance = false;
|
|
KernelConfig.bComputeSampleColorSH = false;
|
|
KernelConfig.bClampUVPerMultiplexedSignal = false;
|
|
|
|
{
|
|
UNROLL_N(SIGNAL_ARRAY_SIZE)
|
|
for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++)
|
|
{
|
|
KernelConfig.BufferColorSpace[MultiplexId] = STANDARD_BUFFER_COLOR_SPACE;
|
|
KernelConfig.AccumulatorColorSpace[MultiplexId] = STANDARD_BUFFER_COLOR_SPACE;
|
|
KernelConfig.BilateralSettings[MultiplexId] = 0x0000;
|
|
}
|
|
}
|
|
|
|
// SGPRs.
|
|
KernelConfig.BufferSizeAndInvSize = float4(0, 0, 0, 0);
|
|
KernelConfig.BufferBilinearUVMinMax = float4(0, 0, 0, 0);
|
|
KernelConfig.KernelSpreadFactor = 1;
|
|
KernelConfig.HarmonicPeriode = 1.0;
|
|
|
|
{
|
|
UNROLL_N(SIGNAL_ARRAY_SIZE)
|
|
for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++)
|
|
{
|
|
KernelConfig.PerSignalUVMinMax[MultiplexId] = 0.0;
|
|
}
|
|
}
|
|
|
|
// VGPRs.
|
|
KernelConfig.BoxKernelRadius = 1;
|
|
KernelConfig.SampleCount = 1;
|
|
KernelConfig.BufferUV = 0.0;
|
|
KernelConfig.CompressedRefSceneMetadata = CreateCompressedSceneInfos();
|
|
KernelConfig.RefBufferUV = 0.0;
|
|
KernelConfig.bForceKernelCenterAccumulation = false;
|
|
KernelConfig.bForceAllAccumulation = false;
|
|
KernelConfig.bIsDynamicPixel = false;
|
|
KernelConfig.SampleTrackId = 0;
|
|
KernelConfig.MajorAxis = 0.0;
|
|
KernelConfig.MajorPixelRadius = 0.0;
|
|
KernelConfig.MinorPixelRadius = 0.0;
|
|
KernelConfig.HammersleySeed = 0;
|
|
|
|
{
|
|
UNROLL_N(SIGNAL_ARRAY_SIZE)
|
|
for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++)
|
|
{
|
|
KernelConfig.RefBilateralDistance[MultiplexId] = 0.0;
|
|
}
|
|
}
|
|
|
|
{
|
|
UNROLL_N(2)
|
|
for (uint RandomSignalId = 0; RandomSignalId < 1; RandomSignalId++)
|
|
{
|
|
KernelConfig.Randoms[RandomSignalId] = 0.0;
|
|
}
|
|
}
|
|
|
|
#if DEBUG_OUTPUT
|
|
{
|
|
KernelConfig.DebugPixelPosition = 0;
|
|
KernelConfig.DebugEventCounter = 0;
|
|
}
|
|
#endif
|
|
|
|
return KernelConfig;
|
|
}
|
|
|
|
|
|
void SetBilateralPreset(uint BilateralPresetId, inout FSSDKernelConfig KernelConfig)
|
|
{
|
|
if (BilateralPresetId == BILATERAL_PRESET_MONOCHROMATIC_PENUMBRA)
|
|
{
|
|
UNROLL_N(SIGNAL_ARRAY_SIZE)
|
|
for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++)
|
|
{
|
|
// Change the bilarteral settings to use normal orientation in
|
|
// order to not merge background / foreground sample, as otherwise this results into leaks
|
|
// Shadow masks are normal invarient, so only reject based on position.
|
|
KernelConfig.BilateralSettings[MultiplexId] = BILATERAL_POSITION_BASED(5) | BILATERAL_NORMAL;
|
|
}
|
|
}
|
|
else if (BilateralPresetId == BILATERAL_PRESET_POLYCHROMATIC_PENUMBRA)
|
|
{
|
|
// Diffuse.
|
|
KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(5) | BILATERAL_NORMAL;
|
|
|
|
// Specular.
|
|
#if SIGNAL_ARRAY_SIZE > 1
|
|
KernelConfig.BilateralSettings[1] = BILATERAL_POSITION_BASED(5) | BILATERAL_TOKOYASHI;
|
|
#endif
|
|
}
|
|
else if (BilateralPresetId == BILATERAL_PRESET_REFLECTIONS)
|
|
{
|
|
// Specular.
|
|
// Can only be done using tokoyashi because have more than one sample at a time.
|
|
KernelConfig.BilateralSettings[0] = BILATERAL_TOKOYASHI;
|
|
|
|
#if SIGNAL_ARRAY_SIZE > 1
|
|
// Specular variance for sampling rejection pre convolution.
|
|
KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0];
|
|
#endif
|
|
}
|
|
else if (BilateralPresetId == BILATERAL_PRESET_REFLECTIONS_1SPP)
|
|
{
|
|
// Specular.
|
|
// Use specular ratio estomator, so no need to to reject based on the axis of the lobe.
|
|
KernelConfig.BilateralSettings[0] = BILATERAL_TOKOYASHI_LOBE;
|
|
}
|
|
else if (BilateralPresetId == BILATERAL_PRESET_REFLECTIONS_TAA)
|
|
{
|
|
// Specular.
|
|
// Can only be done using tokoyashi because have more than one sample at a time.
|
|
KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(1) | BILATERAL_TOKOYASHI;
|
|
|
|
#if SIGNAL_ARRAY_SIZE > 1
|
|
// Specular variance for sampling rejection pre convolution.
|
|
KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0];
|
|
#endif
|
|
}
|
|
else if (BilateralPresetId == BILATERAL_PRESET_DIFFUSE)
|
|
{
|
|
// Diffuse depends on world position, but also the normal of the surface given it doesn't store any directionality.
|
|
KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(2) | BILATERAL_NORMAL;
|
|
|
|
#if SIGNAL_ARRAY_SIZE > 1
|
|
// Variance for sampling rejection pre convolution.
|
|
KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0];
|
|
#endif
|
|
}
|
|
else if (BilateralPresetId == BILATERAL_PRESET_SPHERICAL_HARMONIC)
|
|
{
|
|
// Spherical harmonic encode directionality, so only reject based on world position.
|
|
KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(2);
|
|
}
|
|
else if (BilateralPresetId == BILATERAL_PRESET_PROBE_HIERARCHY)
|
|
{
|
|
// Diffuse & specular bilateral component.
|
|
KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(1) | BILATERAL_SHADING_MODEL;
|
|
}
|
|
else if (BilateralPresetId == BILATERAL_PRESET_AO)
|
|
{
|
|
// Diffuse depends on world position, but also the normal of the surface given it doesn't store any directionality.
|
|
KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(4) | BILATERAL_NORMAL;
|
|
|
|
#if SIGNAL_ARRAY_SIZE > 1
|
|
// Variance for sampling rejection pre convolution.
|
|
KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0];
|
|
#endif
|
|
}
|
|
else if (BilateralPresetId == BILATERAL_PRESET_AO_HISTORY)
|
|
{
|
|
// Diffuse depends on world position, but also the normal of the surface given it doesn't store any directionality.
|
|
KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(2) | BILATERAL_NORMAL;
|
|
//KernelConfig.BilateralSettings[0] = BILATERAL_NORMAL;
|
|
|
|
#if SIGNAL_ARRAY_SIZE > 1
|
|
// Variance for sampling rejection pre convolution.
|
|
KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
|
|
//------------------------------------------------------- CONSTANT
|
|
|
|
static const float kWaveletFilterWeights5x5[] = { 3.0 / 8.0, 1.0 / 4.0, 1.0 / 16.0 };
|
|
|
|
|
|
//------------------------------------------------------- REDERIVE INFORMATION FOR LOWER VGPR OCCUPENCY
|
|
|
|
/** Deduce the buffer UV of the output pixel this kernel has been configured for. */
|
|
ISOLATE
|
|
float2 ComputeRefBufferUV(FSSDKernelConfig KernelConfig)
|
|
{
|
|
if (KernelConfig.bPreviousFrameMetadata)
|
|
{
|
|
// Impossible to compute from BufferUV because it's in the previous frame basis.
|
|
return KernelConfig.RefBufferUV;
|
|
}
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_HEXAWEB)
|
|
{
|
|
// Impossible to compute from BufferUV because of random offset certainely needed using this..
|
|
return KernelConfig.RefBufferUV;
|
|
}
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_STACKOWIAK_4_SETS)
|
|
{
|
|
uint SampleTrackId = KernelConfig.SampleTrackId;
|
|
|
|
// Matches first line of kStackowiakSampleSet0
|
|
// TODO(Denoiser): could be optimised further by just setting sign bit on 0.5.
|
|
float2 SampleOffset = float2(
|
|
SampleTrackId & 0x1 ? 0.5 : -0.5,
|
|
SampleTrackId & 0x2 ? 0.5 : -0.5);
|
|
|
|
return KernelConfig.BufferUV + SampleOffset * KernelConfig.BufferSizeAndInvSize.zw;
|
|
}
|
|
|
|
return KernelConfig.BufferUV;
|
|
}
|
|
|
|
/** Uncompress the reference scene metadata to keep a low VGPR pressure. */
|
|
ISOLATE
|
|
FSSDSampleSceneInfos UncompressRefSceneMetadata(FSSDKernelConfig KernelConfig)
|
|
{
|
|
// Find out the buffer UV of the reference pixel.
|
|
float2 RefBufferUV = ComputeRefBufferUV(KernelConfig);
|
|
|
|
float2 ScreenPos;
|
|
if (KernelConfig.bPreviousFrameMetadata) // TODO(Denoiser): should be bPreviousFrameRefMetadata instead?
|
|
{
|
|
ScreenPos = RefBufferUV * PrevSceneBufferUVToScreenPosition.xy + PrevSceneBufferUVToScreenPosition.zw;
|
|
}
|
|
else
|
|
{
|
|
ScreenPos = DenoiserBufferUVToScreenPosition(RefBufferUV);
|
|
}
|
|
|
|
// Uncompress the reference scene metadata to keep a low VGPR pressure.
|
|
return UncompressSampleSceneInfo(
|
|
KernelConfig.RefSceneMetadataLayout, KernelConfig.bPreviousFrameRefMetadata,
|
|
ScreenPos,
|
|
KernelConfig.CompressedRefSceneMetadata);
|
|
}
|
|
|
|
/** Uncompress the scene metadata of a sample. */
|
|
FSSDSampleSceneInfos UncompressSampleSceneMetadata(
|
|
FSSDKernelConfig KernelConfig,
|
|
float2 SampleBufferUV,
|
|
FSSDCompressedSceneInfos CompressedSampleSceneMetadata)
|
|
{
|
|
float2 ScreenPos;
|
|
if (KernelConfig.bPreviousFrameMetadata)
|
|
{
|
|
ScreenPos = SampleBufferUV * PrevSceneBufferUVToScreenPosition.xy + PrevSceneBufferUVToScreenPosition.zw;
|
|
}
|
|
else
|
|
{
|
|
ScreenPos = DenoiserBufferUVToScreenPosition(SampleBufferUV);
|
|
}
|
|
|
|
return UncompressSampleSceneInfo(
|
|
CONFIG_METADATA_BUFFER_LAYOUT, KernelConfig.bPreviousFrameMetadata,
|
|
ScreenPos,
|
|
CompressedSampleSceneMetadata);
|
|
}
|
|
|
|
float3 ComputeVectorFromNeighborToRef(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDSampleSceneInfos RefSceneMetadata,
|
|
FSSDSampleSceneInfos NeighborSceneMetadata)
|
|
{
|
|
float RefWorldDepth = GetWorldDepth(RefSceneMetadata);
|
|
float NeighborWorldDepth = GetWorldDepth(NeighborSceneMetadata);
|
|
|
|
if (KernelConfig.NeighborToRefComputation == NEIGHBOR_TO_REF_LOWEST_VGPR_PRESSURE)
|
|
{
|
|
// Recompute the the screen position of the reference, from the most minimal VGPR footprint.
|
|
float2 RefScreenPos = RefSceneMetadata.ScreenPosition;
|
|
float3 RefClipPosition = float3(GetScreenPositionForProjectionType(RefScreenPos, RefWorldDepth), RefWorldDepth);
|
|
|
|
float2 NeighborScreenPos = NeighborSceneMetadata.ScreenPosition;
|
|
float3 NeighborClipPosition = float3(GetScreenPositionForProjectionType(NeighborScreenPos, NeighborWorldDepth), NeighborWorldDepth);
|
|
|
|
#if CONFIG_USE_VIEW_SPACE
|
|
float3 NeighborToRefVector = mul(float4(RefClipPosition - NeighborClipPosition, 0), GetScreenToViewDistanceMatrix()).xyz;
|
|
#else
|
|
float3 NeighborToRefVector = mul(float4(RefClipPosition - NeighborClipPosition, 0), View.ScreenToTranslatedWorld).xyz;
|
|
#endif
|
|
|
|
return NeighborToRefVector;
|
|
}
|
|
else // if (KernelConfig.NeighborToRefComputation == NEIGHBOR_TO_REF_CACHE_WORLD_POSITION)
|
|
{
|
|
float3 NeighborToRefWorldVector = GetTranslatedWorldPosition(RefSceneMetadata) - GetTranslatedWorldPosition(NeighborSceneMetadata);
|
|
|
|
// TODO(Denoiser): GetViewPosition(RefSceneMetadata)
|
|
#if CONFIG_USE_VIEW_SPACE
|
|
return mul(float4(NeighborToRefWorldVector, 0), View.TranslatedWorldToView).xyz;
|
|
#endif
|
|
|
|
return NeighborToRefWorldVector;
|
|
}
|
|
}
|
|
|
|
|
|
//------------------------------------------------------- SHARED SAMPLING
|
|
|
|
FSSDSignalSample TransformSignalSampleForAccumulation(
|
|
FSSDKernelConfig KernelConfig,
|
|
uint MultiplexId,
|
|
FSSDSampleSceneInfos SampleSceneMetadata,
|
|
FSSDSignalSample Sample,
|
|
uint2 SamplePixelCoord)
|
|
{
|
|
// Transform the color space.
|
|
#if (!FORCE_IDENTICAL_COLOR_SPACE)
|
|
// TODO(Denoiser): could pass down information that this sample may be normalized.
|
|
Sample = TransformSignal(
|
|
Sample,
|
|
/* SrcBasis = */ KernelConfig.BufferColorSpace[MultiplexId],
|
|
/* DestBasis = */ KernelConfig.AccumulatorColorSpace[MultiplexId]);
|
|
#endif
|
|
|
|
// Compute the spherical harmonic of the sample.
|
|
#if COMPILE_SIGNAL_COLOR_SH && COMPILE_SIGNAL_COLOR
|
|
if (KernelConfig.bComputeSampleColorSH)
|
|
{
|
|
Sample.ColorSH = ComputeSampleColorSH(SampleSceneMetadata, Sample, SamplePixelCoord);
|
|
}
|
|
#endif
|
|
|
|
return Sample;
|
|
}
|
|
|
|
/** Compute at compile time the index of the signal in the batch, from the index of the multiplexed signal. */
|
|
uint ComputeSignalBatchIdFromSignalMultiplexId(FSSDKernelConfig KernelConfig, const uint SignalMultiplexId)
|
|
{
|
|
return SignalMultiplexId / KernelConfig.MultiplexedSignalsPerSignalDomain;
|
|
}
|
|
|
|
/** Returns whether this sample is outside the viewport. */
|
|
bool IsOutsideViewport(FSSDKernelConfig KernelConfig, float2 SampleBufferUV)
|
|
{
|
|
return any(or(SampleBufferUV < KernelConfig.BufferBilinearUVMinMax.xy, SampleBufferUV > KernelConfig.BufferBilinearUVMinMax.zw));
|
|
}
|
|
|
|
/** Sample multiplexed samples and their metadata for kernel use. */
|
|
void SampleMultiplexedSignals(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
float2 SampleBufferUV,
|
|
out FSSDCompressedSceneInfos OutCompressedSampleSceneMetadata,
|
|
out FSSDCompressedMultiplexedSample OutCompressedMultiplexedSamples)
|
|
{
|
|
uint2 PixelCoord = BufferUVToBufferPixelCoord(SampleBufferUV);
|
|
|
|
OutCompressedSampleSceneMetadata = SampleCompressedSceneMetadata(
|
|
KernelConfig.bPreviousFrameMetadata, SampleBufferUV, PixelCoord);
|
|
|
|
// Fetches the signals sample
|
|
OutCompressedMultiplexedSamples = SampleCompressedMultiplexedSignals(
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
GlobalPointClampedSampler,
|
|
SampleBufferUV,
|
|
PixelCoord);
|
|
} // SampleMultiplexedSignals()
|
|
|
|
/** Uncompressed multiplexed signal for accumulation. */
|
|
void UncompressMultiplexedSignals(
|
|
FSSDKernelConfig KernelConfig,
|
|
float2 SampleBufferUV,
|
|
FSSDCompressedMultiplexedSample CompressedMultiplexedSamples,
|
|
out FSSDSignalArray MultiplexedSamples,
|
|
out FSSDSignalFrequencyArray MultiplexedFrequencies)
|
|
{
|
|
// TODO(Denoiser): offer multiplier to apply to each signal during Decode, to save mul VALU.
|
|
DecodeMultiplexedSignals(
|
|
KernelConfig.BufferLayout,
|
|
/* MultiplexedSampleId = */ 0,
|
|
KernelConfig.bNormalizeSample,
|
|
CompressedMultiplexedSamples,
|
|
/* out */ MultiplexedSamples,
|
|
/* out */ MultiplexedFrequencies);
|
|
|
|
if (KernelConfig.bClampUVPerMultiplexedSignal)
|
|
{
|
|
UNROLL_N(SIGNAL_ARRAY_SIZE)
|
|
for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++)
|
|
{
|
|
bool bInvalidSample = any(SampleBufferUV != clamp(
|
|
SampleBufferUV, KernelConfig.PerSignalUVMinMax[SignalMultiplexId].xy, KernelConfig.PerSignalUVMinMax[SignalMultiplexId].zw));
|
|
|
|
if (bInvalidSample)
|
|
{
|
|
MultiplexedSamples.Array[SignalMultiplexId] = CreateSignalSampleFromScalarValue(0.0);
|
|
}
|
|
} // for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++)
|
|
}
|
|
}
|
|
|
|
/** Accumulate multiplexed samples and their metadata to an accumulator. */
|
|
void AccumulateSampledMultiplexedSignals(
|
|
FSSDKernelConfig KernelConfig,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators,
|
|
FSSDSampleSceneInfos RefSceneMetadata,
|
|
float2 SampleBufferUV,
|
|
FSSDSampleSceneInfos SampleSceneMetadata,
|
|
FSSDSignalArray MultiplexedSamples,
|
|
FSSDSignalFrequencyArray MultiplexedFrequencies,
|
|
float KernelSampleWeight,
|
|
const bool bForceSample,
|
|
bool bIsOutsideFrustum)
|
|
{
|
|
// Compute the bluring radius of the output pixel itself.
|
|
float RefPixelWorldBluringRadius = ComputeWorldBluringRadiusCausedByPixelSize(RefSceneMetadata);
|
|
|
|
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED
|
|
FSSDSignalAccumulatorArray Accumulators = UncompressAccumulatorArray(CompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION);
|
|
#endif
|
|
|
|
// Compute the vector from neighbor to reference in the most optimal way.
|
|
float3 NeighborToRefVector = ComputeVectorFromNeighborToRef(
|
|
KernelConfig,
|
|
RefSceneMetadata,
|
|
SampleSceneMetadata);
|
|
|
|
#if DEBUG_OUTPUT && 0
|
|
if (KernelConfig.DebugEventCounter)
|
|
{
|
|
float4 A = float4(
|
|
RefSceneMetadata.WorldDepth,
|
|
SampleSceneMetadata.WorldDepth,
|
|
length(NeighborToRefVector) / RefPixelWorldBluringRadius,
|
|
KernelConfig.bPreviousFrameMetadata);
|
|
|
|
float4 B = float4(
|
|
DenoiserBufferUVToScreenPosition(SampleBufferUV) * 0.5 + 0.5,
|
|
0,
|
|
0);
|
|
|
|
float4 C = float4(
|
|
100 * abs(RefSceneMetadata.WorldDepth - SampleSceneMetadata.WorldDepth),
|
|
0,
|
|
0,
|
|
0);
|
|
|
|
float4 D = float4(
|
|
length(RefSceneMetadata.TranslatedWorldPosition - SampleSceneMetadata.TranslatedWorldPosition),
|
|
abs(RefSceneMetadata.WorldDepth - SampleSceneMetadata.WorldDepth),
|
|
length(RefSceneMetadata.ScreenPosition - SampleSceneMetadata.ScreenPosition),
|
|
0);
|
|
|
|
DebugOutput[KernelConfig.DebugPixelPosition] = A;
|
|
|
|
KernelConfig.DebugEventCounter = 0;
|
|
}
|
|
#endif
|
|
|
|
UNROLL_N(SIGNAL_ARRAY_SIZE)
|
|
for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++)
|
|
{
|
|
// Compute at compile time the id of the signal being processed.
|
|
const uint BatchedSignalId = ComputeSignalBatchIdFromSignalMultiplexId(KernelConfig, SignalMultiplexId);
|
|
|
|
// Domain knowledge of the signal.
|
|
FSSDSignalDomainKnowledge DomainKnowledge = GetSignalDomainKnowledge(BatchedSignalId);
|
|
|
|
// TODO(Denoiser): direction of the ray should be cached by injest or output by RGS, otherwise ends up with VGPR pressure because of SampleBufferUV.
|
|
uint2 NeighborPixelCoord = floor(SampleBufferUV * KernelConfig.BufferSizeAndInvSize.xy);
|
|
|
|
// Fetch and pre process the sample for accumulation.
|
|
FSSDSignalSample Sample = MultiplexedSamples.Array[SignalMultiplexId];
|
|
Sample = TransformSignalSampleForAccumulation(KernelConfig, SignalMultiplexId, SampleSceneMetadata, Sample, NeighborPixelCoord);
|
|
|
|
// Fetch sample's frequency for accumulation.
|
|
FSSDSignalFrequency SampleFrequency = MultiplexedFrequencies.Array[SignalMultiplexId];
|
|
|
|
// Compute the bluring radius of pixel itself.
|
|
float SamplePixelWorldBluringRadius = ComputeWorldBluringRadiusCausedByPixelSize(SampleSceneMetadata);
|
|
|
|
// Compute the bluring radius of the signal from ray hit distance and signal domain knowledge.
|
|
float SignalConvolutionBluringRadius = GetSignalWorldBluringRadius(SampleFrequency, SampleSceneMetadata, DomainKnowledge);
|
|
|
|
// But the signal's bluring radius might already be pre computed.
|
|
if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS)
|
|
{
|
|
SignalConvolutionBluringRadius = SampleFrequency.WorldBluringRadius;
|
|
}
|
|
|
|
// Compute the final world distance to use for bilateral rejection.
|
|
float FinalWorldBluringDistance = -1;
|
|
if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_REF_METADATA_ONLY)
|
|
{
|
|
FinalWorldBluringDistance = AmendWorldBluringRadiusCausedByPixelSize(
|
|
RefPixelWorldBluringRadius);
|
|
}
|
|
else if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_SAMPLE_METADATA_ONLY)
|
|
{
|
|
FinalWorldBluringDistance = AmendWorldBluringRadiusCausedByPixelSize(
|
|
SamplePixelWorldBluringRadius);
|
|
}
|
|
else if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_MIN_METADATA)
|
|
{
|
|
FinalWorldBluringDistance = AmendWorldBluringRadiusCausedByPixelSize(
|
|
min(SamplePixelWorldBluringRadius, RefPixelWorldBluringRadius));
|
|
}
|
|
else if (
|
|
KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_HIT_DISTANCE ||
|
|
KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS)
|
|
{
|
|
FinalWorldBluringDistance = SignalConvolutionBluringRadius;
|
|
}
|
|
else if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_HARMONIC)
|
|
{
|
|
FinalWorldBluringDistance = AmendWorldBluringRadiusCausedByPixelSize(
|
|
RefPixelWorldBluringRadius) * KernelConfig.HarmonicPeriode;
|
|
}
|
|
|
|
FinalWorldBluringDistance *= KernelConfig.WorldBluringDistanceMultiplier;
|
|
|
|
if (KernelConfig.bMaxWithRefBilateralDistance)
|
|
{
|
|
FinalWorldBluringDistance = min(FinalWorldBluringDistance, KernelConfig.RefBilateralDistance[SignalMultiplexId]);
|
|
}
|
|
|
|
// Compute the weight to be applied to do bilateral rejection.
|
|
float BilateralWeight = ComputeBilateralWeight(
|
|
KernelConfig.BilateralSettings[SignalMultiplexId],
|
|
FinalWorldBluringDistance,
|
|
DomainKnowledge,
|
|
RefSceneMetadata,
|
|
SampleSceneMetadata,
|
|
NeighborToRefVector);
|
|
|
|
FSSDSampleAccumulationInfos SampleInfos;
|
|
SampleInfos.Sample = Sample;
|
|
SampleInfos.Frequency = SampleFrequency;
|
|
SampleInfos.FinalWeight = KernelSampleWeight * BilateralWeight;
|
|
SampleInfos.InvFrequency = SignalConvolutionBluringRadius;
|
|
|
|
if (bForceSample || KernelConfig.bForceAllAccumulation)
|
|
{
|
|
SampleInfos.FinalWeight = 1;
|
|
}
|
|
|
|
// TODO(Denoiser): bIsOutsideFrustum could afect number of samples for DRB.
|
|
FLATTEN
|
|
if (SampleInfos.Sample.SampleCount != 0 && !bIsOutsideFrustum)
|
|
{
|
|
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION == ACCUMULATOR_COMPRESSION_DISABLED
|
|
{
|
|
AccumulateSample(
|
|
/* inout */ UncompressedAccumulators.Array[SignalMultiplexId],
|
|
SampleInfos);
|
|
}
|
|
#else
|
|
{
|
|
AccumulateSample(
|
|
/* inout */ Accumulators.Array[SignalMultiplexId],
|
|
SampleInfos);
|
|
}
|
|
#endif
|
|
}
|
|
} // for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++)
|
|
|
|
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED
|
|
CompressedAccumulators = CompressAccumulatorArray(Accumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION);
|
|
#endif
|
|
} // AccumulateSampledMultiplexedSignals().
|
|
|
|
/** Sample and accumulate to accumulatore array.
|
|
*
|
|
* Caution: you probably want to explicitly do this manually to help the shader compiler to do lattency hiding.
|
|
*/
|
|
void SampleAndAccumulateMultiplexedSignals(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators,
|
|
float2 SampleBufferUV,
|
|
float KernelSampleWeight,
|
|
const bool bForceSample)
|
|
{
|
|
// Stores in SGPR whether this sample is outside the viewport, to avoid VGPR pressure to keep SampleBufferUV after texture fetches.
|
|
bool bIsOutsideFrustum = IsOutsideViewport(KernelConfig, SampleBufferUV);
|
|
|
|
FSSDCompressedSceneInfos CompressedSampleSceneMetadata;
|
|
FSSDCompressedMultiplexedSample CompressedMultiplexedSamples;
|
|
|
|
// Force all the signal texture fetch and metadata to overlap to minimize serial texture fetches.
|
|
ISOLATE
|
|
{
|
|
SampleMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
SampleBufferUV,
|
|
/* out */ CompressedSampleSceneMetadata,
|
|
/* out */ CompressedMultiplexedSamples);
|
|
}
|
|
|
|
// Accumulate the samples, giving full freedom for shader compiler scheduler to put instructions in most optimal way.
|
|
{
|
|
FSSDSignalArray MultiplexedSamples;
|
|
FSSDSignalFrequencyArray MultiplexedFrequencies;
|
|
UncompressMultiplexedSignals(
|
|
KernelConfig, SampleBufferUV, CompressedMultiplexedSamples,
|
|
/* out */ MultiplexedSamples,
|
|
/* out */ MultiplexedFrequencies);
|
|
|
|
FSSDSampleSceneInfos RefSceneMetadata = UncompressRefSceneMetadata(KernelConfig);
|
|
|
|
FSSDSampleSceneInfos SampleSceneMetadata = UncompressSampleSceneMetadata(
|
|
KernelConfig, SampleBufferUV, CompressedSampleSceneMetadata);
|
|
|
|
AccumulateSampledMultiplexedSignals(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RefSceneMetadata,
|
|
SampleBufferUV,
|
|
SampleSceneMetadata,
|
|
MultiplexedSamples,
|
|
MultiplexedFrequencies,
|
|
KernelSampleWeight,
|
|
bForceSample,
|
|
bIsOutsideFrustum);
|
|
}
|
|
} // SampleAndAccumulateMultiplexedSignals()
|
|
|
|
void SampleAndAccumulateMultiplexedSignalsPair(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators,
|
|
float2 SampleBufferUV[2],
|
|
float KernelSampleWeight)
|
|
{
|
|
FSSDCompressedSceneInfos CompressedSampleSceneMetadata[2];
|
|
FSSDCompressedMultiplexedSample CompressedMultiplexedSamples[2];
|
|
bool bIsOutsideFrustum[2];
|
|
|
|
// Force all the signal texture fetch and metadata to overlap to minimize serial texture fetches.
|
|
ISOLATE
|
|
{
|
|
UNROLL_N(2)
|
|
for (uint PairFetchId = 0; PairFetchId < 2; PairFetchId++)
|
|
{
|
|
// Stores in SGPR whether this sample is outside the viewport, to avoid VGPR pressure to
|
|
// avoid keeping SampleBufferUV after texture fetches.
|
|
bIsOutsideFrustum[PairFetchId] = IsOutsideViewport(KernelConfig, SampleBufferUV[PairFetchId]);
|
|
|
|
SampleMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
SampleBufferUV[PairFetchId],
|
|
/* out */ CompressedSampleSceneMetadata[PairFetchId],
|
|
/* out */ CompressedMultiplexedSamples[PairFetchId]);
|
|
}
|
|
}
|
|
|
|
// Accumulate the samples, giving full freedom for shader compiler scheduler to put instructions in most optimal way.
|
|
{
|
|
// Uncompress the multiplexed signal.
|
|
FSSDSignalArray MultiplexedSamples[2];
|
|
FSSDSignalFrequencyArray MultiplexedFrequencies[2];
|
|
UNROLL_N(2)
|
|
for (uint PairUncompressId = 0; PairUncompressId < 2; PairUncompressId++)
|
|
{
|
|
UncompressMultiplexedSignals(
|
|
KernelConfig,
|
|
SampleBufferUV[PairUncompressId],
|
|
CompressedMultiplexedSamples[PairUncompressId],
|
|
/* out */ MultiplexedSamples[PairUncompressId],
|
|
/* out */ MultiplexedFrequencies[PairUncompressId]);
|
|
}
|
|
|
|
// Take the min inverse frequency per signal if desired.
|
|
if (KernelConfig.bMinSamplePairInvFrequency)
|
|
{
|
|
UNROLL_N(SIGNAL_ARRAY_SIZE)
|
|
for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++)
|
|
{
|
|
float MinInvFrequency = min(
|
|
MultiplexedFrequencies[0].Array[SignalMultiplexId].WorldBluringRadius,
|
|
MultiplexedFrequencies[1].Array[SignalMultiplexId].WorldBluringRadius);
|
|
|
|
FLATTEN
|
|
if (MinInvFrequency > 0)
|
|
{
|
|
MultiplexedFrequencies[0].Array[SignalMultiplexId].WorldBluringRadius = MinInvFrequency;
|
|
MultiplexedFrequencies[1].Array[SignalMultiplexId].WorldBluringRadius = MinInvFrequency;
|
|
}
|
|
}
|
|
}
|
|
|
|
FSSDSampleSceneInfos RefSceneMetadata = UncompressRefSceneMetadata(KernelConfig);
|
|
|
|
UNROLL_N(2)
|
|
for (uint PairAccumulateId = 0; PairAccumulateId < 2; PairAccumulateId++)
|
|
{
|
|
FSSDSampleSceneInfos SampleSceneMetadata = UncompressSampleSceneMetadata(
|
|
KernelConfig, SampleBufferUV[PairAccumulateId], CompressedSampleSceneMetadata[PairAccumulateId]);
|
|
|
|
AccumulateSampledMultiplexedSignals(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RefSceneMetadata,
|
|
SampleBufferUV[PairAccumulateId],
|
|
SampleSceneMetadata,
|
|
MultiplexedSamples[PairAccumulateId],
|
|
MultiplexedFrequencies[PairAccumulateId],
|
|
KernelSampleWeight,
|
|
/* bForceSample = */ false,
|
|
bIsOutsideFrustum[PairAccumulateId]);
|
|
}
|
|
}
|
|
} // SampleAndAccumulateMultiplexedSignalsPair()
|
|
|
|
void StartAccumulatingCluster(
|
|
FSSDKernelConfig KernelConfig,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators,
|
|
FSSDSampleClusterInfo ClusterInfo)
|
|
{
|
|
FSSDSampleSceneInfos RefSceneMetadata = UncompressRefSceneMetadata(KernelConfig);
|
|
|
|
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED
|
|
FSSDSignalAccumulatorArray Accumulators = UncompressAccumulatorArray(CompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION);
|
|
#endif
|
|
|
|
UNROLL_N(SIGNAL_ARRAY_SIZE)
|
|
for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++)
|
|
{
|
|
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION == ACCUMULATOR_COMPRESSION_DISABLED
|
|
{
|
|
StartAccumulatingCluster(
|
|
RefSceneMetadata,
|
|
/* inout */ UncompressedAccumulators.Array[SignalMultiplexId],
|
|
ClusterInfo);
|
|
}
|
|
#else
|
|
{
|
|
StartAccumulatingCluster(
|
|
RefSceneMetadata,
|
|
/* inout */ Accumulators.Array[SignalMultiplexId],
|
|
ClusterInfo);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED
|
|
CompressedAccumulators = CompressAccumulatorArray(Accumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION);
|
|
#endif
|
|
}
|
|
|
|
void DijestAccumulatedClusterSamples(
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators,
|
|
uint RingId, uint SampleCount)
|
|
{
|
|
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED
|
|
FSSDSignalAccumulatorArray Accumulators = UncompressAccumulatorArray(CompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION);
|
|
#endif
|
|
|
|
UNROLL_N(SIGNAL_ARRAY_SIZE)
|
|
for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++)
|
|
{
|
|
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION == ACCUMULATOR_COMPRESSION_DISABLED
|
|
{
|
|
DijestAccumulatedClusterSamples(
|
|
/* inout */ UncompressedAccumulators.Array[SignalMultiplexId],
|
|
RingId, SampleCount);
|
|
}
|
|
#else
|
|
{
|
|
DijestAccumulatedClusterSamples(
|
|
/* inout */ Accumulators.Array[SignalMultiplexId],
|
|
RingId, SampleCount);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED
|
|
CompressedAccumulators = CompressAccumulatorArray(Accumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION);
|
|
#endif
|
|
}
|
|
|
|
void SampleAndAccumulateCenterSampleAsItsOwnCluster(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators)
|
|
{
|
|
const uint RingId = 0;
|
|
|
|
FSSDSampleClusterInfo ClusterInfo;
|
|
ClusterInfo.OutterBoundaryRadius = (RingId + 1) * KernelConfig.KernelSpreadFactor;
|
|
|
|
StartAccumulatingCluster(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
ClusterInfo);
|
|
|
|
SampleAndAccumulateMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
KernelConfig.BufferUV,
|
|
/* KernelSampleWeight = */ 1.0,
|
|
/* bForceSample = */ KernelConfig.bForceKernelCenterAccumulation);
|
|
|
|
DijestAccumulatedClusterSamples(
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RingId, /* SampleCount = */ 1);
|
|
}
|
|
|
|
|
|
//------------------------------------------------------- EASY CONVOLUTIONS
|
|
|
|
#if COMPILE_BOX_KERNEL
|
|
|
|
void AccumulateBilinear2x2Kernel(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators)
|
|
{
|
|
const float MipLevelPow2 = 1;
|
|
|
|
FBilinearSampleInfos BilinearInfos = GetBilinearSampleLevelInfosEx(
|
|
KernelConfig.BufferUV,
|
|
KernelConfig.BufferSizeAndInvSize.xy,
|
|
KernelConfig.BufferSizeAndInvSize.zw,
|
|
MipLevelPow2, rcp(MipLevelPow2));
|
|
|
|
bool bUseStocasticBilinear = false;
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_2X2_STOCASTIC)
|
|
{
|
|
bUseStocasticBilinear = true;
|
|
}
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_2X2_ADAPTIVE)
|
|
{
|
|
bUseStocasticBilinear = !KernelConfig.bIsDynamicPixel;
|
|
}
|
|
|
|
float2 SampleBufferUVArray[4];
|
|
float BilinearWeightArray[4];
|
|
|
|
FLATTEN
|
|
if (bUseStocasticBilinear)
|
|
{
|
|
float2 SampleOffset = 0;
|
|
float WeigthAccumulation = 0.0;
|
|
|
|
UNROLL_N(4)
|
|
for (uint i = 0; i < 4; i++)
|
|
{
|
|
FLATTEN
|
|
if (KernelConfig.Randoms[0] > WeigthAccumulation)
|
|
SampleOffset = BilinearSamplingOffsets2x2[i];
|
|
|
|
WeigthAccumulation += GetSampleWeight(BilinearInfos, i);
|
|
|
|
BilinearWeightArray[i] = 0.0;
|
|
SampleBufferUVArray[i] = 0.0;
|
|
}
|
|
|
|
// TODO(Denoiser): could be more ALU efficient for this.
|
|
// TODO(Denoiser): -0.5 full res pixel to ensure always select the mip, regardless of mantissa precision?
|
|
SampleBufferUVArray[0] = (BilinearInfos.TopLeftPixelCoord + (SampleOffset + 0.5)) * MipLevelPow2 * KernelConfig.BufferSizeAndInvSize.zw;
|
|
BilinearWeightArray[0] = 1.0;
|
|
}
|
|
else
|
|
{
|
|
UNROLL_N(4)
|
|
for (uint i = 0; i < 4; i++)
|
|
{
|
|
float2 SampleOffset = BilinearSamplingOffsets2x2[i];
|
|
|
|
// TODO(Denoiser): could be more ALU efficient for this.
|
|
// TODO(Denoiser): -0.5 full res pixel to ensure always select the mip, regardless of mantissa precision?
|
|
SampleBufferUVArray[i] = (BilinearInfos.TopLeftPixelCoord + (SampleOffset + 0.5)) * MipLevelPow2 * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
BilinearWeightArray[i] = GetSampleWeight(BilinearInfos, i);
|
|
}
|
|
}
|
|
|
|
{
|
|
SampleAndAccumulateMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUVArray[0],
|
|
BilinearWeightArray[0],
|
|
/* bForceSample = */ false);
|
|
}
|
|
|
|
BRANCH
|
|
if (!bUseStocasticBilinear)
|
|
{
|
|
UNROLL_N(3)
|
|
for (uint i = 1; i < 4; i++)
|
|
{
|
|
SampleAndAccumulateMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUVArray[i],
|
|
BilinearWeightArray[i],
|
|
/* bForceSample = */ false);
|
|
}
|
|
}
|
|
} // AccumulateBilinear2x2Kernel()
|
|
|
|
void AccumulateSquareKernel(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators)
|
|
{
|
|
int KernelRadius = 1;
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_5X5_WAVELET)
|
|
{
|
|
KernelRadius = 2;
|
|
}
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_NXN)
|
|
{
|
|
KernelRadius = KernelConfig.BoxKernelRadius;
|
|
}
|
|
|
|
if (KernelConfig.bUnroll)
|
|
{
|
|
UNROLL for (int x = -KernelRadius; x <= KernelRadius; x++)
|
|
{
|
|
UNROLL for (int y = -KernelRadius; y <= KernelRadius; y++)
|
|
{
|
|
const bool bIsKernelCenterSample = x == 0 && y == 0;
|
|
|
|
if (bIsKernelCenterSample && !KernelConfig.bSampleKernelCenter) continue;
|
|
|
|
float2 SampleOffset = float2(x, y);
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_3X3_SOBEK2018)
|
|
{
|
|
SampleOffset = mul(float2x2(float2(2, -1), float2(1, 2)), SampleOffset);
|
|
}
|
|
|
|
float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
float KernelWeight = 1;
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_5X5_WAVELET)
|
|
{
|
|
KernelWeight =
|
|
kWaveletFilterWeights5x5[abs(x)] *
|
|
kWaveletFilterWeights5x5[abs(y)] *
|
|
rcp(kWaveletFilterWeights5x5[0] * kWaveletFilterWeights5x5[0]);
|
|
}
|
|
|
|
SampleAndAccumulateMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
KernelWeight,
|
|
/* bForceSample = */ bIsKernelCenterSample && KernelConfig.bForceKernelCenterAccumulation);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// TODO(Denoiser): latency hiding of this is terrible.
|
|
LOOP for (int x = -KernelRadius; x <= KernelRadius; x++)
|
|
{
|
|
LOOP for (int y = -KernelRadius; y <= KernelRadius; y++)
|
|
{
|
|
const bool bIsKernelCenterSample = x == 0 && y == 0;
|
|
|
|
if (bIsKernelCenterSample && !KernelConfig.bSampleKernelCenter) continue;
|
|
|
|
float2 SampleOffset = float2(x, y);
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_3X3_SOBEK2018)
|
|
{
|
|
SampleOffset = mul(float2x2(float2(2, -1), float2(1, 2)), SampleOffset);
|
|
}
|
|
|
|
float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
float KernelWeight = 1;
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_5X5_WAVELET)
|
|
{
|
|
KernelWeight =
|
|
kWaveletFilterWeights5x5[abs(x)] *
|
|
kWaveletFilterWeights5x5[abs(y)] *
|
|
rcp(kWaveletFilterWeights5x5[0] * kWaveletFilterWeights5x5[0]);
|
|
}
|
|
|
|
SampleAndAccumulateMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
KernelWeight,
|
|
/* bForceSample = */ bIsKernelCenterSample && KernelConfig.bForceKernelCenterAccumulation);
|
|
}
|
|
}
|
|
}
|
|
} // AccumulateSquareKernel()
|
|
|
|
void BroadcastAccumulateSquare3x3KernelCenter(
|
|
FSSDKernelConfig KernelConfig,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators,
|
|
FSSDSampleSceneInfos RefSceneMetadata,
|
|
float2 SampleBufferUV,
|
|
FSSDSampleSceneInfos SampleSceneMetadata,
|
|
FSSDSignalArray SampleMultiplexedSamples,
|
|
FSSDSignalFrequencyArray SampleMultiplexedFrequencies)
|
|
#if CONFIG_ENABLE_WAVE_BROADCAST
|
|
{
|
|
const FWaveBroadcastSettings BroadcastSettingsX = InitWaveSwapWithinLaneGroup(/* LaneGroupSize = */ 2);
|
|
const FWaveBroadcastSettings BroadcastSettingsY = InitWaveSwapWithinLaneGroup(/* LaneGroupSize = */ 16);
|
|
|
|
// Broadcast X.
|
|
SampleBufferUV = WaveBroadcast(BroadcastSettingsX, SampleBufferUV);
|
|
SampleSceneMetadata = WaveBroadcastSceneMetadata(BroadcastSettingsX, SampleSceneMetadata);
|
|
SampleMultiplexedSamples = WaveBroadcastSignalArray(BroadcastSettingsX, SampleMultiplexedSamples);
|
|
SampleMultiplexedFrequencies = WaveBroadcastSignalFrequenciesArray(BroadcastSettingsX, SampleMultiplexedFrequencies);
|
|
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS)
|
|
{
|
|
AccumulateSampledMultiplexedSignals(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RefSceneMetadata,
|
|
SampleBufferUV,
|
|
SampleSceneMetadata,
|
|
SampleMultiplexedSamples,
|
|
SampleMultiplexedFrequencies,
|
|
/* KernelWeight = */ 1.0,
|
|
/* bForceSample = */ false,
|
|
/* bIsOutsideFrustum = */ false);
|
|
}
|
|
|
|
// Broadcast Y.
|
|
SampleBufferUV = WaveBroadcast(BroadcastSettingsY, SampleBufferUV);
|
|
SampleSceneMetadata = WaveBroadcastSceneMetadata(BroadcastSettingsY, SampleSceneMetadata);
|
|
SampleMultiplexedSamples = WaveBroadcastSignalArray(BroadcastSettingsY, SampleMultiplexedSamples);
|
|
SampleMultiplexedFrequencies = WaveBroadcastSignalFrequenciesArray(BroadcastSettingsY, SampleMultiplexedFrequencies);
|
|
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS)
|
|
{
|
|
AccumulateSampledMultiplexedSignals(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RefSceneMetadata,
|
|
SampleBufferUV,
|
|
SampleSceneMetadata,
|
|
SampleMultiplexedSamples,
|
|
SampleMultiplexedFrequencies,
|
|
/* KernelWeight = */ 1.0,
|
|
/* bForceSample = */ false,
|
|
/* bIsOutsideFrustum = */ false);
|
|
}
|
|
|
|
// Broadcast X Again.
|
|
SampleBufferUV = WaveBroadcast(BroadcastSettingsX, SampleBufferUV);
|
|
SampleSceneMetadata = WaveBroadcastSceneMetadata(BroadcastSettingsX, SampleSceneMetadata);
|
|
SampleMultiplexedSamples = WaveBroadcastSignalArray(BroadcastSettingsX, SampleMultiplexedSamples);
|
|
SampleMultiplexedFrequencies = WaveBroadcastSignalFrequenciesArray(BroadcastSettingsX, SampleMultiplexedFrequencies);
|
|
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS)
|
|
{
|
|
AccumulateSampledMultiplexedSignals(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RefSceneMetadata,
|
|
SampleBufferUV,
|
|
SampleSceneMetadata,
|
|
SampleMultiplexedSamples,
|
|
SampleMultiplexedFrequencies,
|
|
/* KernelWeight = */ 1.0,
|
|
/* bForceSample = */ false,
|
|
/* bIsOutsideFrustum = */ false);
|
|
}
|
|
} // BroadcastAccumulateSquare3x3KernelCenter()
|
|
#else
|
|
{ }
|
|
#endif
|
|
|
|
void AccumulateSquare3x3Kernel(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators)
|
|
#if CONFIG_ENABLE_WAVE_BROADCAST
|
|
{
|
|
if (KernelConfig.bSampleKernelCenter)
|
|
{
|
|
float2 SampleBufferUV = KernelConfig.BufferUV;
|
|
|
|
// TODO(Denoiser):
|
|
const bool bIsOutsideFrustum = false;
|
|
|
|
FSSDCompressedSceneInfos CompressedSampleSceneMetadata;
|
|
FSSDCompressedMultiplexedSample CompressedMultiplexedSamples;
|
|
|
|
// Force all the signal texture fetch and metadata to overlap to minimize serial texture fetches.
|
|
ISOLATE
|
|
{
|
|
SampleMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
SampleBufferUV,
|
|
/* out */ CompressedSampleSceneMetadata,
|
|
/* out */ CompressedMultiplexedSamples);
|
|
}
|
|
|
|
FSSDSampleSceneInfos RefSceneMetadata = UncompressRefSceneMetadata(KernelConfig);
|
|
|
|
FSSDSignalArray MultiplexedSamples;
|
|
FSSDSignalFrequencyArray MultiplexedFrequencies;
|
|
UncompressMultiplexedSignals(
|
|
KernelConfig,
|
|
SampleBufferUV,
|
|
CompressedMultiplexedSamples,
|
|
/* out */ MultiplexedSamples,
|
|
/* out */ MultiplexedFrequencies);
|
|
|
|
FSSDSampleSceneInfos SampleSceneMetadata = UncompressSampleSceneMetadata(
|
|
KernelConfig, SampleBufferUV, CompressedSampleSceneMetadata);
|
|
|
|
AccumulateSampledMultiplexedSignals(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RefSceneMetadata,
|
|
SampleBufferUV,
|
|
SampleSceneMetadata,
|
|
MultiplexedSamples,
|
|
MultiplexedFrequencies,
|
|
/* KernelWeight = */ 1.0,
|
|
/* bForceSample = */ true,
|
|
bIsOutsideFrustum);
|
|
|
|
BroadcastAccumulateSquare3x3KernelCenter(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RefSceneMetadata,
|
|
SampleBufferUV,
|
|
SampleSceneMetadata,
|
|
MultiplexedSamples,
|
|
MultiplexedFrequencies);
|
|
}
|
|
|
|
// Store whether needs to flip offsets to have lowest VGPR pressure.
|
|
uint2 OutputPixelPostion = BufferUVToBufferPixelCoord(KernelConfig.RefBufferUV);
|
|
bool bFlipX = (OutputPixelPostion.x & 0x1) != 0;
|
|
bool bFlipY = (OutputPixelPostion.y & 0x1) != 0;
|
|
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS)
|
|
{
|
|
float2 SampleOffset = float2(bFlipX ? 1.0 : -1.0, bFlipY ? 1.0 : -1.0);
|
|
|
|
float2 SampleBufferUV = KernelConfig.BufferUV + SampleOffset * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
SampleAndAccumulateMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
/* KernelWeight = */ 1.0,
|
|
/* bForceSample = */ false);
|
|
}
|
|
|
|
static const float2 SampleOffsetArray[4] = {
|
|
float2(-1.0, 0.0),
|
|
float2( 0.0, -1.0),
|
|
float2(-1.0, 1.0),
|
|
float2( 1.0, -1.0),
|
|
};
|
|
|
|
UNROLL
|
|
for (
|
|
uint BatchId = (KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS ? 1 : 0);
|
|
BatchId < (KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS ? 1 : 2);
|
|
BatchId++)
|
|
ISOLATE
|
|
{
|
|
|
|
float2 SampleOffset0 = select(bool2(bFlipX, bFlipY), -SampleOffsetArray[BatchId * 2 + 0], SampleOffsetArray[BatchId * 2 + 0]);
|
|
float2 SampleOffset1 = select(bool2(bFlipX, bFlipY), -SampleOffsetArray[BatchId * 2 + 1], SampleOffsetArray[BatchId * 2 + 1]);
|
|
|
|
float2 SampleBufferUV[2];
|
|
SampleBufferUV[0] = KernelConfig.BufferUV + SampleOffset0 * KernelConfig.BufferSizeAndInvSize.zw;
|
|
SampleBufferUV[1] = KernelConfig.BufferUV + SampleOffset1 * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
SampleAndAccumulateMultiplexedSignalsPair(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
/* KernelWeight = */ 1.0);
|
|
}
|
|
} // AccumulateSquare3x3Kernel()
|
|
#else // !CONFIG_ENABLE_WAVE_BROADCAST
|
|
{
|
|
if (KernelConfig.bSampleKernelCenter)
|
|
{
|
|
SampleAndAccumulateCenterSampleAsItsOwnCluster(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
|
|
static const float2 SampleOffsetArray[4] = {
|
|
float2(1.0, 0.0),
|
|
float2(1.0, 1.0),
|
|
float2(0.0, 1.0),
|
|
float2(-1.0, 1.0),
|
|
};
|
|
|
|
UNROLL
|
|
for (
|
|
uint BatchId = (KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS ? 1 : 0);
|
|
BatchId < 4;
|
|
BatchId += (KernelConfig.SampleSet != SAMPLE_SET_3X3 ? 2 : 1))
|
|
ISOLATE
|
|
{
|
|
float2 SampleOffset = SampleOffsetArray[BatchId];
|
|
|
|
float2 SampleBufferUV[2];
|
|
SampleBufferUV[0] = KernelConfig.BufferUV + SampleOffset * KernelConfig.BufferSizeAndInvSize.zw;
|
|
SampleBufferUV[1] = KernelConfig.BufferUV - SampleOffset * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
SampleAndAccumulateMultiplexedSignalsPair(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
/* KernelWeight = */ 1.0);
|
|
}
|
|
} // AccumulateSquare3x3Kernel()
|
|
#endif // !CONFIG_ENABLE_WAVE_BROADCAST
|
|
|
|
#endif // COMPILE_BOX_KERNEL
|
|
|
|
|
|
//------------------------------------------------------- STACKOWIAK 2018
|
|
|
|
#if COMPILE_STACKOWIAK_KERNEL
|
|
|
|
static const float2 kStackowiakSampleSet0[56 * 4] =
|
|
{
|
|
float2(-0.5, -0.5), float2(+0.5, -0.5), float2(-0.5, +0.5), float2(+0.5, +0.5),
|
|
float2(-1.5, +0.5), float2(-1.5, -0.5), float2(-0.5, +1.5), float2(+1.5, -0.5),
|
|
float2(+0.5, -1.5), float2(+2.5, -0.5), float2(+1.5, +0.5), float2(-0.5, -1.5),
|
|
float2(-1.5, -2.5), float2(-0.5, -2.5), float2(-1.5, -1.5), float2(-0.5, +2.5),
|
|
float2(-1.5, +1.5), float2(+1.5, -2.5), float2(-1.5, +2.5), float2(+1.5, +2.5),
|
|
float2(+0.5, -2.5), float2(-2.5, -0.5), float2(-2.5, -1.5), float2(-2.5, +0.5),
|
|
float2(+0.5, +1.5), float2(+0.5, +2.5), float2(-3.5, +0.5), float2(+0.5, +3.5),
|
|
float2(+1.5, -1.5), float2(+3.5, -0.5), float2(+2.5, +1.5), float2(+3.5, +0.5),
|
|
float2(+1.5, +1.5), float2(-2.5, +1.5), float2(-3.5, +2.5), float2(+3.5, +1.5),
|
|
float2(-3.5, -0.5), float2(-1.5, -3.5), float2(-2.5, -2.5), float2(-2.5, +2.5),
|
|
float2(+2.5, +0.5), float2(+2.5, +2.5), float2(+1.5, +3.5), float2(+3.5, -1.5),
|
|
float2(-3.5, -2.5), float2(+3.5, -2.5), float2(+2.5, -1.5), float2(+0.5, -3.5),
|
|
float2(-0.5, +3.5), float2(-0.5, -4.5), float2(-4.5, +0.5), float2(+4.5, +0.5),
|
|
float2(-4.5, -1.5), float2(-3.5, +1.5), float2(-0.5, -3.5), float2(+1.5, -3.5),
|
|
float2(+0.5, -4.5), float2(-1.5, +3.5), float2(+0.5, +4.5), float2(-3.5, -1.5),
|
|
float2(-4.5, +1.5), float2(+2.5, -4.5), float2(+2.5, -2.5), float2(-1.5, +4.5),
|
|
float2(-2.5, -4.5), float2(+4.5, -2.5), float2(+2.5, +3.5), float2(-3.5, +3.5),
|
|
float2(-2.5, +3.5), float2(+0.5, -5.5), float2(-4.5, +3.5), float2(-2.5, -3.5),
|
|
float2(-4.5, +2.5), float2(+3.5, +3.5), float2(+2.5, -3.5), float2(+4.5, +3.5),
|
|
float2(+3.5, -3.5), float2(+4.5, +2.5), float2(-5.5, +1.5), float2(-4.5, -0.5),
|
|
float2(+3.5, +2.5), float2(-0.5, +4.5), float2(-1.5, +5.5), float2(+1.5, +5.5),
|
|
float2(+4.5, -0.5), float2(+5.5, +0.5), float2(+4.5, +1.5), float2(-1.5, -4.5),
|
|
float2(-1.5, -5.5), float2(-4.5, -2.5), float2(-2.5, +5.5), float2(+2.5, +5.5),
|
|
float2(+1.5, +4.5), float2(+5.5, +1.5), float2(+1.5, -4.5), float2(-3.5, -3.5),
|
|
float2(+3.5, -4.5), float2(-3.5, -4.5), float2(+4.5, -1.5), float2(+4.5, -3.5),
|
|
float2(-3.5, -5.5), float2(-2.5, -5.5), float2(-4.5, -3.5), float2(+4.5, +4.5),
|
|
float2(-3.5, +4.5), float2(-2.5, +4.5), float2(-5.5, -2.5), float2(-5.5, +0.5),
|
|
float2(+2.5, -5.5), float2(+3.5, +4.5), float2(-0.5, -5.5), float2(-0.5, +6.5),
|
|
float2(+2.5, +4.5), float2(-5.5, -0.5), float2(-6.5, -1.5), float2(+1.5, -5.5),
|
|
float2(-6.5, -0.5), float2(+0.5, +5.5), float2(+1.5, +6.5), float2(+6.5, +1.5),
|
|
float2(-0.5, +5.5), float2(+6.5, -0.5), float2(-4.5, -4.5), float2(-5.5, +2.5),
|
|
float2(+5.5, -0.5), float2(-5.5, -1.5), float2(-6.5, +3.5), float2(-1.5, +6.5),
|
|
float2(-6.5, +0.5), float2(+4.5, -5.5), float2(-3.5, +6.5), float2(+6.5, -1.5),
|
|
float2(+0.5, -6.5), float2(-5.5, -3.5), float2(+5.5, -2.5), float2(+4.5, -4.5),
|
|
float2(+5.5, -1.5), float2(+3.5, -6.5), float2(+5.5, +3.5), float2(+3.5, -5.5),
|
|
float2(-5.5, -4.5), float2(+6.5, -3.5), float2(-0.5, -6.5), float2(+3.5, +6.5),
|
|
float2(-5.5, +3.5), float2(+0.5, +6.5), float2(+6.5, +0.5), float2(+6.5, -2.5),
|
|
float2(-6.5, -3.5), float2(-4.5, +4.5), float2(-7.5, -0.5), float2(+7.5, +0.5),
|
|
float2(+5.5, +2.5), float2(-0.5, -7.5), float2(+0.5, +7.5), float2(-4.5, +5.5),
|
|
float2(+3.5, +5.5), float2(-3.5, +5.5), float2(-4.5, -5.5), float2(+4.5, +6.5),
|
|
float2(+5.5, -4.5), float2(+4.5, +5.5), float2(-4.5, +6.5), float2(+6.5, +4.5),
|
|
float2(-7.5, +1.5), float2(-6.5, +1.5), float2(+5.5, -3.5), float2(-6.5, +2.5),
|
|
float2(-2.5, +6.5), float2(-1.5, -7.5), float2(+5.5, +4.5), float2(-1.5, -6.5),
|
|
float2(-3.5, -7.5), float2(+2.5, -7.5), float2(-7.5, +2.5), float2(-6.5, -2.5),
|
|
float2(-5.5, +5.5), float2(+2.5, +6.5), float2(-2.5, -6.5), float2(-7.5, +0.5),
|
|
float2(-0.5, +7.5), float2(+7.5, -2.5), float2(-2.5, +7.5), float2(+0.5, -7.5),
|
|
float2(-4.5, -7.5), float2(+7.5, +1.5), float2(+1.5, -6.5), float2(-6.5, +4.5),
|
|
float2(-1.5, +7.5), float2(-5.5, -5.5), float2(+6.5, +2.5), float2(-3.5, -6.5),
|
|
float2(+3.5, -7.5), float2(-5.5, +4.5), float2(+2.5, -6.5), float2(+1.5, -7.5),
|
|
float2(+6.5, +3.5), float2(+5.5, -6.5), float2(-6.5, +5.5), float2(+7.5, +4.5),
|
|
float2(+7.5, -1.5), float2(-7.5, -1.5), float2(+3.5, +7.5), float2(-5.5, +6.5),
|
|
float2(+1.5, +7.5), float2(+7.5, +3.5), float2(+7.5, -0.5), float2(-7.5, -2.5),
|
|
float2(+5.5, +5.5), float2(+6.5, +5.5), float2(+5.5, -5.5), float2(-2.5, -7.5),
|
|
float2(+2.5, +7.5), float2(-7.5, -3.5), float2(-7.5, -4.5), float2(-6.5, -4.5),
|
|
float2(+7.5, -3.5), float2(+5.5, +6.5), float2(-5.5, -6.5), float2(-4.5, -6.5),
|
|
float2(+7.5, +2.5), float2(-7.5, +3.5), float2(+4.5, -6.5), float2(+7.5, -4.5),
|
|
};
|
|
|
|
static const float2 kStackowiakSampleSet1[56 * 4] =
|
|
{
|
|
float2(-0.5, -0.5), float2(+0.5, -0.5), float2(-0.5, +0.5), float2(+0.5, +0.5),
|
|
float2(+0.5, -1.5), float2(+1.5, -1.5), float2(-1.5, -0.5), float2(+1.5, +1.5),
|
|
float2(-0.5, -2.5), float2(-1.5, -1.5), float2(+0.5, +1.5), float2(-1.5, +0.5),
|
|
float2(+1.5, -0.5), float2(-0.5, +1.5), float2(-2.5, +0.5), float2(+0.5, +2.5),
|
|
float2(-2.5, -1.5), float2(+2.5, +0.5), float2(+1.5, +0.5), float2(-0.5, -1.5),
|
|
float2(-1.5, +1.5), float2(+2.5, -2.5), float2(-3.5, -0.5), float2(-1.5, +2.5),
|
|
float2(-2.5, +1.5), float2(-2.5, -0.5), float2(-1.5, -2.5), float2(+2.5, -1.5),
|
|
float2(-3.5, +0.5), float2(-0.5, -3.5), float2(-1.5, +3.5), float2(+0.5, -2.5),
|
|
float2(+1.5, +2.5), float2(-0.5, +2.5), float2(+0.5, +3.5), float2(+3.5, +0.5),
|
|
float2(+2.5, +1.5), float2(-2.5, -2.5), float2(+2.5, -0.5), float2(+3.5, -1.5),
|
|
float2(-0.5, +3.5), float2(+3.5, +1.5), float2(-3.5, +2.5), float2(+3.5, +2.5),
|
|
float2(+3.5, -0.5), float2(+0.5, -4.5), float2(-2.5, +3.5), float2(+0.5, -3.5),
|
|
float2(-1.5, -4.5), float2(+1.5, +3.5), float2(+1.5, -2.5), float2(-3.5, +1.5),
|
|
float2(+2.5, -3.5), float2(-2.5, -3.5), float2(+2.5, +2.5), float2(+1.5, +4.5),
|
|
float2(-4.5, -2.5), float2(-2.5, +2.5), float2(-4.5, +1.5), float2(+4.5, +1.5),
|
|
float2(-2.5, -4.5), float2(+3.5, -3.5), float2(-1.5, -3.5), float2(-3.5, -1.5),
|
|
float2(+1.5, -4.5), float2(+4.5, -2.5), float2(+1.5, -3.5), float2(-1.5, +4.5),
|
|
float2(-4.5, +2.5), float2(-4.5, -0.5), float2(+2.5, +4.5), float2(-4.5, +0.5),
|
|
float2(-3.5, -4.5), float2(+0.5, +4.5), float2(+3.5, -2.5), float2(-3.5, -2.5),
|
|
float2(-3.5, +3.5), float2(+3.5, +3.5), float2(+4.5, +0.5), float2(+0.5, +5.5),
|
|
float2(-0.5, +4.5), float2(+4.5, -3.5), float2(-1.5, +5.5), float2(-0.5, -4.5),
|
|
float2(+2.5, +3.5), float2(+4.5, +2.5), float2(-2.5, +5.5), float2(+2.5, -4.5),
|
|
float2(+4.5, -0.5), float2(+5.5, -0.5), float2(-4.5, +4.5), float2(+5.5, -1.5),
|
|
float2(-5.5, -1.5), float2(-4.5, -1.5), float2(+3.5, +4.5), float2(-3.5, -3.5),
|
|
float2(-5.5, +0.5), float2(+1.5, -5.5), float2(-5.5, -2.5), float2(-3.5, +4.5),
|
|
float2(+0.5, -5.5), float2(-2.5, -5.5), float2(+2.5, +5.5), float2(+4.5, +4.5),
|
|
float2(+4.5, -1.5), float2(-2.5, +4.5), float2(+4.5, +3.5), float2(+0.5, +6.5),
|
|
float2(-0.5, -6.5), float2(+5.5, +2.5), float2(-0.5, -5.5), float2(-5.5, -0.5),
|
|
float2(-6.5, -1.5), float2(-0.5, +5.5), float2(-0.5, +6.5), float2(+6.5, -0.5),
|
|
float2(+1.5, +5.5), float2(+1.5, -6.5), float2(+5.5, +0.5), float2(-5.5, +2.5),
|
|
float2(+5.5, +1.5), float2(-5.5, +1.5), float2(-6.5, -0.5), float2(-1.5, -5.5),
|
|
float2(-5.5, -4.5), float2(-4.5, +3.5), float2(-6.5, +1.5), float2(+2.5, -5.5),
|
|
float2(+3.5, -5.5), float2(-5.5, -3.5), float2(+1.5, +6.5), float2(+6.5, +2.5),
|
|
float2(+4.5, -4.5), float2(+3.5, -6.5), float2(-4.5, -4.5), float2(-4.5, -3.5),
|
|
float2(-6.5, +2.5), float2(+3.5, +5.5), float2(+3.5, -4.5), float2(+5.5, -3.5),
|
|
float2(-5.5, +4.5), float2(+6.5, -3.5), float2(-6.5, -2.5), float2(+5.5, +4.5),
|
|
float2(-1.5, +6.5), float2(-0.5, -7.5), float2(-6.5, +3.5), float2(-5.5, +3.5),
|
|
float2(-6.5, -4.5), float2(+7.5, -1.5), float2(-3.5, -5.5), float2(+3.5, +6.5),
|
|
float2(+5.5, +3.5), float2(+7.5, +0.5), float2(+5.5, -2.5), float2(-6.5, +0.5),
|
|
float2(-7.5, +1.5), float2(-3.5, -6.5), float2(+6.5, +0.5), float2(+7.5, +1.5),
|
|
float2(-2.5, -7.5), float2(-3.5, +5.5), float2(-7.5, -0.5), float2(-3.5, +6.5),
|
|
float2(-2.5, +6.5), float2(+4.5, -6.5), float2(-5.5, +5.5), float2(+4.5, -5.5),
|
|
float2(+6.5, -2.5), float2(+6.5, +3.5), float2(-1.5, -6.5), float2(-1.5, +7.5),
|
|
float2(+6.5, +1.5), float2(-5.5, -5.5), float2(+0.5, -6.5), float2(+7.5, +3.5),
|
|
float2(+2.5, +6.5), float2(-4.5, +5.5), float2(-6.5, -3.5), float2(-4.5, -5.5),
|
|
float2(-6.5, -5.5), float2(+5.5, -6.5), float2(-2.5, -6.5), float2(+5.5, -5.5),
|
|
float2(+4.5, +5.5), float2(-7.5, +0.5), float2(+6.5, -1.5), float2(+0.5, -7.5),
|
|
float2(+7.5, -0.5), float2(-3.5, -7.5), float2(+2.5, -6.5), float2(-3.5, +7.5),
|
|
float2(-4.5, -7.5), float2(-0.5, +7.5), float2(-6.5, +5.5), float2(+7.5, -3.5),
|
|
float2(-4.5, +6.5), float2(+1.5, +7.5), float2(+5.5, -4.5), float2(+7.5, +4.5),
|
|
float2(+0.5, +7.5), float2(+4.5, +6.5), float2(-4.5, +7.5), float2(-7.5, -1.5),
|
|
float2(+3.5, -7.5), float2(+7.5, -4.5), float2(+3.5, +7.5), float2(-1.5, -7.5),
|
|
float2(+6.5, -4.5), float2(-7.5, -3.5), float2(+6.5, +4.5), float2(+2.5, -7.5),
|
|
float2(+7.5, -2.5), float2(-7.5, +2.5), float2(+1.5, -7.5), float2(-5.5, +6.5),
|
|
float2(+5.5, +5.5), float2(-2.5, +7.5), float2(+7.5, +2.5), float2(-7.5, -2.5),
|
|
float2(+2.5, +7.5), float2(-6.5, +4.5), float2(+5.5, +6.5), float2(-4.5, -6.5),
|
|
};
|
|
|
|
static const uint kStackowiakSampleSetCount = 4;
|
|
static const uint kStackowiakSampleCountPerSet = 56;
|
|
|
|
void ConvolveStackowiakKernel(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators)
|
|
{
|
|
// Number of batch size done at same time, to improve lattency hidding.
|
|
const uint kSamplingBatchSize = 2;
|
|
|
|
if (KernelConfig.bDescOrder)
|
|
{
|
|
// (SALU) Number of batch of samples to perform.
|
|
const uint BatchCountCount = (KernelConfig.SampleCount + (kSamplingBatchSize - 1)) / kSamplingBatchSize;
|
|
|
|
// (SALU) Compute a final number of sample quantize the sampling batch size.
|
|
const uint SampleCount = BatchCountCount * kSamplingBatchSize;
|
|
|
|
// Compile time number of samples between rings.
|
|
const uint StocasticSamplesPerCluster = 8 / kStackowiakSampleSetCount;
|
|
|
|
// Compute the first index at witch digestion must happen.
|
|
uint CurrentRingId = 0;
|
|
uint NextClusterBoundary = 0;
|
|
|
|
if (StocasticSamplesPerCluster == 2)
|
|
{
|
|
uint un = SampleCount - 1;
|
|
|
|
CurrentRingId = (uint(floor(sqrt(4 * un - 3))) + 1) / 2;
|
|
|
|
NextClusterBoundary = 1 + CurrentRingId * (CurrentRingId - 1);
|
|
}
|
|
else
|
|
{
|
|
// TODO(Denoiser)
|
|
}
|
|
|
|
FSSDSampleClusterInfo ClusterInfo;
|
|
ClusterInfo.OutterBoundaryRadius = (CurrentRingId + 1) * KernelConfig.KernelSpreadFactor;
|
|
|
|
StartAccumulatingCluster(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
ClusterInfo);
|
|
|
|
// Processes the samples in batches so that the compiler can do lattency hidding.
|
|
LOOP
|
|
for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++)
|
|
{
|
|
UNROLL_N(2)
|
|
for (uint SampleBatchId = 0; SampleBatchId < 2; SampleBatchId++)
|
|
{
|
|
uint SampleId = (BatchCountCount - BatchId) * kSamplingBatchSize - 1 - SampleBatchId;
|
|
|
|
bool bIsKernelCenterSample = SampleId == 0 && (SampleBatchId == (kSamplingBatchSize - 1));
|
|
|
|
uint SampleTrackId = KernelConfig.SampleTrackId;
|
|
|
|
float2 SampleOffset = kStackowiakSampleSet0[kStackowiakSampleSetCount * SampleId + SampleTrackId];
|
|
if (KernelConfig.SampleSubSetId == 1)
|
|
{
|
|
SampleOffset = kStackowiakSampleSet1[kStackowiakSampleSetCount * SampleId + SampleTrackId];
|
|
}
|
|
|
|
float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
float KernelWeight = 1;
|
|
SampleAndAccumulateMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
/* KernelWeight = */ 1.0,
|
|
/* bForceSample = */ bIsKernelCenterSample && KernelConfig.bForceKernelCenterAccumulation);
|
|
|
|
// Change of cluster. Can only happens on odd SampleId, meaning even SampleBatchId.
|
|
BRANCH
|
|
if (SampleId == NextClusterBoundary && (SampleBatchId % 2) == 0)
|
|
{
|
|
// Compute the number samples that have been accumulated for this sample.
|
|
uint SampleCountForCluster = min(CurrentRingId * StocasticSamplesPerCluster, SampleCount - SampleId);
|
|
|
|
// Digest all acumulators.
|
|
DijestAccumulatedClusterSamples(
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
CurrentRingId, SampleCountForCluster);
|
|
|
|
BRANCH
|
|
if (!KernelConfig.bSampleKernelCenter && SampleId == 1)
|
|
{
|
|
break;
|
|
}
|
|
|
|
// Change cluster index and boundary.
|
|
CurrentRingId -= 1;
|
|
NextClusterBoundary -= CurrentRingId * StocasticSamplesPerCluster;
|
|
|
|
FSSDSampleClusterInfo ClusterInfo;
|
|
ClusterInfo.OutterBoundaryRadius = (CurrentRingId + 1) * KernelConfig.KernelSpreadFactor;
|
|
|
|
// Prepare the accumulators for new cluster.
|
|
StartAccumulatingCluster(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
ClusterInfo);
|
|
}
|
|
} // for (uint SampleBatchId = 0; SampleBatchId < kSamplingBatchSize; SampleBatchId++)
|
|
} // for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++)
|
|
|
|
// NextClusterBoundary is not capable to reach 0, therefore need to manually digest the center sample.
|
|
if (KernelConfig.bSampleKernelCenter)
|
|
{
|
|
DijestAccumulatedClusterSamples(
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
/* RingId = */ 0, /* SampleCount = */ 1);
|
|
}
|
|
}
|
|
else // if (!KernelConfig.bDescOrder)
|
|
{
|
|
if (KernelConfig.bSampleKernelCenter)
|
|
{
|
|
SampleAndAccumulateCenterSampleAsItsOwnCluster(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
|
|
// Accumulate second sample to lattency hide with the center sample.
|
|
{
|
|
uint SampleTrackId = KernelConfig.SampleTrackId;
|
|
|
|
uint SampleId = 1;
|
|
|
|
float2 SampleOffset = kStackowiakSampleSet0[kStackowiakSampleSetCount * SampleId + SampleTrackId];
|
|
if (KernelConfig.SampleSubSetId == 1)
|
|
{
|
|
SampleOffset = kStackowiakSampleSet1[kStackowiakSampleSetCount * SampleId + SampleTrackId];
|
|
}
|
|
|
|
float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
SampleAndAccumulateMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
/* KernelWeight = */ 1.0,
|
|
/* bForceSample = */ false);
|
|
}
|
|
|
|
// (SALU) Number of batch of samples to perform.
|
|
const uint BatchCountCount = (KernelConfig.SampleCount - 1) / kSamplingBatchSize;
|
|
|
|
// Processes the samples in batches so that the compiler can do lattency hidding.
|
|
// TODO(Denoiser): kSamplingBatchSize for lattency hidding
|
|
LOOP
|
|
for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++)
|
|
{
|
|
float2 SampleBufferUV[2];
|
|
|
|
uint SampleTrackId = KernelConfig.SampleTrackId;
|
|
|
|
UNROLL_N(2)
|
|
for (uint SampleBatchId = 0; SampleBatchId < 2; SampleBatchId++)
|
|
{
|
|
uint SampleId = BatchId * kSamplingBatchSize + (SampleBatchId + kSamplingBatchSize);
|
|
|
|
float2 SampleOffset = kStackowiakSampleSet0[kStackowiakSampleSetCount * SampleId + SampleTrackId];
|
|
if (KernelConfig.SampleSubSetId == 1)
|
|
{
|
|
SampleOffset = kStackowiakSampleSet1[kStackowiakSampleSetCount * SampleId + SampleTrackId];
|
|
}
|
|
|
|
SampleBufferUV[SampleBatchId] = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw;
|
|
}
|
|
|
|
SampleAndAccumulateMultiplexedSignalsPair(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
/* KernelWeight = */ 1.0);
|
|
} // for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++)
|
|
} // if (!KernelConfig.bDescOrder)
|
|
} // ConvolveStackowiakKernel()
|
|
|
|
#endif // COMPILE_STACKOWIAK_KERNEL
|
|
|
|
|
|
//------------------------------------------------------- DISK
|
|
|
|
#if COMPILE_DISK_KERNEL
|
|
|
|
// Returns the position of the sample on the unit circle (radius = 1) for a given ring.
|
|
float2 GetDiskSampleOnUnitCirle(uint RingId, uint RingSampleIteration, uint RingSampleId)
|
|
{
|
|
RingId -= 1; // TODO(Denoiser).
|
|
|
|
float SampleRingPos = RingSampleId;
|
|
|
|
// Do not allign all j == 0 samples of the different ring on the X axis to increase minimal distance between all
|
|
// samples, that reduce variance to clean by post filtering.
|
|
#if 1
|
|
SampleRingPos += (RingId - 2 * (RingId / 2)) * 0.5;
|
|
#endif
|
|
|
|
#if 1
|
|
SampleRingPos += (RingId + 1) * 0.2;
|
|
#endif
|
|
|
|
float SampleAngle = PI * SampleRingPos / float(RingSampleIteration);
|
|
|
|
return float2(cos(SampleAngle), sin(SampleAngle));
|
|
}
|
|
|
|
// Returns the rotation matrix to use between sample of the ring.
|
|
float2x2 GetSampleRotationMatrix(uint RingSampleIteration)
|
|
{
|
|
float RotationAngle = PI / float(RingSampleIteration);
|
|
|
|
float C = cos(RotationAngle);
|
|
float S = sin(RotationAngle);
|
|
|
|
return float2x2(
|
|
float2( C, S),
|
|
float2(-S, C));
|
|
}
|
|
|
|
// Returns the total number of sampling iteration for a given ring id.
|
|
uint GetRingSamplingPairCount(const uint SampleSet, uint RingId)
|
|
{
|
|
if (SampleSet == SAMPLE_SET_HEXAWEB)
|
|
{
|
|
return RingId * 3;
|
|
}
|
|
|
|
// This number of sample is carefully chosen to have exact number of sample a square shaped ring (SquarePos).
|
|
return RingId * 4;
|
|
}
|
|
|
|
// Returns the total number of sample of the kernel.
|
|
uint GetDiskKernelSampleCount(const uint SampleSet, uint RingCount)
|
|
{
|
|
if (SampleSet == SAMPLE_SET_HEXAWEB)
|
|
{
|
|
return 1 + 3 * RingCount * (RingCount + 1);
|
|
}
|
|
|
|
// Depends on GetRingSamplingPairCount().
|
|
return 1 + 4 * RingCount * (RingCount + 1);
|
|
}
|
|
|
|
// Transform at compile time a 2 dimensional batch's constant into sample pair constant, by using rotation invariance.
|
|
float2 SampleConstFromBatchConst(const uint BatchSampleId, float2 BatchConst)
|
|
{
|
|
/**
|
|
* Y
|
|
* ^
|
|
* |
|
|
* 1 |
|
|
* |
|
|
* | 0
|
|
* |
|
|
* - - - - - - O - - - - > X
|
|
*/
|
|
if (BatchSampleId == 1)
|
|
return float2(-BatchConst.y, BatchConst.x);
|
|
return BatchConst;
|
|
}
|
|
|
|
|
|
|
|
// Gather a ring into the accumulator.
|
|
void GatherRingSamples(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators,
|
|
const uint RingId)
|
|
{
|
|
// Number of sample iteration for this ring.
|
|
const uint RingSamplePairCount = GetRingSamplingPairCount(KernelConfig.SampleSet, RingId);
|
|
|
|
// Number of sample pair to process per batch.
|
|
// TODO(Denoiser): Could potentially do 4 using symetries? Might be unpracticable because of VGPR pressure.
|
|
const uint SamplePairBatchSize = (KernelConfig.SampleSet == SAMPLE_SET_HEXAWEB) ? 1 : 2;
|
|
|
|
// Number of batch to process.
|
|
const uint BatchCount = RingSamplePairCount / SamplePairBatchSize;
|
|
|
|
// Distance of the ring from the center of the kernel in sample count.
|
|
const uint RingDistance = uint(RingId + 0);
|
|
|
|
// Generate at compile time sample rotation matrix.
|
|
const float2x2 SampleRotationMatrix = GetSampleRotationMatrix(RingSamplePairCount);
|
|
|
|
// Generates at compile time first sample location on circle (radius = 1).
|
|
const float2 FirstCircleUnitPos = GetDiskSampleOnUnitCirle(RingId, RingSamplePairCount, /* BatchId = */ 0);
|
|
|
|
// Position of the first sample on circle with radius according to KernelRadius.
|
|
float2 FirstCircleSamplePosOffset = (RingDistance * FirstCircleUnitPos) * KernelConfig.KernelSpreadFactor;
|
|
|
|
// Setup iteratable SGPR
|
|
float2 CurrentCircleUnitPos = FirstCircleUnitPos;
|
|
float2 CurrentCircleSamplePosOffset = FirstCircleSamplePosOffset;
|
|
|
|
#if CONFIG_SGPR_HINT_OPTIMIZATION
|
|
{
|
|
CurrentCircleUnitPos = ToScalarMemory(CurrentCircleUnitPos);
|
|
CurrentCircleSamplePosOffset = ToScalarMemory(CurrentCircleSamplePosOffset);
|
|
}
|
|
#endif
|
|
|
|
// Loops through all batch of samples to process.
|
|
LOOP
|
|
for (uint BatchId = 0; BatchId < BatchCount; BatchId++)
|
|
{
|
|
// Rotate the samples position along the ring.
|
|
CurrentCircleUnitPos = mul(CurrentCircleUnitPos, SampleRotationMatrix);
|
|
CurrentCircleSamplePosOffset = mul(CurrentCircleSamplePosOffset, SampleRotationMatrix);
|
|
|
|
#if CONFIG_SGPR_HINT_OPTIMIZATION
|
|
{
|
|
CurrentCircleUnitPos = ToScalarMemory(CurrentCircleUnitPos);
|
|
CurrentCircleSamplePosOffset = ToScalarMemory(CurrentCircleSamplePosOffset);
|
|
}
|
|
#endif
|
|
|
|
// Sample in batch of multiple pair to increase texture fetch concurency, to have better
|
|
// lattency hidding.
|
|
UNROLL
|
|
for (uint BatchSampleId = 0; BatchSampleId < SamplePairBatchSize; BatchSampleId++)
|
|
{
|
|
float2 CircleSamplePosOffset = SampleConstFromBatchConst(BatchSampleId, CurrentCircleSamplePosOffset);
|
|
|
|
float2 SampleUVPair[2];
|
|
SampleUVPair[0] = KernelConfig.BufferUV + CircleSamplePosOffset * KernelConfig.BufferSizeAndInvSize.zw;
|
|
SampleUVPair[1] = KernelConfig.BufferUV - CircleSamplePosOffset * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
SampleAndAccumulateMultiplexedSignalsPair(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleUVPair,
|
|
/* KernelWeight = */ 1.0);
|
|
} // for (uint BatchSampleId = 0; BatchSampleId < SamplePairBatchSize; BatchSampleId++)
|
|
} // for (uint BatchId = 0; BatchId < BatchCount; BatchId++)
|
|
}
|
|
|
|
void ConvolveDiskKernel(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators)
|
|
{
|
|
// Accumulate the center of the kernel.
|
|
if (KernelConfig.bSampleKernelCenter && !KernelConfig.bDescOrder)
|
|
{
|
|
SampleAndAccumulateCenterSampleAsItsOwnCluster(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
|
|
// Accumulate each ring. Use LOOP, because FXC is going through its pace otherwise.
|
|
#if 1
|
|
LOOP
|
|
#else
|
|
UNROLL
|
|
#endif
|
|
for (
|
|
uint RingId = (KernelConfig.bDescOrder ? KernelConfig.RingCount : 1);
|
|
(KernelConfig.bDescOrder ? RingId > 0 : RingId <= KernelConfig.RingCount);
|
|
RingId += (KernelConfig.bDescOrder ? ~0u : 1))
|
|
{
|
|
const uint RingSamplePairCount = GetRingSamplingPairCount(KernelConfig.SampleSet, RingId);
|
|
|
|
FSSDSampleClusterInfo ClusterInfo;
|
|
ClusterInfo.OutterBoundaryRadius = (RingId + 1) * KernelConfig.KernelSpreadFactor;
|
|
|
|
StartAccumulatingCluster(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
ClusterInfo);
|
|
|
|
GatherRingSamples(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RingId);
|
|
|
|
DijestAccumulatedClusterSamples(
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RingId, RingSamplePairCount * 2);
|
|
} // for (uint RingId = 0; RingId < KernelConfig.RingCount; RingId++)
|
|
|
|
// Accumulate the center of the kernel.
|
|
if (KernelConfig.bSampleKernelCenter && KernelConfig.bDescOrder)
|
|
{
|
|
SampleAndAccumulateCenterSampleAsItsOwnCluster(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
}
|
|
|
|
#endif // COMPILE_DISK_KERNEL
|
|
|
|
|
|
//------------------------------------------------------- DIRECTIONAL KERNELS
|
|
|
|
#if COMPILE_DIRECTIONAL_KERNEL
|
|
|
|
void ConvolveDirectionalRect(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators)
|
|
{
|
|
// Accumulate the center of the kernel.
|
|
if (KernelConfig.bSampleKernelCenter)
|
|
{
|
|
SampleAndAccumulateCenterSampleAsItsOwnCluster(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
|
|
// Number of batch size done at same time, to improve lattency hidding.
|
|
const uint kSamplingBatchSize = 2;
|
|
|
|
// Number of batch of samples to perform. It's not round up because also sampling the center of kernel anyway.
|
|
// TODO(Denoiser): store in a SGPR array instead to save 1 VGPR.
|
|
const uint BatchCountCount = KernelConfig.SampleCount / kSamplingBatchSize;
|
|
|
|
// Processes the samples in batches so that the compiler can do lattency hidding.
|
|
LOOP
|
|
for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++)
|
|
{
|
|
float2 SampleBufferUV[2];
|
|
|
|
UNROLL_N(2)
|
|
for (uint SampleBatchId = 0; SampleBatchId < 2; SampleBatchId++)
|
|
{
|
|
uint SampleId = BatchId * kSamplingBatchSize + SampleBatchId;
|
|
|
|
float2 E = Hammersley16(SampleId, BatchCountCount * kSamplingBatchSize, KernelConfig.HammersleySeed);
|
|
|
|
if (KernelConfig.SampleSet == SAMPLE_SET_DIRECTIONAL_RECT)
|
|
{
|
|
E = (E * 2.0 - 1.0);
|
|
}
|
|
else // if (KernelConfig.SampleSet == SAMPLE_SET_DIRECTIONAL_ELLIPSE)
|
|
{
|
|
E = UniformSampleDiskConcentric(E);
|
|
}
|
|
|
|
float2 SampleOffset =
|
|
float2(KernelConfig.MajorAxis) * KernelConfig.MajorPixelRadius * E.x +
|
|
float2(-KernelConfig.MajorAxis.y, KernelConfig.MajorAxis.x) * KernelConfig.MinorPixelRadius * E.y;
|
|
|
|
SampleBufferUV[SampleBatchId] = KernelConfig.BufferUV + SampleOffset * KernelConfig.BufferSizeAndInvSize.zw;
|
|
}
|
|
|
|
SampleAndAccumulateMultiplexedSignalsPair(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
/* KernelWeight = */ 1.0);
|
|
} // for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++)
|
|
}
|
|
|
|
#endif // COMPILE_DIRECTIONAL_KERNEL
|
|
|
|
//------------------------------------------------------- RAW EXPERIMENTAL KERNEL TO TRY
|
|
|
|
#if COMPILE_RAW_EXPERIMENTAL_KERNEL
|
|
|
|
#if 0
|
|
static const float2 SampleArray4x4x8[128] = {
|
|
float2(0.000000, 0.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(2.000000, 1.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(2.000000, 1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(-3.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(2.000000, 1.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(2.000000, 2.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(-3.000000, 0.000000),
|
|
float2(-2.000000, 2.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(-2.000000, 2.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(0.000000, 3.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(-2.000000, 2.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(-3.000000, 0.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(-2.000000, -2.000000),
|
|
float2(2.000000, 1.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(-2.000000, -2.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(2.000000, -1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(-1.000000, -2.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
}; // SampleArray4x4x8
|
|
|
|
#else
|
|
|
|
static const float2 SampleArray4x4x16[256] = {
|
|
float2(0.000000, 0.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(2.000000, -1.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(-1.000000, -2.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(2.000000, 2.000000),
|
|
float2(2.000000, -2.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(2.000000, 3.000000),
|
|
float2(-1.000000, -2.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(-2.000000, -2.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(2.000000, -1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-1.000000, 3.000000),
|
|
float2(-1.000000, -2.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(-2.000000, -1.000000),
|
|
float2(2.000000, 2.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(-3.000000, 0.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(-2.000000, 2.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(2.000000, 3.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(2.000000, -1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-3.000000, 0.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(2.000000, -2.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(2.000000, 1.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(2.000000, -1.000000),
|
|
float2(-2.000000, 2.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(2.000000, 2.000000),
|
|
float2(3.000000, 1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(3.000000, 1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(-2.000000, -3.000000),
|
|
float2(2.000000, 3.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(2.000000, 2.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(-3.000000, 0.000000),
|
|
float2(-2.000000, -1.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(-2.000000, 2.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(2.000000, -1.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(-1.000000, -2.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(2.000000, 2.000000),
|
|
float2(-3.000000, 0.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(-4.000000, -1.000000),
|
|
float2(2.000000, 1.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(0.000000, 3.000000),
|
|
float2(2.000000, -2.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(-2.000000, -1.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-1.000000, -2.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(1.000000, 2.000000),
|
|
float2(2.000000, -1.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(-2.000000, -2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-1.000000, -2.000000),
|
|
float2(-3.000000, 0.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(-2.000000, 2.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(-2.000000, -1.000000),
|
|
float2(-1.000000, -4.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(2.000000, 1.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(-2.000000, -2.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(3.000000, 1.000000),
|
|
float2(-3.000000, 2.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(-1.000000, -2.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(0.000000, 3.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(3.000000, 1.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(3.000000, -2.000000),
|
|
float2(2.000000, 1.000000),
|
|
float2(1.000000, -3.000000),
|
|
float2(2.000000, -2.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(-2.000000, 2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(-1.000000, 2.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(1.000000, 1.000000),
|
|
float2(2.000000, 3.000000),
|
|
float2(2.000000, 0.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(1.000000, -2.000000),
|
|
float2(0.000000, -2.000000),
|
|
float2(-2.000000, -2.000000),
|
|
float2(0.000000, 0.000000),
|
|
float2(-2.000000, 1.000000),
|
|
float2(0.000000, -1.000000),
|
|
float2(-1.000000, 1.000000),
|
|
float2(1.000000, -3.000000),
|
|
float2(0.000000, 2.000000),
|
|
float2(0.000000, 3.000000),
|
|
float2(0.000000, 1.000000),
|
|
float2(-2.000000, 0.000000),
|
|
float2(1.000000, -1.000000),
|
|
float2(-2.000000, -1.000000),
|
|
float2(-2.000000, -2.000000),
|
|
float2(-1.000000, 0.000000),
|
|
float2(-1.000000, -1.000000),
|
|
float2(1.000000, 0.000000),
|
|
float2(1.000000, -2.000000),
|
|
}; // SampleArray4x4x16
|
|
|
|
|
|
#endif
|
|
|
|
void ConvolveRawExperimentalKernel(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators)
|
|
{
|
|
// Accumulate the center of the kernel.
|
|
if (KernelConfig.bSampleKernelCenter && !KernelConfig.bDescOrder)
|
|
{
|
|
SampleAndAccumulateCenterSampleAsItsOwnCluster(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
|
|
const uint TileSize = 4;
|
|
const uint SampleCount = 16;
|
|
|
|
uint2 PixelCoord = uint2(KernelConfig.BufferUV * View.BufferSizeAndInvSize.xy) % TileSize;
|
|
|
|
LOOP
|
|
for (uint SampleId = 1; SampleId < SampleCount; SampleId++)
|
|
{
|
|
uint MagicIndex = SampleId + SampleCount * (PixelCoord.x + TileSize * PixelCoord.y);
|
|
|
|
float2 SampleOffset = SampleArray4x4x16[MagicIndex];
|
|
float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw;
|
|
|
|
SampleAndAccumulateMultiplexedSignals(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
SampleBufferUV,
|
|
/* KernelWeight = */ 1.0,
|
|
/* bForceSample = */ false);
|
|
}
|
|
|
|
// Accumulate the center of the kernel.
|
|
if (KernelConfig.bSampleKernelCenter && KernelConfig.bDescOrder)
|
|
{
|
|
SampleAndAccumulateCenterSampleAsItsOwnCluster(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
}
|
|
|
|
#endif // COMPILE_RAW_EXPERIMENTAL_KERNEL
|
|
|
|
|
|
//------------------------------------------------------- MAIN ENTRY POINTS
|
|
|
|
/** Accumulate the center of the kernel when KernelConfig.bSampleKernelCenter == false.
|
|
*
|
|
* RefSceneMetadata and SampleSceneMetadata needs to be uncompressed upfront intentionally to share the uncompression with other
|
|
* part of the shader that might have required uncompression anyway.
|
|
*/
|
|
void AccumulateRefSampleAsKernelCenter(
|
|
FSSDKernelConfig KernelConfig,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators,
|
|
float2 RefBufferUV,
|
|
FSSDSampleSceneInfos RefSceneMetadata,
|
|
FSSDSignalArray RefMultiplexedSamples,
|
|
FSSDSignalFrequencyArray RefMultiplexedFrequencies)
|
|
{
|
|
if (!KernelConfig.bSampleKernelCenter)
|
|
{
|
|
AccumulateSampledMultiplexedSignals(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RefSceneMetadata,
|
|
RefBufferUV,
|
|
RefSceneMetadata,
|
|
RefMultiplexedSamples,
|
|
RefMultiplexedFrequencies,
|
|
/* KernelWeight = */ 1.0,
|
|
/* bForceSample = */ true,
|
|
/* bIsOutsideFrustum = */ false);
|
|
|
|
if (KernelConfig.SampleSet == 0xDEADDEAD)
|
|
{
|
|
}
|
|
#if COMPILE_BOX_KERNEL
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_3X3 ||
|
|
KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS ||
|
|
KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS)
|
|
{
|
|
BroadcastAccumulateSquare3x3KernelCenter(
|
|
KernelConfig,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators,
|
|
RefSceneMetadata,
|
|
RefBufferUV,
|
|
RefSceneMetadata,
|
|
RefMultiplexedSamples,
|
|
RefMultiplexedFrequencies);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
void AccumulateKernel(
|
|
FSSDKernelConfig KernelConfig,
|
|
FSSDTexture2D SignalBuffer0,
|
|
FSSDTexture2D SignalBuffer1,
|
|
FSSDTexture2D SignalBuffer2,
|
|
FSSDTexture2D SignalBuffer3,
|
|
inout FSSDSignalAccumulatorArray UncompressedAccumulators,
|
|
inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators)
|
|
{
|
|
if (KernelConfig.SampleSet == 0xDEADDEAD)
|
|
{
|
|
}
|
|
#if COMPILE_BOX_KERNEL
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_1X1)
|
|
{
|
|
if (KernelConfig.bSampleKernelCenter)
|
|
{
|
|
SampleAndAccumulateCenterSampleAsItsOwnCluster(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
}
|
|
else if (
|
|
KernelConfig.SampleSet == SAMPLE_SET_2X2_BILINEAR ||
|
|
KernelConfig.SampleSet == SAMPLE_SET_2X2_STOCASTIC ||
|
|
KernelConfig.SampleSet == SAMPLE_SET_2X2_ADAPTIVE)
|
|
{
|
|
AccumulateBilinear2x2Kernel(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
else if (
|
|
KernelConfig.SampleSet == SAMPLE_SET_3X3 ||
|
|
KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS ||
|
|
KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS)
|
|
{
|
|
AccumulateSquare3x3Kernel(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
else if (
|
|
KernelConfig.SampleSet == SAMPLE_SET_3X3_SOBEK2018 ||
|
|
KernelConfig.SampleSet == SAMPLE_SET_5X5_WAVELET ||
|
|
KernelConfig.SampleSet == SAMPLE_SET_NXN)
|
|
{
|
|
AccumulateSquareKernel(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
#endif// COMPILE_BOX_KERNEL
|
|
#if COMPILE_STACKOWIAK_KERNEL
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_STACKOWIAK_4_SETS)
|
|
{
|
|
ConvolveStackowiakKernel(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
#endif // COMPILE_STACKOWIAK_KERNEL
|
|
#if COMPILE_DISK_KERNEL
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_HEXAWEB)
|
|
{
|
|
ConvolveDiskKernel(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
#endif // COMPILE_DISK_KERNEL
|
|
#if COMPILE_DIRECTIONAL_KERNEL
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_DIRECTIONAL_RECT || KernelConfig.SampleSet == SAMPLE_SET_DIRECTIONAL_ELLIPSE)
|
|
{
|
|
ConvolveDirectionalRect(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
#endif // COMPILE_DIRECTIONAL_KERNEL
|
|
#if COMPILE_RAW_EXPERIMENTAL_KERNEL
|
|
else if (KernelConfig.SampleSet == SAMPLE_SET_RAW_EXPERIMENTAL_KERNEL)
|
|
{
|
|
ConvolveRawExperimentalKernel(
|
|
KernelConfig,
|
|
SignalBuffer0,
|
|
SignalBuffer1,
|
|
SignalBuffer2,
|
|
SignalBuffer3,
|
|
/* inout */ UncompressedAccumulators,
|
|
/* inout */ CompressedAccumulators);
|
|
}
|
|
#endif
|
|
} // AccumulateKernel()
|