Files
UnrealEngine/Engine/Shaders/Private/ScreenSpaceDenoise/SSDSpatialAccumulation.usf
2025-05-18 13:04:45 +08:00

1309 lines
48 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#include "SSDDefinitions.ush"
//------------------------------------------------------- ENUM VALUES
/** Different possible stage for spatial accumulation. Matches */
#define STAGE_RECONSTRUCTION 0
#define STAGE_PRE_CONVOLUTION 1
#define STAGE_REJECTION_PRE_CONVOLUTION 2
#define STAGE_POST_FILTERING 3
#define STAGE_FINAL_OUTPUT 4
/** Policy to use to change the size of kernel. */
#define SAMPLE_COUNT_POLICY_DISABLED 0
#define SAMPLE_COUNT_POLICY_SAMPLE_ACCUMULATION_BASED 4
/** What signal should be outputed. */
// Only output the sum of the signal 0.
#define OUTPUT_MODE_SUM 0
// Only output the sum of the momment 1 & 2 of the signal 0.
#define OUTPUT_MODE_2MOMMENT_SUM 1
// Output the result of descending ring bucketing.
#define OUTPUT_MODE_DRB 2
//------------------------------------------------------- CONFIGS
#define TILE_PIXEL_SIZE 8
#define CONFIG_SIGNAL_PROCESSING DIM_SIGNAL_PROCESSING
#define CONFIG_UPSCALE DIM_UPSCALE
#define CONFIG_SIGNAL_BATCH_SIZE DIM_SIGNAL_BATCH_SIZE
// Configures all the pass for each individual signals.
#if CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_SHADOW_VISIBILITY_MASK
#define MAX_SIGNAL_BATCH_SIZE CONFIG_SIGNAL_BATCH_SIZE
#define SIGNAL_ARRAY_SIZE CONFIG_SIGNAL_BATCH_SIZE
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_MONOCHROMATIC_PENUMBRA
#define CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN 1
#if DIM_STAGE == STAGE_RECONSTRUCTION
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_INJESTION_NSPP
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY
#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2
#define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4
#define CONFIG_INPUT_TEXTURE_COUNT ((CONFIG_SIGNAL_BATCH_SIZE + 1) / 2)
#define CONFIG_OUTPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE
#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
#define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS
#define CONFIG_MAX_WITH_REF_DISTANCE 1
#define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB
#if DIM_SIGNAL_BATCH_SIZE > 1
#define CONFIG_CLAMP_UV_PER_SIGNAL 1
#endif
#elif DIM_STAGE == STAGE_PRE_CONVOLUTION
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY
#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4
#define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4
#define CONFIG_INPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE
#define CONFIG_OUTPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE
#define CONFIG_SAMPLE_SET SAMPLE_SET_HEXAWEB
#define CONFIG_CUSTOM_SPREAD_FACTOR 1
#define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS
#define CONFIG_MAX_WITH_REF_DISTANCE 1
#define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB
#elif DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION
#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4
#define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4
#elif DIM_STAGE == STAGE_POST_FILTERING
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY
#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4
#define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4
#define CONFIG_INPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE
#define CONFIG_OUTPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE
#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
//#define CONFIG_SAMPLE_SUBSET 1
#define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS
#define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB
#elif DIM_STAGE == STAGE_FINAL_OUTPUT
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY
#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4
#define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4
#define CONFIG_INPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE
#define CONFIG_OUTPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE
#define CONFIG_SAMPLE_SET SAMPLE_SET_1X1
#else
#error Unexpected stage.
#endif
// Compress the DRB accumulator to have lower VGPR footprint.
#if defined(CONFIG_OUTPUT_MODE) && CONFIG_OUTPUT_MODE == OUTPUT_MODE_DRB
// Looks like shader compilers completly give up.
// #define CONFIG_ACCUMULATOR_VGPR_COMPRESSION ACCUMULATOR_COMPRESSION_PENUMBRA_DRB
#endif
#elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_POLYCHROMATIC_PENUMBRA_HARMONIC
// Denoise diffuse and specular harmonics at the same time.
#define MAX_SIGNAL_BATCH_SIZE 2
#define SIGNAL_ARRAY_SIZE 2
#undef CONFIG_SIGNAL_BATCH_SIZE
#define CONFIG_SIGNAL_BATCH_SIZE 2
// Each harmonic requires input and output RGB.
#define COMPILE_SIGNAL_COLOR_ARRAY 2
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_POLYCHROMATIC_PENUMBRA
#define CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN 1
// Any world distance depends on the harmonic being processed.
#define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_HARMONIC
#if DIM_STAGE == STAGE_RECONSTRUCTION
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_POLYCHROMATIC_PENUMBRA_HARMONIC_INPUT
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_POLYCHROMATIC_PENUMBRA_HARMONIC_RECONSTRUCTION
#define CONFIG_INPUT_TEXTURE_COUNT 4
#define CONFIG_OUTPUT_TEXTURE_COUNT 4
#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
//#define CONFIG_MAX_WITH_REF_DISTANCE 1
#else
#error Unexpected stage.
#endif
#elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_REFLECTIONS
// Denoise only specular.
#define MAX_SIGNAL_BATCH_SIZE 1
#define SIGNAL_ARRAY_SIZE 1
#define COMPILE_SIGNAL_COLOR 1
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_REFLECTIONS
#if DIM_STAGE == STAGE_RECONSTRUCTION || DIM_STAGE == STAGE_PRE_CONVOLUTION
#define SIGNAL_ARRAY_SIZE 1
// Input and output layout.
#if DIM_STAGE == STAGE_RECONSTRUCTION
#define CONFIG_INPUT_TEXTURE_COUNT 2
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_REFLECTIONS_INPUT
#else
#define CONFIG_INPUT_TEXTURE_COUNT 2
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_REFLECTIONS_HISTORY
#endif
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_REFLECTIONS_HISTORY
#define CONFIG_OUTPUT_TEXTURE_COUNT 2
#define CONFIG_SAMPLE_SET SAMPLE_SET_DIRECTIONAL_ELLIPSE
// Do color accumulation with karis weighting to avoid flickering specular highlight to show up the kernel pattern.
// TODO(Denoiser): This is a bit agressive.
#define CONFIG_ACCUMULATION_COLOR_SPACE (COLOR_SPACE_RGB | COLOR_SPACE_KARIS_WEIGHTING)
#else
#error Unexpected stage.
#endif
#elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_AO
// Denoise only AO.
#define MAX_SIGNAL_BATCH_SIZE 1
#if DIM_STAGE == STAGE_RECONSTRUCTION
#define SIGNAL_ARRAY_SIZE 1
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_INPUT
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY
#define CONFIG_INPUT_TEXTURE_COUNT 2
#define CONFIG_OUTPUT_TEXTURE_COUNT 1
#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
#define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_AO
#elif DIM_STAGE == STAGE_PRE_CONVOLUTION
#define SIGNAL_ARRAY_SIZE 1 // first and second momment to measure variance in temporal accumulation.
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY
#define CONFIG_INPUT_TEXTURE_COUNT 1
#define CONFIG_OUTPUT_TEXTURE_COUNT 1
#define CONFIG_SAMPLE_SET SAMPLE_SET_HEXAWEB
#define CONFIG_CUSTOM_SPREAD_FACTOR 1
//#define CONFIG_MAX_WITH_REF_DISTANCE 1
#define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_AO
#elif DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION
#define SIGNAL_ARRAY_SIZE 2 // first and second momment to measure variance in temporal accumulation.
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_REJECTION
#define CONFIG_INPUT_TEXTURE_COUNT 1
#define CONFIG_OUTPUT_TEXTURE_COUNT 1
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_AO_HISTORY
#elif DIM_STAGE == STAGE_POST_FILTERING
#define SIGNAL_ARRAY_SIZE 1
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY
#define CONFIG_INPUT_TEXTURE_COUNT 1
#define CONFIG_OUTPUT_TEXTURE_COUNT 1
#define CONFIG_SAMPLE_SET SAMPLE_SET_HEXAWEB
#define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB
#define CONFIG_CUSTOM_SPREAD_FACTOR 1
#define CONFIG_SAMPLE_COUNT_POLICY SAMPLE_COUNT_POLICY_SAMPLE_ACCUMULATION_BASED
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_AO
#else
#error Unexpected stage.
#endif
#elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_INDIRECT_AND_AO
// Denoise diffuse and AO, but AO is FSSDSignalSample::MissCount.
#define MAX_SIGNAL_BATCH_SIZE 1
#define SIGNAL_ARRAY_SIZE 1
#define COMPILE_SIGNAL_COLOR 1
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DIFFUSE
#define CONFIG_INPUT_TEXTURE_COUNT 2
#define CONFIG_OUTPUT_TEXTURE_COUNT 2
#if DIM_STAGE == STAGE_RECONSTRUCTION
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_INPUT_NSPP
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_RECONSTRUCTION
#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
#define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_HIT_DISTANCE
#elif DIM_STAGE == STAGE_PRE_CONVOLUTION
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_RECONSTRUCTION
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_RECONSTRUCTION
#define CONFIG_SAMPLE_SET SAMPLE_SET_HEXAWEB
#define CONFIG_CUSTOM_SPREAD_FACTOR 1
//#define CONFIG_MAX_WITH_REF_DISTANCE 1
#elif DIM_STAGE == STAGE_POST_FILTERING
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_HISTORY
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_HISTORY
#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
#define CONFIG_SAMPLE_SUBSET 1
#define CONFIG_SAMPLE_COUNT_POLICY SAMPLE_COUNT_POLICY_SAMPLE_ACCUMULATION_BASED
#else
#error Unexpected stage.
#endif
#elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_SPHERICAL_HARMONIC && 0
#define MAX_SIGNAL_BATCH_SIZE 1
#define SIGNAL_ARRAY_SIZE 1
#define COMPILE_SIGNAL_COLOR_SH 1
// Given it's a spherical harmonic that store directionality, only need position based rejection.
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_SPHERICAL_HARMONIC
#define CONFIG_INPUT_TEXTURE_COUNT 4
#define CONFIG_OUTPUT_TEXTURE_COUNT 4
#define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_MIN_METADATA
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_HARMONIC
#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_HARMONIC
#define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2
// Spherical harmonics are a lot of data, need to shrink VGPR pressure to improve lattency hidding when fetching the buffer.
// TODO(Denoiser): some shader compiler completly falls apparts with the current implementation of
// CONFIG_SIGNAL_VGPR_COMPRESSION and actually drops in occupency.
#define CONFIG_SIGNAL_VGPR_COMPRESSION SIGNAL_COMPRESSION_DIFFUSE_INDIRECT_HARMONIC
#if DIM_STAGE == STAGE_RECONSTRUCTION
#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
#elif DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION
#else
#error Unexpected stage.
#endif
#elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_SPHERICAL_HARMONIC && 1
#define MAX_SIGNAL_BATCH_SIZE 1
#define SIGNAL_ARRAY_SIZE 1
#define COMPILE_SIGNAL_COLOR_SH 1
// Given it's a spherical harmonic that store directionality, only need position based rejection.
#define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_MIN_METADATA
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_SPHERICAL_HARMONIC
//#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DISABLED
#define CONFIG_BILATERAL_DISTANCE_MULTIPLIER 6.0
#define CONFIG_INPUT_TEXTURE_COUNT 2
#define CONFIG_OUTPUT_TEXTURE_COUNT 2
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_LUMEN_DIFFUSE_INPUT
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_LUMEN_DIFFUSE_HISTORY
#if DIM_STAGE == STAGE_RECONSTRUCTION
//#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
#define CONFIG_SAMPLE_SET SAMPLE_SET_RAW_EXPERIMENTAL_KERNEL
#else
#error Unexpected stage.
#endif
#elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_SSGI
// Denoise diffuse and AO, but AO is FSSDSignalSample::MissCount.
#define MAX_SIGNAL_BATCH_SIZE 1
#define SIGNAL_ARRAY_SIZE 1
#define COMPILE_SIGNAL_COLOR 1
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DIFFUSE
//#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DISABLED
// SSGI doesn't have any bilateral distance computed from hitT, so allow to blur spatially by about the size of the kernel.
#define CONFIG_BILATERAL_DISTANCE_MULTIPLIER 3.0
#define CONFIG_INPUT_TEXTURE_COUNT 2
#define CONFIG_OUTPUT_TEXTURE_COUNT 2
#if DIM_STAGE == STAGE_RECONSTRUCTION
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_SSGI_INPUT
//#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_SSGI_HISTORY_R11G11B10
#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
//#define CONFIG_SAMPLE_SET SAMPLE_SET_1X1
#else
#error Unexpected stage.
#endif
#elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_PROBE_HIERARCHY
// Denoise diffuse and AO, but AO is FSSDSignalSample::MissCount.
#define MAX_SIGNAL_BATCH_SIZE 1
#define SIGNAL_ARRAY_SIZE 1
#define CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN 1
#define COMPILE_SIGNAL_COLOR_ARRAY 2
#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_PROBE_HIERARCHY
//#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DISABLED
// SSGI doesn't have any bilateral distance computed from hitT, so allow to blur spatially by about the size of the kernel.
#define CONFIG_BILATERAL_DISTANCE_MULTIPLIER 3.0
#define CONFIG_INPUT_TEXTURE_COUNT 2
#define CONFIG_OUTPUT_TEXTURE_COUNT 2
#if DIM_STAGE == STAGE_RECONSTRUCTION
// Input and output layout.
#define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_PROBE_HIERARCHY_INPUT
//#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2
#define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_PROBE_HIERARCHY_HISTORY
//#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS
#define CONFIG_SAMPLE_SET SAMPLE_SET_1X1
#else
#error Unexpected stage.
#endif
#else
#error Unknown signal processing.
#endif
// Configures pass regardless of the signals.
#if DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION
#define CONFIG_SAMPLE_SET SAMPLE_SET_3X3_PLUS
// Normalize the input, because want to measure the spatial variance regardless of how many samples where used to reconstruct the signal.
#define CONFIG_NORMALIZE_INPUT 1
// Output the 2 momment because history rejection is varaiance based, and may flicker with momment 2 loss since the pre
// convolution will reduce the variance of momment 1.
#define CONFIG_OUTPUT_MODE OUTPUT_MODE_2MOMMENT_SUM
#endif
// No previous frame reprojection, save VGPR.
//#define CONFIG_NEIGHBOR_TO_REF_COMPUTATION NEIGHBOR_TO_REF_LOWEST_VGPR_PRESSURE
//------------------------------------------------------- CONFIG DISABLED DEFAULTS
/** Whether should clamp the UV individually per texture. */
#ifndef CONFIG_CLAMP_UV_PER_SIGNAL
#define CONFIG_CLAMP_UV_PER_SIGNAL 0
#endif
/** Changes the logic controling the number of sample to do. */
#ifndef CONFIG_SAMPLE_COUNT_POLICY
#define CONFIG_SAMPLE_COUNT_POLICY SAMPLE_COUNT_POLICY_DISABLED
#endif
/** Selects a subset of sample of a given CONFIG_SAMPLE_SET */
#ifndef CONFIG_SAMPLE_SUBSET
#define CONFIG_SAMPLE_SUBSET 0
#endif
/** Whether the ray tracing input may needs to be upscale to the view's resolution. */
#ifndef CONFIG_UPSCALE
#define CONFIG_UPSCALE 0
#endif
/** Color space of the input signal. */
#ifndef CONFIG_INPUT_COLOR_SPACE
#define CONFIG_INPUT_COLOR_SPACE STANDARD_BUFFER_COLOR_SPACE
#endif
/** Color space to use for the accumulation. */
#ifndef CONFIG_ACCUMULATION_COLOR_SPACE
#define CONFIG_ACCUMULATION_COLOR_SPACE STANDARD_BUFFER_COLOR_SPACE
#endif
/** Color space to output in the signal. */
#ifndef CONFIG_OUTPUT_COLOR_SPACE
#define CONFIG_OUTPUT_COLOR_SPACE STANDARD_BUFFER_COLOR_SPACE
#endif
/** Removes the highest color. */
#ifndef CONFIG_REJECT_HIGHEST_COLOR
#define CONFIG_REJECT_HIGHEST_COLOR 0
#endif
/** Whether the input signal should be normalized. */
#ifndef CONFIG_NORMALIZE_INPUT
#define CONFIG_NORMALIZE_INPUT 0
#endif
/** The oupput mode that should be use. */
#ifndef CONFIG_OUTPUT_MODE
#define CONFIG_OUTPUT_MODE OUTPUT_MODE_SUM
#endif
/** The number of signal that should be processed per signal domain. */
#ifndef CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN
#define CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN SIGNAL_ARRAY_SIZE
#endif
/** Selects how the world distance should be computed for bilateral rejection. */
#ifndef CONFIG_BILATERAL_DISTANCE_COMPUTATION
#define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_MIN_METADATA
#endif
/** Adds a multiplier on how the distance should be computed. */
#ifndef CONFIG_BILATERAL_DISTANCE_MULTIPLIER
#define CONFIG_BILATERAL_DISTANCE_MULTIPLIER 1.0
#endif
/** Whether neighbor bilateral distance should be maxed with reference one. */
#ifndef CONFIG_MAX_WITH_REF_DISTANCE
#define CONFIG_MAX_WITH_REF_DISTANCE 0
#endif
//------------------------------------------------------- COMPILATION CONFIGURATION
// Choose kernel to compile.
#if CONFIG_SAMPLE_SET == SAMPLE_SET_STACKOWIAK_4_SETS
#define COMPILE_STACKOWIAK_KERNEL 1
#elif CONFIG_SAMPLE_SET == SAMPLE_SET_HEXAWEB
#define COMPILE_DISK_KERNEL 1
#elif CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_RECT || CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_ELLIPSE
#define COMPILE_DIRECTIONAL_KERNEL 1
#elif CONFIG_SAMPLE_SET == SAMPLE_SET_RAW_EXPERIMENTAL_KERNEL
#define COMPILE_RAW_EXPERIMENTAL_KERNEL 1
#else
#define COMPILE_BOX_KERNEL 1
#endif
// Choose accumulators to compile.
#if CONFIG_OUTPUT_MODE == OUTPUT_MODE_DRB
#define COMPILE_DRB_ACCUMULATOR 1
#define COMPILE_MIN_FREQUENCY_ACCUMULATOR 1
#elif CONFIG_OUTPUT_MODE == OUTPUT_MODE_2MOMMENT_SUM
#define COMPILE_MOMENT1_ACCUMULATOR 1
#define COMPILE_MOMENT2_ACCUMULATOR 1
#elif CONFIG_OUTPUT_MODE == OUTPUT_MODE_SUM
#define COMPILE_MOMENT1_ACCUMULATOR 1
#define COMPILE_MIN_FREQUENCY_ACCUMULATOR 1
#else
#error Unknown output mode.
#endif
//------------------------------------------------------- INCLUDES
#include "SSDSignalFramework.ush"
#include "SSDSignalArray.ush"
#include "SSDSpatialKernel.ush"
//------------------------------------------------------- LATE CONFIG DEFAULTS
/** Choose how the reference metadata should be compressed. */
#ifndef CONFIG_REF_METADATA_COMPRESSION
#define CONFIG_REF_METADATA_COMPRESSION CONFIG_METADATA_BUFFER_LAYOUT
#endif
//------------------------------------------------------- PARAMETERS
uint MaxSampleCount;
uint PreviousCumulativeMaxSampleCount;
uint UpscaleFactor;
#if !CONFIG_UPSCALE && CONFIG_CUSTOM_SPREAD_FACTOR
float KernelSpreadFactor;
#endif
float HarmonicPeriode;
float4 InputBufferUVMinMax[CONFIG_SIGNAL_BATCH_SIZE];
#if !defined(CONFIG_INPUT_TEXTURE_COUNT)
#error Missing CONFIG_INPUT_TEXTURE_COUNT
#endif
FSSDTexture2D SignalInput_Textures_0;
FSSDTexture2D SignalInputUint_Textures_0;
#if CONFIG_INPUT_TEXTURE_COUNT > 1
FSSDTexture2D SignalInput_Textures_1;
FSSDTexture2D SignalInputUint_Textures_1;
#else
#define SignalInput_Textures_1 SignalInput_Textures_0
#define SignalInputUint_Textures_1 SignalInputUint_Textures_0
#endif
#if CONFIG_INPUT_TEXTURE_COUNT > 2
FSSDTexture2D SignalInput_Textures_2;
FSSDTexture2D SignalInputUint_Textures_2;
#else
#define SignalInput_Textures_2 SignalInput_Textures_0
#define SignalInputUint_Textures_2 SignalInputUint_Textures_0
#endif
#if CONFIG_INPUT_TEXTURE_COUNT > 3
FSSDTexture2D SignalInput_Textures_3;
FSSDTexture2D SignalInputUint_Textures_3;
#else
#define SignalInput_Textures_3 SignalInput_Textures_0
#define SignalInputUint_Textures_3 SignalInputUint_Textures_0
#endif
#if !defined(CONFIG_OUTPUT_TEXTURE_COUNT)
#error Missing CONFIG_OUTPUT_TEXTURE_COUNT
#endif
FSSDRWTexture2D SignalOutput_UAVs_0;
#if CONFIG_OUTPUT_TEXTURE_COUNT > 1
FSSDRWTexture2D SignalOutput_UAVs_1;
#else
#define SignalOutput_UAVs_1 SignalOutput_UAVs_0
#endif
#if CONFIG_OUTPUT_TEXTURE_COUNT > 2
FSSDRWTexture2D SignalOutput_UAVs_2;
#else
#define SignalOutput_UAVs_2 SignalOutput_UAVs_0
#endif
#if CONFIG_OUTPUT_TEXTURE_COUNT > 3
FSSDRWTexture2D SignalOutput_UAVs_3;
#else
#define SignalOutput_UAVs_3 SignalOutput_UAVs_0
#endif
//------------------------------------------------------- FUNCTIONS
// TODO(Denoiser): duplicated with reflection code.
uint2 GetPixelCoord(uint2 DispatchThreadId)
{
uint UpscaleFactorPow2 = UpscaleFactor * UpscaleFactor;
// TODO(Denoiser): find a way to not interfer with TAA's jittering.
uint SubPixelId = View.StateFrameIndex & (UpscaleFactorPow2 - 1);
return DispatchThreadId * UpscaleFactor + uint2(SubPixelId & (UpscaleFactor - 1), SubPixelId / UpscaleFactor);
}
//------------------------------------------------------- ENTRY POINTS
[numthreads(TILE_PIXEL_SIZE, TILE_PIXEL_SIZE, 1)]
void MainCS(
uint2 DispatchThreadId : SV_DispatchThreadID,
uint2 GroupId : SV_GroupID,
uint2 GroupThreadId : SV_GroupThreadID,
uint GroupThreadIndex : SV_GroupIndex)
{
#if CONFIG_SIGNAL_INPUT_TEXTURE_TYPE == SIGNAL_TEXTURE_TYPE_FLOAT4
Texture2D Signal_Textures_0 = SignalInput_Textures_0;
Texture2D Signal_Textures_1 = SignalInput_Textures_1;
Texture2D Signal_Textures_2 = SignalInput_Textures_2;
Texture2D Signal_Textures_3 = SignalInput_Textures_3;
#else
FSSDTexture2D Signal_Textures_0 = SignalInput_Textures_0;
FSSDTexture2D Signal_Textures_1 = SignalInput_Textures_1;
FSSDTexture2D Signal_Textures_2 = SignalInput_Textures_2;
FSSDTexture2D Signal_Textures_3 = SignalInput_Textures_3;
#endif
// Find out scene buffer UV.
float2 SceneBufferUV = DispatchThreadId * ThreadIdToBufferUV.xy + ThreadIdToBufferUV.zw;
if (true)
{
SceneBufferUV = clamp(SceneBufferUV, DenoiserBufferBilinearUVMinMax.xy, DenoiserBufferBilinearUVMinMax.zw);
}
// Read reference meta data.
FSSDCompressedSceneInfos CompressedRefSceneMetadata;
FSSDSampleSceneInfos RefSceneMetadata;
{
CompressedRefSceneMetadata = SampleCompressedSceneMetadata(
/* bPrevFrame = */ false,
SceneBufferUV, BufferUVToBufferPixelCoord(SceneBufferUV));
float2 ScreenPosition = DenoiserBufferUVToScreenPosition(SceneBufferUV);
RefSceneMetadata = UncompressSampleSceneInfo(
CONFIG_METADATA_BUFFER_LAYOUT, /* bPrevFrame = */ false,
ScreenPosition, CompressedRefSceneMetadata);
}
// Sample the reference sample.
#if !CONFIG_UPSCALE || 1
FSSDSignalArray RefSamples;
FSSDSignalFrequencyArray RefFrequencies;
SampleMultiplexedSignals(
Signal_Textures_0,
Signal_Textures_1,
Signal_Textures_2,
Signal_Textures_3,
GlobalPointClampedSampler,
CONFIG_SIGNAL_INPUT_LAYOUT,
/* MultiplexedSampleId = */ 0,
/* bNormalizeSample = */ CONFIG_NORMALIZE_INPUT != 0,
SceneBufferUV,
/* out */ RefSamples,
/* out */ RefFrequencies);
#if CONFIG_NORMALIZE_INPUT
FSSDSignalArray NormalizedRefSamples = RefSamples;
#else
// TODO(Denoiser): Decode twice instead.
FSSDSignalArray NormalizedRefSamples = NormalizeToOneSampleArray(RefSamples);
#endif
#endif
//DebugOutput[DispatchThreadId] = float4(GetWorldNormal(RefSceneMetadata)* 0.5 + 0.5, GetWorldDepth(RefSceneMetadata));
/** factor by witch should be spread out. */
#if CONFIG_UPSCALE
float KernelSpreadFactor = UpscaleFactor;
#elif !CONFIG_CUSTOM_SPREAD_FACTOR
const float KernelSpreadFactor = 1;
#endif
/** Find out the number of samples that should be done. */
float RequestedSampleCount = 1024;
#if CONFIG_SAMPLE_SET == SAMPLE_SET_NONE
RequestedSampleCount = 1;
#elif CONFIG_SAMPLE_COUNT_POLICY == SAMPLE_COUNT_POLICY_DISABLED
// NOP
#elif CONFIG_SAMPLE_COUNT_POLICY == SAMPLE_COUNT_POLICY_SAMPLE_ACCUMULATION_BASED
{
#if CONFIG_SIGNAL_BATCH_SIZE != 1
#error Unable to support more than one signal.
#endif
RequestedSampleCount = clamp(TARGETED_SAMPLE_COUNT / RefSamples.Array[0].SampleCount, 1, MaxSampleCount);
}
#else
#error Unknown policy to control the number of samples.
#endif
// Register renaming of members of FSSDKernelConfig to survive until the output to UAV
#if (CONFIG_SAMPLE_SET == SAMPLE_SET_STACKOWIAK_4_SETS) && CONFIG_VGPR_OPTIMIZATION
float2 KernelBufferUV;
uint SampleTrackId;
#endif
// Accumulate spatially the input.
FSSDSignalAccumulatorArray SignalAccumulators;
{
FSSDKernelConfig KernelConfig = CreateKernelConfig();
#if DEBUG_OUTPUT
{
KernelConfig.DebugPixelPosition = DispatchThreadId;
KernelConfig.DebugEventCounter = 0;
}
#endif
// Compile time.
KernelConfig.SampleSet = CONFIG_SAMPLE_SET;
KernelConfig.SampleSubSetId = CONFIG_SAMPLE_SUBSET;
KernelConfig.BufferLayout = CONFIG_SIGNAL_INPUT_LAYOUT;
KernelConfig.MultiplexedSignalsPerSignalDomain = CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN;
KernelConfig.NeighborToRefComputation = NEIGHBOR_TO_REF_LOWEST_VGPR_PRESSURE;
KernelConfig.bUnroll = CONFIG_SAMPLE_SET != SAMPLE_SET_STACKOWIAK_4_SETS;
KernelConfig.bDescOrder = CONFIG_OUTPUT_MODE == OUTPUT_MODE_DRB;
KernelConfig.BilateralDistanceComputation = CONFIG_BILATERAL_DISTANCE_COMPUTATION;
KernelConfig.WorldBluringDistanceMultiplier = CONFIG_BILATERAL_DISTANCE_MULTIPLIER;
KernelConfig.bNormalizeSample = CONFIG_NORMALIZE_INPUT != 0;
KernelConfig.bSampleKernelCenter = CONFIG_UPSCALE;
KernelConfig.bForceKernelCenterAccumulation = true;
KernelConfig.bClampUVPerMultiplexedSignal = CONFIG_CLAMP_UV_PER_SIGNAL != 0;
// Reconstruct the spherical harmonic when reconstructing from 1spp.
KernelConfig.bComputeSampleColorSH = DIM_STAGE == STAGE_RECONSTRUCTION && DIM_MULTI_SPP == 0;
{
UNROLL_N(SIGNAL_ARRAY_SIZE)
for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++)
{
KernelConfig.BufferColorSpace[MultiplexId] = CONFIG_INPUT_COLOR_SPACE;
KernelConfig.AccumulatorColorSpace[MultiplexId] = CONFIG_ACCUMULATION_COLOR_SPACE;
}
}
SetBilateralPreset(CONFIG_BILATERAL_PRESET, /* inout */ KernelConfig);
// SGPRs
KernelConfig.BufferSizeAndInvSize = DenoiserBufferSizeAndInvSize;
KernelConfig.BufferBilinearUVMinMax = DenoiserBufferBilinearUVMinMax;
KernelConfig.KernelSpreadFactor = KernelSpreadFactor;
KernelConfig.HarmonicPeriode = HarmonicPeriode;
#if CONFIG_CLAMP_UV_PER_SIGNAL
{
UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE)
for (uint BatchedSignalId = 0; BatchedSignalId < CONFIG_SIGNAL_BATCH_SIZE; BatchedSignalId++)
{
uint MultiplexId = BatchedSignalId / CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN;
KernelConfig.PerSignalUVMinMax[MultiplexId] = InputBufferUVMinMax[MultiplexId];
}
}
#endif
// VGPRs
KernelConfig.BufferUV = SceneBufferUV;
{
#if CONFIG_REF_METADATA_COMPRESSION == CONFIG_METADATA_BUFFER_LAYOUT
// Straight up plumb down the compress layout to save any VALU.
KernelConfig.CompressedRefSceneMetadata = CompressedRefSceneMetadata;
#else
// Recompress the reference scene metadata
KernelConfig.CompressedRefSceneMetadata = CompressSampleSceneInfo(CONFIG_REF_METADATA_COMPRESSION, RefSceneMetadata);
#endif
KernelConfig.RefBufferUV = SceneBufferUV;
KernelConfig.RefSceneMetadataLayout = CONFIG_REF_METADATA_COMPRESSION;
}
KernelConfig.HammersleySeed = Rand3DPCG16(int3(SceneBufferUV * BufferUVToOutputPixelPosition, View.StateFrameIndexMod8)).xy;
// Set up reference distance for all signals.
#if CONFIG_MAX_WITH_REF_DISTANCE
{
KernelConfig.bMaxWithRefBilateralDistance = true;
UNROLL_N(SIGNAL_ARRAY_SIZE)
for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++)
{
if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS)
{
KernelConfig.RefBilateralDistance[MultiplexId] = RefFrequencies.Array[MultiplexId].WorldBluringRadius;
}
else if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_HIT_DISTANCE)
{
KernelConfig.RefBilateralDistance[MultiplexId] = RefFrequencies.Array[MultiplexId].ClosestHitDistance;
}
else
{
const uint BatchedSignalId = ComputeSignalBatchIdFromSignalMultiplexId(KernelConfig, MultiplexId);
FSSDSignalDomainKnowledge DomainKnowledge = GetSignalDomainKnowledge(BatchedSignalId);
KernelConfig.RefBilateralDistance[MultiplexId] = GetSignalWorldBluringRadius(RefFrequencies.Array[MultiplexId], RefSceneMetadata, DomainKnowledge);
}
}
}
#endif
// When doing history rejection preconvolution may have invalid ref sample, in witch case need to force take neighborhood to have a clamping box.
#if DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION && CONFIG_UPSCALE
{
KernelConfig.bForceAllAccumulation = RefSamples.Array[0].SampleCount == 0;
KernelConfig.SampleSet = SAMPLE_SET_3X3_PLUS;
}
#endif
#if CONFIG_SAMPLE_SET == SAMPLE_SET_HEXAWEB
{
KernelConfig.RingCount = 1;
// TODO(Denoiser): could be improved.
//KernelConfig.bMinSamplePairInvFrequency = true;
float2 E = float2(
InterleavedGradientNoise(DispatchThreadId, 0),
InterleavedGradientNoise(DispatchThreadId, 1));
// Add a bit of jittering to hide low sample.
KernelConfig.bSampleKernelCenter = false;
KernelConfig.BufferUV += View.ViewSizeAndInvSize.zw * (E - 0.5) * (KernelConfig.KernelSpreadFactor);
}
#endif
FSSDSignalAccumulatorArray UncompressedAccumulators = CreateSignalAccumulatorArray();
// When not upscaling, manually force accumulate the sample of the kernel.
if (!KernelConfig.bSampleKernelCenter && !KernelConfig.bDescOrder)
{
UNROLL_N(SIGNAL_ARRAY_SIZE)
for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++)
{
const uint BatchedSignalId = ComputeSignalBatchIdFromSignalMultiplexId(KernelConfig, SignalMultiplexId);
FSSDSignalDomainKnowledge DomainKnowledge = GetSignalDomainKnowledge(BatchedSignalId);
uint2 RefPixelCoord = floor(KernelConfig.BufferUV * KernelConfig.BufferSizeAndInvSize.xy);
FSSDSignalSample CenterSample = TransformSignalSampleForAccumulation(
KernelConfig,
SignalMultiplexId,
RefSceneMetadata,
RefSamples.Array[SignalMultiplexId],
RefPixelCoord);
FSSDSampleAccumulationInfos SampleInfos;
SampleInfos.Sample = CenterSample;
SampleInfos.Frequency = RefFrequencies.Array[SignalMultiplexId];
SampleInfos.FinalWeight = 1.0;
SampleInfos.InvFrequency = GetSignalWorldBluringRadius(SampleInfos.Frequency, RefSceneMetadata, DomainKnowledge);
if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS)
{
SampleInfos.InvFrequency = SampleInfos.Frequency.WorldBluringRadius;
}
AccumulateSample(
/* inout */ UncompressedAccumulators.Array[SignalMultiplexId],
SampleInfos);
}
}
#if CONFIG_SAMPLE_SET == SAMPLE_SET_STACKOWIAK_4_SETS
{
KernelConfig.SampleCount = clamp(uint(RequestedSampleCount) / kStackowiakSampleSetCount, 1, MaxSampleCount);
#if CONFIG_UPSCALE
{
// TODO(Denoiser): could be optimised, but currently reusing same peace of code as reflection for maintainability.
uint2 RayDispatchThreadId = (DispatchThreadId - UpscaleFactor / 2) / UpscaleFactor;
uint2 ClosestRayPixelCoord = GetPixelCoord(RayDispatchThreadId);
uint RaySubPixelId = View.StateFrameIndex & (UpscaleFactor * UpscaleFactor - 1);
KernelConfig.BufferUV = ((ViewportMin + ClosestRayPixelCoord + (0.5 * KernelSpreadFactor + 0.5))) * KernelConfig.BufferSizeAndInvSize.zw;
// Sample the center of the kernel by comparing it against the RefSceneMetadata, since it may no match.
KernelConfig.bSampleKernelCenter = true;
// Id of the pixel in the quad.
KernelConfig.SampleTrackId = ((DispatchThreadId.x & 1) | ((DispatchThreadId.y & 1) << 1)) ^ 0x3;
// To avoid precision problem when comparing potentially identicall
KernelConfig.bForceKernelCenterAccumulation = RaySubPixelId == ((DispatchThreadId.x & 1) | ((DispatchThreadId.y & 1) << 1));
}
#else
{
// Put the kernel center at the center of the quad. Half pixel shift is done in the sample offsets.
KernelConfig.BufferUV = float2(DispatchThreadId | 1) * ThreadIdToBufferUV.xy + ThreadIdToBufferUV.zw;
// Id of the pixel in the quad. This is to match hard coded first samples of the sample set.
KernelConfig.SampleTrackId = ((DispatchThreadId.x & 1) | ((DispatchThreadId.y & 1) << 1));
}
#endif
#if CONFIG_VGPR_OPTIMIZATION
// Keek sample SampleTrackId & SceneBufferUV arround for computation of pixel output coordinate.
// Should be VGPR free given it's curernt is being used in accumulation has well that is highest VGPR pressure of the shader.
// TODO(Denoiser): could save 1 VGPR by using 2 SGPR instead of SampleTrackId.
SampleTrackId = KernelConfig.SampleTrackId;
KernelBufferUV = KernelConfig.BufferUV;
#endif
}
#elif CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_RECT || CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_ELLIPSE
{
#if CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_REFLECTIONS
{
const float TargetSamplePerPixel = 0.25;
const float MinimalPixelRadius = 0.5 * rsqrt(2.0);
// Project GGX lobe into screen space.
float2 NormalizedScreenMajorAxis;
float InifinityMajorViewportRadius;
float InifinityMinorViewportRadius;
ProjectSpecularLobeToScreenSpace(
RefSceneMetadata,
/* out */ NormalizedScreenMajorAxis,
/* out */ InifinityMajorViewportRadius,
/* out */ InifinityMinorViewportRadius);
float ConfusionFactor = saturate(RefFrequencies.Array[0].ConfusionFactor);
float AspectRatio = InifinityMinorViewportRadius / InifinityMajorViewportRadius;
float PreviousMaxPixelDiameter = sqrt(rcp(TargetSamplePerPixel) * PreviousCumulativeMaxSampleCount / AspectRatio);
float MaxPixelDiameter = sqrt(rcp(TargetSamplePerPixel) * MaxSampleCount * PreviousCumulativeMaxSampleCount / AspectRatio);
KernelConfig.MajorAxis = NormalizedScreenMajorAxis * float2(1, -1);
KernelConfig.MajorPixelRadius = InifinityMajorViewportRadius * ConfusionFactor * View.ViewSizeAndInvSize.x - PreviousMaxPixelDiameter;
float MaxPixelRadius = 0.5 * MaxPixelDiameter;
KernelConfig.MajorPixelRadius = clamp(KernelConfig.MajorPixelRadius, 0, MaxPixelRadius);
KernelConfig.MinorPixelRadius = AspectRatio * KernelConfig.MajorPixelRadius;
// *4 to multiply from radii area to diameters area.
float ConvolutionArea = 4.0 * max(KernelConfig.MajorPixelRadius, MinimalPixelRadius) * max(KernelConfig.MinorPixelRadius, MinimalPixelRadius);
KernelConfig.SampleCount = clamp(ConvolutionArea * TargetSamplePerPixel * rcp(PreviousCumulativeMaxSampleCount), 0, MaxSampleCount);
#if 0
{
DebugOutput[DispatchThreadId] = float4(
KernelConfig.SampleCount,
KernelConfig.MajorPixelRadius,
KernelConfig.MinorPixelRadius,
ConfusionFactor);
}
#elif 0
{
// DebugOutput[DispatchThreadId] = float4(
// KernelConfig.SampleCount,
// InifinityMajorViewportRadius,
// InifinityMinorViewportRadius,
// AspectRatio);
}
#elif 0
{
DebugOutput[DispatchThreadId] = float4(
//GetWorldNormal(RefSceneMetadata) * 0.5 + 0.5,
abs(GetTranslatedWorldPosition(RefSceneMetadata) * 0.001),
RefSceneMetadata.WorldDepth);
}
#endif
// DebugOutput[DispatchThreadId] = float4(
// KernelConfig.SampleCount,
// KernelConfig.MajorPixelRadius,
// KernelConfig.MinorPixelRadius,
// OutOfFocus);
}
#else
#error Directional rect sample set is not supported.
#endif
}
#endif // CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_*
FSSDCompressedSignalAccumulatorArray CompressedAccumulators = CompressAccumulatorArray(UncompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION);
// Performance: skip pixels/regions where the center sample is invalid (SampleCount = 0) for virtual shadow maps
#if CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_VIRTUAL_SHADOW_MAP_MASK
bool bRefHasSamples = false;
for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++)
{
bRefHasSamples = bRefHasSamples || (RefSamples.Array[SignalMultiplexId].SampleCount > 0);
}
BRANCH
if (bRefHasSamples)
#endif
{
AccumulateKernel(
KernelConfig,
Signal_Textures_0,
Signal_Textures_1,
Signal_Textures_2,
Signal_Textures_3,
/* inout */ UncompressedAccumulators,
/* inout */ CompressedAccumulators);
}
// When doing history rejection pre convolution, could still have no information found with the 3x3 + kernel,
// therefore dynamically complete to form an entire 3x3 convolution.
#if DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION
{
BRANCH
if (KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS &&
KernelConfig.bForceAllAccumulation)
{
KernelConfig.SampleSet = SAMPLE_SET_3X3_CROSS;
KernelConfig.bSampleKernelCenter = false;
AccumulateKernel(
KernelConfig,
Signal_Textures_0,
Signal_Textures_1,
Signal_Textures_2,
Signal_Textures_3,
/* inout */ UncompressedAccumulators,
/* inout */ CompressedAccumulators);
}
}
#endif // DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION
// Manually sample the center of the kernel after any accumulation when accumulating in descending order.
if (!KernelConfig.bSampleKernelCenter && KernelConfig.bDescOrder)
{
// Remove any jitter the kernel may have. Won't have ant VGPR cost when no jittering, because KernelConfig.BufferUV == SceneBufferUV.
// TODO(Denoiser): This is costly for VGPR pressure if using KernelConfig.BufferUV was != SceneBufferUV.
KernelConfig.BufferUV = SceneBufferUV;
SampleAndAccumulateCenterSampleAsItsOwnCluster(
KernelConfig,
Signal_Textures_0,
Signal_Textures_1,
Signal_Textures_2,
Signal_Textures_3,
/* inout */ UncompressedAccumulators,
/* inout */ CompressedAccumulators);
}
#if CONFIG_ACCUMULATOR_VGPR_COMPRESSION == ACCUMULATOR_COMPRESSION_DISABLED
SignalAccumulators = UncompressedAccumulators;
#else
SignalAccumulators = UncompressAccumulatorArray(CompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION);
#endif
}
// Color processing of the signal to reduce highlight flickering.
#if CONFIG_ACCUMULATION_COLOR_SPACE != CONFIG_OUTPUT_COLOR_SPACE || CONFIG_REJECT_HIGHEST_COLOR
{
UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE)
for (uint MultiplexId = 0; MultiplexId < CONFIG_SIGNAL_BATCH_SIZE; MultiplexId++)
{
UncompressSignalAccumulator(/* inout */ SignalAccumulators.Array[MultiplexId]);
#if CONFIG_REJECT_HIGHEST_COLOR
{
#if !COMPILE_SIGNAL_COLOR
#error Need to compile signal color.
#endif
if (Accumulator.Moment1[MultiplexId].SampleCount > 0)
{
const float MaxNeighborWeight = saturate(SignalAccumulators.Array[MultiplexId].Moment1.SampleCount * rcp(10) - 1);
SignalAccumulators.Array[MultiplexId].Moment1.SceneColor.rgb =
(SignalAccumulators.Array[MultiplexId].Moment1.SceneColor.rgb - MaxNeighbor.SceneColor.rgb * MaxNeighborWeight) *
(SignalAccumulators.Array[MultiplexId].Moment1.SampleCount / (SignalAccumulators.Array[MultiplexId].Moment1.SampleCount - MaxNeighborWeight));
}
}
#endif // CONFIG_REJECT_HIGHEST_COLOR
#if CONFIG_ACCUMULATION_COLOR_SPACE != CONFIG_OUTPUT_COLOR_SPACE
{
#if COMPILE_MOMENT1_ACCUMULATOR
SignalAccumulators.Array[MultiplexId].Moment1 = TransformSignal(
SignalAccumulators.Array[MultiplexId].Moment1,
/* SrcBasis = */ CONFIG_ACCUMULATION_COLOR_SPACE,
/* DestBasis = */ CONFIG_OUTPUT_COLOR_SPACE);
#endif
#if COMPILE_MOMENT2_ACCUMULATOR
SignalAccumulators.Array[MultiplexId].Moment2 = TransformSignal(
SignalAccumulators.Array[MultiplexId].Moment2,
/* SrcBasis = */ CONFIG_ACCUMULATION_COLOR_SPACE,
/* DestBasis = */ CONFIG_OUTPUT_COLOR_SPACE);
#endif
}
#endif // CONFIG_ACCUMULATION_COLOR_SPACE != CONFIG_OUTPUT_COLOR_SPACE
// TODO(Denoiser): it might be better to just uncompress before this for loop and remain uncompressed,
// so the color operation get done in practice during the output sample transcoding.
CompressSignalAccumulator(/* inout */ SignalAccumulators.Array[MultiplexId]);
}
}
#endif // CONFIG_ACCUMULATION_COLOR_SPACE != CONFIG_OUTPUT_COLOR_SPACE || CONFIG_REJECT_HIGHEST_COLOR
// Transcode the spatial accumulation into multiplexed signal according to different modes.
uint MultiplexCount = 1;
FSSDSignalArray OutputSamples = CreateSignalArrayFromScalarValue(0.0);
FSSDSignalFrequencyArray OutputFrequencies = CreateInvalidSignalFrequencyArray();
{
#if CONFIG_OUTPUT_MODE == OUTPUT_MODE_SUM
{
MultiplexCount = CONFIG_SIGNAL_BATCH_SIZE;
UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE)
for (uint MultiplexId = 0; MultiplexId < CONFIG_SIGNAL_BATCH_SIZE; MultiplexId++)
{
UncompressSignalAccumulator(/* inout */ SignalAccumulators.Array[MultiplexId]);
OutputSamples.Array[MultiplexId] = SignalAccumulators.Array[MultiplexId].Moment1;
// Output the minimal inverse frequency as new world bluring radius for subsequent passes.
OutputFrequencies.Array[MultiplexId] = SignalAccumulators.Array[MultiplexId].MinFrequency;
}
}
#elif CONFIG_OUTPUT_MODE == OUTPUT_MODE_2MOMMENT_SUM
{
#if SIGNAL_ARRAY_SIZE != 2 * MAX_SIGNAL_BATCH_SIZE
#error Invalid signal array size.
#endif
MultiplexCount = 2 * CONFIG_SIGNAL_BATCH_SIZE;
UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE)
for (uint BatchedSignalId = 0; BatchedSignalId < CONFIG_SIGNAL_BATCH_SIZE; BatchedSignalId++)
{
UncompressSignalAccumulator(/* inout */ SignalAccumulators.Array[BatchedSignalId * CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN + 0]);
OutputSamples.Array[BatchedSignalId * 2 + 0] = SignalAccumulators.Array[BatchedSignalId * CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN + 0].Moment1;
OutputSamples.Array[BatchedSignalId * 2 + 1] = SignalAccumulators.Array[BatchedSignalId * CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN + 0].Moment2;
}
}
#elif CONFIG_OUTPUT_MODE == OUTPUT_MODE_DRB
{
MultiplexCount = CONFIG_SIGNAL_BATCH_SIZE;
UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE)
for (uint MultiplexId = 0; MultiplexId < CONFIG_SIGNAL_BATCH_SIZE; MultiplexId++)
{
UncompressSignalAccumulator(/* inout */ SignalAccumulators.Array[MultiplexId]);
OutputSamples.Array[MultiplexId] = SignalAccumulators.Array[MultiplexId].Previous;
// Output the minimal inverse frequency as new world bluring radius for subsequent passes.
OutputFrequencies.Array[MultiplexId] = SignalAccumulators.Array[MultiplexId].MinFrequency;
// No need to keep the VGPR pressure at this point for WorldBluringRadius, because no passes use it after.
if (DIM_STAGE == STAGE_POST_FILTERING && 0)
{
OutputFrequencies.Array[MultiplexId].WorldBluringRadius = 0;
}
}
}
#else
#error Unknown output mode.
#endif
}
// Clamp the number of sample recorded.
#if DIM_STAGE == STAGE_POST_FILTERING
{
UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE)
for (uint MultiplexId = 0; MultiplexId < CONFIG_SIGNAL_BATCH_SIZE; MultiplexId++)
{
float CurrentSampleCount = RefSamples.Array[MultiplexId].SampleCount;
float NewSampleCount = min(CurrentSampleCount, TARGETED_SAMPLE_COUNT);
OutputSamples.Array[MultiplexId] = MulSignal(OutputSamples.Array[MultiplexId], CurrentSampleCount > 0 ? NewSampleCount / CurrentSampleCount : 0);
}
}
#endif // DIM_STAGE == STAGE_POST_FILTERING
#if CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_SPHERICAL_HARMONIC && 0
DebugOutput[DispatchThreadId] = float4(
OutputSamples.Array[0].ColorSH.R.V.x,
OutputSamples.Array[0].ColorSH.G.V.x,
OutputSamples.Array[0].ColorSH.B.V.x,
OutputSamples.Array[0].SampleCount);
#endif
// TODO(Denoiser): LeaveRayCount = (LeaveRayCount - 1) * 9 / (9 - 2) post processing to reject when for history rejection
uint2 OutputPixelPostion;
#if CONFIG_VGPR_OPTIMIZATION && !CONFIG_UPSCALE // TODO(Denoiser)
{
// No need to keep DispatchThreadId, can recompute the output pixel position based on information stored in VGPRs for spatial kernel.
#if CONFIG_SAMPLE_SET == SAMPLE_SET_STACKOWIAK_4_SETS
#if CONFIG_UPSCALE
SampleTrackId ^= 0x3;
#endif
OutputPixelPostion = (uint2(KernelBufferUV * BufferUVToOutputPixelPosition) & ~0x1) | (uint2(SampleTrackId, SampleTrackId >> 1) & 0x1);
#else
OutputPixelPostion = BufferUVToBufferPixelCoord(SceneBufferUV);
#endif
}
#else
OutputPixelPostion = ViewportMin + DispatchThreadId;
#endif
#if DEBUG_OUTPUT && CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_SHADOW_VISIBILITY_MASK && DIM_STAGE == STAGE_RECONSTRUCTION
DebugOutput[DispatchThreadId] = float4(OutputSamples.Array[0].SampleCount, 0, 0, 0);
#endif
BRANCH
if (all(OutputPixelPostion < ViewportMax))
{
// Output the multiplexed signal.
#if DIM_STAGE == STAGE_FINAL_OUTPUT && (CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_SHADOW_VISIBILITY_MASK || CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_VIRTUAL_SHADOW_MAP_MASK)
{
UNROLL
for (uint MultiplexId = 0; MultiplexId < MultiplexCount; MultiplexId++)
{
float Shadow = GetSamplePenumbraSafe(OutputSamples.Array[MultiplexId]);
const float ShadowFadeFraction = 1;
float SSSTransmission = (OutputSamples.Array[MultiplexId].SampleCount > 0 ? OutputSamples.Array[MultiplexId].TransmissionDistance / OutputSamples.Array[MultiplexId].SampleCount : OutputSamples.Array[MultiplexId].TransmissionDistance);
// 0 is shadowed, 1 is unshadowed
// RETURN_COLOR not needed unless writing to SceneColor;
float FadedShadow = lerp(1.0f, Shadow, ShadowFadeFraction);
float FadedSSSShadow = lerp(1.0f, SSSTransmission, ShadowFadeFraction);
// the channel assignment is documented in ShadowRendering.cpp (look for Light Attenuation channel assignment)
float4 OutColor;
if (GET_SCALAR_ARRAY_ELEMENT(LightType, MultiplexId) == LIGHT_TYPE_DIRECTIONAL)
{
OutColor = EncodeLightAttenuation(half4(FadedShadow, FadedSSSShadow, 1.0, FadedSSSShadow));
}
else
{
OutColor = EncodeLightAttenuation(half4(FadedShadow, FadedSSSShadow, FadedShadow, FadedSSSShadow));
}
if (MultiplexId == 0)
SignalOutput_UAVs_0[OutputPixelPostion] = OutColor;
if (MultiplexId == 1)
SignalOutput_UAVs_1[OutputPixelPostion] = OutColor;
if (MultiplexId == 2)
SignalOutput_UAVs_2[OutputPixelPostion] = OutColor;
if (MultiplexId == 3)
SignalOutput_UAVs_3[OutputPixelPostion] = OutColor;
}
}
#else
{
OutputMultiplexedSignal(
SignalOutput_UAVs_0,
SignalOutput_UAVs_1,
SignalOutput_UAVs_2,
SignalOutput_UAVs_3,
CONFIG_SIGNAL_OUTPUT_LAYOUT,
MultiplexCount,
OutputPixelPostion,
OutputSamples,
OutputFrequencies);
}
#endif
}
} // MainCS