// Copyright Epic Games, Inc. All Rights Reserved. #include "SSDDefinitions.ush" //------------------------------------------------------- ENUM VALUES /** Different possible stage for spatial accumulation. Matches */ #define STAGE_RECONSTRUCTION 0 #define STAGE_PRE_CONVOLUTION 1 #define STAGE_REJECTION_PRE_CONVOLUTION 2 #define STAGE_POST_FILTERING 3 #define STAGE_FINAL_OUTPUT 4 /** Policy to use to change the size of kernel. */ #define SAMPLE_COUNT_POLICY_DISABLED 0 #define SAMPLE_COUNT_POLICY_SAMPLE_ACCUMULATION_BASED 4 /** What signal should be outputed. */ // Only output the sum of the signal 0. #define OUTPUT_MODE_SUM 0 // Only output the sum of the momment 1 & 2 of the signal 0. #define OUTPUT_MODE_2MOMMENT_SUM 1 // Output the result of descending ring bucketing. #define OUTPUT_MODE_DRB 2 //------------------------------------------------------- CONFIGS #define TILE_PIXEL_SIZE 8 #define CONFIG_SIGNAL_PROCESSING DIM_SIGNAL_PROCESSING #define CONFIG_UPSCALE DIM_UPSCALE #define CONFIG_SIGNAL_BATCH_SIZE DIM_SIGNAL_BATCH_SIZE // Configures all the pass for each individual signals. #if CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_SHADOW_VISIBILITY_MASK #define MAX_SIGNAL_BATCH_SIZE CONFIG_SIGNAL_BATCH_SIZE #define SIGNAL_ARRAY_SIZE CONFIG_SIGNAL_BATCH_SIZE #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_MONOCHROMATIC_PENUMBRA #define CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN 1 #if DIM_STAGE == STAGE_RECONSTRUCTION // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_INJESTION_NSPP #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY #define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2 #define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4 #define CONFIG_INPUT_TEXTURE_COUNT ((CONFIG_SIGNAL_BATCH_SIZE + 1) / 2) #define CONFIG_OUTPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE #define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS #define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS #define CONFIG_MAX_WITH_REF_DISTANCE 1 #define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB #if DIM_SIGNAL_BATCH_SIZE > 1 #define CONFIG_CLAMP_UV_PER_SIGNAL 1 #endif #elif DIM_STAGE == STAGE_PRE_CONVOLUTION // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY #define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4 #define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4 #define CONFIG_INPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE #define CONFIG_OUTPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE #define CONFIG_SAMPLE_SET SAMPLE_SET_HEXAWEB #define CONFIG_CUSTOM_SPREAD_FACTOR 1 #define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS #define CONFIG_MAX_WITH_REF_DISTANCE 1 #define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB #elif DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION #define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4 #define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4 #elif DIM_STAGE == STAGE_POST_FILTERING // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY #define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4 #define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4 #define CONFIG_INPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE #define CONFIG_OUTPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE #define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS //#define CONFIG_SAMPLE_SUBSET 1 #define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS #define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB #elif DIM_STAGE == STAGE_FINAL_OUTPUT #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_PENUMBRA_HISTORY #define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4 #define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_FLOAT4 #define CONFIG_INPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE #define CONFIG_OUTPUT_TEXTURE_COUNT CONFIG_SIGNAL_BATCH_SIZE #define CONFIG_SAMPLE_SET SAMPLE_SET_1X1 #else #error Unexpected stage. #endif // Compress the DRB accumulator to have lower VGPR footprint. #if defined(CONFIG_OUTPUT_MODE) && CONFIG_OUTPUT_MODE == OUTPUT_MODE_DRB // Looks like shader compilers completly give up. // #define CONFIG_ACCUMULATOR_VGPR_COMPRESSION ACCUMULATOR_COMPRESSION_PENUMBRA_DRB #endif #elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_POLYCHROMATIC_PENUMBRA_HARMONIC // Denoise diffuse and specular harmonics at the same time. #define MAX_SIGNAL_BATCH_SIZE 2 #define SIGNAL_ARRAY_SIZE 2 #undef CONFIG_SIGNAL_BATCH_SIZE #define CONFIG_SIGNAL_BATCH_SIZE 2 // Each harmonic requires input and output RGB. #define COMPILE_SIGNAL_COLOR_ARRAY 2 #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_POLYCHROMATIC_PENUMBRA #define CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN 1 // Any world distance depends on the harmonic being processed. #define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_HARMONIC #if DIM_STAGE == STAGE_RECONSTRUCTION // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_POLYCHROMATIC_PENUMBRA_HARMONIC_INPUT #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_POLYCHROMATIC_PENUMBRA_HARMONIC_RECONSTRUCTION #define CONFIG_INPUT_TEXTURE_COUNT 4 #define CONFIG_OUTPUT_TEXTURE_COUNT 4 #define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS //#define CONFIG_MAX_WITH_REF_DISTANCE 1 #else #error Unexpected stage. #endif #elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_REFLECTIONS // Denoise only specular. #define MAX_SIGNAL_BATCH_SIZE 1 #define SIGNAL_ARRAY_SIZE 1 #define COMPILE_SIGNAL_COLOR 1 #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_REFLECTIONS #if DIM_STAGE == STAGE_RECONSTRUCTION || DIM_STAGE == STAGE_PRE_CONVOLUTION #define SIGNAL_ARRAY_SIZE 1 // Input and output layout. #if DIM_STAGE == STAGE_RECONSTRUCTION #define CONFIG_INPUT_TEXTURE_COUNT 2 #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_REFLECTIONS_INPUT #else #define CONFIG_INPUT_TEXTURE_COUNT 2 #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_REFLECTIONS_HISTORY #endif #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_REFLECTIONS_HISTORY #define CONFIG_OUTPUT_TEXTURE_COUNT 2 #define CONFIG_SAMPLE_SET SAMPLE_SET_DIRECTIONAL_ELLIPSE // Do color accumulation with karis weighting to avoid flickering specular highlight to show up the kernel pattern. // TODO(Denoiser): This is a bit agressive. #define CONFIG_ACCUMULATION_COLOR_SPACE (COLOR_SPACE_RGB | COLOR_SPACE_KARIS_WEIGHTING) #else #error Unexpected stage. #endif #elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_AO // Denoise only AO. #define MAX_SIGNAL_BATCH_SIZE 1 #if DIM_STAGE == STAGE_RECONSTRUCTION #define SIGNAL_ARRAY_SIZE 1 // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_INPUT #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY #define CONFIG_INPUT_TEXTURE_COUNT 2 #define CONFIG_OUTPUT_TEXTURE_COUNT 1 #define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS #define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_AO #elif DIM_STAGE == STAGE_PRE_CONVOLUTION #define SIGNAL_ARRAY_SIZE 1 // first and second momment to measure variance in temporal accumulation. // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY #define CONFIG_INPUT_TEXTURE_COUNT 1 #define CONFIG_OUTPUT_TEXTURE_COUNT 1 #define CONFIG_SAMPLE_SET SAMPLE_SET_HEXAWEB #define CONFIG_CUSTOM_SPREAD_FACTOR 1 //#define CONFIG_MAX_WITH_REF_DISTANCE 1 #define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_AO #elif DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION #define SIGNAL_ARRAY_SIZE 2 // first and second momment to measure variance in temporal accumulation. // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_REJECTION #define CONFIG_INPUT_TEXTURE_COUNT 1 #define CONFIG_OUTPUT_TEXTURE_COUNT 1 #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_AO_HISTORY #elif DIM_STAGE == STAGE_POST_FILTERING #define SIGNAL_ARRAY_SIZE 1 // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_AO_HISTORY #define CONFIG_INPUT_TEXTURE_COUNT 1 #define CONFIG_OUTPUT_TEXTURE_COUNT 1 #define CONFIG_SAMPLE_SET SAMPLE_SET_HEXAWEB #define CONFIG_OUTPUT_MODE OUTPUT_MODE_DRB #define CONFIG_CUSTOM_SPREAD_FACTOR 1 #define CONFIG_SAMPLE_COUNT_POLICY SAMPLE_COUNT_POLICY_SAMPLE_ACCUMULATION_BASED #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_AO #else #error Unexpected stage. #endif #elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_INDIRECT_AND_AO // Denoise diffuse and AO, but AO is FSSDSignalSample::MissCount. #define MAX_SIGNAL_BATCH_SIZE 1 #define SIGNAL_ARRAY_SIZE 1 #define COMPILE_SIGNAL_COLOR 1 #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DIFFUSE #define CONFIG_INPUT_TEXTURE_COUNT 2 #define CONFIG_OUTPUT_TEXTURE_COUNT 2 #if DIM_STAGE == STAGE_RECONSTRUCTION // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_INPUT_NSPP #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_RECONSTRUCTION #define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS #define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_HIT_DISTANCE #elif DIM_STAGE == STAGE_PRE_CONVOLUTION // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_RECONSTRUCTION #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_RECONSTRUCTION #define CONFIG_SAMPLE_SET SAMPLE_SET_HEXAWEB #define CONFIG_CUSTOM_SPREAD_FACTOR 1 //#define CONFIG_MAX_WITH_REF_DISTANCE 1 #elif DIM_STAGE == STAGE_POST_FILTERING // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_HISTORY #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_AND_AO_HISTORY #define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS #define CONFIG_SAMPLE_SUBSET 1 #define CONFIG_SAMPLE_COUNT_POLICY SAMPLE_COUNT_POLICY_SAMPLE_ACCUMULATION_BASED #else #error Unexpected stage. #endif #elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_SPHERICAL_HARMONIC && 0 #define MAX_SIGNAL_BATCH_SIZE 1 #define SIGNAL_ARRAY_SIZE 1 #define COMPILE_SIGNAL_COLOR_SH 1 // Given it's a spherical harmonic that store directionality, only need position based rejection. #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_SPHERICAL_HARMONIC #define CONFIG_INPUT_TEXTURE_COUNT 4 #define CONFIG_OUTPUT_TEXTURE_COUNT 4 #define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_MIN_METADATA // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_HARMONIC #define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2 #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_INDIRECT_HARMONIC #define CONFIG_SIGNAL_OUTPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2 // Spherical harmonics are a lot of data, need to shrink VGPR pressure to improve lattency hidding when fetching the buffer. // TODO(Denoiser): some shader compiler completly falls apparts with the current implementation of // CONFIG_SIGNAL_VGPR_COMPRESSION and actually drops in occupency. #define CONFIG_SIGNAL_VGPR_COMPRESSION SIGNAL_COMPRESSION_DIFFUSE_INDIRECT_HARMONIC #if DIM_STAGE == STAGE_RECONSTRUCTION #define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS #elif DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION #else #error Unexpected stage. #endif #elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_SPHERICAL_HARMONIC && 1 #define MAX_SIGNAL_BATCH_SIZE 1 #define SIGNAL_ARRAY_SIZE 1 #define COMPILE_SIGNAL_COLOR_SH 1 // Given it's a spherical harmonic that store directionality, only need position based rejection. #define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_MIN_METADATA #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_SPHERICAL_HARMONIC //#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DISABLED #define CONFIG_BILATERAL_DISTANCE_MULTIPLIER 6.0 #define CONFIG_INPUT_TEXTURE_COUNT 2 #define CONFIG_OUTPUT_TEXTURE_COUNT 2 // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_LUMEN_DIFFUSE_INPUT #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_LUMEN_DIFFUSE_HISTORY #if DIM_STAGE == STAGE_RECONSTRUCTION //#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS #define CONFIG_SAMPLE_SET SAMPLE_SET_RAW_EXPERIMENTAL_KERNEL #else #error Unexpected stage. #endif #elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_SSGI // Denoise diffuse and AO, but AO is FSSDSignalSample::MissCount. #define MAX_SIGNAL_BATCH_SIZE 1 #define SIGNAL_ARRAY_SIZE 1 #define COMPILE_SIGNAL_COLOR 1 #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DIFFUSE //#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DISABLED // SSGI doesn't have any bilateral distance computed from hitT, so allow to blur spatially by about the size of the kernel. #define CONFIG_BILATERAL_DISTANCE_MULTIPLIER 3.0 #define CONFIG_INPUT_TEXTURE_COUNT 2 #define CONFIG_OUTPUT_TEXTURE_COUNT 2 #if DIM_STAGE == STAGE_RECONSTRUCTION // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_SSGI_INPUT //#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2 #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_SSGI_HISTORY_R11G11B10 #define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS //#define CONFIG_SAMPLE_SET SAMPLE_SET_1X1 #else #error Unexpected stage. #endif #elif CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_PROBE_HIERARCHY // Denoise diffuse and AO, but AO is FSSDSignalSample::MissCount. #define MAX_SIGNAL_BATCH_SIZE 1 #define SIGNAL_ARRAY_SIZE 1 #define CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN 1 #define COMPILE_SIGNAL_COLOR_ARRAY 2 #define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_PROBE_HIERARCHY //#define CONFIG_BILATERAL_PRESET BILATERAL_PRESET_DISABLED // SSGI doesn't have any bilateral distance computed from hitT, so allow to blur spatially by about the size of the kernel. #define CONFIG_BILATERAL_DISTANCE_MULTIPLIER 3.0 #define CONFIG_INPUT_TEXTURE_COUNT 2 #define CONFIG_OUTPUT_TEXTURE_COUNT 2 #if DIM_STAGE == STAGE_RECONSTRUCTION // Input and output layout. #define CONFIG_SIGNAL_INPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_PROBE_HIERARCHY_INPUT //#define CONFIG_SIGNAL_INPUT_TEXTURE_TYPE SIGNAL_TEXTURE_TYPE_UINT2 #define CONFIG_SIGNAL_OUTPUT_LAYOUT SIGNAL_BUFFER_LAYOUT_DIFFUSE_PROBE_HIERARCHY_HISTORY //#define CONFIG_SAMPLE_SET SAMPLE_SET_STACKOWIAK_4_SETS #define CONFIG_SAMPLE_SET SAMPLE_SET_1X1 #else #error Unexpected stage. #endif #else #error Unknown signal processing. #endif // Configures pass regardless of the signals. #if DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION #define CONFIG_SAMPLE_SET SAMPLE_SET_3X3_PLUS // Normalize the input, because want to measure the spatial variance regardless of how many samples where used to reconstruct the signal. #define CONFIG_NORMALIZE_INPUT 1 // Output the 2 momment because history rejection is varaiance based, and may flicker with momment 2 loss since the pre // convolution will reduce the variance of momment 1. #define CONFIG_OUTPUT_MODE OUTPUT_MODE_2MOMMENT_SUM #endif // No previous frame reprojection, save VGPR. //#define CONFIG_NEIGHBOR_TO_REF_COMPUTATION NEIGHBOR_TO_REF_LOWEST_VGPR_PRESSURE //------------------------------------------------------- CONFIG DISABLED DEFAULTS /** Whether should clamp the UV individually per texture. */ #ifndef CONFIG_CLAMP_UV_PER_SIGNAL #define CONFIG_CLAMP_UV_PER_SIGNAL 0 #endif /** Changes the logic controling the number of sample to do. */ #ifndef CONFIG_SAMPLE_COUNT_POLICY #define CONFIG_SAMPLE_COUNT_POLICY SAMPLE_COUNT_POLICY_DISABLED #endif /** Selects a subset of sample of a given CONFIG_SAMPLE_SET */ #ifndef CONFIG_SAMPLE_SUBSET #define CONFIG_SAMPLE_SUBSET 0 #endif /** Whether the ray tracing input may needs to be upscale to the view's resolution. */ #ifndef CONFIG_UPSCALE #define CONFIG_UPSCALE 0 #endif /** Color space of the input signal. */ #ifndef CONFIG_INPUT_COLOR_SPACE #define CONFIG_INPUT_COLOR_SPACE STANDARD_BUFFER_COLOR_SPACE #endif /** Color space to use for the accumulation. */ #ifndef CONFIG_ACCUMULATION_COLOR_SPACE #define CONFIG_ACCUMULATION_COLOR_SPACE STANDARD_BUFFER_COLOR_SPACE #endif /** Color space to output in the signal. */ #ifndef CONFIG_OUTPUT_COLOR_SPACE #define CONFIG_OUTPUT_COLOR_SPACE STANDARD_BUFFER_COLOR_SPACE #endif /** Removes the highest color. */ #ifndef CONFIG_REJECT_HIGHEST_COLOR #define CONFIG_REJECT_HIGHEST_COLOR 0 #endif /** Whether the input signal should be normalized. */ #ifndef CONFIG_NORMALIZE_INPUT #define CONFIG_NORMALIZE_INPUT 0 #endif /** The oupput mode that should be use. */ #ifndef CONFIG_OUTPUT_MODE #define CONFIG_OUTPUT_MODE OUTPUT_MODE_SUM #endif /** The number of signal that should be processed per signal domain. */ #ifndef CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN #define CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN SIGNAL_ARRAY_SIZE #endif /** Selects how the world distance should be computed for bilateral rejection. */ #ifndef CONFIG_BILATERAL_DISTANCE_COMPUTATION #define CONFIG_BILATERAL_DISTANCE_COMPUTATION SIGNAL_WORLD_FREQUENCY_MIN_METADATA #endif /** Adds a multiplier on how the distance should be computed. */ #ifndef CONFIG_BILATERAL_DISTANCE_MULTIPLIER #define CONFIG_BILATERAL_DISTANCE_MULTIPLIER 1.0 #endif /** Whether neighbor bilateral distance should be maxed with reference one. */ #ifndef CONFIG_MAX_WITH_REF_DISTANCE #define CONFIG_MAX_WITH_REF_DISTANCE 0 #endif //------------------------------------------------------- COMPILATION CONFIGURATION // Choose kernel to compile. #if CONFIG_SAMPLE_SET == SAMPLE_SET_STACKOWIAK_4_SETS #define COMPILE_STACKOWIAK_KERNEL 1 #elif CONFIG_SAMPLE_SET == SAMPLE_SET_HEXAWEB #define COMPILE_DISK_KERNEL 1 #elif CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_RECT || CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_ELLIPSE #define COMPILE_DIRECTIONAL_KERNEL 1 #elif CONFIG_SAMPLE_SET == SAMPLE_SET_RAW_EXPERIMENTAL_KERNEL #define COMPILE_RAW_EXPERIMENTAL_KERNEL 1 #else #define COMPILE_BOX_KERNEL 1 #endif // Choose accumulators to compile. #if CONFIG_OUTPUT_MODE == OUTPUT_MODE_DRB #define COMPILE_DRB_ACCUMULATOR 1 #define COMPILE_MIN_FREQUENCY_ACCUMULATOR 1 #elif CONFIG_OUTPUT_MODE == OUTPUT_MODE_2MOMMENT_SUM #define COMPILE_MOMENT1_ACCUMULATOR 1 #define COMPILE_MOMENT2_ACCUMULATOR 1 #elif CONFIG_OUTPUT_MODE == OUTPUT_MODE_SUM #define COMPILE_MOMENT1_ACCUMULATOR 1 #define COMPILE_MIN_FREQUENCY_ACCUMULATOR 1 #else #error Unknown output mode. #endif //------------------------------------------------------- INCLUDES #include "SSDSignalFramework.ush" #include "SSDSignalArray.ush" #include "SSDSpatialKernel.ush" //------------------------------------------------------- LATE CONFIG DEFAULTS /** Choose how the reference metadata should be compressed. */ #ifndef CONFIG_REF_METADATA_COMPRESSION #define CONFIG_REF_METADATA_COMPRESSION CONFIG_METADATA_BUFFER_LAYOUT #endif //------------------------------------------------------- PARAMETERS uint MaxSampleCount; uint PreviousCumulativeMaxSampleCount; uint UpscaleFactor; #if !CONFIG_UPSCALE && CONFIG_CUSTOM_SPREAD_FACTOR float KernelSpreadFactor; #endif float HarmonicPeriode; float4 InputBufferUVMinMax[CONFIG_SIGNAL_BATCH_SIZE]; #if !defined(CONFIG_INPUT_TEXTURE_COUNT) #error Missing CONFIG_INPUT_TEXTURE_COUNT #endif FSSDTexture2D SignalInput_Textures_0; FSSDTexture2D SignalInputUint_Textures_0; #if CONFIG_INPUT_TEXTURE_COUNT > 1 FSSDTexture2D SignalInput_Textures_1; FSSDTexture2D SignalInputUint_Textures_1; #else #define SignalInput_Textures_1 SignalInput_Textures_0 #define SignalInputUint_Textures_1 SignalInputUint_Textures_0 #endif #if CONFIG_INPUT_TEXTURE_COUNT > 2 FSSDTexture2D SignalInput_Textures_2; FSSDTexture2D SignalInputUint_Textures_2; #else #define SignalInput_Textures_2 SignalInput_Textures_0 #define SignalInputUint_Textures_2 SignalInputUint_Textures_0 #endif #if CONFIG_INPUT_TEXTURE_COUNT > 3 FSSDTexture2D SignalInput_Textures_3; FSSDTexture2D SignalInputUint_Textures_3; #else #define SignalInput_Textures_3 SignalInput_Textures_0 #define SignalInputUint_Textures_3 SignalInputUint_Textures_0 #endif #if !defined(CONFIG_OUTPUT_TEXTURE_COUNT) #error Missing CONFIG_OUTPUT_TEXTURE_COUNT #endif FSSDRWTexture2D SignalOutput_UAVs_0; #if CONFIG_OUTPUT_TEXTURE_COUNT > 1 FSSDRWTexture2D SignalOutput_UAVs_1; #else #define SignalOutput_UAVs_1 SignalOutput_UAVs_0 #endif #if CONFIG_OUTPUT_TEXTURE_COUNT > 2 FSSDRWTexture2D SignalOutput_UAVs_2; #else #define SignalOutput_UAVs_2 SignalOutput_UAVs_0 #endif #if CONFIG_OUTPUT_TEXTURE_COUNT > 3 FSSDRWTexture2D SignalOutput_UAVs_3; #else #define SignalOutput_UAVs_3 SignalOutput_UAVs_0 #endif //------------------------------------------------------- FUNCTIONS // TODO(Denoiser): duplicated with reflection code. uint2 GetPixelCoord(uint2 DispatchThreadId) { uint UpscaleFactorPow2 = UpscaleFactor * UpscaleFactor; // TODO(Denoiser): find a way to not interfer with TAA's jittering. uint SubPixelId = View.StateFrameIndex & (UpscaleFactorPow2 - 1); return DispatchThreadId * UpscaleFactor + uint2(SubPixelId & (UpscaleFactor - 1), SubPixelId / UpscaleFactor); } //------------------------------------------------------- ENTRY POINTS [numthreads(TILE_PIXEL_SIZE, TILE_PIXEL_SIZE, 1)] void MainCS( uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupId : SV_GroupID, uint2 GroupThreadId : SV_GroupThreadID, uint GroupThreadIndex : SV_GroupIndex) { #if CONFIG_SIGNAL_INPUT_TEXTURE_TYPE == SIGNAL_TEXTURE_TYPE_FLOAT4 Texture2D Signal_Textures_0 = SignalInput_Textures_0; Texture2D Signal_Textures_1 = SignalInput_Textures_1; Texture2D Signal_Textures_2 = SignalInput_Textures_2; Texture2D Signal_Textures_3 = SignalInput_Textures_3; #else FSSDTexture2D Signal_Textures_0 = SignalInput_Textures_0; FSSDTexture2D Signal_Textures_1 = SignalInput_Textures_1; FSSDTexture2D Signal_Textures_2 = SignalInput_Textures_2; FSSDTexture2D Signal_Textures_3 = SignalInput_Textures_3; #endif // Find out scene buffer UV. float2 SceneBufferUV = DispatchThreadId * ThreadIdToBufferUV.xy + ThreadIdToBufferUV.zw; if (true) { SceneBufferUV = clamp(SceneBufferUV, DenoiserBufferBilinearUVMinMax.xy, DenoiserBufferBilinearUVMinMax.zw); } // Read reference meta data. FSSDCompressedSceneInfos CompressedRefSceneMetadata; FSSDSampleSceneInfos RefSceneMetadata; { CompressedRefSceneMetadata = SampleCompressedSceneMetadata( /* bPrevFrame = */ false, SceneBufferUV, BufferUVToBufferPixelCoord(SceneBufferUV)); float2 ScreenPosition = DenoiserBufferUVToScreenPosition(SceneBufferUV); RefSceneMetadata = UncompressSampleSceneInfo( CONFIG_METADATA_BUFFER_LAYOUT, /* bPrevFrame = */ false, ScreenPosition, CompressedRefSceneMetadata); } // Sample the reference sample. #if !CONFIG_UPSCALE || 1 FSSDSignalArray RefSamples; FSSDSignalFrequencyArray RefFrequencies; SampleMultiplexedSignals( Signal_Textures_0, Signal_Textures_1, Signal_Textures_2, Signal_Textures_3, GlobalPointClampedSampler, CONFIG_SIGNAL_INPUT_LAYOUT, /* MultiplexedSampleId = */ 0, /* bNormalizeSample = */ CONFIG_NORMALIZE_INPUT != 0, SceneBufferUV, /* out */ RefSamples, /* out */ RefFrequencies); #if CONFIG_NORMALIZE_INPUT FSSDSignalArray NormalizedRefSamples = RefSamples; #else // TODO(Denoiser): Decode twice instead. FSSDSignalArray NormalizedRefSamples = NormalizeToOneSampleArray(RefSamples); #endif #endif //DebugOutput[DispatchThreadId] = float4(GetWorldNormal(RefSceneMetadata)* 0.5 + 0.5, GetWorldDepth(RefSceneMetadata)); /** factor by witch should be spread out. */ #if CONFIG_UPSCALE float KernelSpreadFactor = UpscaleFactor; #elif !CONFIG_CUSTOM_SPREAD_FACTOR const float KernelSpreadFactor = 1; #endif /** Find out the number of samples that should be done. */ float RequestedSampleCount = 1024; #if CONFIG_SAMPLE_SET == SAMPLE_SET_NONE RequestedSampleCount = 1; #elif CONFIG_SAMPLE_COUNT_POLICY == SAMPLE_COUNT_POLICY_DISABLED // NOP #elif CONFIG_SAMPLE_COUNT_POLICY == SAMPLE_COUNT_POLICY_SAMPLE_ACCUMULATION_BASED { #if CONFIG_SIGNAL_BATCH_SIZE != 1 #error Unable to support more than one signal. #endif RequestedSampleCount = clamp(TARGETED_SAMPLE_COUNT / RefSamples.Array[0].SampleCount, 1, MaxSampleCount); } #else #error Unknown policy to control the number of samples. #endif // Register renaming of members of FSSDKernelConfig to survive until the output to UAV #if (CONFIG_SAMPLE_SET == SAMPLE_SET_STACKOWIAK_4_SETS) && CONFIG_VGPR_OPTIMIZATION float2 KernelBufferUV; uint SampleTrackId; #endif // Accumulate spatially the input. FSSDSignalAccumulatorArray SignalAccumulators; { FSSDKernelConfig KernelConfig = CreateKernelConfig(); #if DEBUG_OUTPUT { KernelConfig.DebugPixelPosition = DispatchThreadId; KernelConfig.DebugEventCounter = 0; } #endif // Compile time. KernelConfig.SampleSet = CONFIG_SAMPLE_SET; KernelConfig.SampleSubSetId = CONFIG_SAMPLE_SUBSET; KernelConfig.BufferLayout = CONFIG_SIGNAL_INPUT_LAYOUT; KernelConfig.MultiplexedSignalsPerSignalDomain = CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN; KernelConfig.NeighborToRefComputation = NEIGHBOR_TO_REF_LOWEST_VGPR_PRESSURE; KernelConfig.bUnroll = CONFIG_SAMPLE_SET != SAMPLE_SET_STACKOWIAK_4_SETS; KernelConfig.bDescOrder = CONFIG_OUTPUT_MODE == OUTPUT_MODE_DRB; KernelConfig.BilateralDistanceComputation = CONFIG_BILATERAL_DISTANCE_COMPUTATION; KernelConfig.WorldBluringDistanceMultiplier = CONFIG_BILATERAL_DISTANCE_MULTIPLIER; KernelConfig.bNormalizeSample = CONFIG_NORMALIZE_INPUT != 0; KernelConfig.bSampleKernelCenter = CONFIG_UPSCALE; KernelConfig.bForceKernelCenterAccumulation = true; KernelConfig.bClampUVPerMultiplexedSignal = CONFIG_CLAMP_UV_PER_SIGNAL != 0; // Reconstruct the spherical harmonic when reconstructing from 1spp. KernelConfig.bComputeSampleColorSH = DIM_STAGE == STAGE_RECONSTRUCTION && DIM_MULTI_SPP == 0; { UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++) { KernelConfig.BufferColorSpace[MultiplexId] = CONFIG_INPUT_COLOR_SPACE; KernelConfig.AccumulatorColorSpace[MultiplexId] = CONFIG_ACCUMULATION_COLOR_SPACE; } } SetBilateralPreset(CONFIG_BILATERAL_PRESET, /* inout */ KernelConfig); // SGPRs KernelConfig.BufferSizeAndInvSize = DenoiserBufferSizeAndInvSize; KernelConfig.BufferBilinearUVMinMax = DenoiserBufferBilinearUVMinMax; KernelConfig.KernelSpreadFactor = KernelSpreadFactor; KernelConfig.HarmonicPeriode = HarmonicPeriode; #if CONFIG_CLAMP_UV_PER_SIGNAL { UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE) for (uint BatchedSignalId = 0; BatchedSignalId < CONFIG_SIGNAL_BATCH_SIZE; BatchedSignalId++) { uint MultiplexId = BatchedSignalId / CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN; KernelConfig.PerSignalUVMinMax[MultiplexId] = InputBufferUVMinMax[MultiplexId]; } } #endif // VGPRs KernelConfig.BufferUV = SceneBufferUV; { #if CONFIG_REF_METADATA_COMPRESSION == CONFIG_METADATA_BUFFER_LAYOUT // Straight up plumb down the compress layout to save any VALU. KernelConfig.CompressedRefSceneMetadata = CompressedRefSceneMetadata; #else // Recompress the reference scene metadata KernelConfig.CompressedRefSceneMetadata = CompressSampleSceneInfo(CONFIG_REF_METADATA_COMPRESSION, RefSceneMetadata); #endif KernelConfig.RefBufferUV = SceneBufferUV; KernelConfig.RefSceneMetadataLayout = CONFIG_REF_METADATA_COMPRESSION; } KernelConfig.HammersleySeed = Rand3DPCG16(int3(SceneBufferUV * BufferUVToOutputPixelPosition, View.StateFrameIndexMod8)).xy; // Set up reference distance for all signals. #if CONFIG_MAX_WITH_REF_DISTANCE { KernelConfig.bMaxWithRefBilateralDistance = true; UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++) { if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS) { KernelConfig.RefBilateralDistance[MultiplexId] = RefFrequencies.Array[MultiplexId].WorldBluringRadius; } else if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_HIT_DISTANCE) { KernelConfig.RefBilateralDistance[MultiplexId] = RefFrequencies.Array[MultiplexId].ClosestHitDistance; } else { const uint BatchedSignalId = ComputeSignalBatchIdFromSignalMultiplexId(KernelConfig, MultiplexId); FSSDSignalDomainKnowledge DomainKnowledge = GetSignalDomainKnowledge(BatchedSignalId); KernelConfig.RefBilateralDistance[MultiplexId] = GetSignalWorldBluringRadius(RefFrequencies.Array[MultiplexId], RefSceneMetadata, DomainKnowledge); } } } #endif // When doing history rejection preconvolution may have invalid ref sample, in witch case need to force take neighborhood to have a clamping box. #if DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION && CONFIG_UPSCALE { KernelConfig.bForceAllAccumulation = RefSamples.Array[0].SampleCount == 0; KernelConfig.SampleSet = SAMPLE_SET_3X3_PLUS; } #endif #if CONFIG_SAMPLE_SET == SAMPLE_SET_HEXAWEB { KernelConfig.RingCount = 1; // TODO(Denoiser): could be improved. //KernelConfig.bMinSamplePairInvFrequency = true; float2 E = float2( InterleavedGradientNoise(DispatchThreadId, 0), InterleavedGradientNoise(DispatchThreadId, 1)); // Add a bit of jittering to hide low sample. KernelConfig.bSampleKernelCenter = false; KernelConfig.BufferUV += View.ViewSizeAndInvSize.zw * (E - 0.5) * (KernelConfig.KernelSpreadFactor); } #endif FSSDSignalAccumulatorArray UncompressedAccumulators = CreateSignalAccumulatorArray(); // When not upscaling, manually force accumulate the sample of the kernel. if (!KernelConfig.bSampleKernelCenter && !KernelConfig.bDescOrder) { UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++) { const uint BatchedSignalId = ComputeSignalBatchIdFromSignalMultiplexId(KernelConfig, SignalMultiplexId); FSSDSignalDomainKnowledge DomainKnowledge = GetSignalDomainKnowledge(BatchedSignalId); uint2 RefPixelCoord = floor(KernelConfig.BufferUV * KernelConfig.BufferSizeAndInvSize.xy); FSSDSignalSample CenterSample = TransformSignalSampleForAccumulation( KernelConfig, SignalMultiplexId, RefSceneMetadata, RefSamples.Array[SignalMultiplexId], RefPixelCoord); FSSDSampleAccumulationInfos SampleInfos; SampleInfos.Sample = CenterSample; SampleInfos.Frequency = RefFrequencies.Array[SignalMultiplexId]; SampleInfos.FinalWeight = 1.0; SampleInfos.InvFrequency = GetSignalWorldBluringRadius(SampleInfos.Frequency, RefSceneMetadata, DomainKnowledge); if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS) { SampleInfos.InvFrequency = SampleInfos.Frequency.WorldBluringRadius; } AccumulateSample( /* inout */ UncompressedAccumulators.Array[SignalMultiplexId], SampleInfos); } } #if CONFIG_SAMPLE_SET == SAMPLE_SET_STACKOWIAK_4_SETS { KernelConfig.SampleCount = clamp(uint(RequestedSampleCount) / kStackowiakSampleSetCount, 1, MaxSampleCount); #if CONFIG_UPSCALE { // TODO(Denoiser): could be optimised, but currently reusing same peace of code as reflection for maintainability. uint2 RayDispatchThreadId = (DispatchThreadId - UpscaleFactor / 2) / UpscaleFactor; uint2 ClosestRayPixelCoord = GetPixelCoord(RayDispatchThreadId); uint RaySubPixelId = View.StateFrameIndex & (UpscaleFactor * UpscaleFactor - 1); KernelConfig.BufferUV = ((ViewportMin + ClosestRayPixelCoord + (0.5 * KernelSpreadFactor + 0.5))) * KernelConfig.BufferSizeAndInvSize.zw; // Sample the center of the kernel by comparing it against the RefSceneMetadata, since it may no match. KernelConfig.bSampleKernelCenter = true; // Id of the pixel in the quad. KernelConfig.SampleTrackId = ((DispatchThreadId.x & 1) | ((DispatchThreadId.y & 1) << 1)) ^ 0x3; // To avoid precision problem when comparing potentially identicall KernelConfig.bForceKernelCenterAccumulation = RaySubPixelId == ((DispatchThreadId.x & 1) | ((DispatchThreadId.y & 1) << 1)); } #else { // Put the kernel center at the center of the quad. Half pixel shift is done in the sample offsets. KernelConfig.BufferUV = float2(DispatchThreadId | 1) * ThreadIdToBufferUV.xy + ThreadIdToBufferUV.zw; // Id of the pixel in the quad. This is to match hard coded first samples of the sample set. KernelConfig.SampleTrackId = ((DispatchThreadId.x & 1) | ((DispatchThreadId.y & 1) << 1)); } #endif #if CONFIG_VGPR_OPTIMIZATION // Keek sample SampleTrackId & SceneBufferUV arround for computation of pixel output coordinate. // Should be VGPR free given it's curernt is being used in accumulation has well that is highest VGPR pressure of the shader. // TODO(Denoiser): could save 1 VGPR by using 2 SGPR instead of SampleTrackId. SampleTrackId = KernelConfig.SampleTrackId; KernelBufferUV = KernelConfig.BufferUV; #endif } #elif CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_RECT || CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_ELLIPSE { #if CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_REFLECTIONS { const float TargetSamplePerPixel = 0.25; const float MinimalPixelRadius = 0.5 * rsqrt(2.0); // Project GGX lobe into screen space. float2 NormalizedScreenMajorAxis; float InifinityMajorViewportRadius; float InifinityMinorViewportRadius; ProjectSpecularLobeToScreenSpace( RefSceneMetadata, /* out */ NormalizedScreenMajorAxis, /* out */ InifinityMajorViewportRadius, /* out */ InifinityMinorViewportRadius); float ConfusionFactor = saturate(RefFrequencies.Array[0].ConfusionFactor); float AspectRatio = InifinityMinorViewportRadius / InifinityMajorViewportRadius; float PreviousMaxPixelDiameter = sqrt(rcp(TargetSamplePerPixel) * PreviousCumulativeMaxSampleCount / AspectRatio); float MaxPixelDiameter = sqrt(rcp(TargetSamplePerPixel) * MaxSampleCount * PreviousCumulativeMaxSampleCount / AspectRatio); KernelConfig.MajorAxis = NormalizedScreenMajorAxis * float2(1, -1); KernelConfig.MajorPixelRadius = InifinityMajorViewportRadius * ConfusionFactor * View.ViewSizeAndInvSize.x - PreviousMaxPixelDiameter; float MaxPixelRadius = 0.5 * MaxPixelDiameter; KernelConfig.MajorPixelRadius = clamp(KernelConfig.MajorPixelRadius, 0, MaxPixelRadius); KernelConfig.MinorPixelRadius = AspectRatio * KernelConfig.MajorPixelRadius; // *4 to multiply from radii area to diameters area. float ConvolutionArea = 4.0 * max(KernelConfig.MajorPixelRadius, MinimalPixelRadius) * max(KernelConfig.MinorPixelRadius, MinimalPixelRadius); KernelConfig.SampleCount = clamp(ConvolutionArea * TargetSamplePerPixel * rcp(PreviousCumulativeMaxSampleCount), 0, MaxSampleCount); #if 0 { DebugOutput[DispatchThreadId] = float4( KernelConfig.SampleCount, KernelConfig.MajorPixelRadius, KernelConfig.MinorPixelRadius, ConfusionFactor); } #elif 0 { // DebugOutput[DispatchThreadId] = float4( // KernelConfig.SampleCount, // InifinityMajorViewportRadius, // InifinityMinorViewportRadius, // AspectRatio); } #elif 0 { DebugOutput[DispatchThreadId] = float4( //GetWorldNormal(RefSceneMetadata) * 0.5 + 0.5, abs(GetTranslatedWorldPosition(RefSceneMetadata) * 0.001), RefSceneMetadata.WorldDepth); } #endif // DebugOutput[DispatchThreadId] = float4( // KernelConfig.SampleCount, // KernelConfig.MajorPixelRadius, // KernelConfig.MinorPixelRadius, // OutOfFocus); } #else #error Directional rect sample set is not supported. #endif } #endif // CONFIG_SAMPLE_SET == SAMPLE_SET_DIRECTIONAL_* FSSDCompressedSignalAccumulatorArray CompressedAccumulators = CompressAccumulatorArray(UncompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION); // Performance: skip pixels/regions where the center sample is invalid (SampleCount = 0) for virtual shadow maps #if CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_VIRTUAL_SHADOW_MAP_MASK bool bRefHasSamples = false; for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++) { bRefHasSamples = bRefHasSamples || (RefSamples.Array[SignalMultiplexId].SampleCount > 0); } BRANCH if (bRefHasSamples) #endif { AccumulateKernel( KernelConfig, Signal_Textures_0, Signal_Textures_1, Signal_Textures_2, Signal_Textures_3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } // When doing history rejection pre convolution, could still have no information found with the 3x3 + kernel, // therefore dynamically complete to form an entire 3x3 convolution. #if DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION { BRANCH if (KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS && KernelConfig.bForceAllAccumulation) { KernelConfig.SampleSet = SAMPLE_SET_3X3_CROSS; KernelConfig.bSampleKernelCenter = false; AccumulateKernel( KernelConfig, Signal_Textures_0, Signal_Textures_1, Signal_Textures_2, Signal_Textures_3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } } #endif // DIM_STAGE == STAGE_REJECTION_PRE_CONVOLUTION // Manually sample the center of the kernel after any accumulation when accumulating in descending order. if (!KernelConfig.bSampleKernelCenter && KernelConfig.bDescOrder) { // Remove any jitter the kernel may have. Won't have ant VGPR cost when no jittering, because KernelConfig.BufferUV == SceneBufferUV. // TODO(Denoiser): This is costly for VGPR pressure if using KernelConfig.BufferUV was != SceneBufferUV. KernelConfig.BufferUV = SceneBufferUV; SampleAndAccumulateCenterSampleAsItsOwnCluster( KernelConfig, Signal_Textures_0, Signal_Textures_1, Signal_Textures_2, Signal_Textures_3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION == ACCUMULATOR_COMPRESSION_DISABLED SignalAccumulators = UncompressedAccumulators; #else SignalAccumulators = UncompressAccumulatorArray(CompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION); #endif } // Color processing of the signal to reduce highlight flickering. #if CONFIG_ACCUMULATION_COLOR_SPACE != CONFIG_OUTPUT_COLOR_SPACE || CONFIG_REJECT_HIGHEST_COLOR { UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE) for (uint MultiplexId = 0; MultiplexId < CONFIG_SIGNAL_BATCH_SIZE; MultiplexId++) { UncompressSignalAccumulator(/* inout */ SignalAccumulators.Array[MultiplexId]); #if CONFIG_REJECT_HIGHEST_COLOR { #if !COMPILE_SIGNAL_COLOR #error Need to compile signal color. #endif if (Accumulator.Moment1[MultiplexId].SampleCount > 0) { const float MaxNeighborWeight = saturate(SignalAccumulators.Array[MultiplexId].Moment1.SampleCount * rcp(10) - 1); SignalAccumulators.Array[MultiplexId].Moment1.SceneColor.rgb = (SignalAccumulators.Array[MultiplexId].Moment1.SceneColor.rgb - MaxNeighbor.SceneColor.rgb * MaxNeighborWeight) * (SignalAccumulators.Array[MultiplexId].Moment1.SampleCount / (SignalAccumulators.Array[MultiplexId].Moment1.SampleCount - MaxNeighborWeight)); } } #endif // CONFIG_REJECT_HIGHEST_COLOR #if CONFIG_ACCUMULATION_COLOR_SPACE != CONFIG_OUTPUT_COLOR_SPACE { #if COMPILE_MOMENT1_ACCUMULATOR SignalAccumulators.Array[MultiplexId].Moment1 = TransformSignal( SignalAccumulators.Array[MultiplexId].Moment1, /* SrcBasis = */ CONFIG_ACCUMULATION_COLOR_SPACE, /* DestBasis = */ CONFIG_OUTPUT_COLOR_SPACE); #endif #if COMPILE_MOMENT2_ACCUMULATOR SignalAccumulators.Array[MultiplexId].Moment2 = TransformSignal( SignalAccumulators.Array[MultiplexId].Moment2, /* SrcBasis = */ CONFIG_ACCUMULATION_COLOR_SPACE, /* DestBasis = */ CONFIG_OUTPUT_COLOR_SPACE); #endif } #endif // CONFIG_ACCUMULATION_COLOR_SPACE != CONFIG_OUTPUT_COLOR_SPACE // TODO(Denoiser): it might be better to just uncompress before this for loop and remain uncompressed, // so the color operation get done in practice during the output sample transcoding. CompressSignalAccumulator(/* inout */ SignalAccumulators.Array[MultiplexId]); } } #endif // CONFIG_ACCUMULATION_COLOR_SPACE != CONFIG_OUTPUT_COLOR_SPACE || CONFIG_REJECT_HIGHEST_COLOR // Transcode the spatial accumulation into multiplexed signal according to different modes. uint MultiplexCount = 1; FSSDSignalArray OutputSamples = CreateSignalArrayFromScalarValue(0.0); FSSDSignalFrequencyArray OutputFrequencies = CreateInvalidSignalFrequencyArray(); { #if CONFIG_OUTPUT_MODE == OUTPUT_MODE_SUM { MultiplexCount = CONFIG_SIGNAL_BATCH_SIZE; UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE) for (uint MultiplexId = 0; MultiplexId < CONFIG_SIGNAL_BATCH_SIZE; MultiplexId++) { UncompressSignalAccumulator(/* inout */ SignalAccumulators.Array[MultiplexId]); OutputSamples.Array[MultiplexId] = SignalAccumulators.Array[MultiplexId].Moment1; // Output the minimal inverse frequency as new world bluring radius for subsequent passes. OutputFrequencies.Array[MultiplexId] = SignalAccumulators.Array[MultiplexId].MinFrequency; } } #elif CONFIG_OUTPUT_MODE == OUTPUT_MODE_2MOMMENT_SUM { #if SIGNAL_ARRAY_SIZE != 2 * MAX_SIGNAL_BATCH_SIZE #error Invalid signal array size. #endif MultiplexCount = 2 * CONFIG_SIGNAL_BATCH_SIZE; UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE) for (uint BatchedSignalId = 0; BatchedSignalId < CONFIG_SIGNAL_BATCH_SIZE; BatchedSignalId++) { UncompressSignalAccumulator(/* inout */ SignalAccumulators.Array[BatchedSignalId * CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN + 0]); OutputSamples.Array[BatchedSignalId * 2 + 0] = SignalAccumulators.Array[BatchedSignalId * CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN + 0].Moment1; OutputSamples.Array[BatchedSignalId * 2 + 1] = SignalAccumulators.Array[BatchedSignalId * CONFIG_MULTIPLEXED_SIGNALS_PER_SIGNAL_DOMAIN + 0].Moment2; } } #elif CONFIG_OUTPUT_MODE == OUTPUT_MODE_DRB { MultiplexCount = CONFIG_SIGNAL_BATCH_SIZE; UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE) for (uint MultiplexId = 0; MultiplexId < CONFIG_SIGNAL_BATCH_SIZE; MultiplexId++) { UncompressSignalAccumulator(/* inout */ SignalAccumulators.Array[MultiplexId]); OutputSamples.Array[MultiplexId] = SignalAccumulators.Array[MultiplexId].Previous; // Output the minimal inverse frequency as new world bluring radius for subsequent passes. OutputFrequencies.Array[MultiplexId] = SignalAccumulators.Array[MultiplexId].MinFrequency; // No need to keep the VGPR pressure at this point for WorldBluringRadius, because no passes use it after. if (DIM_STAGE == STAGE_POST_FILTERING && 0) { OutputFrequencies.Array[MultiplexId].WorldBluringRadius = 0; } } } #else #error Unknown output mode. #endif } // Clamp the number of sample recorded. #if DIM_STAGE == STAGE_POST_FILTERING { UNROLL_N(CONFIG_SIGNAL_BATCH_SIZE) for (uint MultiplexId = 0; MultiplexId < CONFIG_SIGNAL_BATCH_SIZE; MultiplexId++) { float CurrentSampleCount = RefSamples.Array[MultiplexId].SampleCount; float NewSampleCount = min(CurrentSampleCount, TARGETED_SAMPLE_COUNT); OutputSamples.Array[MultiplexId] = MulSignal(OutputSamples.Array[MultiplexId], CurrentSampleCount > 0 ? NewSampleCount / CurrentSampleCount : 0); } } #endif // DIM_STAGE == STAGE_POST_FILTERING #if CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_DIFFUSE_SPHERICAL_HARMONIC && 0 DebugOutput[DispatchThreadId] = float4( OutputSamples.Array[0].ColorSH.R.V.x, OutputSamples.Array[0].ColorSH.G.V.x, OutputSamples.Array[0].ColorSH.B.V.x, OutputSamples.Array[0].SampleCount); #endif // TODO(Denoiser): LeaveRayCount = (LeaveRayCount - 1) * 9 / (9 - 2) post processing to reject when for history rejection uint2 OutputPixelPostion; #if CONFIG_VGPR_OPTIMIZATION && !CONFIG_UPSCALE // TODO(Denoiser) { // No need to keep DispatchThreadId, can recompute the output pixel position based on information stored in VGPRs for spatial kernel. #if CONFIG_SAMPLE_SET == SAMPLE_SET_STACKOWIAK_4_SETS #if CONFIG_UPSCALE SampleTrackId ^= 0x3; #endif OutputPixelPostion = (uint2(KernelBufferUV * BufferUVToOutputPixelPosition) & ~0x1) | (uint2(SampleTrackId, SampleTrackId >> 1) & 0x1); #else OutputPixelPostion = BufferUVToBufferPixelCoord(SceneBufferUV); #endif } #else OutputPixelPostion = ViewportMin + DispatchThreadId; #endif #if DEBUG_OUTPUT && CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_SHADOW_VISIBILITY_MASK && DIM_STAGE == STAGE_RECONSTRUCTION DebugOutput[DispatchThreadId] = float4(OutputSamples.Array[0].SampleCount, 0, 0, 0); #endif BRANCH if (all(OutputPixelPostion < ViewportMax)) { // Output the multiplexed signal. #if DIM_STAGE == STAGE_FINAL_OUTPUT && (CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_SHADOW_VISIBILITY_MASK || CONFIG_SIGNAL_PROCESSING == SIGNAL_PROCESSING_VIRTUAL_SHADOW_MAP_MASK) { UNROLL for (uint MultiplexId = 0; MultiplexId < MultiplexCount; MultiplexId++) { float Shadow = GetSamplePenumbraSafe(OutputSamples.Array[MultiplexId]); const float ShadowFadeFraction = 1; float SSSTransmission = (OutputSamples.Array[MultiplexId].SampleCount > 0 ? OutputSamples.Array[MultiplexId].TransmissionDistance / OutputSamples.Array[MultiplexId].SampleCount : OutputSamples.Array[MultiplexId].TransmissionDistance); // 0 is shadowed, 1 is unshadowed // RETURN_COLOR not needed unless writing to SceneColor; float FadedShadow = lerp(1.0f, Shadow, ShadowFadeFraction); float FadedSSSShadow = lerp(1.0f, SSSTransmission, ShadowFadeFraction); // the channel assignment is documented in ShadowRendering.cpp (look for Light Attenuation channel assignment) float4 OutColor; if (GET_SCALAR_ARRAY_ELEMENT(LightType, MultiplexId) == LIGHT_TYPE_DIRECTIONAL) { OutColor = EncodeLightAttenuation(half4(FadedShadow, FadedSSSShadow, 1.0, FadedSSSShadow)); } else { OutColor = EncodeLightAttenuation(half4(FadedShadow, FadedSSSShadow, FadedShadow, FadedSSSShadow)); } if (MultiplexId == 0) SignalOutput_UAVs_0[OutputPixelPostion] = OutColor; if (MultiplexId == 1) SignalOutput_UAVs_1[OutputPixelPostion] = OutColor; if (MultiplexId == 2) SignalOutput_UAVs_2[OutputPixelPostion] = OutColor; if (MultiplexId == 3) SignalOutput_UAVs_3[OutputPixelPostion] = OutColor; } } #else { OutputMultiplexedSignal( SignalOutput_UAVs_0, SignalOutput_UAVs_1, SignalOutput_UAVs_2, SignalOutput_UAVs_3, CONFIG_SIGNAL_OUTPUT_LAYOUT, MultiplexCount, OutputPixelPostion, OutputSamples, OutputFrequencies); } #endif } } // MainCS