// Copyright Epic Games, Inc. All Rights Reserved. #include "SSDSignalAccumulatorArray.ush" #include "SSDSignalBufferEncoding.ush" #include "../TextureSampling.ush" #include "../MonteCarlo.ush" //------------------------------------------------------- ENUMS /** Enums to choose how to compute the world distance for bilateral rejection. */ // Only depends on the reference sample's pixel size and depth. #define SIGNAL_WORLD_FREQUENCY_REF_METADATA_ONLY 0 // Only depends on the sample's pixel size and depth. #define SIGNAL_WORLD_FREQUENCY_SAMPLE_METADATA_ONLY 1 // Is the smallest according of pixel size and depth between reference and sample. #define SIGNAL_WORLD_FREQUENCY_MIN_METADATA 2 // Depends only based of the sample's hit distance and metadata. #define SIGNAL_WORLD_FREQUENCY_HIT_DISTANCE 3 // Uses FSSDSignalSample::WorldBluringRadius precomputed in the sample. #define SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS 4 // Compute based on the harmonic being processed. #define SIGNAL_WORLD_FREQUENCY_HARMONIC 5 //------------------------------------------------------- CONFIG DISABLED DEFAULTS #ifndef CONFIG_ACCUMULATOR_VGPR_COMPRESSION #define CONFIG_ACCUMULATOR_VGPR_COMPRESSION ACCUMULATOR_COMPRESSION_DISABLED #endif #define CONFIG_ENABLE_WAVE_BROADCAST (PLATFORM_SUPPORTS_WAVE_BROADCAST) #ifndef COMPILE_BOX_KERNEL #define COMPILE_BOX_KERNEL 0 #endif #ifndef COMPILE_STACKOWIAK_KERNEL #define COMPILE_STACKOWIAK_KERNEL 0 #endif #ifndef COMPILE_DISK_KERNEL #define COMPILE_DISK_KERNEL 0 #endif #ifndef COMPILE_DIRECTIONAL_KERNEL #define COMPILE_DIRECTIONAL_KERNEL 0 #endif #ifndef COMPILE_RAW_EXPERIMENTAL_KERNEL #define COMPILE_RAW_EXPERIMENTAL_KERNEL 0 #endif #ifndef FORCE_IDENTICAL_COLOR_SPACE #define FORCE_IDENTICAL_COLOR_SPACE 0 #endif //------------------------------------------------------- STRUCTURES /** Configures the spatial kernel. */ struct FSSDKernelConfig { // --------------------------- compile time. // Compile time set of sample to use. uint SampleSet; // Compile time selection of sample to use. uint SampleSubSetId; // Compile time layout of the buffer to accumulate. uint BufferLayout; // Compile time number of multiplexed signal per signal domain. uint MultiplexedSignalsPerSignalDomain; // Selects how the world distance should be computed for bilateral rejection at compile time. uint BilateralDistanceComputation; // Number of ring for a disk kernel. uint RingCount; /** Selects how the computation of world vector between the reference and neighbor should be computed. */ uint NeighborToRefComputation; // Layout of RefSceneMetadata uint RefSceneMetadataLayout; // Multiplier applied on the world bluring distance of the signal. float WorldBluringDistanceMultiplier; // Compile time configuration whether want do LOOP or UNROLL // false by default to expose in user code when the shader byte code might potentially be big. bool bUnroll; // Compile time whether the center of the kernel sample is sampled. bool bSampleKernelCenter; // Compile time whether sampling previous frame or current frame metadata. bool bPreviousFrameMetadata; // Compile time whether reference metadata is current frame or previous frame. bool bPreviousFrameRefMetadata; // The sample should be accumulated starting from the further away. bool bDescOrder; // Whether a sample should be normalised to 1 before accmulation. bool bNormalizeSample; // Whether should min sample frequency of pair of samples // [ Jimenez 2014, "Next Generation Post Processing in Call of Duty: Advanced Warfare" ] bool bMinSamplePairInvFrequency; // Whether the bilateral distance should be maxed with reference bilateral distance. bool bMaxWithRefBilateralDistance; // Whether the spherical harmonic of a sample should be computed before accumulation. bool bComputeSampleColorSH; // Whether should clamp the UV individually per signal. bool bClampUVPerMultiplexedSignal; // The color space that has been encoded in the buffer. uint BufferColorSpace[SIGNAL_ARRAY_SIZE]; // The color space of the accumulation. uint AccumulatorColorSpace[SIGNAL_ARRAY_SIZE]; // The color space of the accumulation. uint BilateralSettings[SIGNAL_ARRAY_SIZE]; // --------------------------- Per wave. // Buffer size and inv size. float4 BufferSizeAndInvSize; float4 BufferBilinearUVMinMax; // Multiplier on the sample's offset. float KernelSpreadFactor; // The periode of the harmonic being sampled. float HarmonicPeriode; // Buffer's min and max UV, per texture. float4 PerSignalUVMinMax[SIGNAL_ARRAY_SIZE]; // --------------------------- Per lane. // Number of samples should be done when doing variable box sampling. uint BoxKernelRadius; // Runtime number of samples uint SampleCount; // Buffer coordinate of the center of the kernel. float2 BufferUV; // Metadata of the scene for the bilateral therm. FSSDCompressedSceneInfos CompressedRefSceneMetadata; // Buffer coordinate of the reference used for decompression. // Please try to make this same as BufferUV. float2 RefBufferUV; // Runtime to force the first sample of the kernel to be accumulated. bool bForceKernelCenterAccumulation; // Runtime to force accumulating all sample. bool bForceAllAccumulation; // Runtime whether this pixel is dynamic object. bool bIsDynamicPixel; // Runtime selection of a track of sample. uint SampleTrackId; // Reference meta data. float RefBilateralDistance[SIGNAL_ARRAY_SIZE]; // Uniform random values required for stocastic kernel. float Randoms[1]; // Seed for hamerley sequence used for stocastic kernel. uint2 HammersleySeed; // Normalized pixel space direction for directional kernels. float2 MajorAxis; // The pixel radius along the major and minor axes for directional kernels. float MajorPixelRadius; float MinorPixelRadius; #if DEBUG_OUTPUT uint2 DebugPixelPosition; uint DebugEventCounter; #endif }; FSSDKernelConfig CreateKernelConfig() { FSSDKernelConfig KernelConfig; KernelConfig.SampleSet = SAMPLE_SET_1X1; KernelConfig.SampleSubSetId = 0; KernelConfig.BufferLayout = SIGNAL_BUFFER_LAYOUT_UNINITIALIZED; KernelConfig.MultiplexedSignalsPerSignalDomain = SIGNAL_ARRAY_SIZE; KernelConfig.NeighborToRefComputation = NEIGHBOR_TO_REF_CACHE_WORLD_POSITION; KernelConfig.RefSceneMetadataLayout = METADATA_BUFFER_LAYOUT_DISABLED; KernelConfig.RingCount = 0; KernelConfig.WorldBluringDistanceMultiplier = 1.0; KernelConfig.bUnroll = false; KernelConfig.bSampleKernelCenter = false; KernelConfig.bPreviousFrameMetadata = false; KernelConfig.bPreviousFrameRefMetadata = false; KernelConfig.BilateralDistanceComputation = SIGNAL_WORLD_FREQUENCY_MIN_METADATA; KernelConfig.bDescOrder = false; KernelConfig.bNormalizeSample = false; KernelConfig.bMinSamplePairInvFrequency = false; KernelConfig.bMaxWithRefBilateralDistance = false; KernelConfig.bComputeSampleColorSH = false; KernelConfig.bClampUVPerMultiplexedSignal = false; { UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++) { KernelConfig.BufferColorSpace[MultiplexId] = STANDARD_BUFFER_COLOR_SPACE; KernelConfig.AccumulatorColorSpace[MultiplexId] = STANDARD_BUFFER_COLOR_SPACE; KernelConfig.BilateralSettings[MultiplexId] = 0x0000; } } // SGPRs. KernelConfig.BufferSizeAndInvSize = float4(0, 0, 0, 0); KernelConfig.BufferBilinearUVMinMax = float4(0, 0, 0, 0); KernelConfig.KernelSpreadFactor = 1; KernelConfig.HarmonicPeriode = 1.0; { UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++) { KernelConfig.PerSignalUVMinMax[MultiplexId] = 0.0; } } // VGPRs. KernelConfig.BoxKernelRadius = 1; KernelConfig.SampleCount = 1; KernelConfig.BufferUV = 0.0; KernelConfig.CompressedRefSceneMetadata = CreateCompressedSceneInfos(); KernelConfig.RefBufferUV = 0.0; KernelConfig.bForceKernelCenterAccumulation = false; KernelConfig.bForceAllAccumulation = false; KernelConfig.bIsDynamicPixel = false; KernelConfig.SampleTrackId = 0; KernelConfig.MajorAxis = 0.0; KernelConfig.MajorPixelRadius = 0.0; KernelConfig.MinorPixelRadius = 0.0; KernelConfig.HammersleySeed = 0; { UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++) { KernelConfig.RefBilateralDistance[MultiplexId] = 0.0; } } { UNROLL_N(2) for (uint RandomSignalId = 0; RandomSignalId < 1; RandomSignalId++) { KernelConfig.Randoms[RandomSignalId] = 0.0; } } #if DEBUG_OUTPUT { KernelConfig.DebugPixelPosition = 0; KernelConfig.DebugEventCounter = 0; } #endif return KernelConfig; } void SetBilateralPreset(uint BilateralPresetId, inout FSSDKernelConfig KernelConfig) { if (BilateralPresetId == BILATERAL_PRESET_MONOCHROMATIC_PENUMBRA) { UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint MultiplexId = 0; MultiplexId < SIGNAL_ARRAY_SIZE; MultiplexId++) { // Change the bilarteral settings to use normal orientation in // order to not merge background / foreground sample, as otherwise this results into leaks // Shadow masks are normal invarient, so only reject based on position. KernelConfig.BilateralSettings[MultiplexId] = BILATERAL_POSITION_BASED(5) | BILATERAL_NORMAL; } } else if (BilateralPresetId == BILATERAL_PRESET_POLYCHROMATIC_PENUMBRA) { // Diffuse. KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(5) | BILATERAL_NORMAL; // Specular. #if SIGNAL_ARRAY_SIZE > 1 KernelConfig.BilateralSettings[1] = BILATERAL_POSITION_BASED(5) | BILATERAL_TOKOYASHI; #endif } else if (BilateralPresetId == BILATERAL_PRESET_REFLECTIONS) { // Specular. // Can only be done using tokoyashi because have more than one sample at a time. KernelConfig.BilateralSettings[0] = BILATERAL_TOKOYASHI; #if SIGNAL_ARRAY_SIZE > 1 // Specular variance for sampling rejection pre convolution. KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0]; #endif } else if (BilateralPresetId == BILATERAL_PRESET_REFLECTIONS_1SPP) { // Specular. // Use specular ratio estomator, so no need to to reject based on the axis of the lobe. KernelConfig.BilateralSettings[0] = BILATERAL_TOKOYASHI_LOBE; } else if (BilateralPresetId == BILATERAL_PRESET_REFLECTIONS_TAA) { // Specular. // Can only be done using tokoyashi because have more than one sample at a time. KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(1) | BILATERAL_TOKOYASHI; #if SIGNAL_ARRAY_SIZE > 1 // Specular variance for sampling rejection pre convolution. KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0]; #endif } else if (BilateralPresetId == BILATERAL_PRESET_DIFFUSE) { // Diffuse depends on world position, but also the normal of the surface given it doesn't store any directionality. KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(2) | BILATERAL_NORMAL; #if SIGNAL_ARRAY_SIZE > 1 // Variance for sampling rejection pre convolution. KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0]; #endif } else if (BilateralPresetId == BILATERAL_PRESET_SPHERICAL_HARMONIC) { // Spherical harmonic encode directionality, so only reject based on world position. KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(2); } else if (BilateralPresetId == BILATERAL_PRESET_PROBE_HIERARCHY) { // Diffuse & specular bilateral component. KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(1) | BILATERAL_SHADING_MODEL; } else if (BilateralPresetId == BILATERAL_PRESET_AO) { // Diffuse depends on world position, but also the normal of the surface given it doesn't store any directionality. KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(4) | BILATERAL_NORMAL; #if SIGNAL_ARRAY_SIZE > 1 // Variance for sampling rejection pre convolution. KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0]; #endif } else if (BilateralPresetId == BILATERAL_PRESET_AO_HISTORY) { // Diffuse depends on world position, but also the normal of the surface given it doesn't store any directionality. KernelConfig.BilateralSettings[0] = BILATERAL_POSITION_BASED(2) | BILATERAL_NORMAL; //KernelConfig.BilateralSettings[0] = BILATERAL_NORMAL; #if SIGNAL_ARRAY_SIZE > 1 // Variance for sampling rejection pre convolution. KernelConfig.BilateralSettings[1] = KernelConfig.BilateralSettings[0]; #endif } } //------------------------------------------------------- CONSTANT static const float kWaveletFilterWeights5x5[] = { 3.0 / 8.0, 1.0 / 4.0, 1.0 / 16.0 }; //------------------------------------------------------- REDERIVE INFORMATION FOR LOWER VGPR OCCUPENCY /** Deduce the buffer UV of the output pixel this kernel has been configured for. */ ISOLATE float2 ComputeRefBufferUV(FSSDKernelConfig KernelConfig) { if (KernelConfig.bPreviousFrameMetadata) { // Impossible to compute from BufferUV because it's in the previous frame basis. return KernelConfig.RefBufferUV; } else if (KernelConfig.SampleSet == SAMPLE_SET_HEXAWEB) { // Impossible to compute from BufferUV because of random offset certainely needed using this.. return KernelConfig.RefBufferUV; } else if (KernelConfig.SampleSet == SAMPLE_SET_STACKOWIAK_4_SETS) { uint SampleTrackId = KernelConfig.SampleTrackId; // Matches first line of kStackowiakSampleSet0 // TODO(Denoiser): could be optimised further by just setting sign bit on 0.5. float2 SampleOffset = float2( SampleTrackId & 0x1 ? 0.5 : -0.5, SampleTrackId & 0x2 ? 0.5 : -0.5); return KernelConfig.BufferUV + SampleOffset * KernelConfig.BufferSizeAndInvSize.zw; } return KernelConfig.BufferUV; } /** Uncompress the reference scene metadata to keep a low VGPR pressure. */ ISOLATE FSSDSampleSceneInfos UncompressRefSceneMetadata(FSSDKernelConfig KernelConfig) { // Find out the buffer UV of the reference pixel. float2 RefBufferUV = ComputeRefBufferUV(KernelConfig); float2 ScreenPos; if (KernelConfig.bPreviousFrameMetadata) // TODO(Denoiser): should be bPreviousFrameRefMetadata instead? { ScreenPos = RefBufferUV * PrevSceneBufferUVToScreenPosition.xy + PrevSceneBufferUVToScreenPosition.zw; } else { ScreenPos = DenoiserBufferUVToScreenPosition(RefBufferUV); } // Uncompress the reference scene metadata to keep a low VGPR pressure. return UncompressSampleSceneInfo( KernelConfig.RefSceneMetadataLayout, KernelConfig.bPreviousFrameRefMetadata, ScreenPos, KernelConfig.CompressedRefSceneMetadata); } /** Uncompress the scene metadata of a sample. */ FSSDSampleSceneInfos UncompressSampleSceneMetadata( FSSDKernelConfig KernelConfig, float2 SampleBufferUV, FSSDCompressedSceneInfos CompressedSampleSceneMetadata) { float2 ScreenPos; if (KernelConfig.bPreviousFrameMetadata) { ScreenPos = SampleBufferUV * PrevSceneBufferUVToScreenPosition.xy + PrevSceneBufferUVToScreenPosition.zw; } else { ScreenPos = DenoiserBufferUVToScreenPosition(SampleBufferUV); } return UncompressSampleSceneInfo( CONFIG_METADATA_BUFFER_LAYOUT, KernelConfig.bPreviousFrameMetadata, ScreenPos, CompressedSampleSceneMetadata); } float3 ComputeVectorFromNeighborToRef( FSSDKernelConfig KernelConfig, FSSDSampleSceneInfos RefSceneMetadata, FSSDSampleSceneInfos NeighborSceneMetadata) { float RefWorldDepth = GetWorldDepth(RefSceneMetadata); float NeighborWorldDepth = GetWorldDepth(NeighborSceneMetadata); if (KernelConfig.NeighborToRefComputation == NEIGHBOR_TO_REF_LOWEST_VGPR_PRESSURE) { // Recompute the the screen position of the reference, from the most minimal VGPR footprint. float2 RefScreenPos = RefSceneMetadata.ScreenPosition; float3 RefClipPosition = float3(GetScreenPositionForProjectionType(RefScreenPos, RefWorldDepth), RefWorldDepth); float2 NeighborScreenPos = NeighborSceneMetadata.ScreenPosition; float3 NeighborClipPosition = float3(GetScreenPositionForProjectionType(NeighborScreenPos, NeighborWorldDepth), NeighborWorldDepth); #if CONFIG_USE_VIEW_SPACE float3 NeighborToRefVector = mul(float4(RefClipPosition - NeighborClipPosition, 0), GetScreenToViewDistanceMatrix()).xyz; #else float3 NeighborToRefVector = mul(float4(RefClipPosition - NeighborClipPosition, 0), View.ScreenToTranslatedWorld).xyz; #endif return NeighborToRefVector; } else // if (KernelConfig.NeighborToRefComputation == NEIGHBOR_TO_REF_CACHE_WORLD_POSITION) { float3 NeighborToRefWorldVector = GetTranslatedWorldPosition(RefSceneMetadata) - GetTranslatedWorldPosition(NeighborSceneMetadata); // TODO(Denoiser): GetViewPosition(RefSceneMetadata) #if CONFIG_USE_VIEW_SPACE return mul(float4(NeighborToRefWorldVector, 0), View.TranslatedWorldToView).xyz; #endif return NeighborToRefWorldVector; } } //------------------------------------------------------- SHARED SAMPLING FSSDSignalSample TransformSignalSampleForAccumulation( FSSDKernelConfig KernelConfig, uint MultiplexId, FSSDSampleSceneInfos SampleSceneMetadata, FSSDSignalSample Sample, uint2 SamplePixelCoord) { // Transform the color space. #if (!FORCE_IDENTICAL_COLOR_SPACE) // TODO(Denoiser): could pass down information that this sample may be normalized. Sample = TransformSignal( Sample, /* SrcBasis = */ KernelConfig.BufferColorSpace[MultiplexId], /* DestBasis = */ KernelConfig.AccumulatorColorSpace[MultiplexId]); #endif // Compute the spherical harmonic of the sample. #if COMPILE_SIGNAL_COLOR_SH && COMPILE_SIGNAL_COLOR if (KernelConfig.bComputeSampleColorSH) { Sample.ColorSH = ComputeSampleColorSH(SampleSceneMetadata, Sample, SamplePixelCoord); } #endif return Sample; } /** Compute at compile time the index of the signal in the batch, from the index of the multiplexed signal. */ uint ComputeSignalBatchIdFromSignalMultiplexId(FSSDKernelConfig KernelConfig, const uint SignalMultiplexId) { return SignalMultiplexId / KernelConfig.MultiplexedSignalsPerSignalDomain; } /** Returns whether this sample is outside the viewport. */ bool IsOutsideViewport(FSSDKernelConfig KernelConfig, float2 SampleBufferUV) { return any(or(SampleBufferUV < KernelConfig.BufferBilinearUVMinMax.xy, SampleBufferUV > KernelConfig.BufferBilinearUVMinMax.zw)); } /** Sample multiplexed samples and their metadata for kernel use. */ void SampleMultiplexedSignals( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, float2 SampleBufferUV, out FSSDCompressedSceneInfos OutCompressedSampleSceneMetadata, out FSSDCompressedMultiplexedSample OutCompressedMultiplexedSamples) { uint2 PixelCoord = BufferUVToBufferPixelCoord(SampleBufferUV); OutCompressedSampleSceneMetadata = SampleCompressedSceneMetadata( KernelConfig.bPreviousFrameMetadata, SampleBufferUV, PixelCoord); // Fetches the signals sample OutCompressedMultiplexedSamples = SampleCompressedMultiplexedSignals( SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, GlobalPointClampedSampler, SampleBufferUV, PixelCoord); } // SampleMultiplexedSignals() /** Uncompressed multiplexed signal for accumulation. */ void UncompressMultiplexedSignals( FSSDKernelConfig KernelConfig, float2 SampleBufferUV, FSSDCompressedMultiplexedSample CompressedMultiplexedSamples, out FSSDSignalArray MultiplexedSamples, out FSSDSignalFrequencyArray MultiplexedFrequencies) { // TODO(Denoiser): offer multiplier to apply to each signal during Decode, to save mul VALU. DecodeMultiplexedSignals( KernelConfig.BufferLayout, /* MultiplexedSampleId = */ 0, KernelConfig.bNormalizeSample, CompressedMultiplexedSamples, /* out */ MultiplexedSamples, /* out */ MultiplexedFrequencies); if (KernelConfig.bClampUVPerMultiplexedSignal) { UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++) { bool bInvalidSample = any(SampleBufferUV != clamp( SampleBufferUV, KernelConfig.PerSignalUVMinMax[SignalMultiplexId].xy, KernelConfig.PerSignalUVMinMax[SignalMultiplexId].zw)); if (bInvalidSample) { MultiplexedSamples.Array[SignalMultiplexId] = CreateSignalSampleFromScalarValue(0.0); } } // for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++) } } /** Accumulate multiplexed samples and their metadata to an accumulator. */ void AccumulateSampledMultiplexedSignals( FSSDKernelConfig KernelConfig, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators, FSSDSampleSceneInfos RefSceneMetadata, float2 SampleBufferUV, FSSDSampleSceneInfos SampleSceneMetadata, FSSDSignalArray MultiplexedSamples, FSSDSignalFrequencyArray MultiplexedFrequencies, float KernelSampleWeight, const bool bForceSample, bool bIsOutsideFrustum) { // Compute the bluring radius of the output pixel itself. float RefPixelWorldBluringRadius = ComputeWorldBluringRadiusCausedByPixelSize(RefSceneMetadata); #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED FSSDSignalAccumulatorArray Accumulators = UncompressAccumulatorArray(CompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION); #endif // Compute the vector from neighbor to reference in the most optimal way. float3 NeighborToRefVector = ComputeVectorFromNeighborToRef( KernelConfig, RefSceneMetadata, SampleSceneMetadata); #if DEBUG_OUTPUT && 0 if (KernelConfig.DebugEventCounter) { float4 A = float4( RefSceneMetadata.WorldDepth, SampleSceneMetadata.WorldDepth, length(NeighborToRefVector) / RefPixelWorldBluringRadius, KernelConfig.bPreviousFrameMetadata); float4 B = float4( DenoiserBufferUVToScreenPosition(SampleBufferUV) * 0.5 + 0.5, 0, 0); float4 C = float4( 100 * abs(RefSceneMetadata.WorldDepth - SampleSceneMetadata.WorldDepth), 0, 0, 0); float4 D = float4( length(RefSceneMetadata.TranslatedWorldPosition - SampleSceneMetadata.TranslatedWorldPosition), abs(RefSceneMetadata.WorldDepth - SampleSceneMetadata.WorldDepth), length(RefSceneMetadata.ScreenPosition - SampleSceneMetadata.ScreenPosition), 0); DebugOutput[KernelConfig.DebugPixelPosition] = A; KernelConfig.DebugEventCounter = 0; } #endif UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++) { // Compute at compile time the id of the signal being processed. const uint BatchedSignalId = ComputeSignalBatchIdFromSignalMultiplexId(KernelConfig, SignalMultiplexId); // Domain knowledge of the signal. FSSDSignalDomainKnowledge DomainKnowledge = GetSignalDomainKnowledge(BatchedSignalId); // TODO(Denoiser): direction of the ray should be cached by injest or output by RGS, otherwise ends up with VGPR pressure because of SampleBufferUV. uint2 NeighborPixelCoord = floor(SampleBufferUV * KernelConfig.BufferSizeAndInvSize.xy); // Fetch and pre process the sample for accumulation. FSSDSignalSample Sample = MultiplexedSamples.Array[SignalMultiplexId]; Sample = TransformSignalSampleForAccumulation(KernelConfig, SignalMultiplexId, SampleSceneMetadata, Sample, NeighborPixelCoord); // Fetch sample's frequency for accumulation. FSSDSignalFrequency SampleFrequency = MultiplexedFrequencies.Array[SignalMultiplexId]; // Compute the bluring radius of pixel itself. float SamplePixelWorldBluringRadius = ComputeWorldBluringRadiusCausedByPixelSize(SampleSceneMetadata); // Compute the bluring radius of the signal from ray hit distance and signal domain knowledge. float SignalConvolutionBluringRadius = GetSignalWorldBluringRadius(SampleFrequency, SampleSceneMetadata, DomainKnowledge); // But the signal's bluring radius might already be pre computed. if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS) { SignalConvolutionBluringRadius = SampleFrequency.WorldBluringRadius; } // Compute the final world distance to use for bilateral rejection. float FinalWorldBluringDistance = -1; if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_REF_METADATA_ONLY) { FinalWorldBluringDistance = AmendWorldBluringRadiusCausedByPixelSize( RefPixelWorldBluringRadius); } else if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_SAMPLE_METADATA_ONLY) { FinalWorldBluringDistance = AmendWorldBluringRadiusCausedByPixelSize( SamplePixelWorldBluringRadius); } else if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_MIN_METADATA) { FinalWorldBluringDistance = AmendWorldBluringRadiusCausedByPixelSize( min(SamplePixelWorldBluringRadius, RefPixelWorldBluringRadius)); } else if ( KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_HIT_DISTANCE || KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_PRECOMPUTED_BLURING_RADIUS) { FinalWorldBluringDistance = SignalConvolutionBluringRadius; } else if (KernelConfig.BilateralDistanceComputation == SIGNAL_WORLD_FREQUENCY_HARMONIC) { FinalWorldBluringDistance = AmendWorldBluringRadiusCausedByPixelSize( RefPixelWorldBluringRadius) * KernelConfig.HarmonicPeriode; } FinalWorldBluringDistance *= KernelConfig.WorldBluringDistanceMultiplier; if (KernelConfig.bMaxWithRefBilateralDistance) { FinalWorldBluringDistance = min(FinalWorldBluringDistance, KernelConfig.RefBilateralDistance[SignalMultiplexId]); } // Compute the weight to be applied to do bilateral rejection. float BilateralWeight = ComputeBilateralWeight( KernelConfig.BilateralSettings[SignalMultiplexId], FinalWorldBluringDistance, DomainKnowledge, RefSceneMetadata, SampleSceneMetadata, NeighborToRefVector); FSSDSampleAccumulationInfos SampleInfos; SampleInfos.Sample = Sample; SampleInfos.Frequency = SampleFrequency; SampleInfos.FinalWeight = KernelSampleWeight * BilateralWeight; SampleInfos.InvFrequency = SignalConvolutionBluringRadius; if (bForceSample || KernelConfig.bForceAllAccumulation) { SampleInfos.FinalWeight = 1; } // TODO(Denoiser): bIsOutsideFrustum could afect number of samples for DRB. FLATTEN if (SampleInfos.Sample.SampleCount != 0 && !bIsOutsideFrustum) { #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION == ACCUMULATOR_COMPRESSION_DISABLED { AccumulateSample( /* inout */ UncompressedAccumulators.Array[SignalMultiplexId], SampleInfos); } #else { AccumulateSample( /* inout */ Accumulators.Array[SignalMultiplexId], SampleInfos); } #endif } } // for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++) #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED CompressedAccumulators = CompressAccumulatorArray(Accumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION); #endif } // AccumulateSampledMultiplexedSignals(). /** Sample and accumulate to accumulatore array. * * Caution: you probably want to explicitly do this manually to help the shader compiler to do lattency hiding. */ void SampleAndAccumulateMultiplexedSignals( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators, float2 SampleBufferUV, float KernelSampleWeight, const bool bForceSample) { // Stores in SGPR whether this sample is outside the viewport, to avoid VGPR pressure to keep SampleBufferUV after texture fetches. bool bIsOutsideFrustum = IsOutsideViewport(KernelConfig, SampleBufferUV); FSSDCompressedSceneInfos CompressedSampleSceneMetadata; FSSDCompressedMultiplexedSample CompressedMultiplexedSamples; // Force all the signal texture fetch and metadata to overlap to minimize serial texture fetches. ISOLATE { SampleMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, SampleBufferUV, /* out */ CompressedSampleSceneMetadata, /* out */ CompressedMultiplexedSamples); } // Accumulate the samples, giving full freedom for shader compiler scheduler to put instructions in most optimal way. { FSSDSignalArray MultiplexedSamples; FSSDSignalFrequencyArray MultiplexedFrequencies; UncompressMultiplexedSignals( KernelConfig, SampleBufferUV, CompressedMultiplexedSamples, /* out */ MultiplexedSamples, /* out */ MultiplexedFrequencies); FSSDSampleSceneInfos RefSceneMetadata = UncompressRefSceneMetadata(KernelConfig); FSSDSampleSceneInfos SampleSceneMetadata = UncompressSampleSceneMetadata( KernelConfig, SampleBufferUV, CompressedSampleSceneMetadata); AccumulateSampledMultiplexedSignals( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RefSceneMetadata, SampleBufferUV, SampleSceneMetadata, MultiplexedSamples, MultiplexedFrequencies, KernelSampleWeight, bForceSample, bIsOutsideFrustum); } } // SampleAndAccumulateMultiplexedSignals() void SampleAndAccumulateMultiplexedSignalsPair( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators, float2 SampleBufferUV[2], float KernelSampleWeight) { FSSDCompressedSceneInfos CompressedSampleSceneMetadata[2]; FSSDCompressedMultiplexedSample CompressedMultiplexedSamples[2]; bool bIsOutsideFrustum[2]; // Force all the signal texture fetch and metadata to overlap to minimize serial texture fetches. ISOLATE { UNROLL_N(2) for (uint PairFetchId = 0; PairFetchId < 2; PairFetchId++) { // Stores in SGPR whether this sample is outside the viewport, to avoid VGPR pressure to // avoid keeping SampleBufferUV after texture fetches. bIsOutsideFrustum[PairFetchId] = IsOutsideViewport(KernelConfig, SampleBufferUV[PairFetchId]); SampleMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, SampleBufferUV[PairFetchId], /* out */ CompressedSampleSceneMetadata[PairFetchId], /* out */ CompressedMultiplexedSamples[PairFetchId]); } } // Accumulate the samples, giving full freedom for shader compiler scheduler to put instructions in most optimal way. { // Uncompress the multiplexed signal. FSSDSignalArray MultiplexedSamples[2]; FSSDSignalFrequencyArray MultiplexedFrequencies[2]; UNROLL_N(2) for (uint PairUncompressId = 0; PairUncompressId < 2; PairUncompressId++) { UncompressMultiplexedSignals( KernelConfig, SampleBufferUV[PairUncompressId], CompressedMultiplexedSamples[PairUncompressId], /* out */ MultiplexedSamples[PairUncompressId], /* out */ MultiplexedFrequencies[PairUncompressId]); } // Take the min inverse frequency per signal if desired. if (KernelConfig.bMinSamplePairInvFrequency) { UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++) { float MinInvFrequency = min( MultiplexedFrequencies[0].Array[SignalMultiplexId].WorldBluringRadius, MultiplexedFrequencies[1].Array[SignalMultiplexId].WorldBluringRadius); FLATTEN if (MinInvFrequency > 0) { MultiplexedFrequencies[0].Array[SignalMultiplexId].WorldBluringRadius = MinInvFrequency; MultiplexedFrequencies[1].Array[SignalMultiplexId].WorldBluringRadius = MinInvFrequency; } } } FSSDSampleSceneInfos RefSceneMetadata = UncompressRefSceneMetadata(KernelConfig); UNROLL_N(2) for (uint PairAccumulateId = 0; PairAccumulateId < 2; PairAccumulateId++) { FSSDSampleSceneInfos SampleSceneMetadata = UncompressSampleSceneMetadata( KernelConfig, SampleBufferUV[PairAccumulateId], CompressedSampleSceneMetadata[PairAccumulateId]); AccumulateSampledMultiplexedSignals( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RefSceneMetadata, SampleBufferUV[PairAccumulateId], SampleSceneMetadata, MultiplexedSamples[PairAccumulateId], MultiplexedFrequencies[PairAccumulateId], KernelSampleWeight, /* bForceSample = */ false, bIsOutsideFrustum[PairAccumulateId]); } } } // SampleAndAccumulateMultiplexedSignalsPair() void StartAccumulatingCluster( FSSDKernelConfig KernelConfig, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators, FSSDSampleClusterInfo ClusterInfo) { FSSDSampleSceneInfos RefSceneMetadata = UncompressRefSceneMetadata(KernelConfig); #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED FSSDSignalAccumulatorArray Accumulators = UncompressAccumulatorArray(CompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION); #endif UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++) { #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION == ACCUMULATOR_COMPRESSION_DISABLED { StartAccumulatingCluster( RefSceneMetadata, /* inout */ UncompressedAccumulators.Array[SignalMultiplexId], ClusterInfo); } #else { StartAccumulatingCluster( RefSceneMetadata, /* inout */ Accumulators.Array[SignalMultiplexId], ClusterInfo); } #endif } #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED CompressedAccumulators = CompressAccumulatorArray(Accumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION); #endif } void DijestAccumulatedClusterSamples( inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators, uint RingId, uint SampleCount) { #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED FSSDSignalAccumulatorArray Accumulators = UncompressAccumulatorArray(CompressedAccumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION); #endif UNROLL_N(SIGNAL_ARRAY_SIZE) for (uint SignalMultiplexId = 0; SignalMultiplexId < SIGNAL_ARRAY_SIZE; SignalMultiplexId++) { #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION == ACCUMULATOR_COMPRESSION_DISABLED { DijestAccumulatedClusterSamples( /* inout */ UncompressedAccumulators.Array[SignalMultiplexId], RingId, SampleCount); } #else { DijestAccumulatedClusterSamples( /* inout */ Accumulators.Array[SignalMultiplexId], RingId, SampleCount); } #endif } #if CONFIG_ACCUMULATOR_VGPR_COMPRESSION != ACCUMULATOR_COMPRESSION_DISABLED CompressedAccumulators = CompressAccumulatorArray(Accumulators, CONFIG_ACCUMULATOR_VGPR_COMPRESSION); #endif } void SampleAndAccumulateCenterSampleAsItsOwnCluster( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators) { const uint RingId = 0; FSSDSampleClusterInfo ClusterInfo; ClusterInfo.OutterBoundaryRadius = (RingId + 1) * KernelConfig.KernelSpreadFactor; StartAccumulatingCluster( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, ClusterInfo); SampleAndAccumulateMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, KernelConfig.BufferUV, /* KernelSampleWeight = */ 1.0, /* bForceSample = */ KernelConfig.bForceKernelCenterAccumulation); DijestAccumulatedClusterSamples( /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RingId, /* SampleCount = */ 1); } //------------------------------------------------------- EASY CONVOLUTIONS #if COMPILE_BOX_KERNEL void AccumulateBilinear2x2Kernel( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators) { const float MipLevelPow2 = 1; FBilinearSampleInfos BilinearInfos = GetBilinearSampleLevelInfosEx( KernelConfig.BufferUV, KernelConfig.BufferSizeAndInvSize.xy, KernelConfig.BufferSizeAndInvSize.zw, MipLevelPow2, rcp(MipLevelPow2)); bool bUseStocasticBilinear = false; if (KernelConfig.SampleSet == SAMPLE_SET_2X2_STOCASTIC) { bUseStocasticBilinear = true; } else if (KernelConfig.SampleSet == SAMPLE_SET_2X2_ADAPTIVE) { bUseStocasticBilinear = !KernelConfig.bIsDynamicPixel; } float2 SampleBufferUVArray[4]; float BilinearWeightArray[4]; FLATTEN if (bUseStocasticBilinear) { float2 SampleOffset = 0; float WeigthAccumulation = 0.0; UNROLL_N(4) for (uint i = 0; i < 4; i++) { FLATTEN if (KernelConfig.Randoms[0] > WeigthAccumulation) SampleOffset = BilinearSamplingOffsets2x2[i]; WeigthAccumulation += GetSampleWeight(BilinearInfos, i); BilinearWeightArray[i] = 0.0; SampleBufferUVArray[i] = 0.0; } // TODO(Denoiser): could be more ALU efficient for this. // TODO(Denoiser): -0.5 full res pixel to ensure always select the mip, regardless of mantissa precision? SampleBufferUVArray[0] = (BilinearInfos.TopLeftPixelCoord + (SampleOffset + 0.5)) * MipLevelPow2 * KernelConfig.BufferSizeAndInvSize.zw; BilinearWeightArray[0] = 1.0; } else { UNROLL_N(4) for (uint i = 0; i < 4; i++) { float2 SampleOffset = BilinearSamplingOffsets2x2[i]; // TODO(Denoiser): could be more ALU efficient for this. // TODO(Denoiser): -0.5 full res pixel to ensure always select the mip, regardless of mantissa precision? SampleBufferUVArray[i] = (BilinearInfos.TopLeftPixelCoord + (SampleOffset + 0.5)) * MipLevelPow2 * KernelConfig.BufferSizeAndInvSize.zw; BilinearWeightArray[i] = GetSampleWeight(BilinearInfos, i); } } { SampleAndAccumulateMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUVArray[0], BilinearWeightArray[0], /* bForceSample = */ false); } BRANCH if (!bUseStocasticBilinear) { UNROLL_N(3) for (uint i = 1; i < 4; i++) { SampleAndAccumulateMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUVArray[i], BilinearWeightArray[i], /* bForceSample = */ false); } } } // AccumulateBilinear2x2Kernel() void AccumulateSquareKernel( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators) { int KernelRadius = 1; if (KernelConfig.SampleSet == SAMPLE_SET_5X5_WAVELET) { KernelRadius = 2; } else if (KernelConfig.SampleSet == SAMPLE_SET_NXN) { KernelRadius = KernelConfig.BoxKernelRadius; } if (KernelConfig.bUnroll) { UNROLL for (int x = -KernelRadius; x <= KernelRadius; x++) { UNROLL for (int y = -KernelRadius; y <= KernelRadius; y++) { const bool bIsKernelCenterSample = x == 0 && y == 0; if (bIsKernelCenterSample && !KernelConfig.bSampleKernelCenter) continue; float2 SampleOffset = float2(x, y); if (KernelConfig.SampleSet == SAMPLE_SET_3X3_SOBEK2018) { SampleOffset = mul(float2x2(float2(2, -1), float2(1, 2)), SampleOffset); } float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw; float KernelWeight = 1; if (KernelConfig.SampleSet == SAMPLE_SET_5X5_WAVELET) { KernelWeight = kWaveletFilterWeights5x5[abs(x)] * kWaveletFilterWeights5x5[abs(y)] * rcp(kWaveletFilterWeights5x5[0] * kWaveletFilterWeights5x5[0]); } SampleAndAccumulateMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, KernelWeight, /* bForceSample = */ bIsKernelCenterSample && KernelConfig.bForceKernelCenterAccumulation); } } } else { // TODO(Denoiser): latency hiding of this is terrible. LOOP for (int x = -KernelRadius; x <= KernelRadius; x++) { LOOP for (int y = -KernelRadius; y <= KernelRadius; y++) { const bool bIsKernelCenterSample = x == 0 && y == 0; if (bIsKernelCenterSample && !KernelConfig.bSampleKernelCenter) continue; float2 SampleOffset = float2(x, y); if (KernelConfig.SampleSet == SAMPLE_SET_3X3_SOBEK2018) { SampleOffset = mul(float2x2(float2(2, -1), float2(1, 2)), SampleOffset); } float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw; float KernelWeight = 1; if (KernelConfig.SampleSet == SAMPLE_SET_5X5_WAVELET) { KernelWeight = kWaveletFilterWeights5x5[abs(x)] * kWaveletFilterWeights5x5[abs(y)] * rcp(kWaveletFilterWeights5x5[0] * kWaveletFilterWeights5x5[0]); } SampleAndAccumulateMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, KernelWeight, /* bForceSample = */ bIsKernelCenterSample && KernelConfig.bForceKernelCenterAccumulation); } } } } // AccumulateSquareKernel() void BroadcastAccumulateSquare3x3KernelCenter( FSSDKernelConfig KernelConfig, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators, FSSDSampleSceneInfos RefSceneMetadata, float2 SampleBufferUV, FSSDSampleSceneInfos SampleSceneMetadata, FSSDSignalArray SampleMultiplexedSamples, FSSDSignalFrequencyArray SampleMultiplexedFrequencies) #if CONFIG_ENABLE_WAVE_BROADCAST { const FWaveBroadcastSettings BroadcastSettingsX = InitWaveSwapWithinLaneGroup(/* LaneGroupSize = */ 2); const FWaveBroadcastSettings BroadcastSettingsY = InitWaveSwapWithinLaneGroup(/* LaneGroupSize = */ 16); // Broadcast X. SampleBufferUV = WaveBroadcast(BroadcastSettingsX, SampleBufferUV); SampleSceneMetadata = WaveBroadcastSceneMetadata(BroadcastSettingsX, SampleSceneMetadata); SampleMultiplexedSamples = WaveBroadcastSignalArray(BroadcastSettingsX, SampleMultiplexedSamples); SampleMultiplexedFrequencies = WaveBroadcastSignalFrequenciesArray(BroadcastSettingsX, SampleMultiplexedFrequencies); if (KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS) { AccumulateSampledMultiplexedSignals( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RefSceneMetadata, SampleBufferUV, SampleSceneMetadata, SampleMultiplexedSamples, SampleMultiplexedFrequencies, /* KernelWeight = */ 1.0, /* bForceSample = */ false, /* bIsOutsideFrustum = */ false); } // Broadcast Y. SampleBufferUV = WaveBroadcast(BroadcastSettingsY, SampleBufferUV); SampleSceneMetadata = WaveBroadcastSceneMetadata(BroadcastSettingsY, SampleSceneMetadata); SampleMultiplexedSamples = WaveBroadcastSignalArray(BroadcastSettingsY, SampleMultiplexedSamples); SampleMultiplexedFrequencies = WaveBroadcastSignalFrequenciesArray(BroadcastSettingsY, SampleMultiplexedFrequencies); if (KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS) { AccumulateSampledMultiplexedSignals( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RefSceneMetadata, SampleBufferUV, SampleSceneMetadata, SampleMultiplexedSamples, SampleMultiplexedFrequencies, /* KernelWeight = */ 1.0, /* bForceSample = */ false, /* bIsOutsideFrustum = */ false); } // Broadcast X Again. SampleBufferUV = WaveBroadcast(BroadcastSettingsX, SampleBufferUV); SampleSceneMetadata = WaveBroadcastSceneMetadata(BroadcastSettingsX, SampleSceneMetadata); SampleMultiplexedSamples = WaveBroadcastSignalArray(BroadcastSettingsX, SampleMultiplexedSamples); SampleMultiplexedFrequencies = WaveBroadcastSignalFrequenciesArray(BroadcastSettingsX, SampleMultiplexedFrequencies); if (KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS) { AccumulateSampledMultiplexedSignals( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RefSceneMetadata, SampleBufferUV, SampleSceneMetadata, SampleMultiplexedSamples, SampleMultiplexedFrequencies, /* KernelWeight = */ 1.0, /* bForceSample = */ false, /* bIsOutsideFrustum = */ false); } } // BroadcastAccumulateSquare3x3KernelCenter() #else { } #endif void AccumulateSquare3x3Kernel( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators) #if CONFIG_ENABLE_WAVE_BROADCAST { if (KernelConfig.bSampleKernelCenter) { float2 SampleBufferUV = KernelConfig.BufferUV; // TODO(Denoiser): const bool bIsOutsideFrustum = false; FSSDCompressedSceneInfos CompressedSampleSceneMetadata; FSSDCompressedMultiplexedSample CompressedMultiplexedSamples; // Force all the signal texture fetch and metadata to overlap to minimize serial texture fetches. ISOLATE { SampleMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, SampleBufferUV, /* out */ CompressedSampleSceneMetadata, /* out */ CompressedMultiplexedSamples); } FSSDSampleSceneInfos RefSceneMetadata = UncompressRefSceneMetadata(KernelConfig); FSSDSignalArray MultiplexedSamples; FSSDSignalFrequencyArray MultiplexedFrequencies; UncompressMultiplexedSignals( KernelConfig, SampleBufferUV, CompressedMultiplexedSamples, /* out */ MultiplexedSamples, /* out */ MultiplexedFrequencies); FSSDSampleSceneInfos SampleSceneMetadata = UncompressSampleSceneMetadata( KernelConfig, SampleBufferUV, CompressedSampleSceneMetadata); AccumulateSampledMultiplexedSignals( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RefSceneMetadata, SampleBufferUV, SampleSceneMetadata, MultiplexedSamples, MultiplexedFrequencies, /* KernelWeight = */ 1.0, /* bForceSample = */ true, bIsOutsideFrustum); BroadcastAccumulateSquare3x3KernelCenter( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RefSceneMetadata, SampleBufferUV, SampleSceneMetadata, MultiplexedSamples, MultiplexedFrequencies); } // Store whether needs to flip offsets to have lowest VGPR pressure. uint2 OutputPixelPostion = BufferUVToBufferPixelCoord(KernelConfig.RefBufferUV); bool bFlipX = (OutputPixelPostion.x & 0x1) != 0; bool bFlipY = (OutputPixelPostion.y & 0x1) != 0; if (KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS) { float2 SampleOffset = float2(bFlipX ? 1.0 : -1.0, bFlipY ? 1.0 : -1.0); float2 SampleBufferUV = KernelConfig.BufferUV + SampleOffset * KernelConfig.BufferSizeAndInvSize.zw; SampleAndAccumulateMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, /* KernelWeight = */ 1.0, /* bForceSample = */ false); } static const float2 SampleOffsetArray[4] = { float2(-1.0, 0.0), float2( 0.0, -1.0), float2(-1.0, 1.0), float2( 1.0, -1.0), }; UNROLL for ( uint BatchId = (KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS ? 1 : 0); BatchId < (KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS ? 1 : 2); BatchId++) ISOLATE { float2 SampleOffset0 = select(bool2(bFlipX, bFlipY), -SampleOffsetArray[BatchId * 2 + 0], SampleOffsetArray[BatchId * 2 + 0]); float2 SampleOffset1 = select(bool2(bFlipX, bFlipY), -SampleOffsetArray[BatchId * 2 + 1], SampleOffsetArray[BatchId * 2 + 1]); float2 SampleBufferUV[2]; SampleBufferUV[0] = KernelConfig.BufferUV + SampleOffset0 * KernelConfig.BufferSizeAndInvSize.zw; SampleBufferUV[1] = KernelConfig.BufferUV + SampleOffset1 * KernelConfig.BufferSizeAndInvSize.zw; SampleAndAccumulateMultiplexedSignalsPair( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, /* KernelWeight = */ 1.0); } } // AccumulateSquare3x3Kernel() #else // !CONFIG_ENABLE_WAVE_BROADCAST { if (KernelConfig.bSampleKernelCenter) { SampleAndAccumulateCenterSampleAsItsOwnCluster( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } static const float2 SampleOffsetArray[4] = { float2(1.0, 0.0), float2(1.0, 1.0), float2(0.0, 1.0), float2(-1.0, 1.0), }; UNROLL for ( uint BatchId = (KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS ? 1 : 0); BatchId < 4; BatchId += (KernelConfig.SampleSet != SAMPLE_SET_3X3 ? 2 : 1)) ISOLATE { float2 SampleOffset = SampleOffsetArray[BatchId]; float2 SampleBufferUV[2]; SampleBufferUV[0] = KernelConfig.BufferUV + SampleOffset * KernelConfig.BufferSizeAndInvSize.zw; SampleBufferUV[1] = KernelConfig.BufferUV - SampleOffset * KernelConfig.BufferSizeAndInvSize.zw; SampleAndAccumulateMultiplexedSignalsPair( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, /* KernelWeight = */ 1.0); } } // AccumulateSquare3x3Kernel() #endif // !CONFIG_ENABLE_WAVE_BROADCAST #endif // COMPILE_BOX_KERNEL //------------------------------------------------------- STACKOWIAK 2018 #if COMPILE_STACKOWIAK_KERNEL static const float2 kStackowiakSampleSet0[56 * 4] = { float2(-0.5, -0.5), float2(+0.5, -0.5), float2(-0.5, +0.5), float2(+0.5, +0.5), float2(-1.5, +0.5), float2(-1.5, -0.5), float2(-0.5, +1.5), float2(+1.5, -0.5), float2(+0.5, -1.5), float2(+2.5, -0.5), float2(+1.5, +0.5), float2(-0.5, -1.5), float2(-1.5, -2.5), float2(-0.5, -2.5), float2(-1.5, -1.5), float2(-0.5, +2.5), float2(-1.5, +1.5), float2(+1.5, -2.5), float2(-1.5, +2.5), float2(+1.5, +2.5), float2(+0.5, -2.5), float2(-2.5, -0.5), float2(-2.5, -1.5), float2(-2.5, +0.5), float2(+0.5, +1.5), float2(+0.5, +2.5), float2(-3.5, +0.5), float2(+0.5, +3.5), float2(+1.5, -1.5), float2(+3.5, -0.5), float2(+2.5, +1.5), float2(+3.5, +0.5), float2(+1.5, +1.5), float2(-2.5, +1.5), float2(-3.5, +2.5), float2(+3.5, +1.5), float2(-3.5, -0.5), float2(-1.5, -3.5), float2(-2.5, -2.5), float2(-2.5, +2.5), float2(+2.5, +0.5), float2(+2.5, +2.5), float2(+1.5, +3.5), float2(+3.5, -1.5), float2(-3.5, -2.5), float2(+3.5, -2.5), float2(+2.5, -1.5), float2(+0.5, -3.5), float2(-0.5, +3.5), float2(-0.5, -4.5), float2(-4.5, +0.5), float2(+4.5, +0.5), float2(-4.5, -1.5), float2(-3.5, +1.5), float2(-0.5, -3.5), float2(+1.5, -3.5), float2(+0.5, -4.5), float2(-1.5, +3.5), float2(+0.5, +4.5), float2(-3.5, -1.5), float2(-4.5, +1.5), float2(+2.5, -4.5), float2(+2.5, -2.5), float2(-1.5, +4.5), float2(-2.5, -4.5), float2(+4.5, -2.5), float2(+2.5, +3.5), float2(-3.5, +3.5), float2(-2.5, +3.5), float2(+0.5, -5.5), float2(-4.5, +3.5), float2(-2.5, -3.5), float2(-4.5, +2.5), float2(+3.5, +3.5), float2(+2.5, -3.5), float2(+4.5, +3.5), float2(+3.5, -3.5), float2(+4.5, +2.5), float2(-5.5, +1.5), float2(-4.5, -0.5), float2(+3.5, +2.5), float2(-0.5, +4.5), float2(-1.5, +5.5), float2(+1.5, +5.5), float2(+4.5, -0.5), float2(+5.5, +0.5), float2(+4.5, +1.5), float2(-1.5, -4.5), float2(-1.5, -5.5), float2(-4.5, -2.5), float2(-2.5, +5.5), float2(+2.5, +5.5), float2(+1.5, +4.5), float2(+5.5, +1.5), float2(+1.5, -4.5), float2(-3.5, -3.5), float2(+3.5, -4.5), float2(-3.5, -4.5), float2(+4.5, -1.5), float2(+4.5, -3.5), float2(-3.5, -5.5), float2(-2.5, -5.5), float2(-4.5, -3.5), float2(+4.5, +4.5), float2(-3.5, +4.5), float2(-2.5, +4.5), float2(-5.5, -2.5), float2(-5.5, +0.5), float2(+2.5, -5.5), float2(+3.5, +4.5), float2(-0.5, -5.5), float2(-0.5, +6.5), float2(+2.5, +4.5), float2(-5.5, -0.5), float2(-6.5, -1.5), float2(+1.5, -5.5), float2(-6.5, -0.5), float2(+0.5, +5.5), float2(+1.5, +6.5), float2(+6.5, +1.5), float2(-0.5, +5.5), float2(+6.5, -0.5), float2(-4.5, -4.5), float2(-5.5, +2.5), float2(+5.5, -0.5), float2(-5.5, -1.5), float2(-6.5, +3.5), float2(-1.5, +6.5), float2(-6.5, +0.5), float2(+4.5, -5.5), float2(-3.5, +6.5), float2(+6.5, -1.5), float2(+0.5, -6.5), float2(-5.5, -3.5), float2(+5.5, -2.5), float2(+4.5, -4.5), float2(+5.5, -1.5), float2(+3.5, -6.5), float2(+5.5, +3.5), float2(+3.5, -5.5), float2(-5.5, -4.5), float2(+6.5, -3.5), float2(-0.5, -6.5), float2(+3.5, +6.5), float2(-5.5, +3.5), float2(+0.5, +6.5), float2(+6.5, +0.5), float2(+6.5, -2.5), float2(-6.5, -3.5), float2(-4.5, +4.5), float2(-7.5, -0.5), float2(+7.5, +0.5), float2(+5.5, +2.5), float2(-0.5, -7.5), float2(+0.5, +7.5), float2(-4.5, +5.5), float2(+3.5, +5.5), float2(-3.5, +5.5), float2(-4.5, -5.5), float2(+4.5, +6.5), float2(+5.5, -4.5), float2(+4.5, +5.5), float2(-4.5, +6.5), float2(+6.5, +4.5), float2(-7.5, +1.5), float2(-6.5, +1.5), float2(+5.5, -3.5), float2(-6.5, +2.5), float2(-2.5, +6.5), float2(-1.5, -7.5), float2(+5.5, +4.5), float2(-1.5, -6.5), float2(-3.5, -7.5), float2(+2.5, -7.5), float2(-7.5, +2.5), float2(-6.5, -2.5), float2(-5.5, +5.5), float2(+2.5, +6.5), float2(-2.5, -6.5), float2(-7.5, +0.5), float2(-0.5, +7.5), float2(+7.5, -2.5), float2(-2.5, +7.5), float2(+0.5, -7.5), float2(-4.5, -7.5), float2(+7.5, +1.5), float2(+1.5, -6.5), float2(-6.5, +4.5), float2(-1.5, +7.5), float2(-5.5, -5.5), float2(+6.5, +2.5), float2(-3.5, -6.5), float2(+3.5, -7.5), float2(-5.5, +4.5), float2(+2.5, -6.5), float2(+1.5, -7.5), float2(+6.5, +3.5), float2(+5.5, -6.5), float2(-6.5, +5.5), float2(+7.5, +4.5), float2(+7.5, -1.5), float2(-7.5, -1.5), float2(+3.5, +7.5), float2(-5.5, +6.5), float2(+1.5, +7.5), float2(+7.5, +3.5), float2(+7.5, -0.5), float2(-7.5, -2.5), float2(+5.5, +5.5), float2(+6.5, +5.5), float2(+5.5, -5.5), float2(-2.5, -7.5), float2(+2.5, +7.5), float2(-7.5, -3.5), float2(-7.5, -4.5), float2(-6.5, -4.5), float2(+7.5, -3.5), float2(+5.5, +6.5), float2(-5.5, -6.5), float2(-4.5, -6.5), float2(+7.5, +2.5), float2(-7.5, +3.5), float2(+4.5, -6.5), float2(+7.5, -4.5), }; static const float2 kStackowiakSampleSet1[56 * 4] = { float2(-0.5, -0.5), float2(+0.5, -0.5), float2(-0.5, +0.5), float2(+0.5, +0.5), float2(+0.5, -1.5), float2(+1.5, -1.5), float2(-1.5, -0.5), float2(+1.5, +1.5), float2(-0.5, -2.5), float2(-1.5, -1.5), float2(+0.5, +1.5), float2(-1.5, +0.5), float2(+1.5, -0.5), float2(-0.5, +1.5), float2(-2.5, +0.5), float2(+0.5, +2.5), float2(-2.5, -1.5), float2(+2.5, +0.5), float2(+1.5, +0.5), float2(-0.5, -1.5), float2(-1.5, +1.5), float2(+2.5, -2.5), float2(-3.5, -0.5), float2(-1.5, +2.5), float2(-2.5, +1.5), float2(-2.5, -0.5), float2(-1.5, -2.5), float2(+2.5, -1.5), float2(-3.5, +0.5), float2(-0.5, -3.5), float2(-1.5, +3.5), float2(+0.5, -2.5), float2(+1.5, +2.5), float2(-0.5, +2.5), float2(+0.5, +3.5), float2(+3.5, +0.5), float2(+2.5, +1.5), float2(-2.5, -2.5), float2(+2.5, -0.5), float2(+3.5, -1.5), float2(-0.5, +3.5), float2(+3.5, +1.5), float2(-3.5, +2.5), float2(+3.5, +2.5), float2(+3.5, -0.5), float2(+0.5, -4.5), float2(-2.5, +3.5), float2(+0.5, -3.5), float2(-1.5, -4.5), float2(+1.5, +3.5), float2(+1.5, -2.5), float2(-3.5, +1.5), float2(+2.5, -3.5), float2(-2.5, -3.5), float2(+2.5, +2.5), float2(+1.5, +4.5), float2(-4.5, -2.5), float2(-2.5, +2.5), float2(-4.5, +1.5), float2(+4.5, +1.5), float2(-2.5, -4.5), float2(+3.5, -3.5), float2(-1.5, -3.5), float2(-3.5, -1.5), float2(+1.5, -4.5), float2(+4.5, -2.5), float2(+1.5, -3.5), float2(-1.5, +4.5), float2(-4.5, +2.5), float2(-4.5, -0.5), float2(+2.5, +4.5), float2(-4.5, +0.5), float2(-3.5, -4.5), float2(+0.5, +4.5), float2(+3.5, -2.5), float2(-3.5, -2.5), float2(-3.5, +3.5), float2(+3.5, +3.5), float2(+4.5, +0.5), float2(+0.5, +5.5), float2(-0.5, +4.5), float2(+4.5, -3.5), float2(-1.5, +5.5), float2(-0.5, -4.5), float2(+2.5, +3.5), float2(+4.5, +2.5), float2(-2.5, +5.5), float2(+2.5, -4.5), float2(+4.5, -0.5), float2(+5.5, -0.5), float2(-4.5, +4.5), float2(+5.5, -1.5), float2(-5.5, -1.5), float2(-4.5, -1.5), float2(+3.5, +4.5), float2(-3.5, -3.5), float2(-5.5, +0.5), float2(+1.5, -5.5), float2(-5.5, -2.5), float2(-3.5, +4.5), float2(+0.5, -5.5), float2(-2.5, -5.5), float2(+2.5, +5.5), float2(+4.5, +4.5), float2(+4.5, -1.5), float2(-2.5, +4.5), float2(+4.5, +3.5), float2(+0.5, +6.5), float2(-0.5, -6.5), float2(+5.5, +2.5), float2(-0.5, -5.5), float2(-5.5, -0.5), float2(-6.5, -1.5), float2(-0.5, +5.5), float2(-0.5, +6.5), float2(+6.5, -0.5), float2(+1.5, +5.5), float2(+1.5, -6.5), float2(+5.5, +0.5), float2(-5.5, +2.5), float2(+5.5, +1.5), float2(-5.5, +1.5), float2(-6.5, -0.5), float2(-1.5, -5.5), float2(-5.5, -4.5), float2(-4.5, +3.5), float2(-6.5, +1.5), float2(+2.5, -5.5), float2(+3.5, -5.5), float2(-5.5, -3.5), float2(+1.5, +6.5), float2(+6.5, +2.5), float2(+4.5, -4.5), float2(+3.5, -6.5), float2(-4.5, -4.5), float2(-4.5, -3.5), float2(-6.5, +2.5), float2(+3.5, +5.5), float2(+3.5, -4.5), float2(+5.5, -3.5), float2(-5.5, +4.5), float2(+6.5, -3.5), float2(-6.5, -2.5), float2(+5.5, +4.5), float2(-1.5, +6.5), float2(-0.5, -7.5), float2(-6.5, +3.5), float2(-5.5, +3.5), float2(-6.5, -4.5), float2(+7.5, -1.5), float2(-3.5, -5.5), float2(+3.5, +6.5), float2(+5.5, +3.5), float2(+7.5, +0.5), float2(+5.5, -2.5), float2(-6.5, +0.5), float2(-7.5, +1.5), float2(-3.5, -6.5), float2(+6.5, +0.5), float2(+7.5, +1.5), float2(-2.5, -7.5), float2(-3.5, +5.5), float2(-7.5, -0.5), float2(-3.5, +6.5), float2(-2.5, +6.5), float2(+4.5, -6.5), float2(-5.5, +5.5), float2(+4.5, -5.5), float2(+6.5, -2.5), float2(+6.5, +3.5), float2(-1.5, -6.5), float2(-1.5, +7.5), float2(+6.5, +1.5), float2(-5.5, -5.5), float2(+0.5, -6.5), float2(+7.5, +3.5), float2(+2.5, +6.5), float2(-4.5, +5.5), float2(-6.5, -3.5), float2(-4.5, -5.5), float2(-6.5, -5.5), float2(+5.5, -6.5), float2(-2.5, -6.5), float2(+5.5, -5.5), float2(+4.5, +5.5), float2(-7.5, +0.5), float2(+6.5, -1.5), float2(+0.5, -7.5), float2(+7.5, -0.5), float2(-3.5, -7.5), float2(+2.5, -6.5), float2(-3.5, +7.5), float2(-4.5, -7.5), float2(-0.5, +7.5), float2(-6.5, +5.5), float2(+7.5, -3.5), float2(-4.5, +6.5), float2(+1.5, +7.5), float2(+5.5, -4.5), float2(+7.5, +4.5), float2(+0.5, +7.5), float2(+4.5, +6.5), float2(-4.5, +7.5), float2(-7.5, -1.5), float2(+3.5, -7.5), float2(+7.5, -4.5), float2(+3.5, +7.5), float2(-1.5, -7.5), float2(+6.5, -4.5), float2(-7.5, -3.5), float2(+6.5, +4.5), float2(+2.5, -7.5), float2(+7.5, -2.5), float2(-7.5, +2.5), float2(+1.5, -7.5), float2(-5.5, +6.5), float2(+5.5, +5.5), float2(-2.5, +7.5), float2(+7.5, +2.5), float2(-7.5, -2.5), float2(+2.5, +7.5), float2(-6.5, +4.5), float2(+5.5, +6.5), float2(-4.5, -6.5), }; static const uint kStackowiakSampleSetCount = 4; static const uint kStackowiakSampleCountPerSet = 56; void ConvolveStackowiakKernel( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators) { // Number of batch size done at same time, to improve lattency hidding. const uint kSamplingBatchSize = 2; if (KernelConfig.bDescOrder) { // (SALU) Number of batch of samples to perform. const uint BatchCountCount = (KernelConfig.SampleCount + (kSamplingBatchSize - 1)) / kSamplingBatchSize; // (SALU) Compute a final number of sample quantize the sampling batch size. const uint SampleCount = BatchCountCount * kSamplingBatchSize; // Compile time number of samples between rings. const uint StocasticSamplesPerCluster = 8 / kStackowiakSampleSetCount; // Compute the first index at witch digestion must happen. uint CurrentRingId = 0; uint NextClusterBoundary = 0; if (StocasticSamplesPerCluster == 2) { uint un = SampleCount - 1; CurrentRingId = (uint(floor(sqrt(4 * un - 3))) + 1) / 2; NextClusterBoundary = 1 + CurrentRingId * (CurrentRingId - 1); } else { // TODO(Denoiser) } FSSDSampleClusterInfo ClusterInfo; ClusterInfo.OutterBoundaryRadius = (CurrentRingId + 1) * KernelConfig.KernelSpreadFactor; StartAccumulatingCluster( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, ClusterInfo); // Processes the samples in batches so that the compiler can do lattency hidding. LOOP for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++) { UNROLL_N(2) for (uint SampleBatchId = 0; SampleBatchId < 2; SampleBatchId++) { uint SampleId = (BatchCountCount - BatchId) * kSamplingBatchSize - 1 - SampleBatchId; bool bIsKernelCenterSample = SampleId == 0 && (SampleBatchId == (kSamplingBatchSize - 1)); uint SampleTrackId = KernelConfig.SampleTrackId; float2 SampleOffset = kStackowiakSampleSet0[kStackowiakSampleSetCount * SampleId + SampleTrackId]; if (KernelConfig.SampleSubSetId == 1) { SampleOffset = kStackowiakSampleSet1[kStackowiakSampleSetCount * SampleId + SampleTrackId]; } float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw; float KernelWeight = 1; SampleAndAccumulateMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, /* KernelWeight = */ 1.0, /* bForceSample = */ bIsKernelCenterSample && KernelConfig.bForceKernelCenterAccumulation); // Change of cluster. Can only happens on odd SampleId, meaning even SampleBatchId. BRANCH if (SampleId == NextClusterBoundary && (SampleBatchId % 2) == 0) { // Compute the number samples that have been accumulated for this sample. uint SampleCountForCluster = min(CurrentRingId * StocasticSamplesPerCluster, SampleCount - SampleId); // Digest all acumulators. DijestAccumulatedClusterSamples( /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, CurrentRingId, SampleCountForCluster); BRANCH if (!KernelConfig.bSampleKernelCenter && SampleId == 1) { break; } // Change cluster index and boundary. CurrentRingId -= 1; NextClusterBoundary -= CurrentRingId * StocasticSamplesPerCluster; FSSDSampleClusterInfo ClusterInfo; ClusterInfo.OutterBoundaryRadius = (CurrentRingId + 1) * KernelConfig.KernelSpreadFactor; // Prepare the accumulators for new cluster. StartAccumulatingCluster( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, ClusterInfo); } } // for (uint SampleBatchId = 0; SampleBatchId < kSamplingBatchSize; SampleBatchId++) } // for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++) // NextClusterBoundary is not capable to reach 0, therefore need to manually digest the center sample. if (KernelConfig.bSampleKernelCenter) { DijestAccumulatedClusterSamples( /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, /* RingId = */ 0, /* SampleCount = */ 1); } } else // if (!KernelConfig.bDescOrder) { if (KernelConfig.bSampleKernelCenter) { SampleAndAccumulateCenterSampleAsItsOwnCluster( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } // Accumulate second sample to lattency hide with the center sample. { uint SampleTrackId = KernelConfig.SampleTrackId; uint SampleId = 1; float2 SampleOffset = kStackowiakSampleSet0[kStackowiakSampleSetCount * SampleId + SampleTrackId]; if (KernelConfig.SampleSubSetId == 1) { SampleOffset = kStackowiakSampleSet1[kStackowiakSampleSetCount * SampleId + SampleTrackId]; } float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw; SampleAndAccumulateMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, /* KernelWeight = */ 1.0, /* bForceSample = */ false); } // (SALU) Number of batch of samples to perform. const uint BatchCountCount = (KernelConfig.SampleCount - 1) / kSamplingBatchSize; // Processes the samples in batches so that the compiler can do lattency hidding. // TODO(Denoiser): kSamplingBatchSize for lattency hidding LOOP for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++) { float2 SampleBufferUV[2]; uint SampleTrackId = KernelConfig.SampleTrackId; UNROLL_N(2) for (uint SampleBatchId = 0; SampleBatchId < 2; SampleBatchId++) { uint SampleId = BatchId * kSamplingBatchSize + (SampleBatchId + kSamplingBatchSize); float2 SampleOffset = kStackowiakSampleSet0[kStackowiakSampleSetCount * SampleId + SampleTrackId]; if (KernelConfig.SampleSubSetId == 1) { SampleOffset = kStackowiakSampleSet1[kStackowiakSampleSetCount * SampleId + SampleTrackId]; } SampleBufferUV[SampleBatchId] = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw; } SampleAndAccumulateMultiplexedSignalsPair( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, /* KernelWeight = */ 1.0); } // for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++) } // if (!KernelConfig.bDescOrder) } // ConvolveStackowiakKernel() #endif // COMPILE_STACKOWIAK_KERNEL //------------------------------------------------------- DISK #if COMPILE_DISK_KERNEL // Returns the position of the sample on the unit circle (radius = 1) for a given ring. float2 GetDiskSampleOnUnitCirle(uint RingId, uint RingSampleIteration, uint RingSampleId) { RingId -= 1; // TODO(Denoiser). float SampleRingPos = RingSampleId; // Do not allign all j == 0 samples of the different ring on the X axis to increase minimal distance between all // samples, that reduce variance to clean by post filtering. #if 1 SampleRingPos += (RingId - 2 * (RingId / 2)) * 0.5; #endif #if 1 SampleRingPos += (RingId + 1) * 0.2; #endif float SampleAngle = PI * SampleRingPos / float(RingSampleIteration); return float2(cos(SampleAngle), sin(SampleAngle)); } // Returns the rotation matrix to use between sample of the ring. float2x2 GetSampleRotationMatrix(uint RingSampleIteration) { float RotationAngle = PI / float(RingSampleIteration); float C = cos(RotationAngle); float S = sin(RotationAngle); return float2x2( float2( C, S), float2(-S, C)); } // Returns the total number of sampling iteration for a given ring id. uint GetRingSamplingPairCount(const uint SampleSet, uint RingId) { if (SampleSet == SAMPLE_SET_HEXAWEB) { return RingId * 3; } // This number of sample is carefully chosen to have exact number of sample a square shaped ring (SquarePos). return RingId * 4; } // Returns the total number of sample of the kernel. uint GetDiskKernelSampleCount(const uint SampleSet, uint RingCount) { if (SampleSet == SAMPLE_SET_HEXAWEB) { return 1 + 3 * RingCount * (RingCount + 1); } // Depends on GetRingSamplingPairCount(). return 1 + 4 * RingCount * (RingCount + 1); } // Transform at compile time a 2 dimensional batch's constant into sample pair constant, by using rotation invariance. float2 SampleConstFromBatchConst(const uint BatchSampleId, float2 BatchConst) { /** * Y * ^ * | * 1 | * | * | 0 * | * - - - - - - O - - - - > X */ if (BatchSampleId == 1) return float2(-BatchConst.y, BatchConst.x); return BatchConst; } // Gather a ring into the accumulator. void GatherRingSamples( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators, const uint RingId) { // Number of sample iteration for this ring. const uint RingSamplePairCount = GetRingSamplingPairCount(KernelConfig.SampleSet, RingId); // Number of sample pair to process per batch. // TODO(Denoiser): Could potentially do 4 using symetries? Might be unpracticable because of VGPR pressure. const uint SamplePairBatchSize = (KernelConfig.SampleSet == SAMPLE_SET_HEXAWEB) ? 1 : 2; // Number of batch to process. const uint BatchCount = RingSamplePairCount / SamplePairBatchSize; // Distance of the ring from the center of the kernel in sample count. const uint RingDistance = uint(RingId + 0); // Generate at compile time sample rotation matrix. const float2x2 SampleRotationMatrix = GetSampleRotationMatrix(RingSamplePairCount); // Generates at compile time first sample location on circle (radius = 1). const float2 FirstCircleUnitPos = GetDiskSampleOnUnitCirle(RingId, RingSamplePairCount, /* BatchId = */ 0); // Position of the first sample on circle with radius according to KernelRadius. float2 FirstCircleSamplePosOffset = (RingDistance * FirstCircleUnitPos) * KernelConfig.KernelSpreadFactor; // Setup iteratable SGPR float2 CurrentCircleUnitPos = FirstCircleUnitPos; float2 CurrentCircleSamplePosOffset = FirstCircleSamplePosOffset; #if CONFIG_SGPR_HINT_OPTIMIZATION { CurrentCircleUnitPos = ToScalarMemory(CurrentCircleUnitPos); CurrentCircleSamplePosOffset = ToScalarMemory(CurrentCircleSamplePosOffset); } #endif // Loops through all batch of samples to process. LOOP for (uint BatchId = 0; BatchId < BatchCount; BatchId++) { // Rotate the samples position along the ring. CurrentCircleUnitPos = mul(CurrentCircleUnitPos, SampleRotationMatrix); CurrentCircleSamplePosOffset = mul(CurrentCircleSamplePosOffset, SampleRotationMatrix); #if CONFIG_SGPR_HINT_OPTIMIZATION { CurrentCircleUnitPos = ToScalarMemory(CurrentCircleUnitPos); CurrentCircleSamplePosOffset = ToScalarMemory(CurrentCircleSamplePosOffset); } #endif // Sample in batch of multiple pair to increase texture fetch concurency, to have better // lattency hidding. UNROLL for (uint BatchSampleId = 0; BatchSampleId < SamplePairBatchSize; BatchSampleId++) { float2 CircleSamplePosOffset = SampleConstFromBatchConst(BatchSampleId, CurrentCircleSamplePosOffset); float2 SampleUVPair[2]; SampleUVPair[0] = KernelConfig.BufferUV + CircleSamplePosOffset * KernelConfig.BufferSizeAndInvSize.zw; SampleUVPair[1] = KernelConfig.BufferUV - CircleSamplePosOffset * KernelConfig.BufferSizeAndInvSize.zw; SampleAndAccumulateMultiplexedSignalsPair( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleUVPair, /* KernelWeight = */ 1.0); } // for (uint BatchSampleId = 0; BatchSampleId < SamplePairBatchSize; BatchSampleId++) } // for (uint BatchId = 0; BatchId < BatchCount; BatchId++) } void ConvolveDiskKernel( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators) { // Accumulate the center of the kernel. if (KernelConfig.bSampleKernelCenter && !KernelConfig.bDescOrder) { SampleAndAccumulateCenterSampleAsItsOwnCluster( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } // Accumulate each ring. Use LOOP, because FXC is going through its pace otherwise. #if 1 LOOP #else UNROLL #endif for ( uint RingId = (KernelConfig.bDescOrder ? KernelConfig.RingCount : 1); (KernelConfig.bDescOrder ? RingId > 0 : RingId <= KernelConfig.RingCount); RingId += (KernelConfig.bDescOrder ? ~0u : 1)) { const uint RingSamplePairCount = GetRingSamplingPairCount(KernelConfig.SampleSet, RingId); FSSDSampleClusterInfo ClusterInfo; ClusterInfo.OutterBoundaryRadius = (RingId + 1) * KernelConfig.KernelSpreadFactor; StartAccumulatingCluster( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, ClusterInfo); GatherRingSamples( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RingId); DijestAccumulatedClusterSamples( /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RingId, RingSamplePairCount * 2); } // for (uint RingId = 0; RingId < KernelConfig.RingCount; RingId++) // Accumulate the center of the kernel. if (KernelConfig.bSampleKernelCenter && KernelConfig.bDescOrder) { SampleAndAccumulateCenterSampleAsItsOwnCluster( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } } #endif // COMPILE_DISK_KERNEL //------------------------------------------------------- DIRECTIONAL KERNELS #if COMPILE_DIRECTIONAL_KERNEL void ConvolveDirectionalRect( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators) { // Accumulate the center of the kernel. if (KernelConfig.bSampleKernelCenter) { SampleAndAccumulateCenterSampleAsItsOwnCluster( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } // Number of batch size done at same time, to improve lattency hidding. const uint kSamplingBatchSize = 2; // Number of batch of samples to perform. It's not round up because also sampling the center of kernel anyway. // TODO(Denoiser): store in a SGPR array instead to save 1 VGPR. const uint BatchCountCount = KernelConfig.SampleCount / kSamplingBatchSize; // Processes the samples in batches so that the compiler can do lattency hidding. LOOP for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++) { float2 SampleBufferUV[2]; UNROLL_N(2) for (uint SampleBatchId = 0; SampleBatchId < 2; SampleBatchId++) { uint SampleId = BatchId * kSamplingBatchSize + SampleBatchId; float2 E = Hammersley16(SampleId, BatchCountCount * kSamplingBatchSize, KernelConfig.HammersleySeed); if (KernelConfig.SampleSet == SAMPLE_SET_DIRECTIONAL_RECT) { E = (E * 2.0 - 1.0); } else // if (KernelConfig.SampleSet == SAMPLE_SET_DIRECTIONAL_ELLIPSE) { E = UniformSampleDiskConcentric(E); } float2 SampleOffset = float2(KernelConfig.MajorAxis) * KernelConfig.MajorPixelRadius * E.x + float2(-KernelConfig.MajorAxis.y, KernelConfig.MajorAxis.x) * KernelConfig.MinorPixelRadius * E.y; SampleBufferUV[SampleBatchId] = KernelConfig.BufferUV + SampleOffset * KernelConfig.BufferSizeAndInvSize.zw; } SampleAndAccumulateMultiplexedSignalsPair( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, /* KernelWeight = */ 1.0); } // for (uint BatchId = 0; BatchId < BatchCountCount; BatchId++) } #endif // COMPILE_DIRECTIONAL_KERNEL //------------------------------------------------------- RAW EXPERIMENTAL KERNEL TO TRY #if COMPILE_RAW_EXPERIMENTAL_KERNEL #if 0 static const float2 SampleArray4x4x8[128] = { float2(0.000000, 0.000000), float2(1.000000, 0.000000), float2(0.000000, 1.000000), float2(2.000000, 1.000000), float2(-1.000000, 2.000000), float2(-1.000000, 1.000000), float2(1.000000, 2.000000), float2(1.000000, 1.000000), float2(0.000000, 0.000000), float2(-2.000000, 0.000000), float2(-1.000000, 1.000000), float2(0.000000, 2.000000), float2(-1.000000, 2.000000), float2(-1.000000, -1.000000), float2(0.000000, 1.000000), float2(-1.000000, 0.000000), float2(0.000000, 0.000000), float2(1.000000, -1.000000), float2(1.000000, 2.000000), float2(0.000000, 1.000000), float2(0.000000, 2.000000), float2(0.000000, -1.000000), float2(-1.000000, -1.000000), float2(2.000000, 1.000000), float2(0.000000, 0.000000), float2(-2.000000, 0.000000), float2(-1.000000, -1.000000), float2(0.000000, 2.000000), float2(1.000000, -1.000000), float2(1.000000, 2.000000), float2(-1.000000, 2.000000), float2(-1.000000, 0.000000), float2(0.000000, 0.000000), float2(0.000000, 1.000000), float2(2.000000, 0.000000), float2(-1.000000, -1.000000), float2(1.000000, -1.000000), float2(0.000000, -1.000000), float2(-3.000000, 0.000000), float2(-1.000000, 1.000000), float2(0.000000, 0.000000), float2(0.000000, -1.000000), float2(0.000000, 1.000000), float2(-2.000000, 0.000000), float2(-1.000000, -1.000000), float2(2.000000, 1.000000), float2(1.000000, 0.000000), float2(2.000000, 0.000000), float2(0.000000, 0.000000), float2(0.000000, 2.000000), float2(2.000000, 2.000000), float2(1.000000, 0.000000), float2(-1.000000, 1.000000), float2(-2.000000, 0.000000), float2(1.000000, -1.000000), float2(-1.000000, 2.000000), float2(0.000000, 0.000000), float2(-1.000000, 0.000000), float2(-2.000000, 0.000000), float2(-2.000000, 1.000000), float2(1.000000, 2.000000), float2(-3.000000, 0.000000), float2(-2.000000, 2.000000), float2(0.000000, -2.000000), float2(0.000000, 0.000000), float2(-1.000000, 1.000000), float2(1.000000, -1.000000), float2(0.000000, -2.000000), float2(-1.000000, 0.000000), float2(1.000000, -2.000000), float2(1.000000, 0.000000), float2(2.000000, 0.000000), float2(0.000000, 0.000000), float2(-2.000000, 0.000000), float2(-1.000000, 0.000000), float2(-2.000000, 2.000000), float2(-1.000000, 2.000000), float2(1.000000, 0.000000), float2(0.000000, 3.000000), float2(0.000000, 2.000000), float2(0.000000, 0.000000), float2(0.000000, 1.000000), float2(1.000000, -2.000000), float2(-1.000000, 0.000000), float2(0.000000, 2.000000), float2(-1.000000, 1.000000), float2(1.000000, 1.000000), float2(-2.000000, 1.000000), float2(0.000000, 0.000000), float2(-2.000000, 0.000000), float2(-1.000000, 1.000000), float2(-2.000000, 2.000000), float2(1.000000, -2.000000), float2(-3.000000, 0.000000), float2(-1.000000, 0.000000), float2(0.000000, -2.000000), float2(0.000000, 0.000000), float2(-1.000000, 0.000000), float2(2.000000, 0.000000), float2(1.000000, 0.000000), float2(-1.000000, 1.000000), float2(-2.000000, -2.000000), float2(2.000000, 1.000000), float2(0.000000, -2.000000), float2(0.000000, 0.000000), float2(-1.000000, 0.000000), float2(-2.000000, -2.000000), float2(-1.000000, 2.000000), float2(1.000000, 2.000000), float2(2.000000, 0.000000), float2(1.000000, 1.000000), float2(1.000000, 0.000000), float2(0.000000, 0.000000), float2(0.000000, -1.000000), float2(1.000000, -1.000000), float2(-1.000000, 1.000000), float2(0.000000, 1.000000), float2(1.000000, 1.000000), float2(-2.000000, 0.000000), float2(2.000000, -1.000000), float2(0.000000, 0.000000), float2(0.000000, -2.000000), float2(-1.000000, -2.000000), float2(-2.000000, 0.000000), float2(1.000000, -2.000000), float2(-1.000000, 0.000000), float2(1.000000, 0.000000), float2(-1.000000, 1.000000), }; // SampleArray4x4x8 #else static const float2 SampleArray4x4x16[256] = { float2(0.000000, 0.000000), float2(-1.000000, -1.000000), float2(1.000000, -1.000000), float2(-1.000000, 0.000000), float2(0.000000, -1.000000), float2(1.000000, 0.000000), float2(-2.000000, 1.000000), float2(2.000000, -1.000000), float2(1.000000, 1.000000), float2(-1.000000, -2.000000), float2(2.000000, 0.000000), float2(0.000000, 1.000000), float2(0.000000, 2.000000), float2(2.000000, 2.000000), float2(2.000000, -2.000000), float2(1.000000, 2.000000), float2(0.000000, 0.000000), float2(-1.000000, 0.000000), float2(2.000000, 3.000000), float2(-1.000000, -2.000000), float2(1.000000, -2.000000), float2(0.000000, -1.000000), float2(1.000000, -1.000000), float2(1.000000, 0.000000), float2(0.000000, 1.000000), float2(-2.000000, -2.000000), float2(-2.000000, 1.000000), float2(0.000000, 2.000000), float2(-1.000000, 1.000000), float2(1.000000, 1.000000), float2(-1.000000, -1.000000), float2(2.000000, -1.000000), float2(0.000000, 0.000000), float2(1.000000, -1.000000), float2(-1.000000, 3.000000), float2(-1.000000, -2.000000), float2(0.000000, -1.000000), float2(1.000000, -2.000000), float2(-1.000000, 0.000000), float2(-2.000000, -1.000000), float2(2.000000, 2.000000), float2(-2.000000, 1.000000), float2(0.000000, 2.000000), float2(1.000000, 1.000000), float2(1.000000, 0.000000), float2(0.000000, 1.000000), float2(2.000000, 0.000000), float2(-1.000000, 1.000000), float2(0.000000, 0.000000), float2(0.000000, -1.000000), float2(0.000000, 1.000000), float2(-2.000000, 1.000000), float2(1.000000, -2.000000), float2(-3.000000, 0.000000), float2(0.000000, -2.000000), float2(-2.000000, 2.000000), float2(-1.000000, 2.000000), float2(1.000000, -1.000000), float2(2.000000, 3.000000), float2(-2.000000, 0.000000), float2(0.000000, 2.000000), float2(-1.000000, 1.000000), float2(1.000000, 1.000000), float2(-1.000000, -1.000000), float2(0.000000, 0.000000), float2(-1.000000, 0.000000), float2(2.000000, -1.000000), float2(-1.000000, -1.000000), float2(-1.000000, 2.000000), float2(1.000000, -1.000000), float2(-3.000000, 0.000000), float2(1.000000, 2.000000), float2(2.000000, -2.000000), float2(0.000000, -1.000000), float2(0.000000, 1.000000), float2(0.000000, -2.000000), float2(1.000000, 1.000000), float2(2.000000, 0.000000), float2(2.000000, 1.000000), float2(-1.000000, 1.000000), float2(0.000000, 0.000000), float2(-2.000000, 0.000000), float2(1.000000, 0.000000), float2(0.000000, -2.000000), float2(2.000000, -1.000000), float2(-2.000000, 2.000000), float2(-1.000000, 0.000000), float2(2.000000, 2.000000), float2(3.000000, 1.000000), float2(0.000000, 2.000000), float2(1.000000, -1.000000), float2(-1.000000, 2.000000), float2(0.000000, 1.000000), float2(-1.000000, -1.000000), float2(1.000000, 1.000000), float2(1.000000, 2.000000), float2(0.000000, 0.000000), float2(0.000000, 1.000000), float2(2.000000, 0.000000), float2(1.000000, 2.000000), float2(-1.000000, 2.000000), float2(-1.000000, 0.000000), float2(0.000000, 2.000000), float2(3.000000, 1.000000), float2(-1.000000, -1.000000), float2(-2.000000, -3.000000), float2(2.000000, 3.000000), float2(1.000000, 0.000000), float2(2.000000, 2.000000), float2(1.000000, 1.000000), float2(1.000000, -1.000000), float2(0.000000, -1.000000), float2(0.000000, 0.000000), float2(1.000000, -1.000000), float2(-2.000000, 1.000000), float2(-1.000000, 2.000000), float2(-1.000000, 0.000000), float2(-3.000000, 0.000000), float2(-2.000000, -1.000000), float2(-1.000000, 1.000000), float2(0.000000, -1.000000), float2(0.000000, 2.000000), float2(1.000000, 1.000000), float2(0.000000, 1.000000), float2(-1.000000, -1.000000), float2(-2.000000, 2.000000), float2(-2.000000, 0.000000), float2(1.000000, 2.000000), float2(0.000000, 0.000000), float2(2.000000, -1.000000), float2(1.000000, -1.000000), float2(0.000000, 2.000000), float2(-1.000000, -2.000000), float2(0.000000, 1.000000), float2(1.000000, 2.000000), float2(-1.000000, 1.000000), float2(2.000000, 2.000000), float2(-3.000000, 0.000000), float2(-2.000000, 0.000000), float2(-1.000000, -1.000000), float2(-1.000000, 0.000000), float2(-4.000000, -1.000000), float2(2.000000, 1.000000), float2(1.000000, 1.000000), float2(0.000000, 0.000000), float2(1.000000, 2.000000), float2(0.000000, 1.000000), float2(-1.000000, 0.000000), float2(1.000000, 0.000000), float2(-1.000000, 2.000000), float2(0.000000, 3.000000), float2(2.000000, -2.000000), float2(-2.000000, 1.000000), float2(-1.000000, 1.000000), float2(0.000000, 2.000000), float2(1.000000, -1.000000), float2(-2.000000, 0.000000), float2(1.000000, 1.000000), float2(-2.000000, -1.000000), float2(0.000000, -2.000000), float2(0.000000, 0.000000), float2(-1.000000, 1.000000), float2(1.000000, -1.000000), float2(-1.000000, -2.000000), float2(0.000000, -2.000000), float2(0.000000, 1.000000), float2(1.000000, 2.000000), float2(2.000000, -1.000000), float2(1.000000, 0.000000), float2(0.000000, -1.000000), float2(-1.000000, -1.000000), float2(-2.000000, 0.000000), float2(-2.000000, 1.000000), float2(-1.000000, 0.000000), float2(1.000000, 1.000000), float2(-2.000000, -2.000000), float2(0.000000, 0.000000), float2(-1.000000, -2.000000), float2(-3.000000, 0.000000), float2(1.000000, -1.000000), float2(0.000000, -2.000000), float2(-2.000000, 1.000000), float2(-1.000000, -1.000000), float2(-1.000000, 1.000000), float2(0.000000, 1.000000), float2(-2.000000, 2.000000), float2(1.000000, 1.000000), float2(-2.000000, -1.000000), float2(-1.000000, -4.000000), float2(1.000000, -2.000000), float2(0.000000, -1.000000), float2(-2.000000, 0.000000), float2(0.000000, 0.000000), float2(0.000000, 1.000000), float2(2.000000, 1.000000), float2(2.000000, 0.000000), float2(-1.000000, 2.000000), float2(-2.000000, -2.000000), float2(0.000000, 2.000000), float2(3.000000, 1.000000), float2(-3.000000, 2.000000), float2(-1.000000, -1.000000), float2(1.000000, 0.000000), float2(-1.000000, -2.000000), float2(1.000000, -1.000000), float2(1.000000, 1.000000), float2(0.000000, -1.000000), float2(-1.000000, 0.000000), float2(0.000000, 0.000000), float2(1.000000, -1.000000), float2(-2.000000, 1.000000), float2(-1.000000, -1.000000), float2(0.000000, -2.000000), float2(0.000000, 2.000000), float2(0.000000, 3.000000), float2(1.000000, 1.000000), float2(3.000000, 1.000000), float2(0.000000, 1.000000), float2(3.000000, -2.000000), float2(2.000000, 1.000000), float2(1.000000, -3.000000), float2(2.000000, -2.000000), float2(1.000000, -2.000000), float2(-2.000000, 2.000000), float2(0.000000, 0.000000), float2(1.000000, 0.000000), float2(1.000000, -1.000000), float2(0.000000, -1.000000), float2(-1.000000, 2.000000), float2(-2.000000, 1.000000), float2(-1.000000, 0.000000), float2(-1.000000, -1.000000), float2(0.000000, 1.000000), float2(1.000000, 1.000000), float2(2.000000, 3.000000), float2(2.000000, 0.000000), float2(-1.000000, 1.000000), float2(1.000000, -2.000000), float2(0.000000, -2.000000), float2(-2.000000, -2.000000), float2(0.000000, 0.000000), float2(-2.000000, 1.000000), float2(0.000000, -1.000000), float2(-1.000000, 1.000000), float2(1.000000, -3.000000), float2(0.000000, 2.000000), float2(0.000000, 3.000000), float2(0.000000, 1.000000), float2(-2.000000, 0.000000), float2(1.000000, -1.000000), float2(-2.000000, -1.000000), float2(-2.000000, -2.000000), float2(-1.000000, 0.000000), float2(-1.000000, -1.000000), float2(1.000000, 0.000000), float2(1.000000, -2.000000), }; // SampleArray4x4x16 #endif void ConvolveRawExperimentalKernel( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators) { // Accumulate the center of the kernel. if (KernelConfig.bSampleKernelCenter && !KernelConfig.bDescOrder) { SampleAndAccumulateCenterSampleAsItsOwnCluster( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } const uint TileSize = 4; const uint SampleCount = 16; uint2 PixelCoord = uint2(KernelConfig.BufferUV * View.BufferSizeAndInvSize.xy) % TileSize; LOOP for (uint SampleId = 1; SampleId < SampleCount; SampleId++) { uint MagicIndex = SampleId + SampleCount * (PixelCoord.x + TileSize * PixelCoord.y); float2 SampleOffset = SampleArray4x4x16[MagicIndex]; float2 SampleBufferUV = KernelConfig.BufferUV + (SampleOffset * KernelConfig.KernelSpreadFactor) * KernelConfig.BufferSizeAndInvSize.zw; SampleAndAccumulateMultiplexedSignals( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, SampleBufferUV, /* KernelWeight = */ 1.0, /* bForceSample = */ false); } // Accumulate the center of the kernel. if (KernelConfig.bSampleKernelCenter && KernelConfig.bDescOrder) { SampleAndAccumulateCenterSampleAsItsOwnCluster( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } } #endif // COMPILE_RAW_EXPERIMENTAL_KERNEL //------------------------------------------------------- MAIN ENTRY POINTS /** Accumulate the center of the kernel when KernelConfig.bSampleKernelCenter == false. * * RefSceneMetadata and SampleSceneMetadata needs to be uncompressed upfront intentionally to share the uncompression with other * part of the shader that might have required uncompression anyway. */ void AccumulateRefSampleAsKernelCenter( FSSDKernelConfig KernelConfig, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators, float2 RefBufferUV, FSSDSampleSceneInfos RefSceneMetadata, FSSDSignalArray RefMultiplexedSamples, FSSDSignalFrequencyArray RefMultiplexedFrequencies) { if (!KernelConfig.bSampleKernelCenter) { AccumulateSampledMultiplexedSignals( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RefSceneMetadata, RefBufferUV, RefSceneMetadata, RefMultiplexedSamples, RefMultiplexedFrequencies, /* KernelWeight = */ 1.0, /* bForceSample = */ true, /* bIsOutsideFrustum = */ false); if (KernelConfig.SampleSet == 0xDEADDEAD) { } #if COMPILE_BOX_KERNEL else if (KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS || KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS) { BroadcastAccumulateSquare3x3KernelCenter( KernelConfig, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators, RefSceneMetadata, RefBufferUV, RefSceneMetadata, RefMultiplexedSamples, RefMultiplexedFrequencies); } #endif } } void AccumulateKernel( FSSDKernelConfig KernelConfig, FSSDTexture2D SignalBuffer0, FSSDTexture2D SignalBuffer1, FSSDTexture2D SignalBuffer2, FSSDTexture2D SignalBuffer3, inout FSSDSignalAccumulatorArray UncompressedAccumulators, inout FSSDCompressedSignalAccumulatorArray CompressedAccumulators) { if (KernelConfig.SampleSet == 0xDEADDEAD) { } #if COMPILE_BOX_KERNEL else if (KernelConfig.SampleSet == SAMPLE_SET_1X1) { if (KernelConfig.bSampleKernelCenter) { SampleAndAccumulateCenterSampleAsItsOwnCluster( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } } else if ( KernelConfig.SampleSet == SAMPLE_SET_2X2_BILINEAR || KernelConfig.SampleSet == SAMPLE_SET_2X2_STOCASTIC || KernelConfig.SampleSet == SAMPLE_SET_2X2_ADAPTIVE) { AccumulateBilinear2x2Kernel( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } else if ( KernelConfig.SampleSet == SAMPLE_SET_3X3 || KernelConfig.SampleSet == SAMPLE_SET_3X3_PLUS || KernelConfig.SampleSet == SAMPLE_SET_3X3_CROSS) { AccumulateSquare3x3Kernel( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } else if ( KernelConfig.SampleSet == SAMPLE_SET_3X3_SOBEK2018 || KernelConfig.SampleSet == SAMPLE_SET_5X5_WAVELET || KernelConfig.SampleSet == SAMPLE_SET_NXN) { AccumulateSquareKernel( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } #endif// COMPILE_BOX_KERNEL #if COMPILE_STACKOWIAK_KERNEL else if (KernelConfig.SampleSet == SAMPLE_SET_STACKOWIAK_4_SETS) { ConvolveStackowiakKernel( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } #endif // COMPILE_STACKOWIAK_KERNEL #if COMPILE_DISK_KERNEL else if (KernelConfig.SampleSet == SAMPLE_SET_HEXAWEB) { ConvolveDiskKernel( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } #endif // COMPILE_DISK_KERNEL #if COMPILE_DIRECTIONAL_KERNEL else if (KernelConfig.SampleSet == SAMPLE_SET_DIRECTIONAL_RECT || KernelConfig.SampleSet == SAMPLE_SET_DIRECTIONAL_ELLIPSE) { ConvolveDirectionalRect( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } #endif // COMPILE_DIRECTIONAL_KERNEL #if COMPILE_RAW_EXPERIMENTAL_KERNEL else if (KernelConfig.SampleSet == SAMPLE_SET_RAW_EXPERIMENTAL_KERNEL) { ConvolveRawExperimentalKernel( KernelConfig, SignalBuffer0, SignalBuffer1, SignalBuffer2, SignalBuffer3, /* inout */ UncompressedAccumulators, /* inout */ CompressedAccumulators); } #endif } // AccumulateKernel()