// Copyright Epic Games, Inc. All Rights Reserved. /*============================================================================= PostprocessAmbientOcclusion.usf: To generate ambient occlusion as a postprocess =============================================================================*/ #include "Common.ush" #include "ScreenPass.ush" #include "PostProcessCommon.ush" #include "DeferredShadingCommon.ush" #include "Substrate/Substrate.ush" // set by C++: // // 0:low / 1: medium / 2:high / 4:very high // SHADER_QUALITY // // 0:no / 1:yes // USE_AO_SETUP_AS_INPUT // // 0:no / 1:yes // USE_UPSAMPLE #define GTAO_THICKNESS_HEURISTIC 1 // 0: AABB Clipping / 1: Clipping based on first order moment #define GTAO_VARIANCE_CLIPPING 1 // 0: classic with weighted sample, 1: don't normalize and adjust the formula to be simpler and faster - can look better and is cheaper (Alchemy like?) #define OPTIMIZATION_O1 1 // 1:lowest quality, 2:medium , 3:high, more doesn't give too much (maybe HZB mip computations should `be adjusted) //#define SAMPLE_STEPS 3 // 0:off / 1:show samples on the right side of the screen #define DEBUG_LOOKUPS 0 // 0:off / 1:take into account scene normals in the computations #define USE_NORMALS 1 // useful to remove high frequency dither pattern, not that needed with more sample // 0:off (fast but dither pattern with low sample count), 1:non normal aware (half res look), 2:normal aware (slower), 3:normal and depth aware (slowest, doesn't add much) //#define QUAD_MESSAGE_PASSING_BLUR 2 // ambient occlusion // AO_SAMPLE_QUALITY = 0 : no AO sampling, only upsampling // AO_SAMPLE_QUALITY = 1 : no dither/per pixel randomization // AO_SAMPLE_QUALITY = 2 : efficient high frequency 4x4 pattern without jitter for TemporalAA // AO_SAMPLE_QUALITY = 3 : efficient high frequency 4x4 pattern with jitter for TemporalAA // SHADER_QUALITY 0-4 #if SHADER_QUALITY == 0 // very low #define USE_SAMPLESET 1 #define SAMPLE_STEPS 1 #define QUAD_MESSAGE_PASSING_BLUR 0 #elif SHADER_QUALITY == 1 // low #define USE_SAMPLESET 1 #define SAMPLE_STEPS 1 #define QUAD_MESSAGE_PASSING_BLUR 2 #elif SHADER_QUALITY == 2 // medium #define USE_SAMPLESET 1 #define SAMPLE_STEPS 2 #define QUAD_MESSAGE_PASSING_BLUR 2 #elif SHADER_QUALITY == 3 // high #define USE_SAMPLESET 1 #define SAMPLE_STEPS 3 #define QUAD_MESSAGE_PASSING_BLUR 0 #else // SHADER_QUALITY == 4 // very high #define USE_SAMPLESET 3 #define SAMPLE_STEPS 3 #define QUAD_MESSAGE_PASSING_BLUR 0 #endif #if QUAD_MESSAGE_PASSING_BLUR == 0 #define QUAD_MESSAGE_PASSING_NORMAL 0 #define QUAD_MESSAGE_PASSING_DEPTH 0 #elif QUAD_MESSAGE_PASSING_BLUR == 1 #define QUAD_MESSAGE_PASSING_NORMAL 0 #define QUAD_MESSAGE_PASSING_DEPTH 0 #elif QUAD_MESSAGE_PASSING_BLUR == 2 #define QUAD_MESSAGE_PASSING_NORMAL 1 #define QUAD_MESSAGE_PASSING_DEPTH 0 #elif QUAD_MESSAGE_PASSING_BLUR == 3 #define QUAD_MESSAGE_PASSING_NORMAL 1 #define QUAD_MESSAGE_PASSING_DEPTH 1 #endif // 0:4 samples, 1:9 samples (only really noticable with dither usage ??) //#define AO_UPSAMPLE_QUALITY #if USE_AO_SETUP_AS_INPUT == 1 // lower resolution #define AO_SAMPLE_QUALITY 3 #undef USE_SAMPLESET #define USE_SAMPLESET 3 #define AO_UPSAMPLE_QUALITY 1 #else // full resolution is expensive, do lower quality #define AO_SAMPLE_QUALITY 3 #define AO_UPSAMPLE_QUALITY 0 #endif // 0: 1 point (for testing) // 1: 3 points // 2: more evenly spread (5 points - slightly faster, stronger effect, better with multiple levels?) // 3: near the surface very large, softly fading out (6 points) #if USE_SAMPLESET == 0 #define SAMPLESET_ARRAY_SIZE 1 static const float2 OcclusionSamplesOffsets[SAMPLESET_ARRAY_SIZE]= { // one sample, for testing float2(0.500, 0.500), }; #elif USE_SAMPLESET == 1 #define SAMPLESET_ARRAY_SIZE 3 static const float2 OcclusionSamplesOffsets[SAMPLESET_ARRAY_SIZE]= { // 3 points distributed on the unit disc, spiral order and distance float2(0, -1.0f) * 0.43f, float2(0.58f, 0.814f) * 0.7f, float2(-0.58f, 0.814f) }; #elif USE_SAMPLESET == 2 #define SAMPLESET_ARRAY_SIZE 5 static const float2 OcclusionSamplesOffsets[SAMPLESET_ARRAY_SIZE]= { // 5 points distributed on a ring float2(0.156434, 0.987688), float2(0.987688, 0.156434)*0.9, float2(0.453990, -0.891007)*0.8, float2(-0.707107, -0.707107)*0.7, float2(-0.891006, 0.453991)*0.65, }; #else // USE_SAMPLESET == 3 #define SAMPLESET_ARRAY_SIZE 6 static const float2 OcclusionSamplesOffsets[SAMPLESET_ARRAY_SIZE]= { // 6 points distributed on the unit disc, spiral order and distance float2(0.000, 0.200), float2(0.325, 0.101), float2(0.272, -0.396), float2(-0.385, -0.488), float2(-0.711, 0.274), float2(0.060, 0.900) }; #endif // USE_SAMPLESET // ----------------------------------------------------------------------------------------------------------------------------- // To be included after defines #include "PostProcessAmbientOcclusionCommon.ush" // downsample the input of the ambient occlusion pass for better performance, can take input from setup or another downsample pass void MainSetupPS(in noperspective float4 UVAndScreenPos : TEXCOORD0, in FStereoPSInput StereoInput, float4 SvPosition : SV_POSITION, out float4 OutColor0 : SV_Target0) { StereoSetupPS(StereoInput); float2 ViewPortSize = AOViewport_ViewportSize; float2 InUV = UVAndScreenPos.xy; // can be optimized float2 UV[4]; UV[0] = InUV + float2(-0.5f, -0.5f) * InputExtentInverse; UV[1] = min(InUV + float2( 0.5f, -0.5f) * InputExtentInverse, View.BufferBilinearUVMinMax.zw); UV[2] = min(InUV + float2(-0.5f, 0.5f) * InputExtentInverse, View.BufferBilinearUVMinMax.zw); UV[3] = min(InUV + float2( 0.5f, 0.5f) * InputExtentInverse, View.BufferBilinearUVMinMax.zw); float4 Samples[4]; UNROLL for(uint i = 0; i < 4; ++i) { #if COMPUTE_SHADER || FORWARD_SHADING // Async compute and forward shading don't have access to the gbuffer. Samples[i].rgb = normalize(ReconstructNormalFromDepthBuffer(float4(UV[i] * ViewPortSize, SvPosition.zw))) * 0.5f + 0.5f; #else #if SUBTRATE_GBUFFER_FORMAT==1 const FSubstrateTopLayerData TopLayerData = SubstrateUnpackTopLayerData(Substrate.TopLayerTexture.Load(uint3(clamp(UV[i] * View.BufferSizeAndInvSize.xy, 0.0f, View.BufferSizeAndInvSize.xy - 1.0f), 0))); Samples[i].rgb = TopLayerData.WorldNormal * 0.5f + 0.5f; #else Samples[i].rgb = GetGBufferData(UV[i], true).WorldNormal * 0.5f + 0.5f; #endif #endif Samples[i].a = CalcSceneDepth(UV[i]); } float MaxZ = max( max(Samples[0].a, Samples[1].a), max(Samples[2].a, Samples[3].a)); float4 AvgColor = 0.0f; if (USE_NORMALS) { AvgColor = 0.0001f; { UNROLL for(uint i = 0; i < 4; ++i) { AvgColor += float4(Samples[i].rgb, 1) * ComputeDepthSimilarity(Samples[i].a, MaxZ, ThresholdInverse); } AvgColor.rgb /= AvgColor.w; } } OutColor0 = float4(AvgColor.rgb, MaxZ / Constant_Float16F_Scale); } // the main pixel shader that computes ambient occlusion void MainPSandCS(in float4 UVAndScreenPos, float4 SvPosition, out float4 OutColor) { OutColor = 0; // the following constants as set up on C++ side float AmbientOcclusionPower = ScreenSpaceAOParams[0].x; float Ratio = ScreenSpaceAOParams[1].w; float AORadiusInShader = ScreenSpaceAOParams[1].z; float InvAmbientOcclusionDistance = ScreenSpaceAOParams[0].z; float AmbientOcclusionIntensity = ScreenSpaceAOParams[0].w; float2 ViewportUVToRandomUV = ScreenSpaceAOParams[1].xy; float AmbientOcclusionBias = ScreenSpaceAOParams[0].y; float ScaleFactor = ScreenSpaceAOParams[2].x; float ScaleRadiusInWorldSpace = ScreenSpaceAOParams[2].z; float2 UV = UVAndScreenPos.xy; float2 ScreenPos = UVAndScreenPos.zw; float InvTanHalfFov = ScreenSpaceAOParams[3].w; float3 FovFix = float3(InvTanHalfFov, Ratio * InvTanHalfFov, 1); float3 InvFovFix = 1.0f / FovFix; float SceneDepth = GetDepthFromAOInput(UV); float3 WorldNormal = GetWorldSpaceNormalFromAOInput(UV, SvPosition); // can be NaN if WorldNormal=0,0,0 which happens when !USE_NORMALS float3 ViewSpaceNormal = normalize(mul(WorldNormal, (float3x3)View.TranslatedWorldToView)); float3 ViewSpacePosition = ReconstructCSPos(SceneDepth, ScreenPos); float ActualAORadius = AORadiusInShader * lerp(SceneDepth, 1, ScaleRadiusInWorldSpace); // Add bias after fixup (causes minor banding - not needed with larger radius) if (USE_NORMALS) { ViewSpacePosition += AmbientOcclusionBias * SceneDepth * ScaleFactor * (ViewSpaceNormal * FovFix); } float2 WeightAccumulator = 0.0001f; // if the feature is enabled and right side of screen const bool bDebugLookups = DEBUG_LOOKUPS && ViewSpacePosition.x > 0; #if AO_SAMPLE_QUALITY != 0 // no SSAO in this pass, only upsampling #if AO_SAMPLE_QUALITY == 1 // no 4x4 randomization float2 RandomVec = float2(0, 1) * ActualAORadius; { #elif AO_SAMPLE_QUALITY == 2 // extract one of 16 base vectors (rotation and scale) from a texture that repeats 4x4 float2 RandomVec = (Texture2DSample(RandomNormalTexture, RandomNormalTextureSampler, UV * ViewportUVToRandomUV).rg * 2 - 1) * ActualAORadius; { #else // AO_SAMPLE_QUALITY == 3 // extract one of 16 base vectors (rotation and scale) from a texture that repeats 4x4, changing over time if TemporalAA is enabled // jitter each frame a bit to get higher quality over multiple frames (only if TemporalAA is enabled), can cause ghosting effects const float2 TemporalOffset = ScreenSpaceAOParams[3].xy; float2 RandomVec = (Texture2DSample(RandomNormalTexture, RandomNormalTextureSampler, TemporalOffset + UV * ViewportUVToRandomUV).rg * 2 - 1) * ActualAORadius; { #endif // AO_SAMPLE_QUALITY == if(bDebugLookups && ViewSpacePosition.y > 0) { // top sample are not per pixel rotated RandomVec = float2(0, 1) * ActualAORadius; } float2 FovFixXY = FovFix.xy * (1.0f / ViewSpacePosition.z); float4 RandomBase = float4(RandomVec, -RandomVec.y, RandomVec.x) * float4(FovFixXY, FovFixXY); float2 ScreenSpacePos = ViewSpacePosition.xy / ViewSpacePosition.z; // to debug the input depth // OutColor = GetDepthForSSAO(ScreenSpacePos, 0); return; // to debug the reconstructed normal // OutColor = ReconstructedViewSpaceNormal.z; return; // .x means for very anisotropic viewports we scale by x float InvHaloSize = 1.0f / (ActualAORadius * FovFixXY.x * 2); float3 ScaledViewSpaceNormal = ViewSpaceNormal; #if OPTIMIZATION_O1 ScaledViewSpaceNormal *= 0.08f * lerp(SceneDepth, 1000, ScaleRadiusInWorldSpace); #endif UNROLL for(int i = 0; i < SAMPLESET_ARRAY_SIZE; ++i) { // -1..1 float2 UnrotatedRandom = OcclusionSamplesOffsets[i].xy; float2 LocalRandom = (UnrotatedRandom.x * RandomBase.xy + UnrotatedRandom.y * RandomBase.zw); if (bDebugLookups) { UNROLL for(uint step = 0; step < SAMPLE_STEPS; ++step) { float Scale = (step + 1) / (float)SAMPLE_STEPS; float MipLevel = ComputeMipLevel(i, step); float2 ScaledLocalRandom = Scale * LocalRandom; WeightAccumulator += float2(ComputeSampleDebugMask(ScreenSpacePos + ScaledLocalRandom, MipLevel), 1.0f); WeightAccumulator += float2(ComputeSampleDebugMask(ScreenSpacePos - ScaledLocalRandom, MipLevel), 1.0f); } } else if (USE_NORMALS) { float3 LocalAccumulator = 0; UNROLL for(uint step = 0; step < SAMPLE_STEPS; ++step) { // constant at run time float Scale = (step + 1) / (float)SAMPLE_STEPS; // constant at run time (higher is better for texture cache / performance, lower is better quality float MipLevel = ComputeMipLevel(i, step); float3 StepSample = WedgeWithNormal(ScreenSpacePos, Scale * LocalRandom, InvFovFix, ViewSpacePosition, ScaledViewSpaceNormal, InvHaloSize, MipLevel); // combine horizon samples LocalAccumulator = lerp(LocalAccumulator, float3(max(LocalAccumulator.xy, StepSample.xy), 1), StepSample.z); } // Square(): the area scales quadratic with the angle - it gets a bit darker WeightAccumulator += float2(Square(1 - LocalAccumulator.x) * LocalAccumulator.z, LocalAccumulator.z); WeightAccumulator += float2(Square(1 - LocalAccumulator.y) * LocalAccumulator.z, LocalAccumulator.z); // cheaper? Could move 1 - out // WeightAccumulator += float2(1 - LocalAccumulator.x, LocalAccumulator.y); } else // Case with no normals { float2 LocalAccumulator = 0; UNROLL for(uint step = 0; step < SAMPLE_STEPS; ++step) { // constant at run time float Scale = (step + 1) / (float)SAMPLE_STEPS; // constant at run time (higher is better for texture cache / performance, lower is better quality float MipLevel = ComputeMipLevel(i, step); float2 StepSample = WedgeNoNormal(ScreenSpacePos, Scale * LocalRandom, InvFovFix, ViewSpacePosition, InvHaloSize, MipLevel); // combine horizon samples LocalAccumulator = lerp(LocalAccumulator, float2(max(LocalAccumulator.x, StepSample.x), 1), StepSample.y); } // Square(): the area scales quadratic with the angle - it gets a bit darker WeightAccumulator += float2(Square(1 - LocalAccumulator.x) * LocalAccumulator.y, LocalAccumulator.y); } } } #endif // #if AO_SAMPLE_QUALITY == 0 OutColor.r = WeightAccumulator.x / WeightAccumulator.y; OutColor.gb = float2(0, 0); if(!bDebugLookups) { #if COMPUTE_SHADER || FORWARD_SHADING // In compute, Input1 and Input2 are not necessarily valid. float4 Filtered = 1; #else float4 Filtered = ComputeUpsampleContribution(SceneDepth, UV, WorldNormal); #endif // recombined result from multiple resolutions OutColor.r = lerp(OutColor.r, Filtered.r, ComputeLerpFactor()); } #if !USE_AO_SETUP_AS_INPUT if(!bDebugLookups) { // full res // soft fade out AO in the distance { float Mul = ScreenSpaceAOParams[4].x; float Add = ScreenSpaceAOParams[4].y; OutColor.r = lerp(OutColor.r, 1, saturate(SceneDepth * Mul + Add)); } // user adjust AO // abs() to prevent shader warning OutColor.r = 1 - (1 - pow(abs(OutColor.r), AmbientOcclusionPower)) * AmbientOcclusionIntensity; // we output in a single alpha channel OutColor = OutColor.r; } else { OutColor.r = pow(1 - OutColor.r, 16); // constnt is tweaked with radius and sample count } #endif // we don't support ddx_fine() for SM4 #if !COMPUTE_SHADER && QUAD_MESSAGE_PASSING_BLUR > 0 && FEATURE_LEVEL >= FEATURE_LEVEL_SM5 { // .x: AO output, .y:SceneDepth .zw:view space normal float4 CenterPixel = float4(OutColor.r, SceneDepth, normalize(ViewSpaceNormal).xy); float4 dX = ddx_fine(CenterPixel); float4 dY = ddy_fine(CenterPixel); int2 Mod = (uint2)(SvPosition.xy) % 2; float4 PixA = CenterPixel; float4 PixB = CenterPixel - dX * (Mod.x * 2 - 1); float4 PixC = CenterPixel - dY * (Mod.y * 2 - 1); float WeightA = 1.0f; float WeightB = 1.0f; float WeightC = 1.0f; #if QUAD_MESSAGE_PASSING_NORMAL const float NormalTweak = 4.0f; float3 NormalA = ReconstructNormal(PixA.zw); float3 NormalB = ReconstructNormal(PixB.zw); float3 NormalC = ReconstructNormal(PixC.zw); WeightB *= saturate(pow(saturate(dot(NormalA, NormalB)), NormalTweak)); WeightC *= saturate(pow(saturate(dot(NormalA, NormalC)), NormalTweak)); #endif #if QUAD_MESSAGE_PASSING_DEPTH const float DepthTweak = 1; float InvDepth = 1.0f / PixA.y; WeightB *= 1 - saturate(abs(1 - PixB.y * InvDepth) * DepthTweak); WeightC *= 1 - saturate(abs(1 - PixC.y * InvDepth) * DepthTweak); #endif // + 1.0f to avoid div by 0 float InvWeightABC = 1.0f / (WeightA + WeightB + WeightC); WeightA *= InvWeightABC; WeightB *= InvWeightABC; WeightC *= InvWeightABC; OutColor = WeightA * PixA.x + WeightB * PixB.x + WeightC * PixC.x; // visualize where we don't want to fade // OutColor = (WeightA - 0.333f) / 0.666f; } #endif } void MainPS(in noperspective float4 UVAndScreenPos : TEXCOORD0, in FStereoPSInput StereoInput, float4 SvPosition : SV_POSITION, out float4 OutColor : SV_Target0) { MainPSandCS(UVAndScreenPos, SvPosition, OutColor); } #if COMPUTE_SHADER /** Output target. In compute, this is a single value buffer. */ RWTexture2D OutTexture; [numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)] void MainCS( uint2 GroupId : SV_GroupID, uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupThreadId : SV_GroupThreadID) { float ScaleFactor = ScreenSpaceAOParams[2].x; int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin; float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5); // todo: move to a function float4 SvPosition = float4(PixelCenter, 0, 0) * ScaleFactor; float2 BufferUV = SvPositionToBufferUV(SvPosition); SvPosition.z = LookupDeviceZ(BufferUV); // todo: investigate // SvPosition.w = ConvertFromDeviceZ(SvPosition.z); SvPosition.w = 1; float4 OutColor = 1; // Test for early exit with out of depth bound. float SceneDepth = ConvertFromDeviceZ(SvPosition.z); float FadeMul = ScreenSpaceAOParams[4].x; float FadeAdd = ScreenSpaceAOParams[4].y; BRANCH if (SceneDepth * FadeMul + FadeAdd < 1) { MainPSandCS(float4(BufferUV, SvPositionToScreenPosition(SvPosition).xy), SvPosition, OutColor); } // Here we could optimized for coalessing writes but that might not be the performance bottleneck. // We should rather optimized for best texture cache performance. // http://on-demand.gputechconf.com/gtc/2010/presentations/S12312-DirectCompute-Pre-Conference-Tutorial.pdf OutTexture[PixelPos] = OutColor.r; } SCREEN_PASS_TEXTURE_VIEWPORT(SSAOSmoothOutputViewport) FScreenTransform SSAOSmoothOutputToInput; Texture2D SSAOSmoothInputTexture; SamplerState SSAOSmoothInputSampler; RWTexture2D SSAOSmoothOutputTexture; [numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)] void MainSSAOSmoothCS( uint2 GroupId : SV_GroupID, uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupThreadId : SV_GroupThreadID) { BRANCH if (any(DispatchThreadId >= (uint2)SSAOSmoothOutputViewport_ViewportSize)) { return; } uint2 DestPixelPos = SSAOSmoothOutputViewport_ViewportMin + DispatchThreadId; float2 DestUV = DestPixelPos * SSAOSmoothOutputViewport_ExtentInverse; float2 SampleUV = ApplyScreenTransform(DestUV, SSAOSmoothOutputToInput); // Use a 4x4 box filter because the random texture is tiled 4x4 float Result; Result = SSAOSmoothInputTexture.SampleLevel(SSAOSmoothInputSampler, SampleUV, 0).r; Result += SSAOSmoothInputTexture.SampleLevel(SSAOSmoothInputSampler, SampleUV, 0, int2(2, 0)).r; Result += SSAOSmoothInputTexture.SampleLevel(SSAOSmoothInputSampler, SampleUV, 0, int2(0, 2)).r; Result += SSAOSmoothInputTexture.SampleLevel(SSAOSmoothInputSampler, SampleUV, 0, int2(2, 2)).r; SSAOSmoothOutputTexture[DestPixelPos] = Result * 0.25; } #endif #if SHADER_QUALITY == 0 // very low #define GTAO_NUMTAPS 4 #define GTAO_BIASMIPLEVEL 2 #define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f #elif SHADER_QUALITY == 1 // low #define GTAO_NUMTAPS 6 #define GTAO_BIASMIPLEVEL 1 #define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f #elif SHADER_QUALITY == 2 // medium #define GTAO_NUMTAPS 8 #define GTAO_BIASMIPLEVEL 0 #define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f #elif SHADER_QUALITY == 3 // high #define GTAO_NUMTAPS 12 #define GTAO_BIASMIPLEVEL 0 #define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f #else // SHADER_QUALITY == 4 // very high #define GTAO_NUMTAPS 20 #define GTAO_BIASMIPLEVEL 0 #define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f #endif float4 GTAOParams[5]; // [0] - { cos(TemporalAngle), sin(TemporalAngle), TemporalOffset, FrameTemporalOffset} // [1] - { FrameNumber, Thicknessblend, unused, unused} // [2] - { TargetSizeX, TargetSizeY, 1.0/TargetSizeX, 1.0f/TargetSizeY} // [3] - { FallOffStart, FallOffEnd, FalloffScale, FalloffBias} // [4] - { Temporal Blend Weight, Angles, SinDeltaAngle, CosDeltaAngle} #define PI_HALF (PI*0.5) #if COMPUTE_SHADER RWTexture2D HorizonOutTexture; RWTexture2D DepthOutTexture; RWTexture2D VelocityOutTexture; RWTexture2D DepthsTexture; #endif Texture2D HistoryTexture; SamplerState HistoryTextureSampler; float2 HistoryTextureSize; float2 HistoryTexturePixelSize; Texture2D ZCurrTexture; SamplerState ZCurrTextureSampler; float4 PrevScreenPositionScaleBias; float ClampScale(float Scale) { return clamp(Scale, 2.0, 8.0); } float3 GetNormal(float2 UV, float3 ViewSpacePosMid) { float3 ViewSpaceNormal; #if USE_NORMALBUFFER #if SUBTRATE_GBUFFER_FORMAT==1 const FSubstrateTopLayerData TopLayerData = SubstrateUnpackTopLayerData(Substrate.TopLayerTexture.Load(uint3(clamp(UV * View.BufferSizeAndInvSize.xy, 0.0f, View.BufferSizeAndInvSize.xy-1.0f), 0))); float3 WorldNormal = TopLayerData.WorldNormal; #else // Get the normal from the normal buffer float3 WorldNormal = GetGBufferData(UV, false).WorldNormal; #endif ViewSpaceNormal = normalize(mul(WorldNormal, (float3x3)View.TranslatedWorldToView)); #else // Get the normal derived from the depth buffer float2 DeltaUV = View.BufferSizeAndInvSize.zw; float DeviceZ = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV,0).r; float DeviceZLeft = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV + float2(-DeltaUV.x, 0.0f),0).r; float DeviceZTop = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV + float2( 0.0f , -DeltaUV.y),0).r; float DeviceZRight = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV + float2( DeltaUV.x, 0.0f),0).r; float DeviceZBottom = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV + float2( 0.0f , DeltaUV.y),0).r; float DeviceZDdx = TakeSmallerAbsDelta(DeviceZLeft, DeviceZ, DeviceZRight); float DeviceZDdy = TakeSmallerAbsDelta(DeviceZTop, DeviceZ, DeviceZBottom); float ZRight = ConvertFromDeviceZ(DeviceZ + DeviceZDdx); float ZDown = ConvertFromDeviceZ(DeviceZ + DeviceZDdy); float3 Right = ScreenToViewPos(UV+ float2( DeltaUV.x, 0.0f) , ZRight)-ViewSpacePosMid; float3 Down = ScreenToViewPos(UV+ float2( 0.0f, DeltaUV.y) , ZDown) -ViewSpacePosMid; ViewSpaceNormal = normalize(cross(Right, Down)); #endif return ViewSpaceNormal; } float GetLinearDepthProj(float2 ScreenUV) { float DeviceZ = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, ScreenUV, 0).r; return 1.0f / (DeviceZ * View.InvDeviceZToWorldZTransform[2] - View.InvDeviceZToWorldZTransform[3]); } float2 SearchForLargestAngleDual(uint NumSteps, float2 BaseUV, float2 ScreenDir, float SearchRadius, float InitialOffset, float3 ViewPos, float3 ViewDir,float AttenFactor) { float SceneDepth, LenSq, OOLen, Ang, FallOff; float3 V; float2 SceneDepths =0; float2 BestAng = float2(-1,-1); float Thickness = GTAOParams[1].y; for(uint i=0; i BestAng.x ) ? Ang : lerp( Ang, BestAng.x, Thickness ); // Negative Direction V = ScreenToViewPos(UV2.zw, SceneDepths.y) - ViewPos; LenSq = dot(V,V); OOLen = rsqrt(LenSq + 0.0001); Ang = dot(V,ViewDir) * OOLen; FallOff = saturate(LenSq * AttenFactor); Ang = lerp(Ang, BestAng.y, FallOff); BestAng.y = ( Ang > BestAng.y ) ? Ang : lerp( Ang, BestAng.y, Thickness ); } BestAng.x = acosFast(clamp(BestAng.x, -1.0, 1.0)); BestAng.y = acosFast(clamp(BestAng.y, -1.0, 1.0)); return BestAng; } float2 SearchForLargestAngleDual_HZB(uint NumSteps, float2 BaseUV, float2 ScreenDir, float SearchRadius, float InitialOffset, float3 ViewPos, float3 ViewDir, float AttenFactor) { float SceneDepth, LenSq, OOLen, Ang, FallOff; float3 V; float2 SceneDepths =0; float MipLevel = 0 ; float2 BestAng = float2(-1,-1); float Thickness = GTAOParams[1].y; UNROLL for(uint i=1; i3) MipLevel+=2; SceneDepths.x = GetHZBDepth(UV2.xy,MipLevel); SceneDepths.y = GetHZBDepth(UV2.zw,MipLevel); } // Positive Direction V = ScreenToViewPos(UV2.xy, SceneDepths.x) - ViewPos; LenSq = dot(V,V); OOLen = rsqrtFast(LenSq + 0.0001); FallOff = saturate(LenSq * AttenFactor); if(FallOff < 1.0) { Ang = dot(V,ViewDir) * OOLen; Ang = lerp(Ang, BestAng.x, FallOff); BestAng.x = ( Ang > BestAng.x ) ? Ang : lerp( Ang, BestAng.x, Thickness ); } // Negative Direction V = ScreenToViewPos(UV2.zw, SceneDepths.y) - ViewPos; LenSq = dot(V,V); OOLen = rsqrtFast(LenSq + 0.0001); FallOff = saturate(LenSq * AttenFactor); if(FallOff < 1.0) { Ang = dot(V,ViewDir) * OOLen; Ang = lerp(Ang, BestAng.y, FallOff); BestAng.y = ( Ang > BestAng.y ) ? Ang : lerp( Ang, BestAng.y, Thickness ); } } BestAng.x = acosFast(clamp(BestAng.x, -1.0, 1.0)); BestAng.y = acosFast(clamp(BestAng.y, -1.0, 1.0)); return BestAng; } float ComputeInnerIntegral(float2 UV, float2 Angles, float2 ScreenDir, float3 ViewDir, float3 ViewSpaceNormal, float SceneDepth) { // Given the angles found in the search plane we need to project the View Space Normal onto the plane defined by the search axis and the View Direction and perform the inner integrate float3 PlaneNormal = normalize(cross(float3(ScreenDir.xy,0) ,ViewDir)); float3 Perp = cross(ViewDir, PlaneNormal); float3 ProjNormal = ViewSpaceNormal - PlaneNormal * dot(ViewSpaceNormal, PlaneNormal); float LenProjNormal = length(ProjNormal) + 0.000001f; float RecipMag = 1.0f / (LenProjNormal); float CosAng = dot(ProjNormal, Perp) * RecipMag; float Gamma = acosFast(CosAng) - PI_HALF; float CosGamma = dot(ProjNormal, ViewDir) * RecipMag; float SinGamma = CosAng * -2.0f; // clamp to normal hemisphere Angles.x = Gamma + max(-Angles.x - Gamma, -(PI_HALF) ); Angles.y = Gamma + min( Angles.y - Gamma, (PI_HALF) ); float AO = ( (LenProjNormal) * 0.25 * ( (Angles.x * SinGamma + CosGamma - cos((2.0 * Angles.x) - Gamma)) + (Angles.y * SinGamma + CosGamma - cos((2.0 * Angles.y) - Gamma)) )); return AO; } float InterleavedGradientNoise( float2 iPos ) { return frac( 52.9829189f * frac( (iPos.x * 0.06711056) + (iPos.y*0.00583715)) ); } float2 GetRandomAngleOffset(uint2 iPos ) { iPos.y = 4096-iPos.y; float Angle = InterleavedGradientNoise(float2(iPos)); float Offset = (1.0/4.0) * (( iPos.y - iPos.x)&3); return float2(Angle, Offset); } float3 GetRandomVector(uint2 iPos ) { iPos.y = 16384-iPos.y; float3 RandomVec = float3(0,0,0); float3 RandomTexVec = float3(0,0,0); float ScaleOffset; float TemporalCos = GTAOParams[0].x; float TemporalSin = GTAOParams[0].y; float GradientNoise = InterleavedGradientNoise(float2(iPos)); RandomTexVec.x = cos((GradientNoise*PI) ); RandomTexVec.y = sin((GradientNoise*PI) ); ScaleOffset = (1.0/4.0) * (( iPos.y - iPos.x) & 3); // ScaleOffset = (1.0/5.0) * (( iPos.y - iPos.x) % 5); RandomVec.x = dot(RandomTexVec.xy, float2(TemporalCos, -TemporalSin )); RandomVec.y = dot(RandomTexVec.xy, float2(TemporalSin, TemporalCos )); RandomVec.z = frac(ScaleOffset + GTAOParams[0].z); return RandomVec; } /* * * HORIZON SEARCH AND INNER INTEGRATE COMBINED * */ void GTAOCombinedPSandCS(in float2 UV, in uint2 iPos, out float OutColor) { OutColor = 0; // Offset by a fraction of a pixel to unsure we don't hit between pixels when running at half res float2 QuarterOffset = AOSceneViewport_ExtentInverse * 0.125; float2 TexUV = UV + QuarterOffset; UV += QuarterOffset; float DeviceZ = LookupDeviceZ(TexUV ); float SceneDepth = ConvertFromDeviceZ(DeviceZ); if(SceneDepth > ScreenSpaceAOParams[4].w) { OutColor = 1; return; } float3 ViewSpacePos = ScreenToViewPos(TexUV,SceneDepth); float3 ViewSpaceNormal = GetNormal(TexUV, ViewSpacePos); float3 ViewDir = normalize(-ViewSpacePos.xyz); const float WorldRadius = GTAOParams[3].y; float InvTanHalfFov = ScreenSpaceAOParams[3].w; float FOVScale = AOSceneViewport_Extent.y * InvTanHalfFov; // TODO // Get Radius in ScreenSpace (in pixels) float WorldRadiusAdj = WorldRadius * FOVScale; float PixelRadius = max( min( WorldRadiusAdj / ViewSpacePos.z, GTAO_MAX_PIXEL_SCREEN_RADIUS ), (float) GTAO_NUMTAPS ); float StepRadius = PixelRadius / ( (float) GTAO_NUMTAPS + 1 ); float AttenFactor = 2.0 / (WorldRadius * WorldRadius); // Get the randomized Direction to sample and the step offset float3 RandomAndOffset = GetRandomVector(iPos); float2 RandomVec = RandomAndOffset.xy; float Offset = RandomAndOffset.z; float Sum=0.0; uint NumAngles = (uint) GTAOParams[4].y; float SinDeltaAngle = GTAOParams[4].z; float CosDeltaAngle = GTAOParams[4].w; float2 ScreenDir = float2(RandomVec.x, RandomVec.y); for(uint Angle =0; Angle < NumAngles; Angle++) { float2 Angles = SearchForLargestAngleDual_HZB(GTAO_NUMTAPS, TexUV, ScreenDir* View.BufferSizeAndInvSize.zw, StepRadius, Offset, ViewSpacePos, ViewDir, AttenFactor); Sum += ComputeInnerIntegral(TexUV, Angles, ScreenDir, ViewDir, ViewSpaceNormal, SceneDepth); // Rotate for the next angle float2 TempScreenDir = ScreenDir.xy; ScreenDir.x = (TempScreenDir.x * CosDeltaAngle) + (TempScreenDir.y * -SinDeltaAngle); ScreenDir.y = (TempScreenDir.x * SinDeltaAngle) + (TempScreenDir.y * CosDeltaAngle); Offset = frac(Offset + 0.617); } float AO = Sum; AO = AO / ((float)NumAngles); AO *= 2.0/PI; // Fade out based on user defined distance float Mul = ScreenSpaceAOParams[4].x; float Add = ScreenSpaceAOParams[4].y; AO = lerp(AO, 1, saturate(SceneDepth * Mul + Add)); OutColor = AO ; return; } void GTAOCombinedPS(in float4 UVAndScreenPos : TEXCOORD0, out float OutColor : SV_Target0) { int2 iPos = int2( UVAndScreenPos.xy * AOViewport_Extent ); GTAOCombinedPSandCS(UVAndScreenPos.xy, iPos, OutColor); } #if COMPUTE_SHADER [numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)] void GTAOCombinedCS( uint2 GroupId : SV_GroupID, uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupThreadId : SV_GroupThreadID) { float OutColor = 0; int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin; float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5); float2 BufferUV = PixelCenter.xy * GTAOParams[2].zw; GTAOCombinedPSandCS(BufferUV, PixelPos, OutColor); OutTexture[PixelPos] = OutColor; } #endif /* * * INNER INTEGRATE * */ Texture2D HorizonsTexture; SamplerState HorizonsTextureSampler; float GTAOInnerIntegratePSandCS(in float2 UV, in uint2 iPos) { float2 QuarterOffset = AOSceneViewport_ExtentInverse * 0.125; UV += QuarterOffset; // Read the angles buffer float SceneDepth = GetDepthFromAOInput(UV); if(SceneDepth > ScreenSpaceAOParams[4].w) { return 1; } float4 Angles = Texture2DSample(HorizonsTexture, HorizonsTextureSampler, UV); // Angles computed from previous pass Angles = Angles * PI; // Get Angle float2 RandomVec = GetRandomVector(iPos).xy; float2 ScreenDir = float2(RandomVec.x, RandomVec.y); // ViewspacePos and Normal float3 ViewSpacePos = ScreenToViewPos(UV, SceneDepth); #if SUBTRATE_GBUFFER_FORMAT==1 const FSubstrateTopLayerData TopLayerData = SubstrateUnpackTopLayerData(Substrate.TopLayerTexture.Load(uint3(iPos, 0))); float3 WorldNormal = TopLayerData.WorldNormal; #else float3 WorldNormal = GetGBufferData(UV, false).WorldNormal; #endif float3 ViewSpaceNormal = normalize(mul(WorldNormal, (float3x3)View.TranslatedWorldToView)); float3 ViewDir = -normalize(ViewSpacePos.xyz); // TODO - This is a function of UV only. float AO = ComputeInnerIntegral(UV, Angles.xy, ScreenDir, ViewDir, ViewSpaceNormal, SceneDepth); uint NumAngles = (uint) GTAOParams[4].y; if(NumAngles>1) { ScreenDir.xy = float2(-ScreenDir.y, ScreenDir.x); AO += ComputeInnerIntegral(UV, Angles.zw, ScreenDir, ViewDir, ViewSpaceNormal, SceneDepth); AO *=0.5; } AO *= 2.0/PI; // Fade out based on user defined distance float Mul = ScreenSpaceAOParams[4].x; float Add = ScreenSpaceAOParams[4].y; AO = lerp(AO, 1, saturate(SceneDepth * Mul + Add)); return AO ; } void GTAOInnerIntegratePS(in noperspective float4 UVAndScreenPos : TEXCOORD0, out float4 OutColor : SV_Target0) { int2 iPos = int2( UVAndScreenPos.xy * AOViewport_Extent ); float AO = GTAOInnerIntegratePSandCS(UVAndScreenPos.xy, iPos); OutColor = AO; } #if COMPUTE_SHADER [numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)] void GTAOInnerIntegrateCS( uint2 GroupId : SV_GroupID, uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupThreadId : SV_GroupThreadID) { float OutColor = 0; int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin; float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5); float2 BufferUV = PixelCenter.xy * GTAOParams[2].zw; float AO = GTAOInnerIntegratePSandCS(BufferUV,PixelPos); OutTexture[PixelPos] = AO; } #endif /* * * HORIZON SEARCH ONLY * */ float4 HorizonSearchPSandCS(in float2 UV, in uint2 iPos) { float4 OutHorizons = 0; // Offset by a fraction of a pixel to unsure we don't hit between pixels when running at half res float2 QuarterOffset = AOSceneViewport_ExtentInverse * 0.125; UV = UV + QuarterOffset; float DeviceZ = LookupDeviceZ(UV ); float SceneDepth = ConvertFromDeviceZ(DeviceZ); if(SceneDepth > ScreenSpaceAOParams[4].w) { OutHorizons = 0; return OutHorizons; } float3 ViewSpacePos = ScreenToViewPos(UV,SceneDepth); float3 ViewSpaceNormal = GetNormal(UV, ViewSpacePos); float3 ViewDir = normalize(-ViewSpacePos.xyz); const float WorldRadius = GTAOParams[3].y; float InvTanHalfFov = ScreenSpaceAOParams[3].w; float FOVScale = AOSceneViewport_Extent.y * InvTanHalfFov; // Get Radius in ScreenSpace (in pixels) float WorldRadiusAdj = WorldRadius * FOVScale; float PixelRadius = max( min( WorldRadiusAdj / ViewSpacePos.z, GTAO_MAX_PIXEL_SCREEN_RADIUS ), (float) GTAO_NUMTAPS ); float StepRadius = PixelRadius / ( (float) GTAO_NUMTAPS + 1 ); float AttenFactor = 2.0 / (WorldRadius * WorldRadius); // Get the randomized Direction to sample and the step offset float3 RandomAndOffset = GetRandomVector(iPos); float2 RandomVec = RandomAndOffset.xy; float Offset = RandomAndOffset.z; float Sum=0.0; uint NumAngles = (uint) GTAOParams[4].y; float SinDeltaAngle = GTAOParams[4].z; float CosDeltaAngle = GTAOParams[4].w; float2 ScreenDir = float2(RandomVec.x, RandomVec.y); // First Angle float2 Angles = SearchForLargestAngleDual_HZB(GTAO_NUMTAPS, UV, ScreenDir* View.BufferSizeAndInvSize.zw, StepRadius, Offset, ViewSpacePos, ViewDir, AttenFactor); Angles /= PI; float2 Angles2=0; if(NumAngles>1) { // Rotate for the next angle float2 TempScreenDir = ScreenDir.xy; ScreenDir.x = (TempScreenDir.x * CosDeltaAngle) + (TempScreenDir.y * -SinDeltaAngle); ScreenDir.y = (TempScreenDir.x * SinDeltaAngle) + (TempScreenDir.y * CosDeltaAngle); Angles2 = SearchForLargestAngleDual_HZB(GTAO_NUMTAPS, UV, ScreenDir* View.BufferSizeAndInvSize.zw, StepRadius, Offset, ViewSpacePos, ViewDir, AttenFactor); Angles2 /= PI; } OutHorizons.xy = Angles; OutHorizons.zw = Angles2; return OutHorizons; } void HorizonSearchPS(in noperspective float4 UVAndScreenPos : TEXCOORD0, float4 SvPosition : SV_POSITION, out float4 OutColor : SV_Target0) { int2 iPos = int2( UVAndScreenPos.xy * AOViewport_Extent ); OutColor = HorizonSearchPSandCS(UVAndScreenPos.xy, iPos); } #if COMPUTE_SHADER [numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)] void HorizonSearchCS( uint2 GroupId : SV_GroupID, uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupThreadId : SV_GroupThreadID) { float2 OutColor = 0; int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin; float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5); float2 BufferUV = PixelCenter.xy * GTAOParams[2].zw; float4 Horizons = HorizonSearchPSandCS(BufferUV,PixelPos); HorizonOutTexture[PixelPos] = Horizons; } #endif /* * * TEMPORAL FILTER * */ Texture2D SceneVelocityTexture; SamplerState SceneVelocityTextureSampler; float4 BlendParams; float3 ReprojectPos(float2 UV, float Depth) { // Given a UV reproject where this was in the previous frame // Camera motion for pixel (in ScreenPos space). float2 ThisScreen = (UV.xy - View.ScreenPositionScaleBias.wz) / View.ScreenPositionScaleBias.xy; float4 ThisClip = float4( ThisScreen, Depth, 1 ); float4 PrevClip = mul( ThisClip, View.ClipToPrevClip ); float2 PrevScreen = PrevClip.xy / PrevClip.w; float4 EncodedVelocity = Texture2DSampleLevel(SceneVelocityTexture, SceneVelocityTextureSampler, UV,0); if( EncodedVelocity.x > 0.0 ) { PrevScreen = ThisClip.xy - DecodeVelocityFromTexture(EncodedVelocity).xy; } float2 PrevUV = PrevScreen.xy * PrevScreenPositionScaleBias.xy + PrevScreenPositionScaleBias.zw; return float3(PrevUV, PrevClip.z/ PrevClip.w); } float ReadHistoryClamp(float2 UV, float MinAO, float MaxAO) { float BilinearWeights[4]; float2 PixUV = (UV * HistoryTextureSize)-0.5; float2 FloorUV = floor(PixUV); float2 FracUV = (PixUV - FloorUV); UV = (FloorUV * HistoryTexturePixelSize) + (HistoryTexturePixelSize*0.5); BilinearWeights[0] = (1.0 - FracUV.x) * ( 1.0 - FracUV.y); BilinearWeights[1] = ( FracUV.x) * ( 1.0 - FracUV.y); BilinearWeights[2] = (1.0 - FracUV.x) * ( FracUV.y); BilinearWeights[3] = ( FracUV.x) * ( FracUV.y); // Read the 4 previous depths and History float HistoryAO[4]; float2 dUV = HistoryTexturePixelSize; // TODO - Use GatherR when available HistoryAO[0] = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV + float2( 0, 0)).r; HistoryAO[1] = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV + float2(dUV.x, 0)).r; HistoryAO[2] = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV + float2( 0, dUV.y)).r; HistoryAO[3] = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV + float2(dUV.x, dUV.y)).r; float VisHistory = 0; for(int i=0; i<4; i++) { HistoryAO[i] = clamp(HistoryAO[i], MinAO, MaxAO); VisHistory += BilinearWeights[i] * HistoryAO[i]; } return VisHistory; } Texture2D GTAOTemporalInput; SamplerState GTAOTemporalSampler; float2 GTAOTemporalInputPixelSize; void NeighbourhoodClamp(float2 UV, float BaseAO, inout float MinAO, inout float MaxAO) { float2 dUV = GTAOTemporalInputPixelSize * 1.5; #define NumSamples 4 float AONeighbours[NumSamples]; AONeighbours[0] = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV + float2(-dUV.x,-dUV.y) ).r; AONeighbours[1] = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV + float2(-dUV.x, dUV.y) ).r; AONeighbours[2] = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV + float2( dUV.x,-dUV.y) ).r; AONeighbours[3] = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV + float2( dUV.x, dUV.y) ).r; #if GTAO_VARIANCE_CLIPPING float AOAverage = 0; float AOSquared = 0; for(int i=0; i<4; i++) { AOAverage += AONeighbours[i]; AOSquared += AONeighbours[i]*AONeighbours[i]; } float Mu = AOAverage / NumSamples; float Sigma = sqrt(AOSquared / NumSamples - (Mu*Mu)); MinAO = max( Mu - Sigma * 0.8, 0.0 ); MaxAO = min( Mu + Sigma * 0.8, 1.0 ); #else MinAO = min(BaseAO, min(min(AONeighbours[0], AONeighbours[1]), min(AONeighbours[2], AONeighbours[3]))); MaxAO = max(BaseAO, max(max(AONeighbours[0], AONeighbours[1]), max(AONeighbours[2], AONeighbours[3]))); #endif } float CompareVeloc(float2 V1, float2 V2) { float2 V12 = V1-V2; return 1-saturate( abs(V12.x + V12.y) * 100); } void GTAOTemporalFilterPSAndCS(float2 UV, inout float OutAO) { float BlendWeight = GTAOParams[4].x; float2 QuarterOffset = AOSceneViewport_ExtentInverse * 0.125; UV = UV + QuarterOffset; // Latest AO value float NewAO = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV).r; // Current depth of the rendered Scene float CurrDepthDeviceZ = Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV).r; float CurrDepth = ConvertFromDeviceZ( CurrDepthDeviceZ); // Previous UV value float3 PrevUVDepth = ReprojectPos( UV, CurrDepthDeviceZ); float CurrDepthReproject = ConvertFromDeviceZ(PrevUVDepth.z); float2 PrevUV = PrevUVDepth.xy; float2 PixVelocity = UV - PrevUV; float VelocityMag = saturate(length(PixVelocity)*100); // Compare velocities float2 DestVeloc=0; { float DestDeviceZ = Texture2DSample(ZCurrTexture, ZCurrTextureSampler, PrevUVDepth.xy).r; float3 Reproj = ReprojectPos( PrevUVDepth.xy, DestDeviceZ); DestVeloc = PrevUVDepth.xy - Reproj.xy; } float VelocCompare = CompareVeloc(PixVelocity, DestVeloc); // Get an acceptable range of values we care about from the current AO float RangeVal = lerp(0.1, 0.00, VelocityMag); float MinAO = saturate(NewAO - RangeVal); float MaxAO = saturate(NewAO + RangeVal); // Simple history value float HistoryPrevUV = ReadHistoryClamp(PrevUV, MinAO, MaxAO); float HistoryThisUV = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV ).r; HistoryThisUV = clamp(HistoryThisUV, MinAO, MaxAO); float HistoryAO = HistoryPrevUV; HistoryAO = lerp(HistoryThisUV, HistoryPrevUV, VelocCompare); OutAO = lerp(HistoryAO, NewAO, BlendWeight); } void GTAOTemporalFilterPS(in noperspective float4 UVAndScreenPos : TEXCOORD0, float4 SvPosition : SV_POSITION, out float4 OutColor : SV_Target0) { float OutAO = 0; GTAOTemporalFilterPSAndCS(UVAndScreenPos.xy, OutAO); OutColor = OutAO; } #if COMPUTE_SHADER [numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)] void GTAOTemporalFilterCS( uint2 GroupId : SV_GroupID, uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupThreadId : SV_GroupThreadID) { float OutColor = 0; int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin; float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5); float2 BufferUV = PixelCenter.xy * AOViewport_ExtentInverse; GTAOTemporalFilterPSAndCS(BufferUV, OutColor); OutTexture[PixelPos] = OutColor; } #endif /* * UPSAMPLE FILTER * */ Texture2D GTAOUpsampleTexture; SamplerState GTAOUpsampleSampler; float2 GTAOUpsamplePixelSize; float GTAOUpsamplePSAndCS(float2 UV) { float2 Offset = GTAOUpsamplePixelSize * 0.25; float AOC = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV ) .r); float AO0 = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV + float2(-Offset.x, -Offset.y)) .r); float AO1 = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV + float2(Offset.x, -Offset.y)) .r); float AO2 = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV + float2(-Offset.x, Offset.y)) .r); float AO3 = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV + float2(Offset.x, Offset.y)) .r); float AO = min(min(AO0, AO1), min(AO2, AO3)); return AO; } void GTAOUpsamplePS(in noperspective float4 UVAndScreenPos : TEXCOORD0, out float4 OutColor : SV_Target0) { OutColor = GTAOUpsamplePSAndCS(UVAndScreenPos.xy); } #if COMPUTE_SHADER Texture2D GTAOSpatialFilterTexture; Texture2D GTAOSpatialFilterDepthTexture; uint2 GTAOSpatialFilterExtents; float4 GTAOSpatialFilterParams; float4 GTAOSpatialFilterWidth; // The 5x5 filter works on a threadgroup of size 16x8 (128 pixels) // We need to read in the 16x8 and a 2 pixel border around. So this is 20x12 (240 pixels) // Each thread reads in 2 pixels each // We make the array 32 wide so it plays better with bank conflicts #define LDS_WIDTH 20 groupshared float AOData[ LDS_WIDTH*12]; groupshared float ZData[ LDS_WIDTH*12]; int GetLDSLocation(int x, int y) { x+=2; y+=2; return ((y*LDS_WIDTH) + x) ; } float GetAOLin(int loc) { return AOData [loc]; } float GetZLin(int loc) { return ZData[loc]; } float GetAO(int x, int y) { x+=2; y+=2; return AOData [(y*LDS_WIDTH) + x]; } float GetZ(int x, int y) { x+=2; y+=2; return ZData [(y*LDS_WIDTH) + x]; } [numthreads(16, 8, 1)] void GTAOSpatialFilterCS( int GroupIndex: SV_GroupIndex, uint2 GroupId : SV_GroupID, uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupThreadId : SV_GroupThreadID) { int2 GTId = int2(GroupThreadId); // Position on the screen We care about int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin; // Firstly get the origin in the screen of the 16x8 inner box int2 FullGroupOrigin = int2(GroupId.x * 16, GroupId.y * 8) + AOViewport_ViewportMin; int2 FullGroupOriginM2 = FullGroupOrigin.xy - int2(2,2); uint pixIdx = (GroupIndex*2); float DownsampleFactor = GTAOSpatialFilterParams.x; // Downsampled version. Note that the Z is double the res of the Z if(pixIdx < (20*12) ) { uint XPos = pixIdx%20; uint YPos = pixIdx/20; int LDSPos = (YPos*LDS_WIDTH) + XPos; int2 ReadXYAO = FullGroupOriginM2 + int2(XPos,YPos); int2 ReadXYZ = ReadXYAO*DownsampleFactor; float AO = GTAOSpatialFilterTexture.Load(int3(ReadXYAO, 0)).r; float Z = GTAOSpatialFilterDepthTexture.Load( int3(ReadXYZ, 0)).r; AOData[ LDSPos ] = AO; ZData[ LDSPos ] = Z; // Next pixel LDSPos++; ReadXYAO.x +=1; ReadXYZ.x +=DownsampleFactor; AO = GTAOSpatialFilterTexture.Load(int3(ReadXYAO, 0)).r; Z = GTAOSpatialFilterDepthTexture.Load( int3(ReadXYZ, 0)).r; AOData[ LDSPos ] = AO; ZData[ LDSPos ] = Z; } GroupMemoryBarrierWithGroupSync(); // Get the differences in Z at this pixel. This is needed for the bilateral filter float ThisZ = GetZ(GTId.x, GTId.y); float ThisZLin =ConvertFromDeviceZ( ThisZ); float2 ZDiff; int FilterMin = int(GTAOSpatialFilterWidth.x); int FilterMax = int(GTAOSpatialFilterWidth.y); int LDSBase = GetLDSLocation(GTId.x + FilterMin, GTId.y + FilterMin); //Get X Delta int LDSCentre = GetLDSLocation(GTId.x , GTId.y); { float XM2Z = GetZLin(LDSCentre-2); float XM1Z = GetZLin(LDSCentre-1); float XP1Z = GetZLin(LDSCentre+1); float XP2Z = GetZLin(LDSCentre+2); // Get extrapolated point either side float C1 = abs((XM1Z + (XM1Z - XM2Z)) - ThisZ); float C2 = abs((XP1Z + (XP1Z - XP2Z)) - ThisZ); if(C1 < C2) { ZDiff.x = XM1Z - XM2Z; } else { ZDiff.x = XP2Z - XP1Z; } } //Get Y Delta { float YM2Z = GetZLin(LDSCentre-(2*LDS_WIDTH)); float YM1Z = GetZLin(LDSCentre-(1*LDS_WIDTH)); float YP1Z = GetZLin(LDSCentre+(1*LDS_WIDTH)); float YP2Z = GetZLin(LDSCentre+(2*LDS_WIDTH)); // Get extrapolated point either side float C1 = abs((YM1Z + (YM1Z - YM2Z)) - ThisZ); float C2 = abs((YP1Z + (YP1Z - YP2Z)) - ThisZ); if(C1 < C2) { ZDiff.y = YM1Z - YM2Z; } else { ZDiff.y = YP2Z - YP1Z; } } // Do the blur float SumAO = 0; float SumWeight = 0; int x,y; // Get the Z Value to compare against float DepthBase = ThisZ +(ZDiff.x*FilterMin) + (ZDiff.y*FilterMin); float SimpleBlur=0.0; for(y=FilterMin; y<=FilterMax; y++) { float PlaneZ = DepthBase; int LDSLineBase = LDSBase; LDSBase += LDS_WIDTH; for(x=FilterMin; x<=FilterMax; x++) { float Sample_AO = GetAOLin(LDSLineBase); float SampleZ = GetZLin( LDSLineBase); LDSLineBase++; // Get the bilateral weight. This is a function of the difference in height between the plane equation and the base depth // Compare the Z at this sample with the gradients float SampleZDiff = abs(PlaneZ - SampleZ); const float SpatialFilterWeight = 20000; float Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight ); SumAO += Sample_AO * Weight; //SimpleBlur += Sample_AO; SumWeight += Weight; PlaneZ += ZDiff.x; } DepthBase += ZDiff.y; } SumAO /=SumWeight; SumAO *= (PI/2.0) ; // user adjust AO float AmbientOcclusionIntensity = ScreenSpaceAOParams[0].w; float AmbientOcclusionPower = ScreenSpaceAOParams[0].x*0.5; SumAO = 1 - (1 - pow(abs(SumAO), AmbientOcclusionPower)) * AmbientOcclusionIntensity; OutTexture[PixelPos] = SumAO; } #endif float2 SpatialDiff; // Single axis blur filter for Pixel Shaders void GTAOSpatialFilterPS(float4 UVAndScreenPos : TEXCOORD0, float4 SvPosition : SV_POSITION, out float4 OutColor : SV_Target0) { float2 UV = UVAndScreenPos.xy; // Do a 3 pixel wide spatial filter float OutAO = 0; float2 Offset = PostprocessInput0Size.zw; float2 Offset2 = Offset*2; // Get Depth and AO at this pixel float AO_C = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV).r; float Z_C = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV).r); float AO_M1 = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV - Offset).r; float Z_M1 = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV- Offset).r); float AO_P1 = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV + Offset).r; float Z_P1 = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV + Offset).r); float AO_M2 = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV - Offset2).r; float Z_M2 = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV- Offset).r); float AO_P2 = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV + Offset2).r; float Z_P2 = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV + Offset).r); float DiffZ = min( abs(Z_C - Z_M1), abs(Z_C - Z_P1) ); const float SpatialFilterWeight = 1000; float SampleZDiff=0; float Weight=0; // Blend the values float SumWeight = 1.0; float TotalAO = AO_C; // Minus 2 SampleZDiff = abs(Z_C - Z_M2); SampleZDiff -= DiffZ*2; Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight ); TotalAO += AO_M2 * Weight; SumWeight += Weight; // Minus 2 SampleZDiff = abs(Z_C - Z_M1); SampleZDiff -= DiffZ; Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight ); TotalAO += AO_M1 * Weight; SumWeight += Weight; // Plus 2 SampleZDiff = abs(Z_C - Z_P2); SampleZDiff -= DiffZ*2; Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight ); TotalAO += AO_P2 * Weight; SumWeight += Weight; // Plus 1 SampleZDiff = abs(Z_C - Z_P1); SampleZDiff -= DiffZ; Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight ); TotalAO += AO_P1 * Weight; SumWeight += Weight; TotalAO /= SumWeight; // Blend them together based on depth OutColor = AO_C; } #if COMPUTE_SHADER #define UPSAMPLE_LDS_WIDTH 20 groupshared float FullZData[ (16+1) * UPSAMPLE_LDS_WIDTH ]; groupshared float LowAOData[ 9*10]; float GetBlendAO(float AO1, float AO2, float Z1, float Z2, float ZMid) { float dZ = Z2 - Z1; float Epsilon = 0.00001f; if(abs(dZ) < Epsilon) { return (AO1+AO2) * 0.5; } float Ratio = saturate((ZMid -Z1) * (1.0/dZ) ); return (AO1 * (1.0-Ratio)) + ( AO2 * Ratio); } [numthreads(8, 8, 1)] void SmartUpsample( int GroupIndex: SV_GroupIndex, uint2 GroupId : SV_GroupID, uint2 DispatchThreadId : SV_DispatchThreadID, uint2 GroupThreadId : SV_GroupThreadID) { #if 0 int2 GTId = int2(GroupThreadId); // Get the pixel Pos of the final position int2 PixelPos = DispatchThreadId*2 + ScreenSpaceAOParams[5].zw; // Each thread will compute 4 output colours . We need a 1 pixel border around the depth buffer so each thread will read in 5 pixel into the 17x17 buffer int2 FullGroupOrigin = int2(GroupId.x * THREADGROUP_SIZEX, GroupId.y * THREADGROUP_SIZEY) + ScreenSpaceAOParams[5].zw; uint2 TileOrigin = GroupId.xy *16; // Read in 4 pixels uint2 PixelPosInTile = GroupThreadId.xy *2; uint FullZLDSOffset = (PixelPosInTile.y *UPSAMPLE_LDS_WIDTH) + PixelPosInTile.x; SetAOVal(PostprocessInput0.Load(int3(FullGroupOriginM2 + int2(XPos,YPos), 0)).r, XPos, YPos); SetZVal( ZReadTexture.Load( int3(FullGroupOriginM2 + int2(XPos,YPos), 0)).r, XPos, YPos); float BotLeftZ = ( ZReadTexture.Load( int3(TileOrigin + PixelPosInTile +uint2(1,0), 0)).r ); SetAOVal(PostprocessInput0.Load(int3(FullGroupOriginM2 + int2(XPos,YPos), 0)).r, XPos, YPos); SetZVal( ZReadTexture.Load( int3(FullGroupOriginM2 + int2(XPos,YPos), 0)).r, XPos, YPos); FullZData[FullZLDSOffset] = TopLeftZ ; FullZData[FullZLDSOffset+1] = TopRightZ ; FullZData[FullZLDSOffset+UPSAMPLE_LDS_WIDTH] = BotLeftZ ; FullZData[FullZLDSOffset+UPSAMPLE_LDS_WIDTH+1] = BotRightZ ; // The final pixel needs to be the border (17+16 == 33 of them) uint2 BorderXY; uint BorderLDSOffset =0; if(GroupIndex < 17) if (any(DispatchThreadId >= (uint2)SpatialFilterParams.zw)) { BorderXY = uint2(16,GroupIndex); BorderLDSOffset = 16 + (GroupIndex*UPSAMPLE_LDS_WIDTH); } else { BorderXY = uint2(GroupIndex-17, 16); BorderLDSOffset = (GroupIndex-17) + (UPSAMPLE_LDS_WIDTH*16); } if(GroupIndex < 33) { FullZData[BorderLDSOffset] = ( ZReadTexture.Load( int3(TileOrigin + BorderXY, 0)).r ); } GroupMemoryBarrierWithGroupSync(); // Now read in the Color data which is 1/4 res uint2 LowTileOrigin = GroupId.xy *8; uint2 LowPixelPosInTile = GroupThreadId.xy; uint LowAOLDSOffset = (LowPixelPosInTile.y *9) + LowPixelPosInTile.x; float ThisAO = PostprocessInput0.Load(int3(LowTileOrigin + LowPixelPosInTile , 0)).r; LowAOData[LowAOLDSOffset] = ThisAO; // Read in the border if(GroupIndex < 9) { BorderXY = uint2(8,GroupIndex); BorderLDSOffset = 8 + (GroupIndex*9); } else { BorderXY = uint2(GroupIndex-9, 8); BorderLDSOffset = (GroupIndex-9) + (9*8); } GroupMemoryBarrierWithGroupSync(); if(GroupIndex < 17) LowAOData[BorderLDSOffset] = PostprocessInput0.Load(int3(LowTileOrigin + BorderXY , 0)).r; GroupMemoryBarrierWithGroupSync(); // All Data read we can now Process the 4 AO Values float FinalAO_TL; float FinalAO_TR; float FinalAO_BL; float FinalAO_BR; // Top Left - Easy this is the same as the low res colour read in FinalAO_TL = ThisAO; // Top Right - This is a weighted blend of the Top Left and the pixel to the right float Right_AO = LowAOData[LowAOLDSOffset+1]; float Ext_Z = FullZData[FullZLDSOffset+2]; FinalAO_TR = GetBlendAO(ThisAO, Right_AO, TopLeftZ, Ext_Z, TopRightZ); // Bottom Left - This is a weighted blend of the Top Left and the pixel below float Bottom_AO = LowAOData[LowAOLDSOffset+9]; Ext_Z = FullZData[FullZLDSOffset+(2*UPSAMPLE_LDS_WIDTH)]; FinalAO_BL = GetBlendAO(ThisAO, Bottom_AO, TopLeftZ, Ext_Z, BotLeftZ); // Bottom Right - This is a weighted blend of the Top Left and the pixel to the bottom right float BotRight_AO = LowAOData[LowAOLDSOffset+9+1]; Ext_Z = FullZData[FullZLDSOffset+(2*UPSAMPLE_LDS_WIDTH)+2]; FinalAO_BR = GetBlendAO(ThisAO, BotRight_AO, TopLeftZ, Ext_Z, BotRightZ); OutTexture[PixelPos + uint2(0,0) ] = FinalAO_TL ; OutTexture[PixelPos + uint2(1,0) ] = FinalAO_TR ; OutTexture[PixelPos + uint2(0,1) ] = FinalAO_BL ; OutTexture[PixelPos + uint2(1,1) ] = FinalAO_BR ; #endif } #endif