Files
UnrealEngine/Engine/Shaders/Private/PostProcessAmbientOcclusion.usf
2025-05-18 13:04:45 +08:00

1813 lines
55 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
PostprocessAmbientOcclusion.usf: To generate ambient occlusion as a postprocess
=============================================================================*/
#include "Common.ush"
#include "ScreenPass.ush"
#include "PostProcessCommon.ush"
#include "DeferredShadingCommon.ush"
#include "Substrate/Substrate.ush"
// set by C++:
//
// 0:low / 1: medium / 2:high / 4:very high
// SHADER_QUALITY
//
// 0:no / 1:yes
// USE_AO_SETUP_AS_INPUT
//
// 0:no / 1:yes
// USE_UPSAMPLE
#define GTAO_THICKNESS_HEURISTIC 1
// 0: AABB Clipping / 1: Clipping based on first order moment
#define GTAO_VARIANCE_CLIPPING 1
// 0: classic with weighted sample, 1: don't normalize and adjust the formula to be simpler and faster - can look better and is cheaper (Alchemy like?)
#define OPTIMIZATION_O1 1
// 1:lowest quality, 2:medium , 3:high, more doesn't give too much (maybe HZB mip computations should `be adjusted)
//#define SAMPLE_STEPS 3
// 0:off / 1:show samples on the right side of the screen
#define DEBUG_LOOKUPS 0
// 0:off / 1:take into account scene normals in the computations
#define USE_NORMALS 1
// useful to remove high frequency dither pattern, not that needed with more sample
// 0:off (fast but dither pattern with low sample count), 1:non normal aware (half res look), 2:normal aware (slower), 3:normal and depth aware (slowest, doesn't add much)
//#define QUAD_MESSAGE_PASSING_BLUR 2
// ambient occlusion
// AO_SAMPLE_QUALITY = 0 : no AO sampling, only upsampling
// AO_SAMPLE_QUALITY = 1 : no dither/per pixel randomization
// AO_SAMPLE_QUALITY = 2 : efficient high frequency 4x4 pattern without jitter for TemporalAA
// AO_SAMPLE_QUALITY = 3 : efficient high frequency 4x4 pattern with jitter for TemporalAA
// SHADER_QUALITY 0-4
#if SHADER_QUALITY == 0
// very low
#define USE_SAMPLESET 1
#define SAMPLE_STEPS 1
#define QUAD_MESSAGE_PASSING_BLUR 0
#elif SHADER_QUALITY == 1
// low
#define USE_SAMPLESET 1
#define SAMPLE_STEPS 1
#define QUAD_MESSAGE_PASSING_BLUR 2
#elif SHADER_QUALITY == 2
// medium
#define USE_SAMPLESET 1
#define SAMPLE_STEPS 2
#define QUAD_MESSAGE_PASSING_BLUR 2
#elif SHADER_QUALITY == 3
// high
#define USE_SAMPLESET 1
#define SAMPLE_STEPS 3
#define QUAD_MESSAGE_PASSING_BLUR 0
#else // SHADER_QUALITY == 4
// very high
#define USE_SAMPLESET 3
#define SAMPLE_STEPS 3
#define QUAD_MESSAGE_PASSING_BLUR 0
#endif
#if QUAD_MESSAGE_PASSING_BLUR == 0
#define QUAD_MESSAGE_PASSING_NORMAL 0
#define QUAD_MESSAGE_PASSING_DEPTH 0
#elif QUAD_MESSAGE_PASSING_BLUR == 1
#define QUAD_MESSAGE_PASSING_NORMAL 0
#define QUAD_MESSAGE_PASSING_DEPTH 0
#elif QUAD_MESSAGE_PASSING_BLUR == 2
#define QUAD_MESSAGE_PASSING_NORMAL 1
#define QUAD_MESSAGE_PASSING_DEPTH 0
#elif QUAD_MESSAGE_PASSING_BLUR == 3
#define QUAD_MESSAGE_PASSING_NORMAL 1
#define QUAD_MESSAGE_PASSING_DEPTH 1
#endif
// 0:4 samples, 1:9 samples (only really noticable with dither usage ??)
//#define AO_UPSAMPLE_QUALITY
#if USE_AO_SETUP_AS_INPUT == 1
// lower resolution
#define AO_SAMPLE_QUALITY 3
#undef USE_SAMPLESET
#define USE_SAMPLESET 3
#define AO_UPSAMPLE_QUALITY 1
#else
// full resolution is expensive, do lower quality
#define AO_SAMPLE_QUALITY 3
#define AO_UPSAMPLE_QUALITY 0
#endif
// 0: 1 point (for testing)
// 1: 3 points
// 2: more evenly spread (5 points - slightly faster, stronger effect, better with multiple levels?)
// 3: near the surface very large, softly fading out (6 points)
#if USE_SAMPLESET == 0
#define SAMPLESET_ARRAY_SIZE 1
static const float2 OcclusionSamplesOffsets[SAMPLESET_ARRAY_SIZE]=
{
// one sample, for testing
float2(0.500, 0.500),
};
#elif USE_SAMPLESET == 1
#define SAMPLESET_ARRAY_SIZE 3
static const float2 OcclusionSamplesOffsets[SAMPLESET_ARRAY_SIZE]=
{
// 3 points distributed on the unit disc, spiral order and distance
float2(0, -1.0f) * 0.43f,
float2(0.58f, 0.814f) * 0.7f,
float2(-0.58f, 0.814f)
};
#elif USE_SAMPLESET == 2
#define SAMPLESET_ARRAY_SIZE 5
static const float2 OcclusionSamplesOffsets[SAMPLESET_ARRAY_SIZE]=
{
// 5 points distributed on a ring
float2(0.156434, 0.987688),
float2(0.987688, 0.156434)*0.9,
float2(0.453990, -0.891007)*0.8,
float2(-0.707107, -0.707107)*0.7,
float2(-0.891006, 0.453991)*0.65,
};
#else // USE_SAMPLESET == 3
#define SAMPLESET_ARRAY_SIZE 6
static const float2 OcclusionSamplesOffsets[SAMPLESET_ARRAY_SIZE]=
{
// 6 points distributed on the unit disc, spiral order and distance
float2(0.000, 0.200),
float2(0.325, 0.101),
float2(0.272, -0.396),
float2(-0.385, -0.488),
float2(-0.711, 0.274),
float2(0.060, 0.900)
};
#endif // USE_SAMPLESET
// -----------------------------------------------------------------------------------------------------------------------------
// To be included after defines
#include "PostProcessAmbientOcclusionCommon.ush"
// downsample the input of the ambient occlusion pass for better performance, can take input from setup or another downsample pass
void MainSetupPS(in noperspective float4 UVAndScreenPos : TEXCOORD0, in FStereoPSInput StereoInput, float4 SvPosition : SV_POSITION, out float4 OutColor0 : SV_Target0)
{
StereoSetupPS(StereoInput);
float2 ViewPortSize = AOViewport_ViewportSize;
float2 InUV = UVAndScreenPos.xy;
// can be optimized
float2 UV[4];
UV[0] = InUV + float2(-0.5f, -0.5f) * InputExtentInverse;
UV[1] = min(InUV + float2( 0.5f, -0.5f) * InputExtentInverse, View.BufferBilinearUVMinMax.zw);
UV[2] = min(InUV + float2(-0.5f, 0.5f) * InputExtentInverse, View.BufferBilinearUVMinMax.zw);
UV[3] = min(InUV + float2( 0.5f, 0.5f) * InputExtentInverse, View.BufferBilinearUVMinMax.zw);
float4 Samples[4];
UNROLL for(uint i = 0; i < 4; ++i)
{
#if COMPUTE_SHADER || FORWARD_SHADING
// Async compute and forward shading don't have access to the gbuffer.
Samples[i].rgb = normalize(ReconstructNormalFromDepthBuffer(float4(UV[i] * ViewPortSize, SvPosition.zw))) * 0.5f + 0.5f;
#else
#if SUBTRATE_GBUFFER_FORMAT==1
const FSubstrateTopLayerData TopLayerData = SubstrateUnpackTopLayerData(Substrate.TopLayerTexture.Load(uint3(clamp(UV[i] * View.BufferSizeAndInvSize.xy, 0.0f, View.BufferSizeAndInvSize.xy - 1.0f), 0)));
Samples[i].rgb = TopLayerData.WorldNormal * 0.5f + 0.5f;
#else
Samples[i].rgb = GetGBufferData(UV[i], true).WorldNormal * 0.5f + 0.5f;
#endif
#endif
Samples[i].a = CalcSceneDepth(UV[i]);
}
float MaxZ = max( max(Samples[0].a, Samples[1].a), max(Samples[2].a, Samples[3].a));
float4 AvgColor = 0.0f;
if (USE_NORMALS)
{
AvgColor = 0.0001f;
{
UNROLL for(uint i = 0; i < 4; ++i)
{
AvgColor += float4(Samples[i].rgb, 1) * ComputeDepthSimilarity(Samples[i].a, MaxZ, ThresholdInverse);
}
AvgColor.rgb /= AvgColor.w;
}
}
OutColor0 = float4(AvgColor.rgb, MaxZ / Constant_Float16F_Scale);
}
// the main pixel shader that computes ambient occlusion
void MainPSandCS(in float4 UVAndScreenPos, float4 SvPosition, out float4 OutColor)
{
OutColor = 0;
// the following constants as set up on C++ side
float AmbientOcclusionPower = ScreenSpaceAOParams[0].x;
float Ratio = ScreenSpaceAOParams[1].w;
float AORadiusInShader = ScreenSpaceAOParams[1].z;
float InvAmbientOcclusionDistance = ScreenSpaceAOParams[0].z;
float AmbientOcclusionIntensity = ScreenSpaceAOParams[0].w;
float2 ViewportUVToRandomUV = ScreenSpaceAOParams[1].xy;
float AmbientOcclusionBias = ScreenSpaceAOParams[0].y;
float ScaleFactor = ScreenSpaceAOParams[2].x;
float ScaleRadiusInWorldSpace = ScreenSpaceAOParams[2].z;
float2 UV = UVAndScreenPos.xy;
float2 ScreenPos = UVAndScreenPos.zw;
float InvTanHalfFov = ScreenSpaceAOParams[3].w;
float3 FovFix = float3(InvTanHalfFov, Ratio * InvTanHalfFov, 1);
float3 InvFovFix = 1.0f / FovFix;
float SceneDepth = GetDepthFromAOInput(UV);
float3 WorldNormal = GetWorldSpaceNormalFromAOInput(UV, SvPosition);
// can be NaN if WorldNormal=0,0,0 which happens when !USE_NORMALS
float3 ViewSpaceNormal = normalize(mul(WorldNormal, (float3x3)View.TranslatedWorldToView));
float3 ViewSpacePosition = ReconstructCSPos(SceneDepth, ScreenPos);
float ActualAORadius = AORadiusInShader * lerp(SceneDepth, 1, ScaleRadiusInWorldSpace);
// Add bias after fixup (causes minor banding - not needed with larger radius)
if (USE_NORMALS)
{
ViewSpacePosition += AmbientOcclusionBias * SceneDepth * ScaleFactor * (ViewSpaceNormal * FovFix);
}
float2 WeightAccumulator = 0.0001f;
// if the feature is enabled and right side of screen
const bool bDebugLookups = DEBUG_LOOKUPS && ViewSpacePosition.x > 0;
#if AO_SAMPLE_QUALITY != 0
// no SSAO in this pass, only upsampling
#if AO_SAMPLE_QUALITY == 1
// no 4x4 randomization
float2 RandomVec = float2(0, 1) * ActualAORadius;
{
#elif AO_SAMPLE_QUALITY == 2
// extract one of 16 base vectors (rotation and scale) from a texture that repeats 4x4
float2 RandomVec = (Texture2DSample(RandomNormalTexture, RandomNormalTextureSampler, UV * ViewportUVToRandomUV).rg * 2 - 1) * ActualAORadius;
{
#else // AO_SAMPLE_QUALITY == 3
// extract one of 16 base vectors (rotation and scale) from a texture that repeats 4x4, changing over time if TemporalAA is enabled
// jitter each frame a bit to get higher quality over multiple frames (only if TemporalAA is enabled), can cause ghosting effects
const float2 TemporalOffset = ScreenSpaceAOParams[3].xy;
float2 RandomVec = (Texture2DSample(RandomNormalTexture, RandomNormalTextureSampler, TemporalOffset + UV * ViewportUVToRandomUV).rg * 2 - 1) * ActualAORadius;
{
#endif // AO_SAMPLE_QUALITY ==
if(bDebugLookups && ViewSpacePosition.y > 0)
{
// top sample are not per pixel rotated
RandomVec = float2(0, 1) * ActualAORadius;
}
float2 FovFixXY = FovFix.xy * (1.0f / ViewSpacePosition.z);
float4 RandomBase = float4(RandomVec, -RandomVec.y, RandomVec.x) * float4(FovFixXY, FovFixXY);
float2 ScreenSpacePos = ViewSpacePosition.xy / ViewSpacePosition.z;
// to debug the input depth
// OutColor = GetDepthForSSAO(ScreenSpacePos, 0); return;
// to debug the reconstructed normal
// OutColor = ReconstructedViewSpaceNormal.z; return;
// .x means for very anisotropic viewports we scale by x
float InvHaloSize = 1.0f / (ActualAORadius * FovFixXY.x * 2);
float3 ScaledViewSpaceNormal = ViewSpaceNormal;
#if OPTIMIZATION_O1
ScaledViewSpaceNormal *= 0.08f * lerp(SceneDepth, 1000, ScaleRadiusInWorldSpace);
#endif
UNROLL for(int i = 0; i < SAMPLESET_ARRAY_SIZE; ++i)
{
// -1..1
float2 UnrotatedRandom = OcclusionSamplesOffsets[i].xy;
float2 LocalRandom = (UnrotatedRandom.x * RandomBase.xy + UnrotatedRandom.y * RandomBase.zw);
if (bDebugLookups)
{
UNROLL for(uint step = 0; step < SAMPLE_STEPS; ++step)
{
float Scale = (step + 1) / (float)SAMPLE_STEPS;
float MipLevel = ComputeMipLevel(i, step);
float2 ScaledLocalRandom = Scale * LocalRandom;
WeightAccumulator += float2(ComputeSampleDebugMask(ScreenSpacePos + ScaledLocalRandom, MipLevel), 1.0f);
WeightAccumulator += float2(ComputeSampleDebugMask(ScreenSpacePos - ScaledLocalRandom, MipLevel), 1.0f);
}
}
else if (USE_NORMALS)
{
float3 LocalAccumulator = 0;
UNROLL for(uint step = 0; step < SAMPLE_STEPS; ++step)
{
// constant at run time
float Scale = (step + 1) / (float)SAMPLE_STEPS;
// constant at run time (higher is better for texture cache / performance, lower is better quality
float MipLevel = ComputeMipLevel(i, step);
float3 StepSample = WedgeWithNormal(ScreenSpacePos, Scale * LocalRandom, InvFovFix, ViewSpacePosition, ScaledViewSpaceNormal, InvHaloSize, MipLevel);
// combine horizon samples
LocalAccumulator = lerp(LocalAccumulator, float3(max(LocalAccumulator.xy, StepSample.xy), 1), StepSample.z);
}
// Square(): the area scales quadratic with the angle - it gets a bit darker
WeightAccumulator += float2(Square(1 - LocalAccumulator.x) * LocalAccumulator.z, LocalAccumulator.z);
WeightAccumulator += float2(Square(1 - LocalAccumulator.y) * LocalAccumulator.z, LocalAccumulator.z);
// cheaper? Could move 1 - out
// WeightAccumulator += float2(1 - LocalAccumulator.x, LocalAccumulator.y);
}
else // Case with no normals
{
float2 LocalAccumulator = 0;
UNROLL for(uint step = 0; step < SAMPLE_STEPS; ++step)
{
// constant at run time
float Scale = (step + 1) / (float)SAMPLE_STEPS;
// constant at run time (higher is better for texture cache / performance, lower is better quality
float MipLevel = ComputeMipLevel(i, step);
float2 StepSample = WedgeNoNormal(ScreenSpacePos, Scale * LocalRandom, InvFovFix, ViewSpacePosition, InvHaloSize, MipLevel);
// combine horizon samples
LocalAccumulator = lerp(LocalAccumulator, float2(max(LocalAccumulator.x, StepSample.x), 1), StepSample.y);
}
// Square(): the area scales quadratic with the angle - it gets a bit darker
WeightAccumulator += float2(Square(1 - LocalAccumulator.x) * LocalAccumulator.y, LocalAccumulator.y);
}
}
}
#endif // #if AO_SAMPLE_QUALITY == 0
OutColor.r = WeightAccumulator.x / WeightAccumulator.y;
OutColor.gb = float2(0, 0);
if(!bDebugLookups)
{
#if COMPUTE_SHADER || FORWARD_SHADING
// In compute, Input1 and Input2 are not necessarily valid.
float4 Filtered = 1;
#else
float4 Filtered = ComputeUpsampleContribution(SceneDepth, UV, WorldNormal);
#endif
// recombined result from multiple resolutions
OutColor.r = lerp(OutColor.r, Filtered.r, ComputeLerpFactor());
}
#if !USE_AO_SETUP_AS_INPUT
if(!bDebugLookups)
{
// full res
// soft fade out AO in the distance
{
float Mul = ScreenSpaceAOParams[4].x;
float Add = ScreenSpaceAOParams[4].y;
OutColor.r = lerp(OutColor.r, 1, saturate(SceneDepth * Mul + Add));
}
// user adjust AO
// abs() to prevent shader warning
OutColor.r = 1 - (1 - pow(abs(OutColor.r), AmbientOcclusionPower)) * AmbientOcclusionIntensity;
// we output in a single alpha channel
OutColor = OutColor.r;
}
else
{
OutColor.r = pow(1 - OutColor.r, 16); // constnt is tweaked with radius and sample count
}
#endif
// we don't support ddx_fine() for SM4
#if !COMPUTE_SHADER && QUAD_MESSAGE_PASSING_BLUR > 0 && FEATURE_LEVEL >= FEATURE_LEVEL_SM5
{
// .x: AO output, .y:SceneDepth .zw:view space normal
float4 CenterPixel = float4(OutColor.r, SceneDepth, normalize(ViewSpaceNormal).xy);
float4 dX = ddx_fine(CenterPixel);
float4 dY = ddy_fine(CenterPixel);
int2 Mod = (uint2)(SvPosition.xy) % 2;
float4 PixA = CenterPixel;
float4 PixB = CenterPixel - dX * (Mod.x * 2 - 1);
float4 PixC = CenterPixel - dY * (Mod.y * 2 - 1);
float WeightA = 1.0f;
float WeightB = 1.0f;
float WeightC = 1.0f;
#if QUAD_MESSAGE_PASSING_NORMAL
const float NormalTweak = 4.0f;
float3 NormalA = ReconstructNormal(PixA.zw);
float3 NormalB = ReconstructNormal(PixB.zw);
float3 NormalC = ReconstructNormal(PixC.zw);
WeightB *= saturate(pow(saturate(dot(NormalA, NormalB)), NormalTweak));
WeightC *= saturate(pow(saturate(dot(NormalA, NormalC)), NormalTweak));
#endif
#if QUAD_MESSAGE_PASSING_DEPTH
const float DepthTweak = 1;
float InvDepth = 1.0f / PixA.y;
WeightB *= 1 - saturate(abs(1 - PixB.y * InvDepth) * DepthTweak);
WeightC *= 1 - saturate(abs(1 - PixC.y * InvDepth) * DepthTweak);
#endif
// + 1.0f to avoid div by 0
float InvWeightABC = 1.0f / (WeightA + WeightB + WeightC);
WeightA *= InvWeightABC;
WeightB *= InvWeightABC;
WeightC *= InvWeightABC;
OutColor = WeightA * PixA.x + WeightB * PixB.x + WeightC * PixC.x;
// visualize where we don't want to fade
// OutColor = (WeightA - 0.333f) / 0.666f;
}
#endif
}
void MainPS(in noperspective float4 UVAndScreenPos : TEXCOORD0, in FStereoPSInput StereoInput, float4 SvPosition : SV_POSITION, out float4 OutColor : SV_Target0)
{
MainPSandCS(UVAndScreenPos, SvPosition, OutColor);
}
#if COMPUTE_SHADER
/** Output target. In compute, this is a single value buffer. */
RWTexture2D<float> OutTexture;
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void MainCS(
uint2 GroupId : SV_GroupID,
uint2 DispatchThreadId : SV_DispatchThreadID,
uint2 GroupThreadId : SV_GroupThreadID)
{
float ScaleFactor = ScreenSpaceAOParams[2].x;
int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin;
float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5);
// todo: move to a function
float4 SvPosition = float4(PixelCenter, 0, 0) * ScaleFactor;
float2 BufferUV = SvPositionToBufferUV(SvPosition);
SvPosition.z = LookupDeviceZ(BufferUV);
// todo: investigate
// SvPosition.w = ConvertFromDeviceZ(SvPosition.z);
SvPosition.w = 1;
float4 OutColor = 1;
// Test for early exit with out of depth bound.
float SceneDepth = ConvertFromDeviceZ(SvPosition.z);
float FadeMul = ScreenSpaceAOParams[4].x;
float FadeAdd = ScreenSpaceAOParams[4].y;
BRANCH
if (SceneDepth * FadeMul + FadeAdd < 1)
{
MainPSandCS(float4(BufferUV, SvPositionToScreenPosition(SvPosition).xy), SvPosition, OutColor);
}
// Here we could optimized for coalessing writes but that might not be the performance bottleneck.
// We should rather optimized for best texture cache performance.
// http://on-demand.gputechconf.com/gtc/2010/presentations/S12312-DirectCompute-Pre-Conference-Tutorial.pdf
OutTexture[PixelPos] = OutColor.r;
}
SCREEN_PASS_TEXTURE_VIEWPORT(SSAOSmoothOutputViewport)
FScreenTransform SSAOSmoothOutputToInput;
Texture2D SSAOSmoothInputTexture;
SamplerState SSAOSmoothInputSampler;
RWTexture2D<float> SSAOSmoothOutputTexture;
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void MainSSAOSmoothCS(
uint2 GroupId : SV_GroupID,
uint2 DispatchThreadId : SV_DispatchThreadID,
uint2 GroupThreadId : SV_GroupThreadID)
{
BRANCH
if (any(DispatchThreadId >= (uint2)SSAOSmoothOutputViewport_ViewportSize))
{
return;
}
uint2 DestPixelPos = SSAOSmoothOutputViewport_ViewportMin + DispatchThreadId;
float2 DestUV = DestPixelPos * SSAOSmoothOutputViewport_ExtentInverse;
float2 SampleUV = ApplyScreenTransform(DestUV, SSAOSmoothOutputToInput);
// Use a 4x4 box filter because the random texture is tiled 4x4
float Result;
Result = SSAOSmoothInputTexture.SampleLevel(SSAOSmoothInputSampler, SampleUV, 0).r;
Result += SSAOSmoothInputTexture.SampleLevel(SSAOSmoothInputSampler, SampleUV, 0, int2(2, 0)).r;
Result += SSAOSmoothInputTexture.SampleLevel(SSAOSmoothInputSampler, SampleUV, 0, int2(0, 2)).r;
Result += SSAOSmoothInputTexture.SampleLevel(SSAOSmoothInputSampler, SampleUV, 0, int2(2, 2)).r;
SSAOSmoothOutputTexture[DestPixelPos] = Result * 0.25;
}
#endif
#if SHADER_QUALITY == 0
// very low
#define GTAO_NUMTAPS 4
#define GTAO_BIASMIPLEVEL 2
#define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f
#elif SHADER_QUALITY == 1
// low
#define GTAO_NUMTAPS 6
#define GTAO_BIASMIPLEVEL 1
#define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f
#elif SHADER_QUALITY == 2
// medium
#define GTAO_NUMTAPS 8
#define GTAO_BIASMIPLEVEL 0
#define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f
#elif SHADER_QUALITY == 3
// high
#define GTAO_NUMTAPS 12
#define GTAO_BIASMIPLEVEL 0
#define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f
#else // SHADER_QUALITY == 4
// very high
#define GTAO_NUMTAPS 20
#define GTAO_BIASMIPLEVEL 0
#define GTAO_MAX_PIXEL_SCREEN_RADIUS 256.0f
#endif
float4 GTAOParams[5];
// [0] - { cos(TemporalAngle), sin(TemporalAngle), TemporalOffset, FrameTemporalOffset}
// [1] - { FrameNumber, Thicknessblend, unused, unused}
// [2] - { TargetSizeX, TargetSizeY, 1.0/TargetSizeX, 1.0f/TargetSizeY}
// [3] - { FallOffStart, FallOffEnd, FalloffScale, FalloffBias}
// [4] - { Temporal Blend Weight, Angles, SinDeltaAngle, CosDeltaAngle}
#define PI_HALF (PI*0.5)
#if COMPUTE_SHADER
RWTexture2D<float4> HorizonOutTexture;
RWTexture2D<float> DepthOutTexture;
RWTexture2D<float2> VelocityOutTexture;
RWTexture2D<float> DepthsTexture;
#endif
Texture2D HistoryTexture;
SamplerState HistoryTextureSampler;
float2 HistoryTextureSize;
float2 HistoryTexturePixelSize;
Texture2D ZCurrTexture;
SamplerState ZCurrTextureSampler;
float4 PrevScreenPositionScaleBias;
float ClampScale(float Scale)
{
return clamp(Scale, 2.0, 8.0);
}
float3 GetNormal(float2 UV, float3 ViewSpacePosMid)
{
float3 ViewSpaceNormal;
#if USE_NORMALBUFFER
#if SUBTRATE_GBUFFER_FORMAT==1
const FSubstrateTopLayerData TopLayerData = SubstrateUnpackTopLayerData(Substrate.TopLayerTexture.Load(uint3(clamp(UV * View.BufferSizeAndInvSize.xy, 0.0f, View.BufferSizeAndInvSize.xy-1.0f), 0)));
float3 WorldNormal = TopLayerData.WorldNormal;
#else
// Get the normal from the normal buffer
float3 WorldNormal = GetGBufferData(UV, false).WorldNormal;
#endif
ViewSpaceNormal = normalize(mul(WorldNormal, (float3x3)View.TranslatedWorldToView));
#else
// Get the normal derived from the depth buffer
float2 DeltaUV = View.BufferSizeAndInvSize.zw;
float DeviceZ = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV,0).r;
float DeviceZLeft = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV + float2(-DeltaUV.x, 0.0f),0).r;
float DeviceZTop = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV + float2( 0.0f , -DeltaUV.y),0).r;
float DeviceZRight = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV + float2( DeltaUV.x, 0.0f),0).r;
float DeviceZBottom = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, UV + float2( 0.0f , DeltaUV.y),0).r;
float DeviceZDdx = TakeSmallerAbsDelta(DeviceZLeft, DeviceZ, DeviceZRight);
float DeviceZDdy = TakeSmallerAbsDelta(DeviceZTop, DeviceZ, DeviceZBottom);
float ZRight = ConvertFromDeviceZ(DeviceZ + DeviceZDdx);
float ZDown = ConvertFromDeviceZ(DeviceZ + DeviceZDdy);
float3 Right = ScreenToViewPos(UV+ float2( DeltaUV.x, 0.0f) , ZRight)-ViewSpacePosMid;
float3 Down = ScreenToViewPos(UV+ float2( 0.0f, DeltaUV.y) , ZDown) -ViewSpacePosMid;
ViewSpaceNormal = normalize(cross(Right, Down));
#endif
return ViewSpaceNormal;
}
float GetLinearDepthProj(float2 ScreenUV)
{
float DeviceZ = Texture2DSampleLevel(SceneTexturesStruct.SceneDepthTexture, SceneTexturesStruct_SceneDepthTextureSampler, ScreenUV, 0).r;
return 1.0f / (DeviceZ * View.InvDeviceZToWorldZTransform[2] - View.InvDeviceZToWorldZTransform[3]);
}
float2 SearchForLargestAngleDual(uint NumSteps, float2 BaseUV, float2 ScreenDir, float SearchRadius, float InitialOffset, float3 ViewPos, float3 ViewDir,float AttenFactor)
{
float SceneDepth, LenSq, OOLen, Ang, FallOff;
float3 V;
float2 SceneDepths =0;
float2 BestAng = float2(-1,-1);
float Thickness = GTAOParams[1].y;
for(uint i=0; i<NumSteps; i++)
{
float fi = (float) i;
float2 UVOffset = ScreenDir * max( SearchRadius * (fi + InitialOffset), (fi+1) );
UVOffset.y *= -1;
float4 UV2 = BaseUV.xyxy + float4( UVOffset.xy, -UVOffset.xy );
// Positive Direction
SceneDepths.x = ConvertFromDeviceZ(LookupDeviceZ(UV2.xy));
SceneDepths.y = ConvertFromDeviceZ(LookupDeviceZ(UV2.zw));
V = ScreenToViewPos(UV2.xy, SceneDepths.x) - ViewPos;
LenSq = dot(V,V);
OOLen = rsqrt(LenSq + 0.0001);
Ang = dot(V,ViewDir) * OOLen;
FallOff = saturate(LenSq * AttenFactor);
Ang = lerp(Ang, BestAng.x, FallOff);
BestAng.x = ( Ang > BestAng.x ) ? Ang : lerp( Ang, BestAng.x, Thickness );
// Negative Direction
V = ScreenToViewPos(UV2.zw, SceneDepths.y) - ViewPos;
LenSq = dot(V,V);
OOLen = rsqrt(LenSq + 0.0001);
Ang = dot(V,ViewDir) * OOLen;
FallOff = saturate(LenSq * AttenFactor);
Ang = lerp(Ang, BestAng.y, FallOff);
BestAng.y = ( Ang > BestAng.y ) ? Ang : lerp( Ang, BestAng.y, Thickness );
}
BestAng.x = acosFast(clamp(BestAng.x, -1.0, 1.0));
BestAng.y = acosFast(clamp(BestAng.y, -1.0, 1.0));
return BestAng;
}
float2 SearchForLargestAngleDual_HZB(uint NumSteps, float2 BaseUV, float2 ScreenDir, float SearchRadius, float InitialOffset, float3 ViewPos, float3 ViewDir, float AttenFactor)
{
float SceneDepth, LenSq, OOLen, Ang, FallOff;
float3 V;
float2 SceneDepths =0;
float MipLevel = 0 ;
float2 BestAng = float2(-1,-1);
float Thickness = GTAOParams[1].y;
UNROLL
for(uint i=1; i<NumSteps+1; i++)
{
float fi = (float) i;
float2 UVOffset = ScreenDir * max( SearchRadius * (fi + InitialOffset), (fi+1) );
UVOffset.y *= -1;
float4 UV2 = BaseUV.xyxy + float4( UVOffset.xy, -UVOffset.xy );
if( (i==0) && (GTAO_BIASMIPLEVEL==0))
{
SceneDepths.x = ConvertFromDeviceZ(LookupDeviceZ(UV2.xy));
SceneDepths.y = ConvertFromDeviceZ(LookupDeviceZ(UV2.zw));
}
else
{
MipLevel = GTAO_BIASMIPLEVEL;
if(i==2)
MipLevel++;
if(i>3)
MipLevel+=2;
SceneDepths.x = GetHZBDepth(UV2.xy,MipLevel);
SceneDepths.y = GetHZBDepth(UV2.zw,MipLevel);
}
// Positive Direction
V = ScreenToViewPos(UV2.xy, SceneDepths.x) - ViewPos;
LenSq = dot(V,V);
OOLen = rsqrtFast(LenSq + 0.0001);
FallOff = saturate(LenSq * AttenFactor);
if(FallOff < 1.0)
{
Ang = dot(V,ViewDir) * OOLen;
Ang = lerp(Ang, BestAng.x, FallOff);
BestAng.x = ( Ang > BestAng.x ) ? Ang : lerp( Ang, BestAng.x, Thickness );
}
// Negative Direction
V = ScreenToViewPos(UV2.zw, SceneDepths.y) - ViewPos;
LenSq = dot(V,V);
OOLen = rsqrtFast(LenSq + 0.0001);
FallOff = saturate(LenSq * AttenFactor);
if(FallOff < 1.0)
{
Ang = dot(V,ViewDir) * OOLen;
Ang = lerp(Ang, BestAng.y, FallOff);
BestAng.y = ( Ang > BestAng.y ) ? Ang : lerp( Ang, BestAng.y, Thickness );
}
}
BestAng.x = acosFast(clamp(BestAng.x, -1.0, 1.0));
BestAng.y = acosFast(clamp(BestAng.y, -1.0, 1.0));
return BestAng;
}
float ComputeInnerIntegral(float2 UV, float2 Angles, float2 ScreenDir, float3 ViewDir, float3 ViewSpaceNormal, float SceneDepth)
{
// Given the angles found in the search plane we need to project the View Space Normal onto the plane defined by the search axis and the View Direction and perform the inner integrate
float3 PlaneNormal = normalize(cross(float3(ScreenDir.xy,0) ,ViewDir));
float3 Perp = cross(ViewDir, PlaneNormal);
float3 ProjNormal = ViewSpaceNormal - PlaneNormal * dot(ViewSpaceNormal, PlaneNormal);
float LenProjNormal = length(ProjNormal) + 0.000001f;
float RecipMag = 1.0f / (LenProjNormal);
float CosAng = dot(ProjNormal, Perp) * RecipMag;
float Gamma = acosFast(CosAng) - PI_HALF;
float CosGamma = dot(ProjNormal, ViewDir) * RecipMag;
float SinGamma = CosAng * -2.0f;
// clamp to normal hemisphere
Angles.x = Gamma + max(-Angles.x - Gamma, -(PI_HALF) );
Angles.y = Gamma + min( Angles.y - Gamma, (PI_HALF) );
float AO = ( (LenProjNormal) * 0.25 *
( (Angles.x * SinGamma + CosGamma - cos((2.0 * Angles.x) - Gamma)) +
(Angles.y * SinGamma + CosGamma - cos((2.0 * Angles.y) - Gamma)) ));
return AO;
}
float InterleavedGradientNoise( float2 iPos )
{
return frac( 52.9829189f * frac( (iPos.x * 0.06711056) + (iPos.y*0.00583715)) );
}
float2 GetRandomAngleOffset(uint2 iPos )
{
iPos.y = 4096-iPos.y;
float Angle = InterleavedGradientNoise(float2(iPos));
float Offset = (1.0/4.0) * (( iPos.y - iPos.x)&3);
return float2(Angle, Offset);
}
float3 GetRandomVector(uint2 iPos )
{
iPos.y = 16384-iPos.y;
float3 RandomVec = float3(0,0,0);
float3 RandomTexVec = float3(0,0,0);
float ScaleOffset;
float TemporalCos = GTAOParams[0].x;
float TemporalSin = GTAOParams[0].y;
float GradientNoise = InterleavedGradientNoise(float2(iPos));
RandomTexVec.x = cos((GradientNoise*PI) );
RandomTexVec.y = sin((GradientNoise*PI) );
ScaleOffset = (1.0/4.0) * (( iPos.y - iPos.x) & 3);
// ScaleOffset = (1.0/5.0) * (( iPos.y - iPos.x) % 5);
RandomVec.x = dot(RandomTexVec.xy, float2(TemporalCos, -TemporalSin ));
RandomVec.y = dot(RandomTexVec.xy, float2(TemporalSin, TemporalCos ));
RandomVec.z = frac(ScaleOffset + GTAOParams[0].z);
return RandomVec;
}
/*
*
* HORIZON SEARCH AND INNER INTEGRATE COMBINED
*
*/
void GTAOCombinedPSandCS(in float2 UV, in uint2 iPos, out float OutColor)
{
OutColor = 0;
// Offset by a fraction of a pixel to unsure we don't hit between pixels when running at half res
float2 QuarterOffset = AOSceneViewport_ExtentInverse * 0.125;
float2 TexUV = UV + QuarterOffset;
UV += QuarterOffset;
float DeviceZ = LookupDeviceZ(TexUV );
float SceneDepth = ConvertFromDeviceZ(DeviceZ);
if(SceneDepth > ScreenSpaceAOParams[4].w)
{
OutColor = 1;
return;
}
float3 ViewSpacePos = ScreenToViewPos(TexUV,SceneDepth);
float3 ViewSpaceNormal = GetNormal(TexUV, ViewSpacePos);
float3 ViewDir = normalize(-ViewSpacePos.xyz);
const float WorldRadius = GTAOParams[3].y;
float InvTanHalfFov = ScreenSpaceAOParams[3].w;
float FOVScale = AOSceneViewport_Extent.y * InvTanHalfFov; // TODO
// Get Radius in ScreenSpace (in pixels)
float WorldRadiusAdj = WorldRadius * FOVScale;
float PixelRadius = max( min( WorldRadiusAdj / ViewSpacePos.z, GTAO_MAX_PIXEL_SCREEN_RADIUS ), (float) GTAO_NUMTAPS );
float StepRadius = PixelRadius / ( (float) GTAO_NUMTAPS + 1 );
float AttenFactor = 2.0 / (WorldRadius * WorldRadius);
// Get the randomized Direction to sample and the step offset
float3 RandomAndOffset = GetRandomVector(iPos);
float2 RandomVec = RandomAndOffset.xy;
float Offset = RandomAndOffset.z;
float Sum=0.0;
uint NumAngles = (uint) GTAOParams[4].y;
float SinDeltaAngle = GTAOParams[4].z;
float CosDeltaAngle = GTAOParams[4].w;
float2 ScreenDir = float2(RandomVec.x, RandomVec.y);
for(uint Angle =0; Angle < NumAngles; Angle++)
{
float2 Angles = SearchForLargestAngleDual_HZB(GTAO_NUMTAPS, TexUV, ScreenDir* View.BufferSizeAndInvSize.zw, StepRadius,
Offset, ViewSpacePos, ViewDir, AttenFactor);
Sum += ComputeInnerIntegral(TexUV, Angles, ScreenDir, ViewDir, ViewSpaceNormal, SceneDepth);
// Rotate for the next angle
float2 TempScreenDir = ScreenDir.xy;
ScreenDir.x = (TempScreenDir.x * CosDeltaAngle) + (TempScreenDir.y * -SinDeltaAngle);
ScreenDir.y = (TempScreenDir.x * SinDeltaAngle) + (TempScreenDir.y * CosDeltaAngle);
Offset = frac(Offset + 0.617);
}
float AO = Sum;
AO = AO / ((float)NumAngles);
AO *= 2.0/PI;
// Fade out based on user defined distance
float Mul = ScreenSpaceAOParams[4].x;
float Add = ScreenSpaceAOParams[4].y;
AO = lerp(AO, 1, saturate(SceneDepth * Mul + Add));
OutColor = AO ;
return;
}
void GTAOCombinedPS(in float4 UVAndScreenPos : TEXCOORD0, out float OutColor : SV_Target0)
{
int2 iPos = int2( UVAndScreenPos.xy * AOViewport_Extent );
GTAOCombinedPSandCS(UVAndScreenPos.xy, iPos, OutColor);
}
#if COMPUTE_SHADER
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void GTAOCombinedCS(
uint2 GroupId : SV_GroupID,
uint2 DispatchThreadId : SV_DispatchThreadID,
uint2 GroupThreadId : SV_GroupThreadID)
{
float OutColor = 0;
int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin;
float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5);
float2 BufferUV = PixelCenter.xy * GTAOParams[2].zw;
GTAOCombinedPSandCS(BufferUV, PixelPos, OutColor);
OutTexture[PixelPos] = OutColor;
}
#endif
/*
*
* INNER INTEGRATE
*
*/
Texture2D HorizonsTexture;
SamplerState HorizonsTextureSampler;
float GTAOInnerIntegratePSandCS(in float2 UV, in uint2 iPos)
{
float2 QuarterOffset = AOSceneViewport_ExtentInverse * 0.125;
UV += QuarterOffset;
// Read the angles buffer
float SceneDepth = GetDepthFromAOInput(UV);
if(SceneDepth > ScreenSpaceAOParams[4].w)
{
return 1;
}
float4 Angles = Texture2DSample(HorizonsTexture, HorizonsTextureSampler, UV); // Angles computed from previous pass
Angles = Angles * PI;
// Get Angle
float2 RandomVec = GetRandomVector(iPos).xy;
float2 ScreenDir = float2(RandomVec.x, RandomVec.y);
// ViewspacePos and Normal
float3 ViewSpacePos = ScreenToViewPos(UV, SceneDepth);
#if SUBTRATE_GBUFFER_FORMAT==1
const FSubstrateTopLayerData TopLayerData = SubstrateUnpackTopLayerData(Substrate.TopLayerTexture.Load(uint3(iPos, 0)));
float3 WorldNormal = TopLayerData.WorldNormal;
#else
float3 WorldNormal = GetGBufferData(UV, false).WorldNormal;
#endif
float3 ViewSpaceNormal = normalize(mul(WorldNormal, (float3x3)View.TranslatedWorldToView));
float3 ViewDir = -normalize(ViewSpacePos.xyz); // TODO - This is a function of UV only.
float AO = ComputeInnerIntegral(UV, Angles.xy, ScreenDir, ViewDir, ViewSpaceNormal, SceneDepth);
uint NumAngles = (uint) GTAOParams[4].y;
if(NumAngles>1)
{
ScreenDir.xy = float2(-ScreenDir.y, ScreenDir.x);
AO += ComputeInnerIntegral(UV, Angles.zw, ScreenDir, ViewDir, ViewSpaceNormal, SceneDepth);
AO *=0.5;
}
AO *= 2.0/PI;
// Fade out based on user defined distance
float Mul = ScreenSpaceAOParams[4].x;
float Add = ScreenSpaceAOParams[4].y;
AO = lerp(AO, 1, saturate(SceneDepth * Mul + Add));
return AO ;
}
void GTAOInnerIntegratePS(in noperspective float4 UVAndScreenPos : TEXCOORD0, out float4 OutColor : SV_Target0)
{
int2 iPos = int2( UVAndScreenPos.xy * AOViewport_Extent );
float AO = GTAOInnerIntegratePSandCS(UVAndScreenPos.xy, iPos);
OutColor = AO;
}
#if COMPUTE_SHADER
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void GTAOInnerIntegrateCS(
uint2 GroupId : SV_GroupID,
uint2 DispatchThreadId : SV_DispatchThreadID,
uint2 GroupThreadId : SV_GroupThreadID)
{
float OutColor = 0;
int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin;
float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5);
float2 BufferUV = PixelCenter.xy * GTAOParams[2].zw;
float AO = GTAOInnerIntegratePSandCS(BufferUV,PixelPos);
OutTexture[PixelPos] = AO;
}
#endif
/*
*
* HORIZON SEARCH ONLY
*
*/
float4 HorizonSearchPSandCS(in float2 UV, in uint2 iPos)
{
float4 OutHorizons = 0;
// Offset by a fraction of a pixel to unsure we don't hit between pixels when running at half res
float2 QuarterOffset = AOSceneViewport_ExtentInverse * 0.125;
UV = UV + QuarterOffset;
float DeviceZ = LookupDeviceZ(UV );
float SceneDepth = ConvertFromDeviceZ(DeviceZ);
if(SceneDepth > ScreenSpaceAOParams[4].w)
{
OutHorizons = 0;
return OutHorizons;
}
float3 ViewSpacePos = ScreenToViewPos(UV,SceneDepth);
float3 ViewSpaceNormal = GetNormal(UV, ViewSpacePos);
float3 ViewDir = normalize(-ViewSpacePos.xyz);
const float WorldRadius = GTAOParams[3].y;
float InvTanHalfFov = ScreenSpaceAOParams[3].w;
float FOVScale = AOSceneViewport_Extent.y * InvTanHalfFov;
// Get Radius in ScreenSpace (in pixels)
float WorldRadiusAdj = WorldRadius * FOVScale;
float PixelRadius = max( min( WorldRadiusAdj / ViewSpacePos.z, GTAO_MAX_PIXEL_SCREEN_RADIUS ), (float) GTAO_NUMTAPS );
float StepRadius = PixelRadius / ( (float) GTAO_NUMTAPS + 1 );
float AttenFactor = 2.0 / (WorldRadius * WorldRadius);
// Get the randomized Direction to sample and the step offset
float3 RandomAndOffset = GetRandomVector(iPos);
float2 RandomVec = RandomAndOffset.xy;
float Offset = RandomAndOffset.z;
float Sum=0.0;
uint NumAngles = (uint) GTAOParams[4].y;
float SinDeltaAngle = GTAOParams[4].z;
float CosDeltaAngle = GTAOParams[4].w;
float2 ScreenDir = float2(RandomVec.x, RandomVec.y);
// First Angle
float2 Angles = SearchForLargestAngleDual_HZB(GTAO_NUMTAPS, UV, ScreenDir* View.BufferSizeAndInvSize.zw, StepRadius,
Offset, ViewSpacePos, ViewDir, AttenFactor);
Angles /= PI;
float2 Angles2=0;
if(NumAngles>1)
{
// Rotate for the next angle
float2 TempScreenDir = ScreenDir.xy;
ScreenDir.x = (TempScreenDir.x * CosDeltaAngle) + (TempScreenDir.y * -SinDeltaAngle);
ScreenDir.y = (TempScreenDir.x * SinDeltaAngle) + (TempScreenDir.y * CosDeltaAngle);
Angles2 = SearchForLargestAngleDual_HZB(GTAO_NUMTAPS, UV, ScreenDir* View.BufferSizeAndInvSize.zw, StepRadius,
Offset, ViewSpacePos, ViewDir, AttenFactor);
Angles2 /= PI;
}
OutHorizons.xy = Angles;
OutHorizons.zw = Angles2;
return OutHorizons;
}
void HorizonSearchPS(in noperspective float4 UVAndScreenPos : TEXCOORD0, float4 SvPosition : SV_POSITION, out float4 OutColor : SV_Target0)
{
int2 iPos = int2( UVAndScreenPos.xy * AOViewport_Extent );
OutColor = HorizonSearchPSandCS(UVAndScreenPos.xy, iPos);
}
#if COMPUTE_SHADER
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void HorizonSearchCS(
uint2 GroupId : SV_GroupID,
uint2 DispatchThreadId : SV_DispatchThreadID,
uint2 GroupThreadId : SV_GroupThreadID)
{
float2 OutColor = 0;
int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin;
float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5);
float2 BufferUV = PixelCenter.xy * GTAOParams[2].zw;
float4 Horizons = HorizonSearchPSandCS(BufferUV,PixelPos);
HorizonOutTexture[PixelPos] = Horizons;
}
#endif
/*
*
* TEMPORAL FILTER
*
*/
Texture2D SceneVelocityTexture;
SamplerState SceneVelocityTextureSampler;
float4 BlendParams;
float3 ReprojectPos(float2 UV, float Depth)
{
// Given a UV reproject where this was in the previous frame
// Camera motion for pixel (in ScreenPos space).
float2 ThisScreen = (UV.xy - View.ScreenPositionScaleBias.wz) / View.ScreenPositionScaleBias.xy;
float4 ThisClip = float4( ThisScreen, Depth, 1 );
float4 PrevClip = mul( ThisClip, View.ClipToPrevClip );
float2 PrevScreen = PrevClip.xy / PrevClip.w;
float4 EncodedVelocity = Texture2DSampleLevel(SceneVelocityTexture, SceneVelocityTextureSampler, UV,0);
if( EncodedVelocity.x > 0.0 )
{
PrevScreen = ThisClip.xy - DecodeVelocityFromTexture(EncodedVelocity).xy;
}
float2 PrevUV = PrevScreen.xy * PrevScreenPositionScaleBias.xy + PrevScreenPositionScaleBias.zw;
return float3(PrevUV, PrevClip.z/ PrevClip.w);
}
float ReadHistoryClamp(float2 UV, float MinAO, float MaxAO)
{
float BilinearWeights[4];
float2 PixUV = (UV * HistoryTextureSize)-0.5;
float2 FloorUV = floor(PixUV);
float2 FracUV = (PixUV - FloorUV);
UV = (FloorUV * HistoryTexturePixelSize) + (HistoryTexturePixelSize*0.5);
BilinearWeights[0] = (1.0 - FracUV.x) * ( 1.0 - FracUV.y);
BilinearWeights[1] = ( FracUV.x) * ( 1.0 - FracUV.y);
BilinearWeights[2] = (1.0 - FracUV.x) * ( FracUV.y);
BilinearWeights[3] = ( FracUV.x) * ( FracUV.y);
// Read the 4 previous depths and History
float HistoryAO[4];
float2 dUV = HistoryTexturePixelSize;
// TODO - Use GatherR when available
HistoryAO[0] = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV + float2( 0, 0)).r;
HistoryAO[1] = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV + float2(dUV.x, 0)).r;
HistoryAO[2] = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV + float2( 0, dUV.y)).r;
HistoryAO[3] = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV + float2(dUV.x, dUV.y)).r;
float VisHistory = 0;
for(int i=0; i<4; i++)
{
HistoryAO[i] = clamp(HistoryAO[i], MinAO, MaxAO);
VisHistory += BilinearWeights[i] * HistoryAO[i];
}
return VisHistory;
}
Texture2D GTAOTemporalInput;
SamplerState GTAOTemporalSampler;
float2 GTAOTemporalInputPixelSize;
void NeighbourhoodClamp(float2 UV, float BaseAO, inout float MinAO, inout float MaxAO)
{
float2 dUV = GTAOTemporalInputPixelSize * 1.5;
#define NumSamples 4
float AONeighbours[NumSamples];
AONeighbours[0] = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV + float2(-dUV.x,-dUV.y) ).r;
AONeighbours[1] = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV + float2(-dUV.x, dUV.y) ).r;
AONeighbours[2] = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV + float2( dUV.x,-dUV.y) ).r;
AONeighbours[3] = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV + float2( dUV.x, dUV.y) ).r;
#if GTAO_VARIANCE_CLIPPING
float AOAverage = 0;
float AOSquared = 0;
for(int i=0; i<4; i++)
{
AOAverage += AONeighbours[i];
AOSquared += AONeighbours[i]*AONeighbours[i];
}
float Mu = AOAverage / NumSamples;
float Sigma = sqrt(AOSquared / NumSamples - (Mu*Mu));
MinAO = max( Mu - Sigma * 0.8, 0.0 );
MaxAO = min( Mu + Sigma * 0.8, 1.0 );
#else
MinAO = min(BaseAO, min(min(AONeighbours[0], AONeighbours[1]), min(AONeighbours[2], AONeighbours[3])));
MaxAO = max(BaseAO, max(max(AONeighbours[0], AONeighbours[1]), max(AONeighbours[2], AONeighbours[3])));
#endif
}
float CompareVeloc(float2 V1, float2 V2)
{
float2 V12 = V1-V2;
return 1-saturate( abs(V12.x + V12.y) * 100);
}
void GTAOTemporalFilterPSAndCS(float2 UV, inout float OutAO)
{
float BlendWeight = GTAOParams[4].x;
float2 QuarterOffset = AOSceneViewport_ExtentInverse * 0.125;
UV = UV + QuarterOffset;
// Latest AO value
float NewAO = Texture2DSample(GTAOTemporalInput, GTAOTemporalSampler, UV).r;
// Current depth of the rendered Scene
float CurrDepthDeviceZ = Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV).r;
float CurrDepth = ConvertFromDeviceZ( CurrDepthDeviceZ);
// Previous UV value
float3 PrevUVDepth = ReprojectPos( UV, CurrDepthDeviceZ);
float CurrDepthReproject = ConvertFromDeviceZ(PrevUVDepth.z);
float2 PrevUV = PrevUVDepth.xy;
float2 PixVelocity = UV - PrevUV;
float VelocityMag = saturate(length(PixVelocity)*100);
// Compare velocities
float2 DestVeloc=0;
{
float DestDeviceZ = Texture2DSample(ZCurrTexture, ZCurrTextureSampler, PrevUVDepth.xy).r;
float3 Reproj = ReprojectPos( PrevUVDepth.xy, DestDeviceZ);
DestVeloc = PrevUVDepth.xy - Reproj.xy;
}
float VelocCompare = CompareVeloc(PixVelocity, DestVeloc);
// Get an acceptable range of values we care about from the current AO
float RangeVal = lerp(0.1, 0.00, VelocityMag);
float MinAO = saturate(NewAO - RangeVal);
float MaxAO = saturate(NewAO + RangeVal);
// Simple history value
float HistoryPrevUV = ReadHistoryClamp(PrevUV, MinAO, MaxAO);
float HistoryThisUV = Texture2DSample(HistoryTexture, HistoryTextureSampler, UV ).r;
HistoryThisUV = clamp(HistoryThisUV, MinAO, MaxAO);
float HistoryAO = HistoryPrevUV;
HistoryAO = lerp(HistoryThisUV, HistoryPrevUV, VelocCompare);
OutAO = lerp(HistoryAO, NewAO, BlendWeight);
}
void GTAOTemporalFilterPS(in noperspective float4 UVAndScreenPos : TEXCOORD0, float4 SvPosition : SV_POSITION, out float4 OutColor : SV_Target0)
{
float OutAO = 0;
GTAOTemporalFilterPSAndCS(UVAndScreenPos.xy, OutAO);
OutColor = OutAO;
}
#if COMPUTE_SHADER
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void GTAOTemporalFilterCS(
uint2 GroupId : SV_GroupID,
uint2 DispatchThreadId : SV_DispatchThreadID,
uint2 GroupThreadId : SV_GroupThreadID)
{
float OutColor = 0;
int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin;
float2 PixelCenter = (float2)PixelPos + float2(0.5, 0.5);
float2 BufferUV = PixelCenter.xy * AOViewport_ExtentInverse;
GTAOTemporalFilterPSAndCS(BufferUV, OutColor);
OutTexture[PixelPos] = OutColor;
}
#endif
/*
* UPSAMPLE FILTER
*
*/
Texture2D GTAOUpsampleTexture;
SamplerState GTAOUpsampleSampler;
float2 GTAOUpsamplePixelSize;
float GTAOUpsamplePSAndCS(float2 UV)
{
float2 Offset = GTAOUpsamplePixelSize * 0.25;
float AOC = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV ) .r);
float AO0 = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV + float2(-Offset.x, -Offset.y)) .r);
float AO1 = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV + float2(Offset.x, -Offset.y)) .r);
float AO2 = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV + float2(-Offset.x, Offset.y)) .r);
float AO3 = saturate( Texture2DSample(GTAOUpsampleTexture, GTAOUpsampleSampler, UV + float2(Offset.x, Offset.y)) .r);
float AO = min(min(AO0, AO1), min(AO2, AO3));
return AO;
}
void GTAOUpsamplePS(in noperspective float4 UVAndScreenPos : TEXCOORD0, out float4 OutColor : SV_Target0)
{
OutColor = GTAOUpsamplePSAndCS(UVAndScreenPos.xy);
}
#if COMPUTE_SHADER
Texture2D GTAOSpatialFilterTexture;
Texture2D GTAOSpatialFilterDepthTexture;
uint2 GTAOSpatialFilterExtents;
float4 GTAOSpatialFilterParams;
float4 GTAOSpatialFilterWidth;
// The 5x5 filter works on a threadgroup of size 16x8 (128 pixels)
// We need to read in the 16x8 and a 2 pixel border around. So this is 20x12 (240 pixels)
// Each thread reads in 2 pixels each
// We make the array 32 wide so it plays better with bank conflicts
#define LDS_WIDTH 20
groupshared float AOData[ LDS_WIDTH*12];
groupshared float ZData[ LDS_WIDTH*12];
int GetLDSLocation(int x, int y)
{
x+=2; y+=2;
return ((y*LDS_WIDTH) + x) ;
}
float GetAOLin(int loc)
{
return AOData [loc];
}
float GetZLin(int loc)
{
return ZData[loc];
}
float GetAO(int x, int y)
{
x+=2; y+=2;
return AOData [(y*LDS_WIDTH) + x];
}
float GetZ(int x, int y)
{
x+=2; y+=2;
return ZData [(y*LDS_WIDTH) + x];
}
[numthreads(16, 8, 1)]
void GTAOSpatialFilterCS(
int GroupIndex: SV_GroupIndex,
uint2 GroupId : SV_GroupID,
uint2 DispatchThreadId : SV_DispatchThreadID,
uint2 GroupThreadId : SV_GroupThreadID)
{
int2 GTId = int2(GroupThreadId);
// Position on the screen We care about
int2 PixelPos = DispatchThreadId + AOViewport_ViewportMin;
// Firstly get the origin in the screen of the 16x8 inner box
int2 FullGroupOrigin = int2(GroupId.x * 16, GroupId.y * 8) + AOViewport_ViewportMin;
int2 FullGroupOriginM2 = FullGroupOrigin.xy - int2(2,2);
uint pixIdx = (GroupIndex*2);
float DownsampleFactor = GTAOSpatialFilterParams.x;
// Downsampled version. Note that the Z is double the res of the Z
if(pixIdx < (20*12) )
{
uint XPos = pixIdx%20;
uint YPos = pixIdx/20;
int LDSPos = (YPos*LDS_WIDTH) + XPos;
int2 ReadXYAO = FullGroupOriginM2 + int2(XPos,YPos);
int2 ReadXYZ = ReadXYAO*DownsampleFactor;
float AO = GTAOSpatialFilterTexture.Load(int3(ReadXYAO, 0)).r;
float Z = GTAOSpatialFilterDepthTexture.Load( int3(ReadXYZ, 0)).r;
AOData[ LDSPos ] = AO;
ZData[ LDSPos ] = Z;
// Next pixel
LDSPos++;
ReadXYAO.x +=1;
ReadXYZ.x +=DownsampleFactor;
AO = GTAOSpatialFilterTexture.Load(int3(ReadXYAO, 0)).r;
Z = GTAOSpatialFilterDepthTexture.Load( int3(ReadXYZ, 0)).r;
AOData[ LDSPos ] = AO;
ZData[ LDSPos ] = Z;
}
GroupMemoryBarrierWithGroupSync();
// Get the differences in Z at this pixel. This is needed for the bilateral filter
float ThisZ = GetZ(GTId.x, GTId.y);
float ThisZLin =ConvertFromDeviceZ( ThisZ);
float2 ZDiff;
int FilterMin = int(GTAOSpatialFilterWidth.x);
int FilterMax = int(GTAOSpatialFilterWidth.y);
int LDSBase = GetLDSLocation(GTId.x + FilterMin, GTId.y + FilterMin);
//Get X Delta
int LDSCentre = GetLDSLocation(GTId.x , GTId.y);
{
float XM2Z = GetZLin(LDSCentre-2);
float XM1Z = GetZLin(LDSCentre-1);
float XP1Z = GetZLin(LDSCentre+1);
float XP2Z = GetZLin(LDSCentre+2);
// Get extrapolated point either side
float C1 = abs((XM1Z + (XM1Z - XM2Z)) - ThisZ);
float C2 = abs((XP1Z + (XP1Z - XP2Z)) - ThisZ);
if(C1 < C2)
{
ZDiff.x = XM1Z - XM2Z;
}
else
{
ZDiff.x = XP2Z - XP1Z;
}
}
//Get Y Delta
{
float YM2Z = GetZLin(LDSCentre-(2*LDS_WIDTH));
float YM1Z = GetZLin(LDSCentre-(1*LDS_WIDTH));
float YP1Z = GetZLin(LDSCentre+(1*LDS_WIDTH));
float YP2Z = GetZLin(LDSCentre+(2*LDS_WIDTH));
// Get extrapolated point either side
float C1 = abs((YM1Z + (YM1Z - YM2Z)) - ThisZ);
float C2 = abs((YP1Z + (YP1Z - YP2Z)) - ThisZ);
if(C1 < C2)
{
ZDiff.y = YM1Z - YM2Z;
}
else
{
ZDiff.y = YP2Z - YP1Z;
}
}
// Do the blur
float SumAO = 0;
float SumWeight = 0;
int x,y;
// Get the Z Value to compare against
float DepthBase = ThisZ +(ZDiff.x*FilterMin) + (ZDiff.y*FilterMin);
float SimpleBlur=0.0;
for(y=FilterMin; y<=FilterMax; y++)
{
float PlaneZ = DepthBase;
int LDSLineBase = LDSBase;
LDSBase += LDS_WIDTH;
for(x=FilterMin; x<=FilterMax; x++)
{
float Sample_AO = GetAOLin(LDSLineBase);
float SampleZ = GetZLin( LDSLineBase);
LDSLineBase++;
// Get the bilateral weight. This is a function of the difference in height between the plane equation and the base depth
// Compare the Z at this sample with the gradients
float SampleZDiff = abs(PlaneZ - SampleZ);
const float SpatialFilterWeight = 20000;
float Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight );
SumAO += Sample_AO * Weight;
//SimpleBlur += Sample_AO;
SumWeight += Weight;
PlaneZ += ZDiff.x;
}
DepthBase += ZDiff.y;
}
SumAO /=SumWeight;
SumAO *= (PI/2.0) ;
// user adjust AO
float AmbientOcclusionIntensity = ScreenSpaceAOParams[0].w;
float AmbientOcclusionPower = ScreenSpaceAOParams[0].x*0.5;
SumAO = 1 - (1 - pow(abs(SumAO), AmbientOcclusionPower)) * AmbientOcclusionIntensity;
OutTexture[PixelPos] = SumAO;
}
#endif
float2 SpatialDiff;
// Single axis blur filter for Pixel Shaders
void GTAOSpatialFilterPS(float4 UVAndScreenPos : TEXCOORD0,
float4 SvPosition : SV_POSITION,
out float4 OutColor : SV_Target0)
{
float2 UV = UVAndScreenPos.xy;
// Do a 3 pixel wide spatial filter
float OutAO = 0;
float2 Offset = PostprocessInput0Size.zw;
float2 Offset2 = Offset*2;
// Get Depth and AO at this pixel
float AO_C = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV).r;
float Z_C = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV).r);
float AO_M1 = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV - Offset).r;
float Z_M1 = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV- Offset).r);
float AO_P1 = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV + Offset).r;
float Z_P1 = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV + Offset).r);
float AO_M2 = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV - Offset2).r;
float Z_M2 = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV- Offset).r);
float AO_P2 = Texture2DSample(PostprocessInput0, PostprocessInput0Sampler, UV + Offset2).r;
float Z_P2 = ConvertFromDeviceZ( Texture2DSample(ZCurrTexture, ZCurrTextureSampler, UV + Offset).r);
float DiffZ = min( abs(Z_C - Z_M1), abs(Z_C - Z_P1) );
const float SpatialFilterWeight = 1000;
float SampleZDiff=0;
float Weight=0;
// Blend the values
float SumWeight = 1.0;
float TotalAO = AO_C;
// Minus 2
SampleZDiff = abs(Z_C - Z_M2);
SampleZDiff -= DiffZ*2;
Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight );
TotalAO += AO_M2 * Weight;
SumWeight += Weight;
// Minus 2
SampleZDiff = abs(Z_C - Z_M1);
SampleZDiff -= DiffZ;
Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight );
TotalAO += AO_M1 * Weight;
SumWeight += Weight;
// Plus 2
SampleZDiff = abs(Z_C - Z_P2);
SampleZDiff -= DiffZ*2;
Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight );
TotalAO += AO_P2 * Weight;
SumWeight += Weight;
// Plus 1
SampleZDiff = abs(Z_C - Z_P1);
SampleZDiff -= DiffZ;
Weight = 1.0f - saturate(SampleZDiff*SpatialFilterWeight );
TotalAO += AO_P1 * Weight;
SumWeight += Weight;
TotalAO /= SumWeight;
// Blend them together based on depth
OutColor = AO_C;
}
#if COMPUTE_SHADER
#define UPSAMPLE_LDS_WIDTH 20
groupshared float FullZData[ (16+1) * UPSAMPLE_LDS_WIDTH ];
groupshared float LowAOData[ 9*10];
float GetBlendAO(float AO1, float AO2, float Z1, float Z2, float ZMid)
{
float dZ = Z2 - Z1;
float Epsilon = 0.00001f;
if(abs(dZ) < Epsilon)
{
return (AO1+AO2) * 0.5;
}
float Ratio = saturate((ZMid -Z1) * (1.0/dZ) );
return (AO1 * (1.0-Ratio)) + ( AO2 * Ratio);
}
[numthreads(8, 8, 1)]
void SmartUpsample(
int GroupIndex: SV_GroupIndex,
uint2 GroupId : SV_GroupID,
uint2 DispatchThreadId : SV_DispatchThreadID,
uint2 GroupThreadId : SV_GroupThreadID)
{
#if 0
int2 GTId = int2(GroupThreadId);
// Get the pixel Pos of the final position
int2 PixelPos = DispatchThreadId*2 + ScreenSpaceAOParams[5].zw;
// Each thread will compute 4 output colours . We need a 1 pixel border around the depth buffer so each thread will read in 5 pixel into the 17x17 buffer
int2 FullGroupOrigin = int2(GroupId.x * THREADGROUP_SIZEX, GroupId.y * THREADGROUP_SIZEY) + ScreenSpaceAOParams[5].zw;
uint2 TileOrigin = GroupId.xy *16;
// Read in 4 pixels
uint2 PixelPosInTile = GroupThreadId.xy *2;
uint FullZLDSOffset = (PixelPosInTile.y *UPSAMPLE_LDS_WIDTH) + PixelPosInTile.x;
SetAOVal(PostprocessInput0.Load(int3(FullGroupOriginM2 + int2(XPos,YPos), 0)).r, XPos, YPos);
SetZVal( ZReadTexture.Load( int3(FullGroupOriginM2 + int2(XPos,YPos), 0)).r, XPos, YPos);
float BotLeftZ = ( ZReadTexture.Load( int3(TileOrigin + PixelPosInTile +uint2(1,0), 0)).r );
SetAOVal(PostprocessInput0.Load(int3(FullGroupOriginM2 + int2(XPos,YPos), 0)).r, XPos, YPos);
SetZVal( ZReadTexture.Load( int3(FullGroupOriginM2 + int2(XPos,YPos), 0)).r, XPos, YPos);
FullZData[FullZLDSOffset] = TopLeftZ ;
FullZData[FullZLDSOffset+1] = TopRightZ ;
FullZData[FullZLDSOffset+UPSAMPLE_LDS_WIDTH] = BotLeftZ ;
FullZData[FullZLDSOffset+UPSAMPLE_LDS_WIDTH+1] = BotRightZ ;
// The final pixel needs to be the border (17+16 == 33 of them)
uint2 BorderXY;
uint BorderLDSOffset =0;
if(GroupIndex < 17)
if (any(DispatchThreadId >= (uint2)SpatialFilterParams.zw))
{
BorderXY = uint2(16,GroupIndex);
BorderLDSOffset = 16 + (GroupIndex*UPSAMPLE_LDS_WIDTH);
}
else
{
BorderXY = uint2(GroupIndex-17, 16);
BorderLDSOffset = (GroupIndex-17) + (UPSAMPLE_LDS_WIDTH*16);
}
if(GroupIndex < 33)
{
FullZData[BorderLDSOffset] = ( ZReadTexture.Load( int3(TileOrigin + BorderXY, 0)).r );
}
GroupMemoryBarrierWithGroupSync();
// Now read in the Color data which is 1/4 res
uint2 LowTileOrigin = GroupId.xy *8;
uint2 LowPixelPosInTile = GroupThreadId.xy;
uint LowAOLDSOffset = (LowPixelPosInTile.y *9) + LowPixelPosInTile.x;
float ThisAO = PostprocessInput0.Load(int3(LowTileOrigin + LowPixelPosInTile , 0)).r;
LowAOData[LowAOLDSOffset] = ThisAO;
// Read in the border
if(GroupIndex < 9)
{
BorderXY = uint2(8,GroupIndex);
BorderLDSOffset = 8 + (GroupIndex*9);
}
else
{
BorderXY = uint2(GroupIndex-9, 8);
BorderLDSOffset = (GroupIndex-9) + (9*8);
}
GroupMemoryBarrierWithGroupSync();
if(GroupIndex < 17)
LowAOData[BorderLDSOffset] = PostprocessInput0.Load(int3(LowTileOrigin + BorderXY , 0)).r;
GroupMemoryBarrierWithGroupSync();
// All Data read we can now Process the 4 AO Values
float FinalAO_TL;
float FinalAO_TR;
float FinalAO_BL;
float FinalAO_BR;
// Top Left - Easy this is the same as the low res colour read in
FinalAO_TL = ThisAO;
// Top Right - This is a weighted blend of the Top Left and the pixel to the right
float Right_AO = LowAOData[LowAOLDSOffset+1];
float Ext_Z = FullZData[FullZLDSOffset+2];
FinalAO_TR = GetBlendAO(ThisAO, Right_AO, TopLeftZ, Ext_Z, TopRightZ);
// Bottom Left - This is a weighted blend of the Top Left and the pixel below
float Bottom_AO = LowAOData[LowAOLDSOffset+9];
Ext_Z = FullZData[FullZLDSOffset+(2*UPSAMPLE_LDS_WIDTH)];
FinalAO_BL = GetBlendAO(ThisAO, Bottom_AO, TopLeftZ, Ext_Z, BotLeftZ);
// Bottom Right - This is a weighted blend of the Top Left and the pixel to the bottom right
float BotRight_AO = LowAOData[LowAOLDSOffset+9+1];
Ext_Z = FullZData[FullZLDSOffset+(2*UPSAMPLE_LDS_WIDTH)+2];
FinalAO_BR = GetBlendAO(ThisAO, BotRight_AO, TopLeftZ, Ext_Z, BotRightZ);
OutTexture[PixelPos + uint2(0,0) ] = FinalAO_TL ;
OutTexture[PixelPos + uint2(1,0) ] = FinalAO_TR ;
OutTexture[PixelPos + uint2(0,1) ] = FinalAO_BL ;
OutTexture[PixelPos + uint2(1,1) ] = FinalAO_BR ;
#endif
}
#endif