UnrealEngine/Engine/Shaders/Private/VariableRateShading/VRSShadingRateCalculate.usf

// Copyright Epic Games, Inc. All Rights Reserved.

#include "VRSShadingRateCommon.ush"
#include "../Common.ush"
#include "../ColorSpace.ush"

float4 ViewRect;
float EdgeThreshold;
float ConservativeEdgeThreshold;

Texture2D<float> LuminanceTexture;
RWTexture2D<uint> HardwareShadingRateImage;
RWTexture2D<uint> SoftwareShadingRateImage;

groupshared half LumaMatrix[HARDWARE_TILE_SIZE][HARDWARE_TILE_SIZE];
groupshared uint TileSupportedShadingRate;

#define SOBEL_DIMENSION 3
#define USE_WAVE_OP COMPILER_SUPPORTS_WAVE_BIT_ORAND

void CalculateShadingRateImageInner(
	uint2 DispatchThreadId,
	uint2 GroupThreadId,
	uint2 GroupId)
{
	const uint VRSBitMask = D3D12_SHADING_RATE_2X2 | (D3D12_SHADING_RATE_2X2 << CONSERVATIVE_SHADING_RATE_SHIFT);
	const uint SobelWidth = SOBEL_DIMENSION / 2;

	uint2 LuminanceTextureCoord = uint2(int(ViewRect.x + DispatchThreadId.x), int(ViewRect.y + DispatchThreadId.y));

#if !OUTPUT_SOFTWARE_IMAGE
	LumaMatrix[GroupThreadId.y][GroupThreadId.x] = LuminanceTexture[LuminanceTextureCoord];
#endif

	TileSupportedShadingRate = VRSBitMask;
	GroupMemoryBarrierWithGroupSync();

	// Exclude pixels outside our ViewRect
	float2 Dimensions = ViewRect.zw - ViewRect.xy;
	bool InvalidThread = (
#if !OUTPUT_SOFTWARE_IMAGE
		// When in HW only mode, we only read from the groupshared LumaMatrix, and so skip sobel operations on pixels at the edge of a group
		GroupThreadId.x == HARDWARE_TILE_SIZE - 1 ||
		GroupThreadId.y == HARDWARE_TILE_SIZE - 1 ||
		GroupThreadId.x == 0 ||
		GroupThreadId.y == 0 ||
#endif
		(float)DispatchThreadId.x >= Dimensions.x - 1 ||
		(float)DispatchThreadId.y >= Dimensions.y - 1);

	// If Sobel X is over a certain threshold, we rule out 2x1 rates. If Sobel Y is over that threshold, we rule out 1x2 rates.
	// Later, taking the inverse will give us the roughest supported rate for the current lane.
	uint UnsupportedShadingRate = 0x0;

	if(!InvalidThread)
	{
		float SobelXSum = 0.0;
		float SobelX[SOBEL_DIMENSION][SOBEL_DIMENSION] =
		{
			{-1.0, 0.0, 1.0},
			{-2.0, 0.0, 2.0},
			{-1.0, 0.0, 1.0}
		};

		float SobelYSum = 0.0;
		float SobelY[SOBEL_DIMENSION][SOBEL_DIMENSION] =
		{
			{-1.0, -2.0, -1.0},
			{ 0.0,  0.0,  0.0},
			{ 1.0,  2.0,  1.0}
		};


		for (int x = 0; x < SOBEL_DIMENSION; x++)
		{
			for (int y = 0; y < SOBEL_DIMENSION; y++)
			{
#if OUTPUT_SOFTWARE_IMAGE
				uint2 LumaCoord = LuminanceTextureCoord + int2(x - SobelWidth, y - SobelWidth);
				float LumaValue = LuminanceTexture[LumaCoord];
#else
				uint2 LumaCoord = GroupThreadId + int2(x - SobelWidth, y - SobelWidth);
				float LumaValue = LumaMatrix[LumaCoord.y][LumaCoord.x];
#endif
				SobelXSum += SobelX[y][x] * LumaValue;
				SobelYSum += SobelY[y][x] * LumaValue;
			}
		}

		if (abs(SobelXSum) > EdgeThreshold)
		{
			UnsupportedShadingRate |= D3D12_SHADING_RATE_2X1;
		}

		if (abs(SobelXSum) > ConservativeEdgeThreshold)
		{
			UnsupportedShadingRate |= (D3D12_SHADING_RATE_2X1 << CONSERVATIVE_SHADING_RATE_SHIFT);
		}

		if (abs(SobelYSum) > EdgeThreshold)
		{
			UnsupportedShadingRate |= D3D12_SHADING_RATE_1X2;
		}

		if (abs(SobelYSum) > ConservativeEdgeThreshold)
		{
			UnsupportedShadingRate |= (D3D12_SHADING_RATE_1X2 << CONSERVATIVE_SHADING_RATE_SHIFT);
		}
	}

#if OUTPUT_SOFTWARE_IMAGE
	uint QuadSupportedShadingRate = ~UnsupportedShadingRate;

#if USE_WAVE_OP
	QuadSupportedShadingRate &= QuadReadAcrossX(QuadSupportedShadingRate);
	QuadSupportedShadingRate &= QuadReadAcrossY(QuadSupportedShadingRate);
#endif

	if (GroupThreadId.x % 2 == 0 && GroupThreadId.y % 2 == 0)
	{
		// Software VRS always uses a 2x2 tile size
		SoftwareShadingRateImage[DispatchThreadId.xy >> 1] = VRSBitMask & QuadSupportedShadingRate;
	}
#endif

#if OUTPUT_HARDWARE_IMAGE
	// Tile size may exceed wave size (and always will for 16x16 tiles), so WaveActiveBitAnd is insufficient
	uint Unused;
	InterlockedAnd(TileSupportedShadingRate, ~UnsupportedShadingRate, Unused);
	GroupMemoryBarrierWithGroupSync();

	if (GroupThreadId.x == 1 && GroupThreadId.y == 1)
	{
		// Generation is always done in one pass for a view family, so no need to account for stereo offsets
		HardwareShadingRateImage[GroupId.xy] = VRSBitMask & TileSupportedShadingRate;
	}
#endif
}


uint2 LinearToSwizzled(uint LinearIndex)
{
	// Just like pixel shaders, compute 2x2 CS quads from thread index so we can leverage the 4 lane cross bar.
	// [0 1][2 3] -> [0 1][4 5]
	// [4 5][6 7] -> [2 3][6 7]

	// All quads internally have this index order:
	//  0	1
	//	2	3

	// Ordering of 8x8 group quads (each index represents one four-lane quad)
	//	0	2	4	6
	//	1	3	5	7
	//	8	10	12	14
	//	9	11	13	15

	// Ordering of 16x16 group quads (rectangular)
	//  0	8	16	24	32	40	48	56
	//  1	9	17	25	33	41	49	57
	//  2	10	18	26	34	42	50	58
	//  3	11	19	27	35	43	51	59
	//  4	12	20	28	36	44	52	60
	//  5	13	21	29	37	45	53	61
	//  6	14	22	30	38	46	54	62
	//  7	15	23	31	39	47	55	63

#if HARDWARE_TILE_SIZE == 8
	// Quads organized in squares (shift Y, then X, then X, then Y)
	// Original index has bits 5-4-3-2-1-0
	// SwizzleX = 4-3-0
	// SwizzleY = 5-2-1
	const uint SwizzleX = BitFieldInsertU32(BitFieldMaskU32(1u, 0u), LinearIndex, BitFieldExtractU32(LinearIndex, 3u, 2u));
	const uint SwizzleY = BitFieldInsertU32(BitFieldMaskU32(2u, 0u), BitFieldExtractU32(LinearIndex, 2u, 1u), BitFieldExtractU32(LinearIndex, 3u, 3u));
#else
	// Quads go down, then right
	// Original index has bits 7-6-5-4-3-2-1-0
	// SwizzleX = 7-6-5-0
	// SwizzleY = 4-3-2-1
	const uint SwizzleX =  BitFieldInsertU32(BitFieldMaskU32(1u, 0u), LinearIndex, BitFieldExtractU32(LinearIndex, 4u, 4u));
	const uint SwizzleY =  BitFieldExtractU32(LinearIndex, 4u, 1u);
#endif

	return uint2(SwizzleX, SwizzleY);
}

#define CALCULATE_SHADING_RATE_IMAGE_THREADS	(HARDWARE_TILE_SIZE * HARDWARE_TILE_SIZE)

[numthreads(CALCULATE_SHADING_RATE_IMAGE_THREADS, 1, 1)]
void CalculateShadingRateImage(
	uint3 DispatchThreadId : SV_DispatchThreadID,
	uint3 GroupThreadId : SV_GroupThreadID,
	uint3 GroupId : SV_GroupID,
	uint GroupIndex : SV_GroupIndex)
{
	// Remap lanes so that 4 adjacent lanes (e.g. 0,1,2,3) correspond to a pixel quad in the luma texture
	// Required to use QuadReadAcross functions
	uint2 RemappedGroupThreadId = LinearToSwizzled(GroupIndex);
	uint2 RemappedDispatchThreadId = GroupId.xy * uint2(HARDWARE_TILE_SIZE, HARDWARE_TILE_SIZE) + RemappedGroupThreadId;

	CalculateShadingRateImageInner(
		RemappedDispatchThreadId,
		RemappedGroupThreadId,
		GroupId.xy);

}