Files
UnrealEngine/Engine/Shaders/Private/VariableRateShading/VRSShadingRateCalculate.usf
2025-05-18 13:04:45 +08:00

206 lines
6.3 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#include "VRSShadingRateCommon.ush"
#include "../Common.ush"
#include "../ColorSpace.ush"
float4 ViewRect;
float EdgeThreshold;
float ConservativeEdgeThreshold;
Texture2D<float> LuminanceTexture;
RWTexture2D<uint> HardwareShadingRateImage;
RWTexture2D<uint> SoftwareShadingRateImage;
groupshared half LumaMatrix[HARDWARE_TILE_SIZE][HARDWARE_TILE_SIZE];
groupshared uint TileSupportedShadingRate;
#define SOBEL_DIMENSION 3
#define USE_WAVE_OP COMPILER_SUPPORTS_WAVE_BIT_ORAND
void CalculateShadingRateImageInner(
uint2 DispatchThreadId,
uint2 GroupThreadId,
uint2 GroupId)
{
const uint VRSBitMask = D3D12_SHADING_RATE_2X2 | (D3D12_SHADING_RATE_2X2 << CONSERVATIVE_SHADING_RATE_SHIFT);
const uint SobelWidth = SOBEL_DIMENSION / 2;
uint2 LuminanceTextureCoord = uint2(int(ViewRect.x + DispatchThreadId.x), int(ViewRect.y + DispatchThreadId.y));
#if !OUTPUT_SOFTWARE_IMAGE
LumaMatrix[GroupThreadId.y][GroupThreadId.x] = LuminanceTexture[LuminanceTextureCoord];
#endif
TileSupportedShadingRate = VRSBitMask;
GroupMemoryBarrierWithGroupSync();
// Exclude pixels outside our ViewRect
float2 Dimensions = ViewRect.zw - ViewRect.xy;
bool InvalidThread = (
#if !OUTPUT_SOFTWARE_IMAGE
// When in HW only mode, we only read from the groupshared LumaMatrix, and so skip sobel operations on pixels at the edge of a group
GroupThreadId.x == HARDWARE_TILE_SIZE - 1 ||
GroupThreadId.y == HARDWARE_TILE_SIZE - 1 ||
GroupThreadId.x == 0 ||
GroupThreadId.y == 0 ||
#endif
(float)DispatchThreadId.x >= Dimensions.x - 1 ||
(float)DispatchThreadId.y >= Dimensions.y - 1);
// If Sobel X is over a certain threshold, we rule out 2x1 rates. If Sobel Y is over that threshold, we rule out 1x2 rates.
// Later, taking the inverse will give us the roughest supported rate for the current lane.
uint UnsupportedShadingRate = 0x0;
if(!InvalidThread)
{
float SobelXSum = 0.0;
float SobelX[SOBEL_DIMENSION][SOBEL_DIMENSION] =
{
{-1.0, 0.0, 1.0},
{-2.0, 0.0, 2.0},
{-1.0, 0.0, 1.0}
};
float SobelYSum = 0.0;
float SobelY[SOBEL_DIMENSION][SOBEL_DIMENSION] =
{
{-1.0, -2.0, -1.0},
{ 0.0, 0.0, 0.0},
{ 1.0, 2.0, 1.0}
};
for (int x = 0; x < SOBEL_DIMENSION; x++)
{
for (int y = 0; y < SOBEL_DIMENSION; y++)
{
#if OUTPUT_SOFTWARE_IMAGE
uint2 LumaCoord = LuminanceTextureCoord + int2(x - SobelWidth, y - SobelWidth);
float LumaValue = LuminanceTexture[LumaCoord];
#else
uint2 LumaCoord = GroupThreadId + int2(x - SobelWidth, y - SobelWidth);
float LumaValue = LumaMatrix[LumaCoord.y][LumaCoord.x];
#endif
SobelXSum += SobelX[y][x] * LumaValue;
SobelYSum += SobelY[y][x] * LumaValue;
}
}
if (abs(SobelXSum) > EdgeThreshold)
{
UnsupportedShadingRate |= D3D12_SHADING_RATE_2X1;
}
if (abs(SobelXSum) > ConservativeEdgeThreshold)
{
UnsupportedShadingRate |= (D3D12_SHADING_RATE_2X1 << CONSERVATIVE_SHADING_RATE_SHIFT);
}
if (abs(SobelYSum) > EdgeThreshold)
{
UnsupportedShadingRate |= D3D12_SHADING_RATE_1X2;
}
if (abs(SobelYSum) > ConservativeEdgeThreshold)
{
UnsupportedShadingRate |= (D3D12_SHADING_RATE_1X2 << CONSERVATIVE_SHADING_RATE_SHIFT);
}
}
#if OUTPUT_SOFTWARE_IMAGE
uint QuadSupportedShadingRate = ~UnsupportedShadingRate;
#if USE_WAVE_OP
QuadSupportedShadingRate &= QuadReadAcrossX(QuadSupportedShadingRate);
QuadSupportedShadingRate &= QuadReadAcrossY(QuadSupportedShadingRate);
#endif
if (GroupThreadId.x % 2 == 0 && GroupThreadId.y % 2 == 0)
{
// Software VRS always uses a 2x2 tile size
SoftwareShadingRateImage[DispatchThreadId.xy >> 1] = VRSBitMask & QuadSupportedShadingRate;
}
#endif
#if OUTPUT_HARDWARE_IMAGE
// Tile size may exceed wave size (and always will for 16x16 tiles), so WaveActiveBitAnd is insufficient
uint Unused;
InterlockedAnd(TileSupportedShadingRate, ~UnsupportedShadingRate, Unused);
GroupMemoryBarrierWithGroupSync();
if (GroupThreadId.x == 1 && GroupThreadId.y == 1)
{
// Generation is always done in one pass for a view family, so no need to account for stereo offsets
HardwareShadingRateImage[GroupId.xy] = VRSBitMask & TileSupportedShadingRate;
}
#endif
}
uint2 LinearToSwizzled(uint LinearIndex)
{
// Just like pixel shaders, compute 2x2 CS quads from thread index so we can leverage the 4 lane cross bar.
// [0 1][2 3] -> [0 1][4 5]
// [4 5][6 7] -> [2 3][6 7]
// All quads internally have this index order:
// 0 1
// 2 3
// Ordering of 8x8 group quads (each index represents one four-lane quad)
// 0 2 4 6
// 1 3 5 7
// 8 10 12 14
// 9 11 13 15
// Ordering of 16x16 group quads (rectangular)
// 0 8 16 24 32 40 48 56
// 1 9 17 25 33 41 49 57
// 2 10 18 26 34 42 50 58
// 3 11 19 27 35 43 51 59
// 4 12 20 28 36 44 52 60
// 5 13 21 29 37 45 53 61
// 6 14 22 30 38 46 54 62
// 7 15 23 31 39 47 55 63
#if HARDWARE_TILE_SIZE == 8
// Quads organized in squares (shift Y, then X, then X, then Y)
// Original index has bits 5-4-3-2-1-0
// SwizzleX = 4-3-0
// SwizzleY = 5-2-1
const uint SwizzleX = BitFieldInsertU32(BitFieldMaskU32(1u, 0u), LinearIndex, BitFieldExtractU32(LinearIndex, 3u, 2u));
const uint SwizzleY = BitFieldInsertU32(BitFieldMaskU32(2u, 0u), BitFieldExtractU32(LinearIndex, 2u, 1u), BitFieldExtractU32(LinearIndex, 3u, 3u));
#else
// Quads go down, then right
// Original index has bits 7-6-5-4-3-2-1-0
// SwizzleX = 7-6-5-0
// SwizzleY = 4-3-2-1
const uint SwizzleX = BitFieldInsertU32(BitFieldMaskU32(1u, 0u), LinearIndex, BitFieldExtractU32(LinearIndex, 4u, 4u));
const uint SwizzleY = BitFieldExtractU32(LinearIndex, 4u, 1u);
#endif
return uint2(SwizzleX, SwizzleY);
}
#define CALCULATE_SHADING_RATE_IMAGE_THREADS (HARDWARE_TILE_SIZE * HARDWARE_TILE_SIZE)
[numthreads(CALCULATE_SHADING_RATE_IMAGE_THREADS, 1, 1)]
void CalculateShadingRateImage(
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID,
uint3 GroupId : SV_GroupID,
uint GroupIndex : SV_GroupIndex)
{
// Remap lanes so that 4 adjacent lanes (e.g. 0,1,2,3) correspond to a pixel quad in the luma texture
// Required to use QuadReadAcross functions
uint2 RemappedGroupThreadId = LinearToSwizzled(GroupIndex);
uint2 RemappedDispatchThreadId = GroupId.xy * uint2(HARDWARE_TILE_SIZE, HARDWARE_TILE_SIZE) + RemappedGroupThreadId;
CalculateShadingRateImageInner(
RemappedDispatchThreadId,
RemappedGroupThreadId,
GroupId.xy);
}