// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "Common.ush" #ifdef OVERRIDE_HTILELOOKUP_USH #include "/Platform/Private/HTileLookup.ush" #else uint ComputeTileOffset(uint2 PixelPos, uint PixelsWide, uint PlatformConfig) { return 0; } #endif #ifndef PLATFORM_SUPPORTS_HTILE_LOOKUP #define PLATFORM_SUPPORTS_HTILE_LOOKUP 0 #endif #if PLATFORM_SUPPORTS_HTILE_LOOKUP // Optimized version for no hi-stencil, just min and max depth. uint EncodeTileMinMaxDepth(float MinDepth, float MaxDepth) { // Convert min and max depth to UNORM14. #if COMPILER_SUPPORTS_PACK_INTRINSICS const uint HTileValue = PackFloat32ToUNorm16(MinDepth - 0.5 / 65535.0, MaxDepth + 3.5 / 65535.0); #else const float MaxDepthRange = float((1 << 14) - 1); const uint ZMin = uint(floor(MinDepth * MaxDepthRange)); const uint ZMax = uint( ceil(MaxDepth * MaxDepthRange)); const uint HTileValue = (ZMin << 18) | (ZMax << 4); #endif // Shift up min by 2 bits, then set all four low bits. return BitFieldInsertU32(BitFieldMaskU32(14, 18), HTileValue, HTileValue << 2) | 0xF /* expanded tile ZMask */; } uint EncodeZRangeDelta(uint ZBase, uint ZOther) { uint Delta = abs(int(ZBase) - int(ZOther)); if (Delta < 16u) { return Delta; } const uint LeadingZeros = (31u - firstbithigh(Delta)) - 18u; if (LeadingZeros >= 8) { return ((11u - LeadingZeros) << 3u) | ((Delta >> (10u - LeadingZeros)) & 7u); } else { return ((15u - LeadingZeros) << 2u) | ((Delta >> (11u - LeadingZeros)) & 3u); } } uint EncodeTileStencilZBaseDelta(float MinDepth, float MaxDepth, uint XX, uint SMem, uint SR0, uint SR1, bool bZCompareGE) { const float MaxDepthRange = float((1 << 14) - 1); const uint ZMask = 0xF; // Expanded const uint ZMin = uint(floor(MinDepth * MaxDepthRange)); const uint ZMax = uint( ceil(MaxDepth * MaxDepthRange)); // Assumes reverse Z const uint ZBase = select(bZCompareGE, ZMax, ZMin); const uint ZDelta = EncodeZRangeDelta(ZMin, ZMax); uint HTileValue = 0; HTileValue |= ZMask; // 0:3 HTileValue |= (SR0 << 4); // 4:5 HTileValue |= (SR1 << 6); // 6:7 HTileValue |= (SMem << 8); // 8:9 HTileValue |= (XX << 10); // 10:11 HTileValue |= (ZDelta << 12); // 12:17 HTileValue |= (ZBase << 18); // 18:31 return HTileValue; } // Decodes 6 bit depth delta coding for hi-stencil. uint DecodeDepthDelta(uint DeltaZ) { // 000DDD -> 00000000000DDD // 001DDD -> 00000000001DDD // 010DDD -> 0000000001DDD1 // 011DDD -> 000000001DDD11 // 1000DD -> 00000001DD1111 // 1001DD -> 0000001DD11111 // 1010DD -> 000001DD111111 // 1011DD -> 00001DD1111111 // 1100DD -> 0001DD11111111 // 1101DD -> 001DD111111111 // 1110DD -> 01DD1111111111 // 1111DD -> 1DD11111111111 const bool TwoBitDelta = BitFieldExtractU32(DeltaZ, 1, 5); const uint DeltaBits = TwoBitDelta ? 2 : 3; const uint Delta = BitFieldExtractU32(DeltaZ, DeltaBits, 0); const uint Code = BitFieldExtractU32(DeltaZ, 3, DeltaBits); const uint LeadingOne = Code + (TwoBitDelta ? 6 : 2); const uint Ones = (1u << (LeadingOne + 1)) - 1; const uint DeltaStart = (LeadingOne >= DeltaBits) ? (LeadingOne - DeltaBits) : 0; const uint Mask = BitFieldMaskU32(DeltaBits, DeltaStart); return BitFieldInsertU32(Mask, Delta << DeltaStart, Ones); } // Decodes the hi-z information in the tile as a 14 bit integer min and max. uint2 DecodeTileMinMax(uint HTileValue, bool HiStencil, bool CompareMinZ) { uint MinZ; uint MaxZ; if (HiStencil) { const uint ZBase = BitFieldExtractU32(HTileValue, 14, 18); // 14 bit fixed point. const uint ZDelta = BitFieldExtractU32(HTileValue, 6, 12); // 6 bit delta coding. // Base is closest to near plane; delta is towards far plane. MinZ = CompareMinZ ? (ZBase - DecodeDepthDelta(ZDelta)) : ZBase; MaxZ = CompareMinZ ? ZBase : (ZBase + DecodeDepthDelta(ZDelta)); } else { // Both values are 14 bit fixed point. MinZ = BitFieldExtractU32(HTileValue, 14, 4); MaxZ = BitFieldExtractU32(HTileValue, 14, 18); } return uint2(MinZ, MaxZ); } // Decodes the hi-stencil information in the tile as a pair of 2 bit values. uint2 ExtractTileHiStencil(uint HTileValue, bool HiStencil) { uint HiStencil0; uint HiStencil1; if (HiStencil) { HiStencil0 = BitFieldExtractU32(HTileValue, 2, 4); HiStencil1 = BitFieldExtractU32(HTileValue, 2, 6); } else { HiStencil0 = 0; HiStencil1 = 0; } return uint2(HiStencil0, HiStencil1); } uint GetZPlaneCount(uint ZMask) { uint PlaneCount = ZMask; if (PlaneCount >= 11u) { PlaneCount += 2u; } else if (PlaneCount >= 9u) { ++PlaneCount; } return PlaneCount; } // Note: We don't use MSAA depth, this assumes single sample count uint PackZLayout(uint ZMask) { const uint ZPlaneCount = GetZPlaneCount(ZMask); const uint ZPlaneSize = ZPlaneCount * 12u; const uint ZPlanePaddedSize = Padding(ZPlaneSize, 32u); const uint PMaskBitCount = CeilLog2(ZPlaneCount); const uint PMaskSize = 64u * PMaskBitCount / 8u; const uint PMaskPaddedSize = Padding(PMaskSize, 32u); const uint DepthPaddedSize = ZPlanePaddedSize + PMaskPaddedSize; const uint DepthPackedSize = Padding(ZPlaneSize + PMaskSize, 32u); const uint PMaskOffset = Padding(select(DepthPackedSize < DepthPaddedSize, ZPlaneSize, ZPlanePaddedSize), select(DepthPackedSize > 256u, 256u, 8u)); const uint ZPlaneWordCount = (PMaskOffset + PMaskSize) >> 2u; uint ZLayout = ZPlaneCount; ZLayout |= PMaskBitCount << 5; ZLayout |= select(DepthPackedSize <= 32, 1u, 0u) << 16; ZLayout |= (PMaskOffset >> 2u) << 17u; ZLayout |= ZPlaneWordCount << 24u; return ZLayout; } struct ZLayout { uint ZPlaneCount; uint ZPlaneWordCount; uint PMaskOffset; uint IndexBitCount; uint Is32BytePadded; }; ZLayout UnpackZLayout(uint ZMask) { const uint ZLayoutTable[16u] = { PackZLayout(0x0), PackZLayout(0x1), PackZLayout(0x2), PackZLayout(0x3), PackZLayout(0x4), PackZLayout(0x5), PackZLayout(0x6), PackZLayout(0x7), PackZLayout(0x8), PackZLayout(0x9), PackZLayout(0xA), PackZLayout(0xB), PackZLayout(0xC), PackZLayout(0xD), PackZLayout(0xE), PackZLayout(0xF), }; const uint PackedZLayout = ZLayoutTable[ZMask]; ZLayout Layout; Layout.ZPlaneCount = BitFieldExtractU32(PackedZLayout, 5u, 0u); Layout.IndexBitCount = BitFieldExtractU32(PackedZLayout, 3u, 5u); Layout.Is32BytePadded = BitFieldExtractU32(PackedZLayout, 4u, 13u); Layout.PMaskOffset = BitFieldExtractU32(PackedZLayout, 7u, 17u); Layout.ZPlaneWordCount = BitFieldExtractU32(PackedZLayout, 8u, 24u); return Layout; } // 112 dword max ever read, but can be offset by 8 words groupshared uint CompressedDepthData[128]; float DecompressDepthValue_Internal(ZLayout Layout, uint ZOffset, uint ZMask, uint2 PixelPos, uint ThreadIndex, float ClearValue, uint PlatformConfig) { GroupMemoryBarrierWithGroupSync(); uint ZPlaneIndex = 0; BRANCH if (Layout.ZPlaneCount > 1) // scalarized { const uint BitOffset = MulU24(ThreadIndex, Layout.IndexBitCount); const uint IndexOffset = ZOffset + Layout.PMaskOffset + (BitOffset >> 5); ZPlaneIndex = BitAlignU32(CompressedDepthData[IndexOffset + 1], CompressedDepthData[IndexOffset], BitOffset); ZPlaneIndex = BitFieldExtractU32(ZPlaneIndex, Layout.IndexBitCount, 0); } uint ZPlaneOffset = (ZPlaneIndex * 3) + ZOffset; uint3 ZPlane = uint3( CompressedDepthData[ZPlaneOffset + 0u], CompressedDepthData[ZPlaneOffset + 1u], CompressedDepthData[ZPlaneOffset + 2u] ); int Exponent = BitFieldExtractU32(ZPlane.y, 8u, 24u); int ZCenter = BitFieldExtractI32(ZPlane.z, 31u, 0u) << 1; int2 AdjustedPixelPos = PixelPos & 7; AdjustedPixelPos = AdjustedPixelPos * 2 - 7; int DepthValue = 0; DepthValue = MadI24(ZPlane.x >> 4, AdjustedPixelPos.x, DepthValue ); DepthValue = MadI24(ZPlane.y, AdjustedPixelPos.y, DepthValue ); DepthValue = MadI24(ZPlane.x & 0xF, AdjustedPixelPos.x, DepthValue << 4); DepthValue = MadI24(ZPlane.x >> 28, AdjustedPixelPos.y, DepthValue ); if (DepthValue <= -ZCenter) { return 0.0f; } DepthValue += ZCenter; uint DepthValueUnsigned = DepthValue; int Shift = firstbithigh(DepthValueUnsigned) - 23; Exponent += Shift; const bool bHandleDenorms = true; const bool bFlushDenorms = true; if (bHandleDenorms && Exponent < 1) { return select(bFlushDenorms, 0.0f, asfloat((DepthValueUnsigned << (8 - Shift)) >> (9 - Exponent))); } else { return min(1.0f, asfloat(Exponent << 23 | (DepthValueUnsigned << (9 - Shift)) >> 9)); } } float DecompressDepthValue(Texture2D DepthBuffer, uint ZMask, uint2 PixelPos, uint ThreadIndex, float ClearValue, uint PlatformConfig) { if (IsZTileClear(ZMask)) { return ClearValue; } if (ZMask == HTILE_ZMASK_EXPANDED) { // Uncompressed, fetch raw depth return DepthBuffer[PixelPos]; } const ZLayout Layout = UnpackZLayout(ZMask); const uint ZOffset = select(Is32BytePaddedTileConfig(PlatformConfig), Layout.Is32BytePadded & PixelPos.x, 0u); const uint MaxThreadIndex = ZOffset + Layout.ZPlaneWordCount; if (ThreadIndex < MaxThreadIndex) { CompressedDepthData[ThreadIndex] = asuint(DepthBuffer[PixelPos]); } return DecompressDepthValue_Internal(Layout, ZOffset, ZMask, PixelPos, ThreadIndex, ClearValue, PlatformConfig); } float DecompressDepthValue(RWTexture2D DepthBuffer, uint ZMask, uint2 PixelPos, uint ThreadIndex, float ClearValue, uint PlatformConfig) { if (IsZTileClear(ZMask)) { return ClearValue; } if (ZMask == HTILE_ZMASK_EXPANDED) { // Uncompressed, fetch raw depth return DepthBuffer[PixelPos]; } const ZLayout Layout = UnpackZLayout(ZMask); const uint ZOffset = select(Is32BytePaddedTileConfig(PlatformConfig), Layout.Is32BytePadded & PixelPos.x, 0u); const uint MaxThreadIndex = ZOffset + Layout.ZPlaneWordCount; if (ThreadIndex < MaxThreadIndex) { CompressedDepthData[ThreadIndex] = asuint(DepthBuffer[PixelPos]); } return DecompressDepthValue_Internal(Layout, ZOffset, ZMask, PixelPos, ThreadIndex, ClearValue, PlatformConfig); } uint DecompressStencilValue(Texture2D StencilBuffer, uint SMem, uint2 PixelPos, uint ClearValue) { // SMem // 0 = Clear // 1 = Single // 2 = Clear and Expanded // 3 = Expanded if ((SMem & 0x1) == 0) { return ClearValue; } if (SMem == 1) { return StencilBuffer[PixelPos & (uint2)~7u]; } // Expanded return StencilBuffer[PixelPos]; } uint DecompressStencilValue(RWTexture2D StencilBuffer, uint SMem, uint2 PixelPos, uint ClearValue) { // SMem // 0 = Clear // 1 = Single // 2 = Clear and Expanded // 3 = Expanded if ((SMem & 0x1) == 0) { return ClearValue; } if (SMem == 1) { return StencilBuffer[PixelPos & (uint2)~7u]; } // Expanded return StencilBuffer[PixelPos]; } uint2 PixelFromThreadIndex(uint Element) { uint2 PixelPos; PixelPos.x = ((Element >> 0u) & 1u) | ((Element >> 1u) & 2u) | ((Element >> 2u) & 4u); PixelPos.y = ((Element >> 1u) & 1u) | ((Element >> 2u) & 2u) | ((Element >> 3u) & 4u); return PixelPos; } uint2 SwizzleThreadIndex(uint ThreadIndex) { #if 1 // At least currently, it seems to be faster to just compute the swizzling directly instead of using the LUT. return PixelFromThreadIndex(ThreadIndex); #else const uint2 LookupTable[64] = { PixelFromThreadIndex(8 * 0 + 0), PixelFromThreadIndex(8 * 0 + 1), PixelFromThreadIndex(8 * 0 + 2), PixelFromThreadIndex(8 * 0 + 3), PixelFromThreadIndex(8 * 0 + 4), PixelFromThreadIndex(8 * 0 + 5), PixelFromThreadIndex(8 * 0 + 6), PixelFromThreadIndex(8 * 0 + 7), PixelFromThreadIndex(8 * 1 + 0), PixelFromThreadIndex(8 * 1 + 1), PixelFromThreadIndex(8 * 1 + 2), PixelFromThreadIndex(8 * 1 + 3), PixelFromThreadIndex(8 * 1 + 4), PixelFromThreadIndex(8 * 1 + 5), PixelFromThreadIndex(8 * 1 + 6), PixelFromThreadIndex(8 * 1 + 7), PixelFromThreadIndex(8 * 2 + 0), PixelFromThreadIndex(8 * 2 + 1), PixelFromThreadIndex(8 * 2 + 2), PixelFromThreadIndex(8 * 2 + 3), PixelFromThreadIndex(8 * 2 + 4), PixelFromThreadIndex(8 * 2 + 5), PixelFromThreadIndex(8 * 2 + 6), PixelFromThreadIndex(8 * 2 + 7), PixelFromThreadIndex(8 * 3 + 0), PixelFromThreadIndex(8 * 3 + 1), PixelFromThreadIndex(8 * 3 + 2), PixelFromThreadIndex(8 * 3 + 3), PixelFromThreadIndex(8 * 3 + 4), PixelFromThreadIndex(8 * 3 + 5), PixelFromThreadIndex(8 * 3 + 6), PixelFromThreadIndex(8 * 3 + 7), PixelFromThreadIndex(8 * 4 + 0), PixelFromThreadIndex(8 * 4 + 1), PixelFromThreadIndex(8 * 4 + 2), PixelFromThreadIndex(8 * 4 + 3), PixelFromThreadIndex(8 * 4 + 4), PixelFromThreadIndex(8 * 4 + 5), PixelFromThreadIndex(8 * 4 + 6), PixelFromThreadIndex(8 * 4 + 7), PixelFromThreadIndex(8 * 5 + 0), PixelFromThreadIndex(8 * 5 + 1), PixelFromThreadIndex(8 * 5 + 2), PixelFromThreadIndex(8 * 5 + 3), PixelFromThreadIndex(8 * 5 + 4), PixelFromThreadIndex(8 * 5 + 5), PixelFromThreadIndex(8 * 5 + 6), PixelFromThreadIndex(8 * 5 + 7), PixelFromThreadIndex(8 * 6 + 0), PixelFromThreadIndex(8 * 6 + 1), PixelFromThreadIndex(8 * 6 + 2), PixelFromThreadIndex(8 * 6 + 3), PixelFromThreadIndex(8 * 6 + 4), PixelFromThreadIndex(8 * 6 + 5), PixelFromThreadIndex(8 * 6 + 6), PixelFromThreadIndex(8 * 6 + 7), PixelFromThreadIndex(8 * 7 + 0), PixelFromThreadIndex(8 * 7 + 1), PixelFromThreadIndex(8 * 7 + 2), PixelFromThreadIndex(8 * 7 + 3), PixelFromThreadIndex(8 * 7 + 4), PixelFromThreadIndex(8 * 7 + 5), PixelFromThreadIndex(8 * 7 + 6), PixelFromThreadIndex(8 * 7 + 7), }; return LookupTable[ThreadIndex]; #endif } #endif // PLATFORM_SUPPORTS_HTILE_LOOKUP