Files
UnrealEngine/Engine/Shaders/Private/OIT/OITSorting.usf
2025-05-18 13:04:45 +08:00

481 lines
14 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#pragma once
// RECOMPILE HASH
#pragma message("UESHADERMETADATA_VERSION 78C3EBBB-59EA-459D-A2D4-67170905D7FC")
#include "../Common.ush"
///////////////////////////////////////////////////////////////////////////////////////////////////
#define PRIMITIVE_TRILIST 0
#define PRIMITIVE_TRISTRIP 1
#define ENCODING_TYPE 1
#ifndef SORTING_SLICE_COUNT
#error SORTING_SLICE_COUNT needs to be defined
#endif
#define BACK_TO_FRONT 0
#define FRONT_TO_BACK 1
#if ENCODING_TYPE == 1
#define FPackedSliceAndOffset uint
#else
#define FPackedSliceAndOffset uint2
#endif
#if PERMUTATION_DEBUG || SHADER_DEBUG
#include "../ShaderPrint.ush"
#include "../ColorMap.ush"
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
struct FTriIndex
{
uint I0;
uint I1;
uint I2;
};
FTriIndex GetTriangleIndices(uint InPrimitiveId, uint InFirstIndex, uint InPrimitiveType)
{
FTriIndex Out = (FTriIndex)0;
if (InPrimitiveType == PRIMITIVE_TRILIST)
{
Out.I0 = InFirstIndex + InPrimitiveId * 3;
Out.I1 = Out.I0 + 1;
Out.I2 = Out.I0 + 2;
}
else if (InPrimitiveType == PRIMITIVE_TRISTRIP)
{
Out.I0 = (InFirstIndex < 3 || InPrimitiveId < 1) ? InPrimitiveId : InPrimitiveId + 2;
Out.I1 = Out.I0 + 1;
Out.I2 = Out.I0 + 2;
}
return Out;
}
bool IsValid(FTriIndex InTri, uint InNumIndices)
{
return InTri.I0 < InNumIndices && InTri.I1 < InNumIndices && InTri.I2 < InNumIndices;
}
#if SORTING_SLICE_COUNT > 256
#error Slice index is encoded onto 8bits and no longer fit into stored data
#endif
FPackedSliceAndOffset PackSliceAndOffset(uint InSliceIndex, uint InLocalOffset)
{
#if ENCODING_TYPE
return (InLocalOffset & 0xFFFFFF) | ((InSliceIndex&0xFF) << 24);
#else
return uint2(InSliceIndex, InLocalOffset);
#endif
}
void UnpackSliceAndOffset(FPackedSliceAndOffset In, inout uint OutSliceIndex, inout uint OutLocalOffset)
{
#if ENCODING_TYPE
OutSliceIndex = (In>>24) & 0xFF;
OutLocalOffset = In & 0xFFFFFF;
#else
OutSliceIndex = In.x;
OutLocalOffset = In.y;
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Debug
struct FOITDebugData
{
uint Index;
uint PrimitiveCount;
uint Type;
uint SizeInBytes;
uint FirstIndex;
uint BaseVertexIndex;
uint MinVertexIndex;
uint MaxVertexIndex;
};
void AddDebugInfo(RWBuffer<uint> InDebugData, FOITDebugData In)
{
uint Index = 0;
InterlockedAdd(InDebugData[0], 1, Index);
const uint Index3 = 1 + Index * 7u;
InDebugData[Index3 + 0] = In.PrimitiveCount;
InDebugData[Index3 + 1] = In.Type;
InDebugData[Index3 + 2] = In.SizeInBytes;
InDebugData[Index3 + 3] = In.FirstIndex;
InDebugData[Index3 + 4] = In.BaseVertexIndex;
InDebugData[Index3 + 5] = In.MinVertexIndex;
InDebugData[Index3 + 6] = In.MaxVertexIndex;
}
uint GetDebugInfoCount(Buffer<uint> InDebugData)
{
return InDebugData[0];
}
FOITDebugData GetDebugInfo(Buffer<uint> InDebugData, uint Index)
{
FOITDebugData Out = (FOITDebugData)0;
if (Index < InDebugData[0])
{
const uint Index3 = 1 + Index * 7u;
Out.Index = Index;
Out.PrimitiveCount = InDebugData[Index3 + 0];
Out.Type = InDebugData[Index3 + 1];
Out.SizeInBytes = InDebugData[Index3 + 2];
Out.FirstIndex = InDebugData[Index3 + 3];
Out.BaseVertexIndex = InDebugData[Index3 + 4];
Out.MinVertexIndex = InDebugData[Index3 + 5];
Out.MaxVertexIndex = InDebugData[Index3 + 6];
}
return Out;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
#if SHADER_SCAN
#if PERMUTATION_DEBUG
float4x4 ViewToWorld;
float3 WorldBound_Min;
float3 WorldBound_Max;
float3 ViewBound_Min;
float3 ViewBound_Max;
#endif
float4x4 LocalToWorld; // GPU scene buffer instead? How?
float4x4 LocalToView;
uint SourcePrimitiveType;
uint NumPrimitives;
uint NumIndices;
uint SourceFirstIndex;
uint SourceBaseVertexIndex;
uint SourceMinVertexIndex;
uint SourceMaxVertexIndex;
uint SortType;
uint SortedIndexBufferSizeInByte;
float ViewBoundMaxZ;
float ViewBoundMinZ;
Buffer<float> PositionBuffer;
Buffer<uint> IndexBuffer;
RWBuffer<uint> OutIndexBuffer;
RWBuffer<uint> OutSliceCounterBuffer;
RWBuffer<FPackedSliceAndOffset> OutPrimitiveSliceBuffer;
RWBuffer<uint> OutDebugData;
groupshared uint s_SliceIndices[SORTING_SLICE_COUNT];
groupshared uint s_SliceOffsets[SORTING_SLICE_COUNT];
[numthreads(SORTING_SLICE_COUNT, 1, 1)]
void MainCS(uint3 DispatchThreadId : SV_DispatchThreadID, uint GroupThreadId : SV_GroupIndex)
{
uint PrimitiveId = DispatchThreadId.x;
uint LocalOffset = 0;
uint SliceIndex = 0;
// 0. Clear local allocation (shared-memory)
if (GroupThreadId < SORTING_SLICE_COUNT)
{
s_SliceIndices[GroupThreadId] = 0;
}
GroupMemoryBarrierWithGroupSync();
// 1. Compute slice index, and local allocation (shared-memory)
const bool bIsValid = PrimitiveId < NumPrimitives;
if (bIsValid)
{
const FTriIndex Tri = GetTriangleIndices(PrimitiveId, SourceFirstIndex, SourcePrimitiveType);
const uint I0 = 3 * clamp(IndexBuffer[Tri.I0] + SourceBaseVertexIndex, SourceMinVertexIndex, SourceMaxVertexIndex);
const uint I1 = 3 * clamp(IndexBuffer[Tri.I1] + SourceBaseVertexIndex, SourceMinVertexIndex, SourceMaxVertexIndex);
const uint I2 = 3 * clamp(IndexBuffer[Tri.I2] + SourceBaseVertexIndex, SourceMinVertexIndex, SourceMaxVertexIndex);
// Position buffer store data as a float, not float3
const float3 P0 = float3(PositionBuffer[I0], PositionBuffer[I0+1], PositionBuffer[I0+2]);
const float3 P1 = float3(PositionBuffer[I1], PositionBuffer[I1+1], PositionBuffer[I1+2]);
const float3 P2 = float3(PositionBuffer[I2], PositionBuffer[I2+1], PositionBuffer[I2+2]);
// Compute triangle center in view space
const float3 LocalP = (P0 + P1 + P2) / 3;
const float3 ViewP = mul(float4(LocalP, 1), LocalToView).xyz;
const float fSliceIndex = saturate((ViewP.z - ViewBoundMinZ) / (ViewBoundMaxZ - ViewBoundMinZ));
SliceIndex = SORTING_SLICE_COUNT * (SortType == BACK_TO_FRONT ? (1.f - fSliceIndex) : fSliceIndex);
SliceIndex = clamp(SliceIndex, 0, SORTING_SLICE_COUNT - 1);
#if PERMUTATION_DEBUG
{
FShaderPrintContext Ctx = InitShaderPrintContext();
const float4 TriColor = float4(ColorMapViridis(SliceIndex / float(SORTING_SLICE_COUNT - 1)), 1);
const float3 WP0 = mul(float4(P0, 1), LocalToWorld).xyz;
const float3 WP1 = mul(float4(P1, 1), LocalToWorld).xyz;
const float3 WP2 = mul(float4(P2, 1), LocalToWorld).xyz;
AddLineTriangleWS(Ctx, WP0, WP1, WP2, TriColor);
}
#endif
InterlockedAdd(s_SliceIndices[SliceIndex], 1, LocalOffset);
}
#if PERMUTATION_DEBUG
if (all(DispatchThreadId == 0))
{
AddAABBWS(WorldBound_Min, WorldBound_Max, ColorYellow);
AddOBBWS(ViewBound_Min, ViewBound_Max, ColorCyan, ViewToWorld);
FOITDebugData DebugData;
DebugData.PrimitiveCount = NumPrimitives;
DebugData.Type = SourcePrimitiveType;
DebugData.SizeInBytes = SortedIndexBufferSizeInByte;
DebugData.FirstIndex = SourceFirstIndex;
DebugData.BaseVertexIndex = SourceBaseVertexIndex;
DebugData.MinVertexIndex = SourceMinVertexIndex;
DebugData.MaxVertexIndex = SourceMaxVertexIndex;
AddDebugInfo(OutDebugData, DebugData);
}
#endif
GroupMemoryBarrierWithGroupSync();
// 2. Global allocation (global-memory)
if (GroupThreadId < SORTING_SLICE_COUNT)
{
if (s_SliceIndices[GroupThreadId] > 0)
{
InterlockedAdd(OutSliceCounterBuffer[GroupThreadId], s_SliceIndices[GroupThreadId], s_SliceOffsets[GroupThreadId]);
}
}
GroupMemoryBarrierWithGroupSync();
// 3. Write slice index & relative offset per primitive
if (bIsValid)
{
// Write slice index and the primitive offset so that we don't need to atomic ops during the final write-out pass
const uint GlobalRelativeOffset = s_SliceOffsets[SliceIndex] + LocalOffset;
OutPrimitiveSliceBuffer[PrimitiveId] = PackSliceAndOffset(SliceIndex, GlobalRelativeOffset);
}
}
#endif // SHADER_SCAN
///////////////////////////////////////////////////////////////////////////////////////////////////
#if SHADER_ALLOCATE
Buffer<uint> SliceCounterBuffer;
RWBuffer<uint> SliceOffsetsBuffer;
groupshared uint s_Counters[SORTING_SLICE_COUNT];
groupshared uint s_Offsets[SORTING_SLICE_COUNT];
[numthreads(SORTING_SLICE_COUNT, 1, 1)]
void MainCS(uint3 DispatchThreadId : SV_DispatchThreadID)
{
const uint SliceIndex = DispatchThreadId.x;
const bool bIsValid = SliceIndex < SORTING_SLICE_COUNT;
if (bIsValid)
{
s_Counters[SliceIndex] = SliceCounterBuffer[SliceIndex];
}
GroupMemoryBarrierWithGroupSync();
// TODO: parallel prefix sum
if (SliceIndex == 0)
{
uint AccumPrimitiveCount = 0;
for (uint SliceIt = 0; SliceIt < SORTING_SLICE_COUNT; ++SliceIt)
{
s_Offsets[SliceIt] = AccumPrimitiveCount;
AccumPrimitiveCount += s_Counters[SliceIt];
}
}
GroupMemoryBarrierWithGroupSync();
if (bIsValid)
{
SliceOffsetsBuffer[SliceIndex] = s_Offsets[SliceIndex];
}
}
#endif // SHADER_ALLOCATE
///////////////////////////////////////////////////////////////////////////////////////////////////
#if SHADER_WRITE
uint SourcePrimitiveType;
uint NumPrimitives;
uint NumIndices;
uint SrcFirstIndex;
uint DstFirstIndex;
uint SortType;
Buffer<uint> SliceOffsetsBuffer;
Buffer<FPackedSliceAndOffset> PrimitiveSliceBuffer;
Buffer<uint> IndexBuffer;
RWBuffer<uint> OutIndexBuffer;
[numthreads(256, 1, 1)]
void MainCS(uint3 DispatchThreadId : SV_DispatchThreadID)
{
uint SrcPrimitiveId = DispatchThreadId.x;
if (SrcPrimitiveId >= NumPrimitives)
{
return;
}
uint SliceIndex = 0;
uint PrimitiveOffset = 0;
UnpackSliceAndOffset(PrimitiveSliceBuffer[SrcPrimitiveId], SliceIndex, PrimitiveOffset);
uint SlicePrimitiveOffset = 0;
if (SliceIndex < SORTING_SLICE_COUNT)
{
SlicePrimitiveOffset = SliceOffsetsBuffer[SliceIndex];
}
const uint DstPrimitiveId = SlicePrimitiveOffset + PrimitiveOffset;
const FTriIndex SrcTri = GetTriangleIndices(SrcPrimitiveId, SrcFirstIndex, SourcePrimitiveType);
const FTriIndex DstTri = GetTriangleIndices(DstPrimitiveId, DstFirstIndex, PRIMITIVE_TRILIST); // Always write out triangle list, since sorting break strip topology
if (IsValid(DstTri, NumIndices))
{
OutIndexBuffer[DstTri.I0] = IndexBuffer[SrcTri.I0];
OutIndexBuffer[DstTri.I1] = IndexBuffer[SrcTri.I1];
OutIndexBuffer[DstTri.I2] = IndexBuffer[SrcTri.I2];
}
}
#endif // SHADER_WRITE
///////////////////////////////////////////////////////////////////////////////////////////////////
#if SHADER_DEBUG
uint VisibleInstances;
uint VisiblePrimitives;
uint VisibleIndexSizeInBytes;
uint AllocatedBuffers;
uint AllocatedIndexSizeInBytes;
uint UnusedBuffers;
uint UnusedIndexSizeInBytes;
uint TotalEntries;
Buffer<uint> DebugData;
void PrintSize(inout FShaderPrintContext Context, uint SizeInBytes, FFontColor Color)
{
if (SizeInBytes < 1000)
{
Print(Context, SizeInBytes, Color, 5, 2);
Print(Context, TEXT(" bytes "), Color);
}
else if (SizeInBytes < 1000000)
{
Print(Context, SizeInBytes / 1000.f, Color, 5, 2);
Print(Context, TEXT(" Kbytes "), Color);
}
else
{
Print(Context, SizeInBytes / 1000000.f, Color, 5, 2);
Print(Context, TEXT(" Mbytes "), Color);
}
}
[numthreads(1, 1, 1)]
void MainCS(uint DispatchThreadId : SV_DispatchThreadID)
{
FShaderPrintContext Context = InitShaderPrintContext(true, uint2(50, 50));
// Total NumPrimitive | Total Size 0.4 Mb
// Visible
{
Print(Context, TEXT("Visible"), FontOrange); Newline(Context);
Print(Context, TEXT("#Sort slices : "), FontWhite); Print(Context, uint(SORTING_SLICE_COUNT), FontSilver); Newline(Context);
Print(Context, TEXT("Instances : "), FontWhite); Print(Context, VisibleInstances, FontSilver); Newline(Context);
Print(Context, TEXT("Primitives : "), FontWhite); Print(Context, VisiblePrimitives, FontSilver); Newline(Context);
Print(Context, TEXT("Size : "), FontYellow); PrintSize(Context, VisibleIndexSizeInBytes, FontYellow); Newline(Context);
}
Newline(Context);
// Memory
{
Print(Context, TEXT("Memory"), FontOrange); Newline(Context);
Print(Context, TEXT("Alloc. Buffers : "), FontWhite); Print(Context, AllocatedBuffers, FontSilver, 3, 0); Print(Context, TEXT("/"), FontSilver); Print(Context, TotalEntries, FontSilver, 3, 0); Newline(Context);
Print(Context, TEXT("Alloc. Size : "), FontWhite); PrintSize(Context, AllocatedIndexSizeInBytes, FontSilver); Newline(Context);
Print(Context, TEXT("Unused Buffers : "), FontWhite); Print(Context, UnusedBuffers, FontSilver); Newline(Context);
Print(Context, TEXT("Unused Size : "), FontWhite); PrintSize(Context, UnusedIndexSizeInBytes, FontSilver); Newline(Context);
Print(Context, TEXT("Total Size : "), FontYellow); PrintSize(Context, AllocatedIndexSizeInBytes + UnusedIndexSizeInBytes, FontYellow); Newline(Context);
}
Newline(Context);
// 1: NumPrimitive | PrimitiveType | 0.4 Mb
{
Print(Context, TEXT("Visible (details)"), FontSilver);
Newline(Context);
Print(Context, TEXT("Index "));
Print(Context, TEXT("Primitive "));
Print(Context, TEXT("Type "));
Print(Context, TEXT("Size "));
Print(Context, TEXT("FirstIdx "));
Print(Context, TEXT("BaseIdx "));
Print(Context, TEXT("MinVtxIdx "));
Print(Context, TEXT("MaxVtxIdx "));
Print(Context, TEXT("#Vertex "));
Newline(Context);
for (uint Index = 0; Index < VisibleInstances; ++Index)
{
const FOITDebugData Data = GetDebugInfo(DebugData, Index);
Print(Context, Data.Index, FontWhite);
Print(Context, Data.PrimitiveCount, FontSilver);
if (Data.Type == 0)
Print(Context, TEXT("List "), FontSilver);
else
Print(Context, TEXT("Strip "), FontSilver);
PrintSize(Context, Data.SizeInBytes, FontYellow);
Print(Context, Data.FirstIndex, FontSilver);
Print(Context, Data.BaseVertexIndex, FontSilver);
Print(Context, Data.MinVertexIndex, FontSilver);
Print(Context, Data.MaxVertexIndex, FontSilver);
Print(Context, Data.MaxVertexIndex - Data.MinVertexIndex + 1, FontSilver);
Newline(Context);
}
}
}
#endif // SHADER_DEBUG