2098 lines
67 KiB
HLSL
2098 lines
67 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#define HAIR_STRANDS_PARAMETERS 0
|
|
|
|
#include "../Common.ush"
|
|
#include "../WaveOpUtil.ush"
|
|
#include "HairStrandsClusterCommon.ush"
|
|
#include "HairStrandsVertexFactoryCommon.ush"
|
|
#include "HairStrandsVisibilityCommon.ush"
|
|
#include "../ColorMap.ush"
|
|
|
|
#if PERMUTATION_DEBUG
|
|
#include "../ShaderPrint.ush"
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////
|
|
// Pack/unpack helpers
|
|
uint PackTileCoord(uint2 In)
|
|
{
|
|
return (In.x & 0xffff) | ((In.y & 0xffff) << 16);
|
|
}
|
|
|
|
uint2 UnpackTileCoord(uint In)
|
|
{
|
|
return uint2(In & 0xffff, (In >> 16) & 0xffff);
|
|
}
|
|
|
|
uint PackDepth(float In)
|
|
{
|
|
return asuint(In);
|
|
}
|
|
|
|
float UnpackDepth(uint In)
|
|
{
|
|
return asfloat(In);
|
|
}
|
|
|
|
struct FDepthRange
|
|
{
|
|
float MinZ;
|
|
float MaxZ;
|
|
};
|
|
|
|
uint PackDepthRange(FDepthRange In)
|
|
{
|
|
return PackFloat2ToUInt(In.MinZ, In.MaxZ);
|
|
}
|
|
|
|
FDepthRange UnpackDepthRange(uint In)
|
|
{
|
|
FDepthRange Out;
|
|
const float2 D = UnpackFloat2FromUInt(In);
|
|
Out.MinZ = D.x;
|
|
Out.MaxZ = D.y;
|
|
return Out;
|
|
}
|
|
|
|
uint PackWork(uint InTileIndex, uint InTileCount)
|
|
{
|
|
return InTileIndex | (InTileCount << 16);
|
|
}
|
|
|
|
uint2 UnpackWork(uint In)
|
|
{
|
|
return uint2(In & 0xFFFF, In >> 16);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Tile Helpers
|
|
|
|
// Max number of iterator that a rasterizer can do. This is for preventing any kind of infinite loop.
|
|
#define MAX_WORK_COUNT 4096
|
|
|
|
#define BIN_TILE_SIZE 32
|
|
#define BIN_RCP_TILE_SIZE (1.f / BIN_TILE_SIZE)
|
|
#define BIN_TILE_SIZE_AS_SHIFT 5
|
|
#define BIN_THREAD_COUNT 1024
|
|
|
|
#define RASTER_TILE_SIZE 8
|
|
#define RASTER_RCP_TILE_SIZE (1.f / RASTER_TILE_SIZE)
|
|
#define RASTER_TILE_SIZE_AS_SHIFT 3
|
|
#define RASTER_THREAD_COUNT 64
|
|
|
|
uint2 LinearTo2D_Common(uint In, uint InTileSize, float InRcpTileSize)
|
|
{
|
|
uint2 Out;
|
|
#if 0
|
|
Out.y = (In + 0.5f) * InRcpTileSize;
|
|
Out.x = In - (Out.y * InTileSize);
|
|
#else
|
|
Out.x = In%InTileSize;
|
|
Out.y = In/InTileSize;
|
|
#endif
|
|
return Out;
|
|
}
|
|
|
|
uint2 LinearTo2D_Bin(uint In) { return LinearTo2D_Common(In, BIN_TILE_SIZE, BIN_RCP_TILE_SIZE); }
|
|
uint2 LinearTo2D_Raster(uint In) { return LinearTo2D_Common(In, RASTER_TILE_SIZE, RASTER_RCP_TILE_SIZE); }
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// DDA helper
|
|
|
|
// TODO: Without setting a limit, we are sometimes caught in an infinite loop even though this should not happen.
|
|
#define DDA_MAX_ITERATIONS 256
|
|
|
|
struct FDDAContext
|
|
{
|
|
float2 Coord;
|
|
float2 DeltaDist;
|
|
float2 Step;
|
|
float2 SideDist;
|
|
};
|
|
|
|
FDDAContext DDACreateContext(float2 RayStart, float2 RayDir)
|
|
{
|
|
const float2 RayDirRcp = 1.0f / RayDir;
|
|
|
|
FDDAContext Context;
|
|
Context.Coord = floor(RayStart);
|
|
Context.DeltaDist = abs(RayDirRcp);
|
|
Context.Step = sign(RayDir);
|
|
Context.SideDist = (Context.Coord - RayStart + max(Context.Step, 0.0f)) * RayDirRcp;
|
|
|
|
return Context;
|
|
}
|
|
|
|
void DDAAdvance(inout FDDAContext Context)
|
|
{
|
|
if (Context.SideDist.x < Context.SideDist.y)
|
|
{
|
|
Context.SideDist.x += Context.DeltaDist.x;
|
|
Context.Coord.x += Context.Step.x;
|
|
}
|
|
else
|
|
{
|
|
Context.SideDist.y += Context.DeltaDist.y;
|
|
Context.Coord.y += Context.Step.y;
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Visibility Tile data
|
|
/*
|
|
// use untyped buffer for segment tiles to reduce VGPR usage - 16 bytes
|
|
struct FVisTile
|
|
{
|
|
uint PrimOffset;
|
|
uint PrimCount;
|
|
uint TileCoord;
|
|
uint MinDepth;
|
|
};
|
|
*/
|
|
#define VT_PrimOffset 0
|
|
#define VT_PrimCount 1
|
|
#define VT_Coord 2
|
|
#define VT_MinWriteIndex 3
|
|
#define VT_MinMaxDepth 4
|
|
#define VT_SIZE 5
|
|
|
|
// Visibility tile data are stored as:
|
|
// ________________________________________________________________________________________________________________________________________________________________________________________________________________
|
|
// || Tile 0 || Tile 1 || Tile 2 ||
|
|
// ||____________________________________________________________________||____________________________________________________________________||____________________________________________________________________||
|
|
// || | | | | || | | | | || | | | | ||
|
|
// || PrimOffset | PrimCount | (Tile)Coord | MinWriteIndex | MinMaxDepth || PrimOffset | PrimCount | (Tile)Coord | MinWriteIndex | MinMaxDepth || PrimOffset | PrimCount | (Tile)Coord | MinWriteIndex | MinMaxDepth ||
|
|
|
|
uint PackVisTileCoord(uint2 In)
|
|
{
|
|
return (In.x & 0xff) | ((In.y & 0xff) << 8);
|
|
}
|
|
|
|
uint2 UnpackVisTileCoord(uint In)
|
|
{
|
|
return uint2(In & 0xff, (In >> 8) & 0xff);
|
|
}
|
|
|
|
uint LoadOutVisTileData(RWByteAddressBuffer OutBuffer, uint Index, uint VTEntry)
|
|
{
|
|
// Each entry is 4 bytes
|
|
return OutBuffer.Load(Index * VT_SIZE * 4 + VTEntry * 4);
|
|
}
|
|
|
|
void StoreOutVisTileData(RWByteAddressBuffer OutBuffer, uint Index, uint VTEntry, uint Value)
|
|
{
|
|
// Each entry is 4 bytes
|
|
OutBuffer.Store(Index * VT_SIZE * 4 + VTEntry * 4, Value);
|
|
}
|
|
|
|
uint LoadVisTileData(ByteAddressBuffer InBuffer, uint Index, uint VTEntry)
|
|
{
|
|
return InBuffer.Load(Index * VT_SIZE * 4 + VTEntry * 4);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Misc.
|
|
float4 Transparent(float4 Color) { return float4(Color.xyz, 0.5f); }
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Common parameters
|
|
|
|
int2 BinTileRes;
|
|
int2 RasterTileRes;
|
|
|
|
uint NumBinners;
|
|
float RcpNumBinners;
|
|
|
|
uint NumRasterizers;
|
|
float RcpNumRasterizers;
|
|
|
|
int2 OutputResolution;
|
|
float2 OutputResolutionf;
|
|
float RadiusAtDepth1;
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Control points
|
|
Buffer<float4> ControlPoints;
|
|
StructuredBuffer<uint> ControlPointCount;
|
|
uint MaxControlPointCount;
|
|
|
|
uint GetControlPointCount()
|
|
{
|
|
return ControlPointCount[0];
|
|
}
|
|
|
|
// Custom encoding for forward rasterizer. Position are 32bits float. This is temporary.
|
|
FHairControlPoint UnpackHairControlPoint(uint InPrimId)
|
|
{
|
|
const float4 Packed = ControlPoints[InPrimId];
|
|
const uint W = asuint(Packed.w);
|
|
const float R = f16tof32(W & 0xFFFF);
|
|
const float U = ((W >> 16) & 0xFF) * (1.f / 255.f);
|
|
const uint T = (W >> 24) & 0x3;
|
|
|
|
FHairControlPoint Out;
|
|
Out.Position = Packed.xyz;
|
|
Out.WorldRadius = R;
|
|
Out.UCoord = U;
|
|
Out.Type = T;
|
|
return Out;
|
|
}
|
|
|
|
#if SHADER_RASTERCOMPUTE_BINNING || SHADER_RASTERCOMPUTE_COMPACTION || SHADER_RASTERCOMPUTE_RASTER || SHADER_RASTERCOMPUTE_DEPTH_GRID || SHADER_RASTERCOMPUTE_DEBUG
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
float3 NDCToPixelCoord(float4 InDC)
|
|
{
|
|
const float3 NDC = InDC.xyz / InDC.w;
|
|
float2 UV = NDC.xy * ResolvedView.ScreenPositionScaleBias.xy + ResolvedView.ScreenPositionScaleBias.wz;
|
|
return float3(UV * OutputResolution, NDC.z);
|
|
}
|
|
|
|
void CalcHomogenousPos(in uint PrimId, out float4 HP, out uint Type)
|
|
{
|
|
const FHairControlPoint CP = UnpackHairControlPoint(PrimId);
|
|
|
|
const float3 WP = CP.Position; // This is actually WorldPosition
|
|
HP = mul(float4(WP, 1.0f), DFHackToFloat(PrimaryView.WorldToClip)); // TODO move this at least into translated world space
|
|
Type = CP.Type;
|
|
}
|
|
|
|
void CalcHomogenousPosAndRad(in uint PrimId, out float4 HP, out float Rad, out uint Type)
|
|
{
|
|
const FHairControlPoint CP = UnpackHairControlPoint(PrimId);
|
|
|
|
const float3 WP = CP.Position; // This is actually WorldPosition
|
|
HP = mul(float4(WP, 1.0f), DFHackToFloat(PrimaryView.WorldToClip));
|
|
Rad = CP.WorldRadius;
|
|
Type = CP.Type;
|
|
}
|
|
|
|
float ComputeLerpAlpha(int2 Coord, float2 P0, float2 P1, float SegmentLenSqRcp)
|
|
{
|
|
// Project P onto line segment and compute the lerp alpha between P0 and P1
|
|
// Simplification of:
|
|
// A = P - P0
|
|
// B = P1 - P0
|
|
// Alpha = dot(A, B) / dot(B, B)
|
|
const float2 P = Coord + 0.5f;
|
|
const float Alpha = saturate(dot(P - P0, P1 - P0) * SegmentLenSqRcp);
|
|
return Alpha;
|
|
}
|
|
|
|
float ComputePerspectiveCorrectRadius(float Rad0, float Rad1, float Alpha, float RcpW0, float RcpW1)
|
|
{
|
|
// Alpha value for perspective correct interpolation. We store the reciprocal of w in the w component of P0 and P1,
|
|
// so this is a simplification of:
|
|
// (Alpha / w1) / ((1 - Alpha) / w0 + Alpha / w1)
|
|
const float LerpedRcpW = lerp(RcpW0, RcpW1, Alpha);
|
|
const float PerspectiveAlpha = (Alpha * RcpW1) / LerpedRcpW;
|
|
// Divide by W to make thickness dependent on screen space depth? This division was kept from the previous line rasterization algorithm.
|
|
const float Rad = lerp(Rad0, Rad1, PerspectiveAlpha) * LerpedRcpW;
|
|
return Rad;
|
|
}
|
|
|
|
// Line clipping based on "CLIPPING USING HOMOGENEOUS COORDINATES" by Blinn et al.
|
|
bool BlinnLineClipping(inout float4 P0, inout float4 P1)
|
|
{
|
|
float2 T = float2(0.0f, 1.0f);
|
|
bool bIsRemoved = P0.w < 0.0f && P1.w < 0.0f; // Both points behind near plane
|
|
|
|
bool bSign = false;
|
|
|
|
UNROLL
|
|
for (uint PlaneIdx = 0; PlaneIdx < 6; ++PlaneIdx)
|
|
{
|
|
// Compute boundary coordinates of both points (w+x, w-x, w+y, w-y, z, w-z)
|
|
bSign = !bSign;
|
|
const uint CompIdx = PlaneIdx / 2;
|
|
const float Sign = bSign ? 1.0f : -1.0f;
|
|
const float WFactor = PlaneIdx != 4 ? 1.0f : 0.0f;
|
|
const float2 BC = WFactor * float2(P0.w, P1.w) + Sign * float2(P0[CompIdx], P1[CompIdx]);
|
|
|
|
float Num = BC.x;
|
|
float Denom = BC.x - BC.y;
|
|
bIsRemoved = bIsRemoved || (BC.x < 0.0f && BC.y < 0.0f); // Both points outside the plane
|
|
float Alpha = Num / Denom;
|
|
|
|
// If the denominator is negative, P0 has a smaller boundary coordinate than P1, so we can assume
|
|
// that P1 is inside the plane (or bIsRemoved is true), so we need to update the alpha for P0.
|
|
// The reverse is true if the denominator is positive.
|
|
if (Denom < 0.0f)
|
|
{
|
|
T.x = max(T.x, Alpha);
|
|
}
|
|
else
|
|
{
|
|
T.y = min(T.y, Alpha);
|
|
}
|
|
}
|
|
|
|
if (!bIsRemoved)
|
|
{
|
|
const float4 P0Clipped = lerp(P0, P1, T.x);
|
|
const float4 P1Clipped = lerp(P0, P1, T.y);
|
|
P0 = P0Clipped;
|
|
P1 = P1Clipped;
|
|
}
|
|
|
|
return !bIsRemoved;
|
|
}
|
|
|
|
bool ClipRaySegment(float2 AABBMin, float2 AABBMax, float4 P0, float4 P1, out float2 T, out bool2 bClipped)
|
|
{
|
|
bClipped = false;
|
|
T = float2(0.0f, 1.0f);
|
|
const bool bP0Outside = any(P0.xy < AABBMin) || any(P0.xy > AABBMax);
|
|
const bool bP1Outside = any(P1.xy < AABBMin) || any(P1.xy > AABBMax);
|
|
if (!bP0Outside && !bP1Outside)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
const float2 Origin = P0.xy;
|
|
const float2 Dir = P1.xy - P0.xy;
|
|
const float2 RcpDir = 1.0f / Dir;
|
|
|
|
const float2 T0 = (AABBMin - Origin) * RcpDir;
|
|
const float2 T1 = (AABBMax - Origin) * RcpDir;
|
|
|
|
T.x = max(min(T0.x, T1.x), min(T0.y, T1.y));
|
|
T.y = min(max(T0.x, T1.x), max(T0.y, T1.y));
|
|
|
|
// Ray intersects the AABB but the segment is completely outside or no intersection at all.
|
|
if (T.y < 0.0f || T.x > T.y || T.x > 1.f)
|
|
{
|
|
bClipped = true;
|
|
return false;
|
|
}
|
|
|
|
if (bP0Outside && T.x > 0.0f && T.x < 1.0f)
|
|
{
|
|
bClipped.x = true;
|
|
}
|
|
if (bP1Outside && T.y > 0.0f && T.y < 1.0f)
|
|
{
|
|
bClipped.y = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool ClipRaySegment(float2 AABBMin, float2 AABBMax, inout float4 P0, inout float4 P1, inout float Rad0, inout float Rad1, inout float Alpha0, inout float Alpha1, out bool2 bClipped, out float2 T)
|
|
{
|
|
//float2 T;
|
|
bool bIsValid = ClipRaySegment(AABBMin, AABBMax, P0, P1, T, bClipped);
|
|
|
|
if (bIsValid)
|
|
{
|
|
const bool bP0Outside = any(P0.xy < AABBMin) || any(P0.xy > AABBMax);
|
|
const bool bP1Outside = any(P1.xy < AABBMin) || any(P1.xy > AABBMax);
|
|
|
|
float4 P0New = P0;
|
|
float4 P1New = P1;
|
|
float Rad0New = Rad0;
|
|
float Rad1New = Rad1;
|
|
if (bP0Outside && T.x > 0.0f && T.x < 1.0f)
|
|
{
|
|
Alpha0 = T.x;
|
|
P0New = lerp(P0, P1, T.x);
|
|
Rad0New = lerp(Rad0, Rad1, T.x);
|
|
bClipped.x = true;
|
|
}
|
|
if (bP1Outside && T.y > 0.0f && T.y < 1.0f)
|
|
{
|
|
Alpha1 = T.y;
|
|
P1New = lerp(P0, P1, T.y);
|
|
Rad1New = lerp(Rad0, Rad1, T.y);
|
|
bClipped.y = true;
|
|
}
|
|
P0 = P0New;
|
|
P1 = P1New;
|
|
Rad0 = Rad0New;
|
|
Rad1 = Rad1New;
|
|
}
|
|
|
|
return bIsValid;
|
|
}
|
|
|
|
#endif // Common rasetrizer helper function & parameters
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_DEPTH_GRID
|
|
|
|
Texture2D<float> SceneDepthTexture;
|
|
RWTexture2D<uint> OutVisTileDepthGrid;
|
|
groupshared uint group_FurthestDepth; // (4 bytes)
|
|
|
|
[numthreads(BIN_THREAD_COUNT, 1, 1)]
|
|
void PrepareDepthGridCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID)
|
|
{
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_FurthestDepth = 0xFFFFFFFF; // Inverse-Z
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Assumes a workgroup size of 1024 threads to cover a maximum tile size of 32x32.
|
|
if (GroupThreadID < BIN_THREAD_COUNT)
|
|
{
|
|
const uint2 PixelCoord = LinearTo2D_Bin(GroupThreadID) + GroupID * BIN_TILE_SIZE;
|
|
|
|
if (all(PixelCoord < (uint2)OutputResolution))
|
|
{
|
|
const float Depth = SceneDepthTexture.Load(uint3(PixelCoord, 0));
|
|
|
|
// Compute furthest depth inside this tile
|
|
WaveInterlockedMin(group_FurthestDepth, PackDepth(Depth)); // Inverse-Z
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (GroupThreadID == 0)
|
|
{
|
|
OutVisTileDepthGrid[GroupID] = group_FurthestDepth;
|
|
}
|
|
}
|
|
|
|
#endif //SHADER_RASTERCOMPUTE_DEPTH_GRID
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#define BIN_MINMAX 1
|
|
|
|
#if SHADER_RASTERCOMPUTE_BINNING
|
|
|
|
RWTexture2DArray<uint> OutVisTileBinningGrid;
|
|
#if BIN_MINMAX
|
|
RWTexture2DArray<uint> OutVisTileBinningGridMinZ;
|
|
RWTexture2DArray<uint> OutVisTileBinningGridMaxZ;
|
|
#endif
|
|
RWBuffer<uint> OutVisTilePrims;
|
|
RWBuffer<uint> OutVisTilePrimDepths;
|
|
RWBuffer<uint> OutVisTileArgs;
|
|
RWByteAddressBuffer OutVisTileData;
|
|
Texture2D<uint> VisTileDepthGrid;
|
|
ByteAddressBuffer IndirectPrimIDCount;
|
|
|
|
groupshared uint group_LoopNum;
|
|
groupshared uint group_VerticesNum;
|
|
groupshared uint group_BatchNum;
|
|
|
|
#define TILES_TO_ALLOCATE_MAX 1024
|
|
|
|
groupshared uint group_TilesToAllocate[TILES_TO_ALLOCATE_MAX];
|
|
groupshared uint group_TilesToAllocateCount;
|
|
|
|
// The total number of line segments (ControlPointCount) is divided up equally between N binners - each binner = a workgroup which loops through the designated set segments in batches of 1024
|
|
// NB there is still potential to use LDS to prevent/reduce contention when adding segments to the binning grid and this may improve perf
|
|
|
|
[numthreads(1024, 1, 1)]
|
|
void BinningCS(uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID)
|
|
{
|
|
ResolvedView = ResolveView();
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_VerticesNum = GetControlPointCount();
|
|
group_BatchNum = DivideAndRoundUp(group_VerticesNum, 1024);
|
|
group_LoopNum = DivideAndRoundUp(group_BatchNum, NumBinners);
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
#if PERMUTATION_DEBUG
|
|
const bool bDebugEnabled = false && GroupID == 0 && GroupThreadID <= 64;
|
|
FShaderPrintContext Ctx = InitShaderPrintContext(bDebugEnabled, uint2(250, 50));
|
|
#endif
|
|
|
|
LOOP for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++)
|
|
{
|
|
const uint BatchIndex = LoopIndex + (GroupID * group_LoopNum);
|
|
bool bSegValid = (BatchIndex < group_BatchNum);
|
|
|
|
const uint PrimID = BatchIndex * 1024 + GroupThreadID;
|
|
bSegValid = bSegValid && (PrimID < group_VerticesNum);
|
|
|
|
const uint SegmentCountLayerIdx = GroupID; // Stores number of segments per tile per workgroup.
|
|
const uint TmpSegmentCountLayerIdx = SegmentCountLayerIdx + NumBinners; // Also stores number of segments per tile per workgroup. Used as second counter for this two pass algorithm.
|
|
const uint TileAllocInfoLayerIdx = SegmentCountLayerIdx + NumBinners * 2; // Stores per tile per workgroup allocation info.
|
|
|
|
uint MaxZ = 0;
|
|
uint MinZ = 0xFFFFFFFF;
|
|
float2 TileCoord0F = 0.0f;
|
|
float2 TileCoord1F = 0.0f;
|
|
|
|
#if PERMUTATION_DEBUG
|
|
FHairControlPoint CP0;
|
|
FHairControlPoint CP1;
|
|
#endif
|
|
|
|
// 1. Project segment end points and clip them to the screen
|
|
if (bSegValid)
|
|
{
|
|
float4 H0 = 0.0f;
|
|
float4 H1 = 0.0f;
|
|
uint Type = -1;
|
|
CalcHomogenousPos(PrimID, H0, Type);
|
|
|
|
bool bIsEndCV = (Type == HAIR_CONTROLPOINT_END);
|
|
bSegValid = !bIsEndCV;
|
|
|
|
if (bSegValid)
|
|
{
|
|
CalcHomogenousPos(PrimID + 1, H1, Type);
|
|
|
|
// Do clipping in homogenous coordinates
|
|
bSegValid = BlinnLineClipping(H0, H1);
|
|
|
|
if (bSegValid)
|
|
{
|
|
float3 SP0 = NDCToPixelCoord(H0);
|
|
float3 SP1 = NDCToPixelCoord(H1);
|
|
SP0.xy *= BIN_RCP_TILE_SIZE;
|
|
SP1.xy *= BIN_RCP_TILE_SIZE;
|
|
|
|
// For peace of mind, make sure these are actually clamped to a valid range.
|
|
SP0 = clamp(SP0, 0.0f, float3(BinTileRes-0.01f, 1.0f));
|
|
SP1 = clamp(SP1, 0.0f, float3(BinTileRes-0.01f, 1.0f));
|
|
|
|
MaxZ = PackDepth(max(SP0.z, SP1.z));
|
|
MinZ = PackDepth(min(SP0.z, SP1.z));
|
|
|
|
TileCoord0F = SP0.xy;
|
|
TileCoord1F = SP1.xy;
|
|
|
|
#if PERMUTATION_DEBUG
|
|
if (bDebugEnabled && 0)
|
|
{
|
|
CP0 = UnpackHairControlPoint(PrimID);
|
|
CP1 = UnpackHairControlPoint(PrimID +1);
|
|
|
|
AddLineWS(Ctx, CP0.Position, CP1.Position, ColorRed);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
// 2. Reset allocation counter
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_TilesToAllocateCount = 0;
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// 3. Increment per workgroup per tile counters and add tiles to be allocated
|
|
if (bSegValid)
|
|
{
|
|
FDDAContext DDAContext = DDACreateContext(TileCoord0F, normalize(TileCoord1F - TileCoord0F));
|
|
const int2 EndCoord = (int2)floor(TileCoord1F);
|
|
|
|
for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt)
|
|
{
|
|
const int2 TileCoord = (int2)floor(DDAContext.Coord);
|
|
|
|
uint DebugInsertMode = 0;
|
|
BRANCH
|
|
if (MaxZ > VisTileDepthGrid[TileCoord]) // Inverse-Z
|
|
{
|
|
uint OldTileSegmentCount;
|
|
InterlockedAdd(OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)], 1, OldTileSegmentCount);
|
|
DebugInsertMode = 1;
|
|
|
|
// Min/Max
|
|
#if BIN_MINMAX
|
|
InterlockedMin(OutVisTileBinningGridMinZ[uint3(TileCoord, SegmentCountLayerIdx)], MinZ);
|
|
InterlockedMax(OutVisTileBinningGridMaxZ[uint3(TileCoord, SegmentCountLayerIdx)], MaxZ);
|
|
#endif
|
|
|
|
BRANCH
|
|
if ((OldTileSegmentCount % 1024) == 0)
|
|
{
|
|
uint WritePos;
|
|
InterlockedAdd(group_TilesToAllocateCount, 1, WritePos);
|
|
if (WritePos < TILES_TO_ALLOCATE_MAX)
|
|
{
|
|
group_TilesToAllocate[WritePos] = PackVisTileCoord(TileCoord);
|
|
DebugInsertMode = 2;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if PERMUTATION_DEBUG
|
|
if (bDebugEnabled)
|
|
{
|
|
//CP0 = UnpackHairControlPoint(PrimID);
|
|
//CP1 = UnpackHairControlPoint(PrimID +1);
|
|
//AddLineWS(Ctx, CP0.Position, CP1.Position, ColorRed);
|
|
|
|
float4 DebugColor = ColorRed;
|
|
if (DebugInsertMode == 1) DebugColor = ColorGreen;
|
|
if (DebugInsertMode == 2) DebugColor = ColorYellow;
|
|
AddQuadSS(Ctx, TileCoord * BIN_TILE_SIZE, TileCoord * BIN_TILE_SIZE + BIN_TILE_SIZE, DebugColor);
|
|
}
|
|
#endif
|
|
|
|
if (all(TileCoord == EndCoord))
|
|
{
|
|
break;
|
|
}
|
|
|
|
DDAAdvance(DDAContext);
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// 4. Allocate tiles
|
|
const uint TilesToAllocateCount = min(TILES_TO_ALLOCATE_MAX, group_TilesToAllocateCount);
|
|
for (uint TileIdx = GroupThreadID; TileIdx < TilesToAllocateCount; TileIdx += 1024)
|
|
{
|
|
const uint PackedTileCoord = group_TilesToAllocate[TileIdx];
|
|
const uint2 TileCoord = UnpackVisTileCoord(PackedTileCoord);
|
|
|
|
const uint TotalNewWriteCount = OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)];
|
|
const uint TotalOldWriteCount = OutVisTileBinningGrid[uint3(TileCoord, TmpSegmentCountLayerIdx)];
|
|
|
|
#if BIN_MINMAX
|
|
FDepthRange TileDepthRange;
|
|
TileDepthRange.MinZ = UnpackDepth(OutVisTileBinningGridMinZ[uint3(TileCoord, SegmentCountLayerIdx)]);
|
|
TileDepthRange.MaxZ = UnpackDepth(OutVisTileBinningGridMaxZ[uint3(TileCoord, SegmentCountLayerIdx)]);
|
|
#endif
|
|
|
|
uint NewTile;
|
|
WaveInterlockedAddScalar_(OutVisTileArgs[0], 1, NewTile);
|
|
|
|
StoreOutVisTileData(OutVisTileData, NewTile, VT_Coord, PackedTileCoord);
|
|
// Round down the count to the start of the tile and later compare against this to decide which tile to write to.
|
|
StoreOutVisTileData(OutVisTileData, NewTile, VT_MinWriteIndex, TotalNewWriteCount & ~1023u);
|
|
// Min/Max depth
|
|
#if BIN_MINMAX
|
|
StoreOutVisTileData(OutVisTileData, NewTile, VT_MinMaxDepth, PackDepthRange(TileDepthRange));
|
|
#endif
|
|
|
|
const uint PrevTile = (OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)] & 0xffff);
|
|
|
|
if (TotalOldWriteCount > 0)
|
|
{
|
|
StoreOutVisTileData(OutVisTileData, PrevTile, VT_PrimCount, 1024);
|
|
}
|
|
|
|
OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)] = (PrevTile << 16) | (NewTile & 0xffff);
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// 5. Write PrimID to tiles
|
|
if (bSegValid)
|
|
{
|
|
FDDAContext DDAContext = DDACreateContext(TileCoord0F, normalize(TileCoord1F - TileCoord0F));
|
|
const int2 EndCoord = (int2)floor(TileCoord1F);
|
|
|
|
for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt)
|
|
{
|
|
const int2 TileCoord = (int2)floor(DDAContext.Coord);
|
|
|
|
BRANCH
|
|
if (MaxZ > VisTileDepthGrid[TileCoord]) // Inverse-Z
|
|
{
|
|
const uint PackedTiles = OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)];
|
|
const uint CurTile = (PackedTiles & 0xffff);
|
|
const uint PrevTile = ((PackedTiles >> 16) & 0xffff);
|
|
|
|
// Currently we need this to get our write position, but maybe there is a cheaper way to keep track of that?
|
|
uint OldTileSegmentCount;
|
|
InterlockedAdd(OutVisTileBinningGrid[uint3(TileCoord, TmpSegmentCountLayerIdx)], 1, OldTileSegmentCount);
|
|
|
|
const bool bWriteToCurTile = OldTileSegmentCount >= LoadOutVisTileData(OutVisTileData, CurTile, VT_MinWriteIndex);
|
|
const uint LocalWritePos = OldTileSegmentCount % 1024;
|
|
const uint WritePos = (bWriteToCurTile ? CurTile : PrevTile) * 1024 + LocalWritePos;
|
|
|
|
OutVisTilePrims[WritePos] = PrimID;
|
|
OutVisTilePrimDepths[WritePos] = MaxZ; // Inverse-Z
|
|
BRANCH
|
|
if (bWriteToCurTile)
|
|
{
|
|
if ((OldTileSegmentCount + 1) == OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)])
|
|
{
|
|
StoreOutVisTileData(OutVisTileData, CurTile, VT_PrimCount, (OldTileSegmentCount == 1023) ? 1024 : ((OldTileSegmentCount + 1) % 1024));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (all(TileCoord == EndCoord))
|
|
{
|
|
break;
|
|
}
|
|
|
|
DDAAdvance(DDAContext);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif //SHADER_RASTERCOMPUTE_BINNING
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_COMPACTION
|
|
|
|
ByteAddressBuffer InData;
|
|
Buffer<uint> InPrims;
|
|
Buffer<uint> InDepths;
|
|
Buffer<uint> InArgs;
|
|
RWByteAddressBuffer OutData;
|
|
RWBuffer<uint> OutPrims;
|
|
RWBuffer<uint> OutArgs;
|
|
RWStructuredBuffer<uint> OutWork; // Offset & Count
|
|
RWStructuredBuffer<uint> OutDataCount;
|
|
RWStructuredBuffer<uint> OutWorkCount;
|
|
|
|
groupshared uint group_TotalPrimCount;
|
|
groupshared uint group_PrimWriteOffset;
|
|
groupshared uint group_NumTiles;
|
|
groupshared uint group_TilesToCompact[1024];
|
|
groupshared uint group_MaxLDSTileIdx;
|
|
groupshared uint group_MinZ;
|
|
groupshared uint group_MaxZ;
|
|
|
|
#define COMPACTION_DEPTH_BUCKET 1024
|
|
groupshared uint s_BinOffset[COMPACTION_DEPTH_BUCKET];
|
|
groupshared uint s_BinCount[COMPACTION_DEPTH_BUCKET];
|
|
|
|
uint GetDepthBinIndex(float InDepth)
|
|
{
|
|
// Inverse-Z
|
|
const float MinDepth = UnpackDepth(group_MinZ);
|
|
const float MaxDepth = UnpackDepth(group_MaxZ);
|
|
const float InvDepthExtent = 1.f / max(MaxDepth - MinDepth, 1e-5f);
|
|
const uint DepthIt = clamp(saturate((InDepth - MinDepth) * InvDepthExtent) * COMPACTION_DEPTH_BUCKET, 0, COMPACTION_DEPTH_BUCKET - 1);
|
|
return (COMPACTION_DEPTH_BUCKET - 1) - DepthIt;
|
|
}
|
|
|
|
// Launch based on CPU BinTileResX x BinTileResY
|
|
// 1 group per screen-tile, 1 threads per bin-tile matching the screen-tile coord
|
|
// There can be/are several bins for the same screen area
|
|
[numthreads(1024, 1, 1)]
|
|
void CompactionCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID)
|
|
{
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_TotalPrimCount = 0;
|
|
group_NumTiles = 0;
|
|
group_MaxLDSTileIdx = 0;
|
|
group_MinZ = 0xFFFFFFFF;
|
|
group_MaxZ = 0;
|
|
}
|
|
if (GroupThreadID < COMPACTION_DEPTH_BUCKET)
|
|
{
|
|
s_BinCount[GroupThreadID] = 0;
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
const uint NumTiles = InArgs[0];
|
|
const uint PackedCoord = PackVisTileCoord(GroupID); // All thread will process the same tile
|
|
|
|
#if PERMUTATION_DEBUG
|
|
const uint2 TileCoord = UnpackVisTileCoord(PackedCoord);
|
|
const bool bDebugEnabled = false && all(TileCoord == uint2(ShaderPrintData.CursorCoord / float(BIN_TILE_SIZE)));
|
|
FShaderPrintContext Ctx = InitShaderPrintContext(bDebugEnabled, uint2(750, 50));
|
|
#endif
|
|
|
|
// 1. Compute total number of primitives at this tile coordinate
|
|
uint LocalPrimCount = 0;
|
|
{
|
|
for (uint TileIdx = GroupThreadID; TileIdx < NumTiles; TileIdx += 1024)
|
|
{
|
|
const uint TilePackedCoord = LoadVisTileData(InData, TileIdx, VT_Coord);
|
|
if (PackedCoord == TilePackedCoord)
|
|
{
|
|
LocalPrimCount += LoadVisTileData(InData, TileIdx, VT_PrimCount);
|
|
|
|
const FDepthRange LocalDepthRange = UnpackDepthRange(LoadVisTileData(InData, TileIdx, VT_MinMaxDepth));
|
|
InterlockedMin(group_MinZ, PackDepth(LocalDepthRange.MinZ));
|
|
InterlockedMax(group_MaxZ, PackDepth(LocalDepthRange.MaxZ));
|
|
|
|
uint WritePos;
|
|
WaveInterlockedAddScalar_(group_NumTiles, 1, WritePos);
|
|
if (WritePos < 1024)
|
|
{
|
|
group_TilesToCompact[WritePos] = TileIdx;
|
|
WaveInterlockedMax(group_MaxLDSTileIdx, TileIdx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (LocalPrimCount > 0)
|
|
{
|
|
WaveInterlockedAdd(group_TotalPrimCount, LocalPrimCount);
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
const uint TotalPrimCount = group_TotalPrimCount;
|
|
|
|
if (TotalPrimCount == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// 2. Allocate space
|
|
if (GroupThreadID == 0)
|
|
{
|
|
uint NumTilesToAllocate = DivideAndRoundUp(TotalPrimCount, 1024);
|
|
|
|
uint FirstCompactedTile;
|
|
InterlockedAdd(OutArgs[0], NumTilesToAllocate, FirstCompactedTile);
|
|
|
|
uint WorkIndex;
|
|
InterlockedAdd(OutWorkCount[0], 1, WorkIndex);
|
|
OutWork[WorkIndex] = PackWork(FirstCompactedTile, NumTilesToAllocate);
|
|
|
|
group_PrimWriteOffset = FirstCompactedTile * 1024;
|
|
|
|
// Initialize new tiles
|
|
for (uint TileIdx = 0; TileIdx < NumTilesToAllocate; ++TileIdx)
|
|
{
|
|
const uint CompactedTile = FirstCompactedTile + TileIdx;
|
|
|
|
const uint PrimCount = min(TotalPrimCount - TileIdx * 1024, 1024);
|
|
StoreOutVisTileData(OutData, CompactedTile, VT_PrimCount, PrimCount);
|
|
StoreOutVisTileData(OutData, CompactedTile, VT_Coord, PackedCoord);
|
|
|
|
FDepthRange DepthRange;
|
|
DepthRange.MinZ = group_MinZ;
|
|
DepthRange.MaxZ = group_MaxZ;
|
|
StoreOutVisTileData(OutData, CompactedTile, VT_MinMaxDepth, PackDepthRange(DepthRange));
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
|
|
#if PERMUTATION_DEBUG
|
|
if (bDebugEnabled)
|
|
{
|
|
float4 DebugColor = ColorRed;
|
|
if (GroupThreadID == 0)
|
|
{
|
|
AddQuadSS(Ctx, TileCoord * BIN_TILE_SIZE, TileCoord * BIN_TILE_SIZE + BIN_TILE_SIZE, DebugColor);
|
|
Print(Ctx, TEXT("TileCoord :"), FontWhite); Print(Ctx, TileCoord, FontWhite); Newline(Ctx);
|
|
Print(Ctx, TEXT("TotalPrimCount :"), FontWhite); Print(Ctx, TotalPrimCount, FontWhite); Newline(Ctx);
|
|
Print(Ctx, TEXT("group_NumTiles :"), FontWhite); Print(Ctx, group_NumTiles, FontWhite); Newline(Ctx);
|
|
Print(Ctx, TEXT("group_MinZ :"), FontWhite); Print(Ctx, UnpackDepth(group_MinZ), FontWhite); Newline(Ctx);
|
|
Print(Ctx, TEXT("group_MaxZ :"), FontWhite); Print(Ctx, UnpackDepth(group_MaxZ), FontWhite); Newline(Ctx);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// 3. Copy PrimIDs to compacted memory
|
|
{
|
|
const uint NumInputTiles = min(group_NumTiles, 1024);
|
|
|
|
{
|
|
// 3.1 First process the LDS list of tiles
|
|
{
|
|
for (uint LDSIdx = 0; LDSIdx < NumInputTiles; ++LDSIdx)
|
|
{
|
|
const uint TileIdx = group_TilesToCompact[LDSIdx];
|
|
|
|
const uint TilePrimOffset = TileIdx * 1024;
|
|
const uint TilePrimCount = LoadVisTileData(InData, TileIdx, VT_PrimCount);
|
|
|
|
if (GroupThreadID < TilePrimCount)
|
|
{
|
|
const float Depth = UnpackDepth(InDepths[TilePrimOffset + GroupThreadID]);
|
|
const uint BinIndex = GetDepthBinIndex(Depth);
|
|
InterlockedAdd(s_BinCount[BinIndex], 1);
|
|
|
|
#if PERMUTATION_DEBUG
|
|
if (0 && GroupThreadID == 0)
|
|
{
|
|
Print(Ctx, TEXT("Depth0 :"), FontWhite); Print(Ctx, Depth, FontWhite);
|
|
Print(Ctx, TEXT(" - BinIndex : - "), FontWhite); Print(Ctx, BinIndex, FontWhite);
|
|
Print(Ctx, TEXT(" - BinMinZ : - "), FontWhite); Print(Ctx, UnpackDepth(group_MinZ), FontWhite);
|
|
Print(Ctx, TEXT(" - BinMaxZ : - "), FontWhite); Print(Ctx, UnpackDepth(group_MaxZ), FontWhite);
|
|
Newline(Ctx);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// 3.2 Prefix sum of bin count
|
|
if (GroupThreadID == 0)
|
|
{
|
|
uint GlobalOffset = 0;
|
|
for (uint It=0; It < COMPACTION_DEPTH_BUCKET;++It)
|
|
{
|
|
s_BinOffset[It] = GlobalOffset;
|
|
GlobalOffset += s_BinCount[It];
|
|
}
|
|
|
|
#if PERMUTATION_DEBUG
|
|
for (uint It2 = 0; It2 < COMPACTION_DEPTH_BUCKET; ++It2)
|
|
{
|
|
if (s_BinCount[It2] > 0)
|
|
Print(Ctx, TEXT("x"), FontWhite);
|
|
else
|
|
Print(Ctx, TEXT("."), FontWhite);
|
|
|
|
if (It2 != 0 && (It2 % 32) == 0)
|
|
Newline(Ctx);
|
|
}
|
|
Newline(Ctx);
|
|
#endif
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// 3.3 Clear insertion counter
|
|
if (GroupThreadID < COMPACTION_DEPTH_BUCKET)
|
|
{
|
|
s_BinCount[GroupThreadID] = 0;
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// 3.4 Insert primitive into bins
|
|
{
|
|
uint CurrentWriteOffset = group_PrimWriteOffset;
|
|
for (uint LDSIdx = 0; LDSIdx < NumInputTiles; ++LDSIdx)
|
|
{
|
|
const uint TileIdx = group_TilesToCompact[LDSIdx];
|
|
|
|
const uint TilePrimOffset = TileIdx * 1024;
|
|
const uint TilePrimCount = LoadVisTileData(InData, TileIdx, VT_PrimCount);
|
|
|
|
if (GroupThreadID < TilePrimCount)
|
|
{
|
|
const float Depth = UnpackDepth(InDepths[TilePrimOffset + GroupThreadID]);
|
|
const uint BinIndex = GetDepthBinIndex(Depth);
|
|
const uint GlobalOffset = s_BinOffset[BinIndex];
|
|
uint LocalOffset = 0;
|
|
InterlockedAdd(s_BinCount[BinIndex], 1, LocalOffset);
|
|
const uint WriteIndex = group_PrimWriteOffset + GlobalOffset + LocalOffset;
|
|
OutPrims[WriteIndex] = InPrims[TilePrimOffset + GroupThreadID];
|
|
}
|
|
|
|
CurrentWriteOffset += TilePrimCount;
|
|
}
|
|
}
|
|
|
|
// 3.5 Check any remaning tiles (Unlikely?)
|
|
//if (group_NumTiles > 1024)
|
|
//{
|
|
// for (uint TileIdx = group_MaxLDSTileIdx + 1; TileIdx < NumTiles; ++TileIdx)
|
|
// {
|
|
// const uint TilePackedCoord = LoadVisTileData(InData, TileIdx, VT_Coord);
|
|
// if (PackedCoord == TilePackedCoord)
|
|
// {
|
|
// const uint TilePrimOffset = TileIdx * 1024;
|
|
// const uint TilePrimCount = LoadVisTileData(InData, TileIdx, VT_PrimCount);
|
|
//
|
|
// if (GroupThreadID < TilePrimCount)
|
|
// {
|
|
// OutPrims[CurrentWriteOffset + GroupThreadID] = InPrims[TilePrimOffset + GroupThreadID];
|
|
// }
|
|
//
|
|
// CurrentWriteOffset += TilePrimCount;
|
|
// }
|
|
// }
|
|
//}
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif // SHADER_RASTERCOMPUTE_COMPACTION
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#define RASTER_DEPTH_BUCKET 64
|
|
#define RASTER_SEGMENT_COUNT 32
|
|
#define SEGMENT_COUNT_PER_GROUP 1024
|
|
#define INVALID_PRIM_ID 0xFFFFFFFF
|
|
#define INVALID_VELOCITY -1e8f
|
|
|
|
// For editing convenience
|
|
#if !SHADER_RASTERCOMPUTE_DEBUG && !SHADER_RASTERCOMPUTE_COMPACTION && !SHADER_RASTERCOMPUTE_BINNING && !SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE && !SHADER_RASTERCOMPUTE_DEPTH_GRID
|
|
#define SHADER_RASTERCOMPUTE_RASTER 1
|
|
#endif
|
|
|
|
#if SHADER_RASTERCOMPUTE_RASTER
|
|
|
|
Buffer<uint> VisTilePrims;
|
|
StructuredBuffer<uint> VisTileWork;
|
|
StructuredBuffer<uint> VisTileWorkCount;
|
|
Buffer<uint> VisTileArgs;
|
|
ByteAddressBuffer VisTileData;
|
|
RWTexture2D<float4> OutSceneColorTexture;
|
|
RWTexture2D<float4> OutSceneVelocityTexture;
|
|
RWStructuredBuffer<uint> RWWorkCounter;
|
|
|
|
#if PERMUTATION_DEBUG
|
|
RWTexture2D<uint> OutHairCountTexture_ForDebug;
|
|
RWTexture2D<uint> OutHairPixelCountPerTile_ForDebug;
|
|
#endif
|
|
|
|
int2 SampleLightingViewportResolution;
|
|
Texture2D<float4> SampleLightingTexture;
|
|
Buffer<float4> SampleVelocityBuffer;
|
|
|
|
Texture2D<float> SceneDepthTexture;
|
|
|
|
// Depth|ID
|
|
groupshared uint2 s_Segments[1024];
|
|
groupshared uint2 s_Segments_Sorted[1024];
|
|
groupshared uint s_Segments_Min;
|
|
groupshared uint s_Segments_Max;
|
|
groupshared uint s_Segments_ValidCount;
|
|
groupshared uint s_SegmentsCount[RASTER_DEPTH_BUCKET];
|
|
groupshared uint s_SegmentsAlloc[RASTER_DEPTH_BUCKET];
|
|
|
|
//groupshared uint s_Mask[RASTER_THREAD_COUNT];
|
|
groupshared uint s_Mask[8][8];
|
|
groupshared uint2 s_OpaqueMask;
|
|
groupshared uint s_Data[RASTER_SEGMENT_COUNT][8];
|
|
groupshared uint s_Color[RASTER_SEGMENT_COUNT][8];
|
|
groupshared uint2 s_Velocity[RASTER_SEGMENT_COUNT][8];
|
|
groupshared uint s_bDataOrder;
|
|
groupshared uint s_WorkID;
|
|
|
|
groupshared uint s_BinTileOffset;
|
|
groupshared uint s_BinTileCount;
|
|
|
|
#if PERMUTATION_DEBUG
|
|
groupshared float s_Coverage[RASTER_THREAD_COUNT];
|
|
#endif
|
|
|
|
void ClearMask(uint2 In)
|
|
{
|
|
s_Mask[In.x][In.y] = 0;
|
|
//s_Mask[In.x + In.y * RASTER_TILE_SIZE] = 0;
|
|
}
|
|
|
|
uint ReadMask(uint2 In)
|
|
{
|
|
return s_Mask[In.x][In.y];
|
|
//return s_Mask[In.x + In.y * RASTER_TILE_SIZE];
|
|
}
|
|
|
|
void WriteMask(uint2 In, uint InValue)
|
|
{
|
|
InterlockedOr(s_Mask[In.x][In.y], InValue);
|
|
//InterlockedOr(s_Mask[In.x + In.y * RASTER_TILE_SIZE], InValue);
|
|
}
|
|
|
|
#if PERMUTATION_DEBUG
|
|
void PrintCoverage(inout FShaderPrintContext Ctx)
|
|
{
|
|
Print(Ctx, TEXT("Coverage"), FontWhite);
|
|
Newline(Ctx);
|
|
for (uint y = 0; y < RASTER_TILE_SIZE; ++y)
|
|
{
|
|
for (uint x = 0; x < RASTER_TILE_SIZE; ++x)
|
|
{
|
|
const uint ValidSegments = ReadMask(uint2(x,y)); //s_Mask[x][y];
|
|
if (ValidSegments != 0)
|
|
{
|
|
Print(Ctx, TEXT("x "), FontGreen);
|
|
}
|
|
else
|
|
{
|
|
Print(Ctx, TEXT(". "), FontRed);
|
|
}
|
|
}
|
|
Newline(Ctx);
|
|
}
|
|
}
|
|
|
|
void PrintPixelMask(inout FShaderPrintContext Ctx, uint InMask)
|
|
{
|
|
for (uint s = 0; s < RASTER_SEGMENT_COUNT; ++s)
|
|
{
|
|
const bool bValid = ((1u<<s) & InMask) != 0u;
|
|
if (bValid)
|
|
{
|
|
Print(Ctx, TEXT("x"), FontGreen);
|
|
}
|
|
else
|
|
{
|
|
Print(Ctx, TEXT("."), FontRed);
|
|
}
|
|
}
|
|
}
|
|
|
|
void PrintOpaqueMask(inout FShaderPrintContext Ctx, uint2 InOpaqueMask)
|
|
{
|
|
for (uint y = 0; y < RASTER_TILE_SIZE; ++y)
|
|
for (uint x = 0; x < RASTER_TILE_SIZE; ++x)
|
|
{
|
|
const uint s = x + y * RASTER_TILE_SIZE;
|
|
bool bValid = false;
|
|
if (s < 32) { bValid = ((1u << s) & InOpaqueMask.x) != 0u; }
|
|
else { bValid = ((1u << (s - 32)) & InOpaqueMask.y) != 0u; }
|
|
|
|
if (bValid) { Print(Ctx, TEXT("x "), FontGreen); }
|
|
else { Print(Ctx, TEXT(". "), FontRed); }
|
|
|
|
if (x == RASTER_TILE_SIZE-1) Newline(Ctx);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
struct FSampleData
|
|
{
|
|
uint2 Coord; // 2x3bits - Pixel coord within the tile 8x8
|
|
float Depth; // 16 bits
|
|
float Coverage; // 10 bits
|
|
};
|
|
|
|
uint PackSampleData(FSampleData In)
|
|
{
|
|
return f32tof16(In.Depth) | (uint(saturate(In.Coverage) * 0x3FF) << 16u) | ((In.Coord.x & 0x7) << 26u) | ((In.Coord.y & 0x7) << 29u);
|
|
}
|
|
|
|
FSampleData UnpackSampleData(uint In)
|
|
{
|
|
FSampleData Out;
|
|
Out.Depth = f16tof32(In & 0xFFFF);
|
|
Out.Coverage = ((In >> 16) & 0x3FF) * (1.f / 1023.f);
|
|
Out.Coord = uint2((In >> 26) & 0x7, (In >> 29) & 0x7);
|
|
return Out;
|
|
}
|
|
|
|
uint PackColorData(float3 In)
|
|
{
|
|
return PackR11G11B10F(In);
|
|
}
|
|
|
|
float3 UnpackColorData(uint In)
|
|
{
|
|
return UnpackR11G11B10F(In);
|
|
}
|
|
|
|
uint2 PackVelocityData(float4 In)
|
|
{
|
|
return uint2(PackFloat2ToUInt(In.xy), PackFloat2ToUInt(In.zw));
|
|
}
|
|
|
|
float4 UnpackVelocityData(uint2 In)
|
|
{
|
|
return float4(UnpackFloat2FromUInt(In.x), UnpackFloat2FromUInt(In.y));
|
|
}
|
|
|
|
float3 LoadSampleColor(uint InPrimId, uint2 InSampleResolution)
|
|
{
|
|
const uint2 SampleCoord = GetHairSampleCoord(InPrimId, InSampleResolution);
|
|
return SampleLightingTexture.Load(uint3(SampleCoord, 0)).xyz;
|
|
}
|
|
|
|
float4 LoadSampleVelocity(uint InPrimId)
|
|
{
|
|
// This return the encoded velocity
|
|
// For decoding the actual velocity, use DecodeVelocityFromTexture(...)
|
|
return SampleVelocityBuffer[InPrimId];
|
|
}
|
|
|
|
uint GetDepthBinIndex(float InDepth, float InvDepthExtent)
|
|
{
|
|
// Inverse-Z
|
|
const float MinDepth = UnpackDepth(s_Segments_Min);
|
|
const float MaxDepth = UnpackDepth(s_Segments_Max);
|
|
const uint DepthIt = clamp(saturate((InDepth - MinDepth) * InvDepthExtent) * RASTER_DEPTH_BUCKET, 0, RASTER_DEPTH_BUCKET - 1);
|
|
return (RASTER_DEPTH_BUCKET-1) - DepthIt;
|
|
}
|
|
|
|
#if PERMUTATION_DEBUG
|
|
void ShiftX(inout FShaderPrintContext Out, uint InPixelCountX)
|
|
{
|
|
const float fShift = float(InPixelCountX) / float(ShaderPrintData.Resolution.x);
|
|
Out.StartPos.x += fShift;
|
|
Out.Pos.x += fShift;
|
|
}
|
|
|
|
void ShiftY(inout FShaderPrintContext Out, uint InPixelCountY)
|
|
{
|
|
const float fShift = float(InPixelCountY) / float(ShaderPrintData.Resolution.y);
|
|
Out.StartPos.y += fShift;
|
|
Out.Pos.y += fShift;
|
|
}
|
|
#endif
|
|
|
|
[numthreads(RASTER_TILE_SIZE, RASTER_TILE_SIZE, 1)]
|
|
void RasterCS(
|
|
uint GroupThread1D : SV_GroupIndex, /* 64 */
|
|
uint2 GroupThread2D : SV_GroupThreadID, /* 8x8 */
|
|
uint GroupID : SV_GroupID) /* Rasterizer ID */
|
|
{
|
|
ResolvedView = ResolveView();
|
|
const uint FetchWorkCount = VisTileWorkCount[0];
|
|
const uint BinTileNum = VisTileArgs[0];
|
|
|
|
s_BinTileOffset = 0;
|
|
s_BinTileCount = 0;
|
|
|
|
// These are global Color/Coverage for the final pixel handled by this thread
|
|
float3 Thread_Color = 0;
|
|
float Thread_Coverage = 0;
|
|
uint Thread_Complete = 0;
|
|
float4 Thread_Velocity = INVALID_VELOCITY;
|
|
uint Thread_LoopCountToFullCoverage = 0;
|
|
uint2 Thread_PixelCoord = 0;
|
|
|
|
const uint2 SampleLightingEffectiveResolution = GetHairSampleResolution(ControlPointCount[0]);
|
|
|
|
LOOP
|
|
for (uint WorkIndex = 0; WorkIndex < MAX_WORK_COUNT; WorkIndex++)
|
|
{
|
|
#if PERMUTATION_DEBUG
|
|
FShaderPrintContext GlobalCtx = InitShaderPrintContext(false, uint2(0, 0));
|
|
#endif
|
|
|
|
// 0.1 Fetch work item
|
|
if (GroupThread1D == 0)
|
|
{
|
|
InterlockedAdd(RWWorkCounter[0], 1, s_WorkID);
|
|
}
|
|
|
|
if (GroupThread1D == 0)
|
|
{
|
|
const uint FetchWorkIndex = s_WorkID / 16; // 1 x Bin32x32 -> 16 x Raster8x8
|
|
|
|
uint2 Work = 0;
|
|
if (FetchWorkIndex < FetchWorkCount)
|
|
{
|
|
Work = UnpackWork(VisTileWork[FetchWorkIndex]);
|
|
}
|
|
|
|
s_BinTileOffset = Work.x;
|
|
s_BinTileCount = Work.y;
|
|
s_OpaqueMask = 0;
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// 0.3 If we start a new screen tile, clear out final output
|
|
{
|
|
Thread_Color = 0;
|
|
Thread_Coverage = 0;
|
|
Thread_Complete = 0;
|
|
Thread_Velocity = INVALID_VELOCITY;
|
|
Thread_LoopCountToFullCoverage = 0;
|
|
Thread_PixelCoord = 0;
|
|
}
|
|
|
|
// Early out if we are done
|
|
{
|
|
const uint FetchWorkIndex = s_WorkID / 16; // 1 x Bin32x32 -> 16 x Raster8x8
|
|
if (FetchWorkIndex >= FetchWorkCount)
|
|
{
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Iterate over all bins for the current raster/screen tile
|
|
const uint FrontToBackCount = min(s_BinTileCount, 64);
|
|
uint ExitFrontToBackIndex = FrontToBackCount;
|
|
for (uint FrontToBackIndex = 0; FrontToBackIndex < FrontToBackCount; ++FrontToBackIndex)
|
|
{
|
|
// 0.2 Reset all LDS variables
|
|
{
|
|
if (GroupThread1D == 0)
|
|
{
|
|
s_Segments_Min = PackDepth(1e8);
|
|
s_Segments_Max = 0;
|
|
s_Segments_ValidCount = 0;
|
|
s_bDataOrder = 0;
|
|
}
|
|
if (GroupThread1D < RASTER_DEPTH_BUCKET)
|
|
{
|
|
s_SegmentsCount[GroupThread1D] = 0;
|
|
s_SegmentsAlloc[GroupThread1D] = 0;
|
|
}
|
|
ClearMask(GroupThread2D);
|
|
//s_Mask[Thread_Coord.x][Thread_Coord.y] = 0;
|
|
|
|
if (GroupThread1D < 32)
|
|
{
|
|
s_Data[GroupThread1D][0] = 0;
|
|
s_Data[GroupThread1D][1] = 0;
|
|
s_Data[GroupThread1D][2] = 0;
|
|
s_Data[GroupThread1D][3] = 0;
|
|
}
|
|
else
|
|
{
|
|
s_Data[GroupThread1D-32][4] = 0;
|
|
s_Data[GroupThread1D-32][5] = 0;
|
|
s_Data[GroupThread1D-32][6] = 0;
|
|
s_Data[GroupThread1D-32][7] = 0;
|
|
}
|
|
|
|
#if PERMUTATION_DEBUG
|
|
s_Coverage[GroupThread1D] = 0;
|
|
#endif
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// 0.4 Early out when running out of valid tiles
|
|
const uint BinTileIndex = s_BinTileOffset + FrontToBackIndex;
|
|
const bool bTileValid = (BinTileIndex < BinTileNum);
|
|
if (!bTileValid)
|
|
{
|
|
return;
|
|
}
|
|
|
|
const uint PrimOffset = BinTileIndex * SEGMENT_COUNT_PER_GROUP;
|
|
const uint PrimCount = LoadVisTileData(VisTileData, BinTileIndex, VT_PrimCount);
|
|
const uint2 BinTileCoord= UnpackVisTileCoord(LoadVisTileData(VisTileData, BinTileIndex, VT_Coord));
|
|
const uint2 BinTileMin = BinTileCoord * BIN_TILE_SIZE;
|
|
const uint2 BinTileMax = BinTileMin + BIN_TILE_SIZE;
|
|
|
|
const uint QuadrantIndex = s_WorkID % 16; // 1 x Bin32x32 -> 16 x Raster8x8
|
|
const uint2 QuadrantCoord = LinearTo2D_Common(QuadrantIndex, 4, 1.f / 4.f);
|
|
const uint2 RasterTileCoord = BinTileCoord * 4 + QuadrantCoord;
|
|
const uint2 RasterTileMin = RasterTileCoord * RASTER_TILE_SIZE;
|
|
const uint2 RasterTileMax = RasterTileMin + RASTER_TILE_SIZE;
|
|
|
|
const uint LoopCount64 = DivideAndRoundUp(PrimCount, RASTER_THREAD_COUNT);
|
|
|
|
Thread_PixelCoord = RasterTileMin + GroupThread2D;
|
|
|
|
if (all(s_OpaqueMask == 0xFFFFFFFF))
|
|
{
|
|
ExitFrontToBackIndex = FrontToBackIndex;
|
|
break;
|
|
}
|
|
|
|
// For debug only
|
|
#if PERMUTATION_DEBUG
|
|
const bool bDebugEnableAll = FrontToBackIndex == 0 && all(RasterTileCoord == uint2(ShaderPrintData.CursorCoord / float(RASTER_TILE_SIZE)));
|
|
const bool bDebugEnable = FrontToBackIndex == 0 && all(RasterTileCoord == uint2(ShaderPrintData.CursorCoord / float(RASTER_TILE_SIZE))) && GroupThread1D == 0;
|
|
FShaderPrintContext Ctx = InitShaderPrintContext(bDebugEnable, uint2(350, 50));
|
|
const FFontColor FontLegend = FontWhite;
|
|
const FFontColor FontValue = FontOrange;
|
|
|
|
//const bool bDebugEnable2 = all(RasterTileCoord == uint2(ShaderPrintData.CursorCoord / float(RASTER_TILE_SIZE))) && GroupThread1D == 1;
|
|
FShaderPrintContext CtxAll = InitShaderPrintContext(bDebugEnableAll, uint2(750, 50));
|
|
#endif
|
|
|
|
#if PERMUTATION_DEBUG
|
|
if (bDebugEnable)
|
|
{
|
|
Print(Ctx, TEXT("Work - Index :"), FontLegend); Print(Ctx, FrontToBackIndex, FontValue); Newline(Ctx);
|
|
Print(Ctx, TEXT("Work - Offset:"), FontLegend); Print(Ctx, s_BinTileOffset, FontValue); Newline(Ctx);
|
|
Print(Ctx, TEXT("Work - Count :"), FontLegend); Print(Ctx, s_BinTileCount, FontValue); Newline(Ctx);
|
|
|
|
//AddFilledQuadSS(BinTileMin, BinTileMax, Transparent(ColorLightGreen));
|
|
//AddFilledQuadSS(RasterTileMin, RasterTileMax, Transparent(ColorLightGreen));
|
|
AddQuadSS(BinTileMin, BinTileMax, Transparent(ColorLightGreen));
|
|
AddQuadSS(RasterTileMin, RasterTileMax, Transparent(ColorYellow));
|
|
}
|
|
#endif
|
|
|
|
// 1. Load all the segments and compute min/max bound (move this during the compaction)
|
|
{
|
|
LOOP
|
|
for (uint LoopIndex = 0; LoopIndex < LoopCount64; ++LoopIndex)
|
|
{
|
|
const uint Prim = GroupThread1D + LoopIndex * RASTER_THREAD_COUNT;
|
|
|
|
// Need to reset s_Segments and s_Segments_Sorted, as they are used for tracking valid segments
|
|
s_Segments[Prim] = uint2(0, INVALID_PRIM_ID);
|
|
s_Segments_Sorted[Prim] = uint2(0, INVALID_PRIM_ID);
|
|
|
|
if (Prim < PrimCount)
|
|
{
|
|
uint PrimID = VisTilePrims[PrimOffset + Prim];
|
|
|
|
uint TypeDummy = 0;
|
|
float4 SP0 = 0;
|
|
float4 SP1 = 0;
|
|
float Rad0 = 0;
|
|
float Rad1 = 0;
|
|
CalcHomogenousPosAndRad(PrimID, SP0, Rad0, TypeDummy);
|
|
CalcHomogenousPosAndRad(PrimID+1, SP1, Rad1, TypeDummy);
|
|
|
|
float Alpha0 = 0;
|
|
float Alpha1 = 1;
|
|
// Clipping
|
|
{
|
|
SP0 = float4(NDCToPixelCoord(SP0), 1.0f / SP0.w);
|
|
SP1 = float4(NDCToPixelCoord(SP1), 1.0f / SP1.w);
|
|
|
|
// Clip against tile
|
|
bool2 bClipped = false;
|
|
bool bIsValidSegment = false;
|
|
float2 T = 0;
|
|
bIsValidSegment = ClipRaySegment(RasterTileMin - 0.5f, RasterTileMax + 0.5f, SP0, SP1, Rad0, Rad1, Alpha0, Alpha1, bClipped, T);
|
|
|
|
PrimID = bIsValidSegment ? PrimID : INVALID_PRIM_ID;
|
|
}
|
|
|
|
const uint uDepth = PackDepth(max(SP0.z, SP1.z)); // Inverse Z
|
|
s_Segments[Prim] = uint2(uDepth, PrimID);
|
|
if (PrimID != INVALID_PRIM_ID)
|
|
{
|
|
InterlockedMin(s_Segments_Min, uDepth);
|
|
InterlockedMax(s_Segments_Max, uDepth);
|
|
}
|
|
|
|
#if 0 && PERMUTATION_DEBUG
|
|
if (bDebugEnableAll && PrimID != INVALID_PRIM_ID)
|
|
{
|
|
const float3 P0 = UnpackHairControlPoint(PrimID).Position;
|
|
const float3 P1 = UnpackHairControlPoint(PrimID+1).Position;
|
|
|
|
const float4 Color0 = float4(LoadSampleColor(PrimID, SampleLightingEffectiveResolution), 1);
|
|
const float4 Color1 = float4(LoadSampleColor(PrimID+1, SampleLightingEffectiveResolution), 1);
|
|
|
|
const float4 LineColor = float4(ColorMapTurbo(GroupThread1D / float(RASTER_THREAD_COUNT)), 1);
|
|
AddLineWS(P0, P1, LineColor);
|
|
//AddLineWS(P0, P1, Color0, Color1);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
const float InvDepthExtent = 1.f / max(0.0001f, UnpackDepth(s_Segments_Max)- UnpackDepth(s_Segments_Min));
|
|
|
|
// 2. Compute the count of depth bucket
|
|
{
|
|
LOOP
|
|
for (uint LoopIndex = 0; LoopIndex < LoopCount64; LoopIndex++)
|
|
{
|
|
const uint Prim = GroupThread1D + LoopIndex * RASTER_THREAD_COUNT;
|
|
if (Prim < PrimCount)
|
|
{
|
|
const bool bIsValid = s_Segments[Prim].y != INVALID_PRIM_ID;
|
|
if (bIsValid)
|
|
{
|
|
const float Depth = UnpackDepth(s_Segments[Prim].x);
|
|
const uint DepthIt = GetDepthBinIndex(Depth, InvDepthExtent);
|
|
InterlockedAdd(s_SegmentsCount[DepthIt], 1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Replace this with parallel version
|
|
s_Segments_ValidCount = 0;
|
|
if (GroupThread1D == 0)
|
|
{
|
|
uint Acc = 0;
|
|
for (uint It = 0; It < RASTER_DEPTH_BUCKET; It++)
|
|
{
|
|
const uint Next = s_SegmentsCount[It];
|
|
s_SegmentsCount[It] = Acc;
|
|
Acc += Next;
|
|
|
|
#if 1 && PERMUTATION_DEBUG
|
|
if (bDebugEnable)
|
|
{
|
|
if (Next > 0) { Print(Ctx, TEXT("x"), FontValue); }
|
|
else { Print(Ctx, TEXT("."), FontValue); }
|
|
if (It == RASTER_DEPTH_BUCKET-1) { Newline(Ctx); }
|
|
}
|
|
#endif
|
|
}
|
|
s_Segments_ValidCount = Acc;
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// 3. Insert the segment into the right bucket
|
|
{
|
|
LOOP
|
|
for (uint LoopIndex = 0; LoopIndex < LoopCount64; LoopIndex++)
|
|
{
|
|
const uint Prim = GroupThread1D + LoopIndex * RASTER_THREAD_COUNT;
|
|
if (Prim < PrimCount)
|
|
{
|
|
const uint2 Segment = s_Segments[Prim];
|
|
if (Segment.y != INVALID_PRIM_ID)
|
|
{
|
|
const float Depth = UnpackDepth(Segment.x);
|
|
const uint DepthIt = GetDepthBinIndex(Depth, InvDepthExtent);
|
|
|
|
uint AllocOffset = 0;
|
|
InterlockedAdd(s_SegmentsAlloc[DepthIt], 1, AllocOffset);
|
|
const uint NewIndex = AllocOffset + s_SegmentsCount[DepthIt];
|
|
s_Segments_Sorted[NewIndex] = Segment;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
const uint LoopCount32 = DivideAndRoundUp(s_Segments_ValidCount, RASTER_SEGMENT_COUNT);
|
|
const uint LoopCount32_All = DivideAndRoundUp(PrimCount, RASTER_SEGMENT_COUNT);
|
|
|
|
// DEBUG
|
|
#if PERMUTATION_DEBUG
|
|
if (bDebugEnable)
|
|
{
|
|
const uint2 OutCoord = GroupThread2D + RasterTileMin;
|
|
|
|
//Print(Ctx, TEXT("Out Coord :"), FontLegend); Print(Ctx, OutCoord, FontValue); Newline(Ctx);
|
|
//Print(Ctx, TEXT("Min Raster :"), FontLegend); Print(Ctx, RasterTileMin, FontValue); Newline(Ctx);
|
|
//Print(Ctx, TEXT("Thread :"), FontLegend); Print(Ctx, Thread_Coord, FontValue); Newline(Ctx);
|
|
//Print(Ctx, TEXT("Max Raster :"), FontLegend); Print(Ctx, RasterTileMax, FontValue); Newline(Ctx);
|
|
//Newline(Ctx);
|
|
|
|
//Print(Ctx, TEXT("Cursor :"), FontLegend); Print(Ctx, uint2(ShaderPrintData.CursorCoord), FontValue); Newline(Ctx);
|
|
//Newline(Ctx);
|
|
|
|
Print(Ctx, TEXT("Work ID :"), FontLegend); Print(Ctx, s_WorkID, FontValue); Newline(Ctx);
|
|
Print(Ctx, TEXT("Prim Count :"), FontLegend); Print(Ctx, s_Segments_ValidCount, FontValue); Print(Ctx, TEXT(" / "), FontLegend); Print(Ctx, PrimCount, FontValue); Newline(Ctx);
|
|
Print(Ctx, TEXT("Loop 32 Count:"), FontLegend); Print(Ctx, LoopCount32, FontValue); Print(Ctx, TEXT(" / "), FontLegend); Print(Ctx, LoopCount32_All, FontValue); Newline(Ctx);
|
|
Print(Ctx, TEXT("Loop 64 Count:"), FontLegend); Print(Ctx, LoopCount64, FontValue); Newline(Ctx);
|
|
Print(Ctx, TEXT("Min Depth :"), FontLegend); Print(Ctx, UnpackDepth(s_Segments_Min), FontValue); Newline(Ctx);
|
|
Print(Ctx, TEXT("Max Depth :"), FontLegend); Print(Ctx, UnpackDepth(s_Segments_Max), FontValue); Newline(Ctx);
|
|
Print(Ctx, TEXT("Rad.at Depth1:"), FontLegend); Print(Ctx, RadiusAtDepth1, FontValue); Newline(Ctx);
|
|
Newline(Ctx);
|
|
|
|
Print(Ctx, TEXT("Work - Index :"), FontLegend); Print(Ctx, WorkIndex, FontValue, 2, 0);
|
|
Print(Ctx, TEXT(" - Offset:"), FontLegend); Print(Ctx, s_BinTileOffset, FontValue, 5, 0);
|
|
Print(Ctx, TEXT(" - Count :"), FontLegend); Print(Ctx, s_BinTileCount, FontValue, 3, 0);
|
|
Newline(Ctx);
|
|
|
|
//Print(Ctx, TEXT("Bin ID :"), FontLegend); Print(Ctx, BinTileIndex, FontValue); Newline(Ctx);
|
|
//Print(Ctx, TEXT("Raster ID :"), FontLegend); Print(Ctx, QuadrantIndex, FontValue); Newline(Ctx);
|
|
//Newline(Ctx);
|
|
|
|
//Print(Ctx, TEXT("Raster Tile:"), FontLegend); Print(Ctx, RasterTileCoord, FontValue); Newline(Ctx);
|
|
//Print(Ctx, TEXT("Bin Tile :"), FontLegend); Print(Ctx, BinTileCoord, FontValue); Newline(Ctx);
|
|
//Print(Ctx, TEXT("Out Coord :"), FontLegend); Print(Ctx, OutCoord, FontValue); Newline(Ctx);
|
|
//Newline(Ctx);
|
|
|
|
}
|
|
#endif
|
|
|
|
if (LoopCount32 == 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// 4. Raster segments
|
|
{
|
|
// 4.1 Loop over all segment within the tile, and rastize 32 of them at each loop
|
|
LOOP
|
|
for (uint LoopIndex = 0; LoopIndex < LoopCount32; LoopIndex++)
|
|
{
|
|
// 4.0 Reset
|
|
if (GroupThread1D < RASTER_SEGMENT_COUNT)
|
|
{
|
|
for (int J = 0; J < RASTER_TILE_SIZE; ++J)
|
|
{
|
|
s_Data[GroupThread1D][J] = 0;
|
|
s_Color[GroupThread1D][J] = 0;
|
|
s_Velocity[GroupThread1D][J] = 0;
|
|
}
|
|
}
|
|
s_Mask[GroupThread2D.x][GroupThread2D.y] = 0;
|
|
s_bDataOrder = 0;
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// If raster tile is fully covered, exit
|
|
if (all(s_OpaqueMask == 0xFFFFFFFF))
|
|
{
|
|
break;
|
|
}
|
|
|
|
// 4.1 Raster segment (1 thread = 1 segment)
|
|
// Half of the thread are doing nothing, we could raster the two half of the segments (one per each thread)
|
|
const uint Prim = GroupThread1D + LoopIndex * RASTER_SEGMENT_COUNT;
|
|
if (GroupThread1D < RASTER_SEGMENT_COUNT && Prim < s_Segments_ValidCount)
|
|
{
|
|
const uint PrimID = s_Segments_Sorted[Prim].y;
|
|
uint TypeDummy = 0;
|
|
float4 SP0 = 0;
|
|
float4 SP1 = 0;
|
|
float Rad0 = 0;
|
|
float Rad1 = 0;
|
|
CalcHomogenousPosAndRad(PrimID, SP0, Rad0, TypeDummy);
|
|
CalcHomogenousPosAndRad(PrimID + 1, SP1, Rad1, TypeDummy);
|
|
|
|
float Alpha0=0;
|
|
float Alpha1=1;
|
|
|
|
bool2 bClipped = false;
|
|
bool bIsSegmentValid = false;
|
|
float2 T = 0;
|
|
// 4.1.1 Clipping
|
|
{
|
|
SP0 = float4(NDCToPixelCoord(SP0), 1.0f / SP0.w);
|
|
SP1 = float4(NDCToPixelCoord(SP1), 1.0f / SP1.w);
|
|
|
|
// Clip against tile
|
|
bIsSegmentValid = ClipRaySegment(RasterTileMin - 0.5f, RasterTileMax + 0.5f, SP0, SP1, Rad0, Rad1, Alpha0, Alpha1, bClipped, T);
|
|
}
|
|
|
|
// DEBUG (Write coord)
|
|
#if 0 && PERMUTATION_DEBUG
|
|
if (bDebugEnableAll)
|
|
{
|
|
ShiftY(CtxAll, 500 + GroupThread1D * 150);
|
|
|
|
Print(CtxAll, TEXT("Thrd 1D:"), FontYellow);
|
|
Print(CtxAll, GroupThread1D, FontWhite);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("ID :"), FontYellow);
|
|
Print(CtxAll, PrimID, FontWhite);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("Valid :"), FontYellow);
|
|
PrintBool(CtxAll, bIsSegmentValid);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("Clip X :"), FontYellow);
|
|
PrintBool(CtxAll, bClipped.x);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("Clip Y :"), FontYellow);
|
|
PrintBool(CtxAll, bClipped.y);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("X0 :"), FontYellow);
|
|
Print(CtxAll, SP0, FontWhite);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("X1 :"), FontYellow);
|
|
Print(CtxAll, SP1, FontWhite);
|
|
Newline(CtxAll);
|
|
|
|
//const float3 P0 = UnpackHairControlPoint(PrimID).Position;
|
|
//const float3 P1 = UnpackHairControlPoint(PrimID + 1).Position;
|
|
|
|
//const float4 Color0 = float4(LoadSampleColor(PrimID), 1);
|
|
//const float4 Color1 = float4(LoadSampleColor(PrimID + 1), 1);
|
|
|
|
//const float4 LineColor = float4(ColorMapTurbo(GroupThread1D / float(RASTER_THREAD_COUNT)), 1);
|
|
//AddLineWS(P0, P1, LineColor);
|
|
if (GroupThread1D == 0)
|
|
AddLineSS(SP0, SP1, ColorPurple);
|
|
}
|
|
#endif
|
|
|
|
// 4.1.2 Rasterize segment (1 thread = 1 segments)
|
|
if (bIsSegmentValid)
|
|
{
|
|
const float SegmentLenSqRcp = 1.0f / dot(SP1.xy - SP0.xy, SP1.xy - SP0.xy);
|
|
const bool bIsSteep = abs(SP1.x - SP0.x) < abs(SP1.y - SP0.y);
|
|
const float X0 = bIsSteep ? min(SP0.y, SP1.y) : min(SP0.x, SP1.x);
|
|
const float X1 = bIsSteep ? max(SP0.y, SP1.y) : max(SP0.x, SP1.x);
|
|
const float RcpNumSteps = 1.0f / (X1 - X0);
|
|
const int NumSteps = (int)(ceil(X1) - floor(X0));
|
|
|
|
if (bIsSteep)
|
|
{
|
|
InterlockedOr(s_bDataOrder, 1u << GroupThread1D);
|
|
}
|
|
|
|
// DEBUG (Write coord)
|
|
#if 0 && PERMUTATION_DEBUG
|
|
//if (GroupThread1D == 0 && bDebugEnable)
|
|
if (bDebugEnableAll)
|
|
{
|
|
ShiftY(CtxAll, GroupThread1D * 150);
|
|
//ShiftY(CtxAll, GroupThread1D * )
|
|
|
|
Print(CtxAll, TEXT("NumSteps:"), FontYellow);
|
|
Print(CtxAll, NumSteps, FontWhite);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("X0 :"), FontYellow);
|
|
Print(CtxAll, X0, FontWhite);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("X1 :"), FontYellow);
|
|
Print(CtxAll, X1, FontWhite);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("T :"), FontYellow);
|
|
Print(CtxAll, T, FontWhite);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("bValid :"), FontYellow);
|
|
PrintBool(CtxAll, bIsSegmentValid);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("bClip X :"), FontYellow);
|
|
PrintBool(CtxAll, bClipped.x);
|
|
Newline(CtxAll);
|
|
|
|
Print(CtxAll, TEXT("bClip Y :"), FontYellow);
|
|
PrintBool(CtxAll, bClipped.y);
|
|
Newline(CtxAll);
|
|
|
|
|
|
//const float3 P0 = UnpackHairControlPoint(PrimID).Position;
|
|
//const float3 P1 = UnpackHairControlPoint(PrimID + 1).Position;
|
|
|
|
//const float4 Color0 = float4(LoadSampleColor(PrimID, SampleLightingEffectiveResolution), 1);
|
|
//const float4 Color1 = float4(LoadSampleColor(PrimID + 1, SampleLightingEffectiveResolution), 1);
|
|
|
|
//const float4 LineColor = float4(ColorMapTurbo(GroupThread1D / float(RASTER_THREAD_COUNT)), 1);
|
|
//AddLineWS(P0, P1, LineColor);
|
|
AddLineSS(SP0, SP1, ColorRed);
|
|
}
|
|
#endif
|
|
|
|
// DEBUG (Legend)
|
|
#if 0 && PERMUTATION_DEBUG
|
|
if (bDebugEnable)
|
|
{
|
|
//Print(Ctx, s_Mask[IntraTileCoord.x][IntraTileCoord.y], FontWhite);
|
|
//PrintPixelMask(Ctx, ReadMask(IntraTileCoord)/*s_Mask[IntraTileCoord.x][IntraTileCoord.y]*/);
|
|
//Print(Ctx, TEXT("Color0: "), FontWhite); Print(Ctx, Color0); Newline(Ctx);
|
|
//Print(Ctx, TEXT("Color1: "), FontWhite); Print(Ctx, Color1); Newline(Ctx);
|
|
//Print(Ctx, TEXT("Color : "), FontWhite); Print(Ctx, Color);
|
|
const float3 Color0 = LoadSampleColor(PrimID, SampleLightingEffectiveResolution);
|
|
const float3 Color1 = LoadSampleColor(PrimID + 1, SampleLightingEffectiveResolution);
|
|
Print(Ctx, TEXT("Color0 : "), FontWhite); Print(Ctx, Color0); Newline(Ctx);
|
|
Print(Ctx, TEXT("Color1 : "), FontWhite); Print(Ctx, Color1); Newline(Ctx);
|
|
Print(Ctx, TEXT(" Alpha A0 A1 AColor Color Coord Coverage"), FontWhite);
|
|
Newline(Ctx);
|
|
}
|
|
#endif
|
|
LOOP
|
|
for (int J = 0; J < NumSteps; ++J)
|
|
//for (int J = 0; J < RASTER_TILE_SIZE; ++J)
|
|
{
|
|
const float AlphaSP = saturate(J * RcpNumSteps);
|
|
const float4 SP = lerp(SP0, SP1, AlphaSP);
|
|
|
|
int2 Coords = SP.xy;
|
|
const int2 IntraTileCoord = Coords - RasterTileMin;
|
|
|
|
// TO it needs to store data in sweeping order
|
|
FSampleData Sample = (FSampleData)0;
|
|
if (all(IntraTileCoord >= 0) && all(IntraTileCoord < RASTER_TILE_SIZE))
|
|
{
|
|
const float Alpha = ComputeLerpAlpha(Coords, SP0.xy, SP1.xy, SegmentLenSqRcp);
|
|
const float Depth = lerp(SP0.z, SP1.z, Alpha);
|
|
const float OpaqueDepth = SceneDepthTexture.Load(uint3(Coords, 0));
|
|
if (Depth > OpaqueDepth)
|
|
{
|
|
const float AlphaColor = lerp(Alpha0, Alpha1, Alpha);
|
|
const float3 Color0 = LoadSampleColor(PrimID, SampleLightingEffectiveResolution);
|
|
const float3 Color1 = LoadSampleColor(PrimID+1, SampleLightingEffectiveResolution);
|
|
const float3 Color = lerp(Color0, Color1, AlphaColor);
|
|
|
|
const float4 Velocity0 = LoadSampleVelocity(PrimID);
|
|
const float4 Velocity1 = LoadSampleVelocity(PrimID+1);
|
|
const float4 Velocity = lerp(Velocity0, Velocity1, AlphaColor);
|
|
|
|
|
|
// Fill in sample data
|
|
Sample.Depth = Depth;
|
|
Sample.Coord = IntraTileCoord;
|
|
|
|
// Compute coverage
|
|
// Minimal radius to snap the strand to a sample/pixel center (to avoid aliasing)
|
|
const float SceneDistance = ConvertFromDeviceZ(Depth);
|
|
const float MinHairRadius = ConvertGivenDepthRadiusForProjectionType(RadiusAtDepth1, SceneDistance);
|
|
const float HairRadius = lerp(Rad0, Rad1, AlphaColor);
|
|
Sample.Coverage = saturate(HairRadius / MinHairRadius);
|
|
|
|
// Write data
|
|
WriteMask(IntraTileCoord, 1u << GroupThread1D);
|
|
|
|
const uint JCoord = bIsSteep ? IntraTileCoord.y : IntraTileCoord.x;
|
|
s_Color[GroupThread1D][JCoord] = PackColorData(Color);
|
|
s_Data[GroupThread1D][JCoord] = PackSampleData(Sample);
|
|
s_Velocity[GroupThread1D][JCoord] = PackVelocityData(Velocity);
|
|
|
|
|
|
// DEBUG (Write coord)
|
|
#if 0 && PERMUTATION_DEBUG
|
|
if (bDebugEnable)
|
|
{
|
|
Print(Ctx, J, FontYellow, 2, 0);
|
|
Print(Ctx, Alpha, FontWhite);
|
|
Print(Ctx, AlphaColor, FontBlue);
|
|
Print(Ctx, HairRadius, FontRed);
|
|
Print(Ctx, MinHairRadius, FontGreen);
|
|
Print(Ctx, Sample.Coverage, FontBlue);
|
|
Newline(Ctx);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
} // if (bIsSegmentValid)
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// DEBUG (Coverage mask)
|
|
#if 0 && PERMUTATION_DEBUG
|
|
if (bDebugEnable)
|
|
{
|
|
PrintCoverage(Ctx);
|
|
}
|
|
#endif
|
|
|
|
// 4.2 Combine all samples within the same pixel (1 thread = 1 pixel)
|
|
{
|
|
const uint ValidSegments = ReadMask(GroupThread2D);//s_Mask[Thread_Coord.x][Thread_Coord.y];
|
|
|
|
// Change this loop into to bit logic?
|
|
{
|
|
for (uint SegmentIt=0; SegmentIt < RASTER_SEGMENT_COUNT; ++SegmentIt)
|
|
{
|
|
const uint SegmentBit = 1u << SegmentIt;
|
|
const bool bIsValid = (ValidSegments & SegmentBit) != 0;
|
|
const bool bIsSteep = (s_bDataOrder & SegmentBit) != 0;
|
|
if (bIsValid)
|
|
{
|
|
const uint J = bIsSteep ? GroupThread2D.y : GroupThread2D.x;
|
|
|
|
const FSampleData Sample = UnpackSampleData(s_Data[SegmentIt][J]);
|
|
const float3 Color = UnpackColorData(s_Color[SegmentIt][J]);
|
|
const float4 Velocity = UnpackVelocityData(s_Velocity[SegmentIt][J]);
|
|
|
|
const float AccTransmittance = saturate(1.f-Thread_Coverage);
|
|
Thread_Coverage += AccTransmittance * Sample.Coverage;
|
|
Thread_Color += AccTransmittance * Sample.Coverage * Color;
|
|
|
|
// Use the closest valid segment as output velocity
|
|
if (Thread_Velocity.x <= INVALID_VELOCITY)
|
|
{
|
|
Thread_Velocity = Velocity;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const float CoverageThreshold = 0.95f;
|
|
if (Thread_Complete == 0 && Thread_Coverage > CoverageThreshold)
|
|
{
|
|
// Mark pixel has fully covered
|
|
if (GroupThread1D >= 32)
|
|
{
|
|
InterlockedOr(s_OpaqueMask.y, 1u << (GroupThread1D - 32));
|
|
}
|
|
else
|
|
{
|
|
InterlockedOr(s_OpaqueMask.x, 1u << GroupThread1D);
|
|
}
|
|
Thread_Complete = 1;
|
|
}
|
|
|
|
#if PERMUTATION_DEBUG
|
|
if (Thread_LoopCountToFullCoverage == 0 && Thread_Coverage > CoverageThreshold)
|
|
{
|
|
Thread_LoopCountToFullCoverage = LoopIndex;
|
|
}
|
|
#endif
|
|
}
|
|
//GroupMemoryBarrierWithGroupSync();
|
|
|
|
} // for (uint LoopIndex =...) ...
|
|
|
|
#if PERMUTATION_DEBUG
|
|
s_Coverage[GroupThread1D] = Thread_Coverage; // For sync s_Coverage
|
|
GlobalCtx = Ctx;
|
|
#endif
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
} // 4. Raster
|
|
|
|
} // for (... FrontToBackIndex ...)
|
|
|
|
// 5. Write final color
|
|
const bool bWriteOut = Thread_Coverage > 0;
|
|
if (bWriteOut)
|
|
{
|
|
const uint2 PixelCoord = Thread_PixelCoord;
|
|
const float3 SourceColor = OutSceneColorTexture[PixelCoord].xyz;
|
|
OutSceneColorTexture[PixelCoord] = float4(SourceColor * (1-Thread_Coverage) + Thread_Color, 1);
|
|
if (Thread_Velocity.x > INVALID_VELOCITY)
|
|
{
|
|
OutSceneVelocityTexture[PixelCoord] = Thread_Velocity;
|
|
}
|
|
|
|
// For debug purpose only
|
|
#if PERMUTATION_DEBUG
|
|
{
|
|
const uint2 RasterTileCoord = PixelCoord / RASTER_TILE_SIZE;
|
|
InterlockedAdd(OutHairCountTexture_ForDebug[PixelCoord], 1);
|
|
InterlockedAdd(OutHairPixelCountPerTile_ForDebug[RasterTileCoord], 1);
|
|
|
|
const FFontColor FontLegend = FontWhite;
|
|
const FFontColor FontValue = FontOrange;
|
|
Print(GlobalCtx, TEXT("Exit :"), FontLegend);
|
|
Print(GlobalCtx, ExitFrontToBackIndex, FontValue, 3, 0);
|
|
Print(GlobalCtx, TEXT(" / "), FontLegend);
|
|
Print(GlobalCtx, FrontToBackCount, FontValue, 3, 0);
|
|
Newline(GlobalCtx);
|
|
|
|
PrintOpaqueMask(GlobalCtx, s_OpaqueMask);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
} // for ( ... Work item ... )
|
|
}
|
|
|
|
#endif //SHADER_RASTERCOMPUTE_RASTER
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_DEBUG
|
|
|
|
#include "../ShaderPrint.ush"
|
|
|
|
Buffer<uint> VisTileArgs;
|
|
ByteAddressBuffer VisTileData;
|
|
|
|
Buffer<uint> CompactedVisTileArgs;
|
|
ByteAddressBuffer CompactedVisTileData;
|
|
|
|
Texture2D<uint> HairCountTexture_ForDebug;
|
|
Texture2D<uint> HairPixelCountPerTile_ForDebug;
|
|
uint InstanceCount;
|
|
uint CPUAllocatedTileCount;
|
|
uint CPUAllocatedCompactedTileCount;
|
|
|
|
uint GetTileTotalSegment(uint2 TileCoord, bool bPrintDetails, uint InTileSize)
|
|
{
|
|
const float TileDisplayScale = 1.5f;
|
|
const uint DisplayTileSize = InTileSize * TileDisplayScale;
|
|
uint2 InlinedTileCoord = uint2(0, 0);
|
|
|
|
const uint TileCount = VisTileArgs[0];
|
|
|
|
uint TotalSegments = 0;
|
|
for (uint TileIndex=0; TileIndex<TileCount; ++TileIndex)
|
|
{
|
|
const uint PackedTileCoord = LoadVisTileData(VisTileData, TileIndex, VT_Coord);
|
|
const uint2 VisTileCoord = UnpackVisTileCoord(PackedTileCoord);
|
|
if (all(VisTileCoord == TileCoord))
|
|
{
|
|
const uint TileSegments = LoadVisTileData(VisTileData, TileIndex, VT_PrimCount);
|
|
TotalSegments += TileSegments;
|
|
|
|
if (bPrintDetails)
|
|
{
|
|
AddFilledQuadSS(InlinedTileCoord * DisplayTileSize, (InlinedTileCoord + 1) * DisplayTileSize, TileSegments > 0 ? Transparent(ColorLightGreen) : Transparent(ColorLightRed));
|
|
AddQuadSS(InlinedTileCoord * DisplayTileSize, (InlinedTileCoord + 1) * DisplayTileSize, ColorYellow);
|
|
|
|
const uint2 TilePrintOffset = InTileSize >> 1;
|
|
|
|
FShaderPrintContext Context = InitShaderPrintContext(true, InlinedTileCoord * DisplayTileSize + TilePrintOffset);
|
|
Print(Context, TileSegments, FontWhite);
|
|
++InlinedTileCoord.x;
|
|
}
|
|
}
|
|
}
|
|
return TotalSegments;
|
|
}
|
|
|
|
void PrintTile(uint2 TileCoord, uint TotalSegments, bool bPrintText, uint InTileSize)
|
|
{
|
|
AddFilledQuadSS(TileCoord * InTileSize, (TileCoord + 1) * InTileSize, TotalSegments > 0 ? Transparent(ColorLightGreen) : Transparent(ColorLightRed));
|
|
if (bPrintText)
|
|
{
|
|
FShaderPrintContext Context = InitShaderPrintContext(true, TileCoord * InTileSize + uint2(0, InTileSize * 1.5f));
|
|
Print(Context, TotalSegments, FontWhite);
|
|
|
|
AddQuadSS(TileCoord * InTileSize, (TileCoord + 1) * InTileSize, ColorYellow);
|
|
}
|
|
}
|
|
|
|
[numthreads(8, 8, 1)]
|
|
void MainCS(uint3 ThreadId : SV_DispatchThreadID)
|
|
{
|
|
const bool bIsCursorPixel = (all(ThreadId == 0) && all(ShaderPrintData.CursorCoord >= 0));
|
|
|
|
// Info/Stats
|
|
if (all(ThreadId == 0))
|
|
{
|
|
FFontColor FontValue = FontOrange;
|
|
FFontColor FontTitle = FontYellow;
|
|
FFontColor FontLegend = FontWhite;
|
|
FShaderPrintContext Context = InitShaderPrintContext(true, uint2(50, 110));
|
|
Print(Context, TEXT("Raster compute "), FontTitle); Newline(Context);
|
|
Print(Context, TEXT("Instance Count : "), FontLegend); Print(Context, InstanceCount, FontValue, 3, 0); Newline(Context);
|
|
Print(Context, TEXT("Total segments Count : "), FontLegend); Print(Context, GetControlPointCount(), FontValue); Newline(Context);
|
|
Print(Context, TEXT("Max. segments Count : "), FontLegend); Print(Context, MaxControlPointCount, FontValue); Newline(Context);
|
|
Newline(Context);
|
|
|
|
Print(Context, TEXT("Configuration "), FontTitle); Newline(Context);
|
|
Print(Context, TEXT("Output Resolution : "), FontLegend); Print(Context, OutputResolution.x, FontValue, 4, 0); Print(Context, TEXT("x"), FontSilver); Print(Context, OutputResolution.y, FontValue, 4, 0); Newline(Context);
|
|
Newline(Context);
|
|
|
|
Print(Context, TEXT("Bin Tile Size : "), FontLegend); Print(Context, uint(BIN_TILE_SIZE), FontValue, 2, 0); Newline(Context);
|
|
Print(Context, TEXT("Bin Tile Res : "), FontLegend); Print(Context, BinTileRes.x, FontValue, 3, 0); Print(Context, TEXT("x"), FontSilver); Print(Context, BinTileRes.y, FontValue, 3, 0); Newline(Context);
|
|
Print(Context, TEXT("Num Binners : "), FontLegend); Print(Context, NumBinners, FontValue); Newline(Context);
|
|
Newline(Context);
|
|
|
|
Print(Context, TEXT("Raster Tile Size : "), FontLegend); Print(Context, uint(RASTER_TILE_SIZE), FontValue, 2, 0); Newline(Context);
|
|
Print(Context, TEXT("Raster Tile Res : "), FontLegend); Print(Context, RasterTileRes.x, FontValue, 3, 0); Print(Context, TEXT("x"), FontSilver); Print(Context, RasterTileRes.y, FontValue, 3, 0); Newline(Context);
|
|
Print(Context, TEXT("Num Rasterizers : "), FontLegend); Print(Context, NumRasterizers, FontValue); Newline(Context);
|
|
Newline(Context);
|
|
|
|
const FFontColor AllocColor = InitFontColor(ColorMapTurbo(VisTileArgs[0] / float(CPUAllocatedTileCount)));
|
|
const FFontColor AllocCompactedColor = InitFontColor(ColorMapTurbo(CompactedVisTileArgs[0] / float(CPUAllocatedCompactedTileCount)));
|
|
|
|
Print(Context, TEXT("Alloc. Tile : "), FontLegend); Print(Context, VisTileArgs[0], AllocColor, 6, 0); Print(Context, TEXT(" / "), FontLegend); Print(Context, CPUAllocatedTileCount, FontValue,6,0); Newline(Context);
|
|
Print(Context, TEXT("Alloc. Compacted Tile: "), FontLegend); Print(Context, CompactedVisTileArgs[0], AllocCompactedColor, 6, 0); Print(Context, TEXT(" / "), FontLegend); Print(Context, CPUAllocatedCompactedTileCount, FontValue, 6, 0); Newline(Context);
|
|
Print(Context, TEXT("Rasterizer Max Work : "), FontLegend); Print(Context, NumRasterizers * MAX_WORK_COUNT, FontValue); Newline(Context);
|
|
Newline(Context);
|
|
|
|
Newline(Context);
|
|
|
|
if (bIsCursorPixel)
|
|
{
|
|
const uint2 PixelCoord = ShaderPrintData.CursorCoord;
|
|
const uint2 RasterTileCoord = uint2(ShaderPrintData.CursorCoord) >> RASTER_TILE_SIZE_AS_SHIFT;
|
|
|
|
const uint HairCount = HairCountTexture_ForDebug.Load(uint3(PixelCoord, 0));
|
|
const uint RasterizedPixels = HairPixelCountPerTile_ForDebug.Load(uint3(RasterTileCoord, 0));
|
|
|
|
Print(Context, TEXT("Hair Count : "), FontLegend); Print(Context, HairCount, FontValue); Newline(Context);
|
|
Print(Context, TEXT("Hair #Pixel in Tile : "), FontLegend); Print(Context, RasterizedPixels, FontValue); Newline(Context);
|
|
Newline(Context);
|
|
}
|
|
}
|
|
#if 0
|
|
// Cursor info
|
|
if (bIsCursorPixel)
|
|
{
|
|
const uint2 PixelCoord = ShaderPrintData.CursorCoord;
|
|
const uint2 BinTileCoord = uint2(ShaderPrintData.CursorCoord) >> BIN_TILE_SIZE_AS_SHIFT;
|
|
if (all(BinTileCoord < BinTileRes))
|
|
{
|
|
const uint TotalSegments = GetTileTotalSegment(BinTileCoord, true, BIN_TILE_SIZE);
|
|
PrintTile(BinTileCoord, TotalSegments, true, BIN_TILE_SIZE);
|
|
}
|
|
}
|
|
|
|
// All tile
|
|
{
|
|
const uint2 BinTileCoord = ThreadId.xy;
|
|
if (all(BinTileCoord < BinTileRes))
|
|
{
|
|
const uint TotalSegments = GetTileTotalSegment(BinTileCoord, false, BIN_TILE_SIZE);
|
|
if (TotalSegments)
|
|
{
|
|
PrintTile(BinTileCoord, TotalSegments, false, BIN_TILE_SIZE);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
#endif //SHADER_RASTERCOMPUTE_DEBUG
|
|
|