1587 lines
50 KiB
HLSL
1587 lines
50 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#define HAIR_STRANDS_PARAMETERS 1
|
|
|
|
#include "../Common.ush"
|
|
#include "../WaveOpUtil.ush"
|
|
#include "HairStrandsClusterCommon.ush"
|
|
#include "HairStrandsVertexFactoryCommon.ush"
|
|
#include "HairStrandsVisibilityCommon.ush"
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Common parameters
|
|
|
|
uint TileSizeAsShift;
|
|
uint TileSize;
|
|
float RcpTileSize;
|
|
uint SqrTileSize;
|
|
uint HalfTileSize;
|
|
float RcpHalfTileSize;
|
|
uint SqrHalfTileSize;
|
|
int2 TileRes;
|
|
|
|
uint NumBinners;
|
|
float RcpNumBinners;
|
|
uint NumRasterizers;
|
|
float RcpNumRasterizers;
|
|
|
|
uint MaxRasterCount;
|
|
uint FrameIdMod8;
|
|
uint ResolutionMultiplier;
|
|
int2 OutputResolution;
|
|
float2 OutputResolutionf;
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_BINNING || SHADER_RASTERCOMPUTE_COMPACTION || SHADER_RASTERCOMPUTE_RASTER || SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE || SHADER_RASTERCOMPUTE_DEPTH_GRID
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
/*
|
|
// use untyped buffer for segment tiles to reduce VGPR usage - 16 bytes
|
|
struct FVisTile
|
|
{
|
|
uint PrimOffset;
|
|
uint PrimCount;
|
|
uint TileCoord;
|
|
uint MinDepth;
|
|
};
|
|
*/
|
|
#define VT_SIZE 4
|
|
|
|
#define VT_PrimOffset 0
|
|
#define VT_PrimCount 1
|
|
#define VT_Coord 2
|
|
#define VT_MinWriteIndex 3
|
|
|
|
uint PackVisTileCoord(uint2 Coord)
|
|
{
|
|
return uint(((Coord.x & 0xff) << 0) | (((Coord.y) & 0xff) << 8));
|
|
}
|
|
|
|
uint2 UnpackVisTileCoord(uint Packed)
|
|
{
|
|
return uint2(((Packed >> 0) & 0xff), ((Packed >> 8) & 0xff));
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
uint MacroGroupId;
|
|
uint HairMaterialId;
|
|
|
|
|
|
Texture2D<float> SceneDepthTexture;
|
|
|
|
uint VertexCount;
|
|
float CoverageScale;
|
|
|
|
float3 NDCToPixelCoord(float4 InDC)
|
|
{
|
|
const float3 NDC = InDC.xyz / InDC.w;
|
|
float2 UV = NDC.xy * ResolvedView.ScreenPositionScaleBias.xy + ResolvedView.ScreenPositionScaleBias.wz;
|
|
return float3(UV * OutputResolution, NDC.z);
|
|
}
|
|
|
|
void CalcHomogenousPos(in uint InPointIndex, in float3 PBO, out float4 HP, out uint Type)
|
|
{
|
|
const FHairControlPoint CP = ReadHairControlPoint(
|
|
HairStrandsVF_PositionBuffer,
|
|
InPointIndex,
|
|
PBO,
|
|
HairStrandsVF_Radius,
|
|
HairStrandsVF_RootScale,
|
|
HairStrandsVF_TipScale);
|
|
|
|
const float3 WP = mul(float4(CP.Position, 1.0f), HairStrandsVF_LocalToWorldPrimitiveTransform).xyz;
|
|
HP = mul(float4(WP, 1.0f), DFHackToFloat(PrimaryView.WorldToClip));
|
|
Type = CP.Type;
|
|
}
|
|
|
|
void CalcHomogenousPosAndRad(in uint InPointIndex, in float3 PBO, out float4 HP, out float Rad, out uint Type)
|
|
{
|
|
const FHairControlPoint CP = ReadHairControlPoint(
|
|
HairStrandsVF_PositionBuffer,
|
|
InPointIndex,
|
|
PBO,
|
|
HairStrandsVF_Radius,
|
|
HairStrandsVF_RootScale,
|
|
HairStrandsVF_TipScale);
|
|
|
|
const float3 WP = mul(float4(CP.Position, 1.0f), HairStrandsVF_LocalToWorldPrimitiveTransform).xyz;
|
|
HP = mul(float4(WP, 1.0f), DFHackToFloat(PrimaryView.WorldToClip));
|
|
Rad = CP.WorldRadius * 2000.0; // OutputResolutionf.x; //TODO: figure this out correctly?
|
|
Type = CP.Type;
|
|
}
|
|
|
|
float ComputeLerpAlpha(int2 Coord, float2 P0, float2 P1, float SegmentLenSqRcp)
|
|
{
|
|
// Project P onto line segment and compute the lerp alpha between P0 and P1
|
|
// Simplification of:
|
|
// A = P - P0
|
|
// B = P1 - P0
|
|
// Alpha = dot(A, B) / dot(B, B)
|
|
const float2 P = Coord + 0.5f;
|
|
const float Alpha = saturate(dot(P - P0, P1 - P0) * SegmentLenSqRcp);
|
|
return Alpha;
|
|
}
|
|
|
|
float ComputePerspectiveCorrectRadius(float Rad0, float Rad1, float Alpha, float RcpW0, float RcpW1)
|
|
{
|
|
// Alpha value for perspective correct interpolation. We store the reciprocal of w in the w component of P0 and P1,
|
|
// so this is a simplification of:
|
|
// (Alpha / w1) / ((1 - Alpha) / w0 + Alpha / w1)
|
|
const float LerpedRcpW = lerp(RcpW0, RcpW1, Alpha);
|
|
const float PerspectiveAlpha = (Alpha * RcpW1) / LerpedRcpW;
|
|
// Divide by W to make thickness dependent on screen space depth? This division was kept from the previous line rasterization algorithm.
|
|
const float Rad = lerp(Rad0, Rad1, PerspectiveAlpha) * LerpedRcpW;
|
|
return Rad;
|
|
}
|
|
|
|
// Line clipping based on "CLIPPING USING HOMOGENEOUS COORDINATES" by Blinn et al.
|
|
bool BlinnLineClipping(inout float4 P0, inout float4 P1)
|
|
{
|
|
float2 T = float2(0.0f, 1.0f);
|
|
bool bIsRemoved = P0.w < 0.0f && P1.w < 0.0f; // Both points behind near plane
|
|
|
|
bool bSign = false;
|
|
|
|
UNROLL
|
|
for (uint PlaneIdx = 0; PlaneIdx < 6; ++PlaneIdx)
|
|
{
|
|
// Compute boundary coordinates of both points (w+x, w-x, w+y, w-y, z, w-z)
|
|
bSign = !bSign;
|
|
const uint CompIdx = PlaneIdx / 2;
|
|
const float Sign = bSign ? 1.0f : -1.0f;
|
|
const float WFactor = PlaneIdx != 4 ? 1.0f : 0.0f;
|
|
const float2 BC = WFactor * float2(P0.w, P1.w) + Sign * float2(P0[CompIdx], P1[CompIdx]);
|
|
|
|
float Num = BC.x;
|
|
float Denom = BC.x - BC.y;
|
|
bIsRemoved = bIsRemoved || (BC.x < 0.0f && BC.y < 0.0f); // Both points outside the plane
|
|
float Alpha = Num / Denom;
|
|
|
|
// If the denominator is negative, P0 has a smaller boundary coordinate than P1, so we can assume
|
|
// that P1 is inside the plane (or bIsRemoved is true), so we need to update the alpha for P0.
|
|
// The reverse is true if the denominator is positive.
|
|
if (Denom < 0.0f)
|
|
{
|
|
T.x = max(T.x, Alpha);
|
|
}
|
|
else
|
|
{
|
|
T.y = min(T.y, Alpha);
|
|
}
|
|
}
|
|
|
|
if (!bIsRemoved)
|
|
{
|
|
const float4 P0Clipped = lerp(P0, P1, T.x);
|
|
const float4 P1Clipped = lerp(P0, P1, T.y);
|
|
P0 = P0Clipped;
|
|
P1 = P1Clipped;
|
|
}
|
|
|
|
return !bIsRemoved;
|
|
}
|
|
|
|
bool ClipRaySegment(float2 AABBMin, float2 AABBMax, float4 P0, float4 P1, out float2 T, out bool2 bClipped)
|
|
{
|
|
bClipped = false;
|
|
T = float2(0.0f, 1.0f);
|
|
const bool bP0Outside = any(P0.xy < AABBMin) || any(P0.xy > AABBMax);
|
|
const bool bP1Outside = any(P1.xy < AABBMin) || any(P1.xy > AABBMax);
|
|
if (!bP0Outside && !bP1Outside)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
const float2 Origin = P0.xy;
|
|
const float2 Dir = P1.xy - P0.xy;
|
|
const float2 RcpDir = 1.0f / Dir;
|
|
|
|
const float2 T0 = (AABBMin - Origin) * RcpDir;
|
|
const float2 T1 = (AABBMax - Origin) * RcpDir;
|
|
|
|
T.x = max(min(T0.x, T1.x), min(T0.y, T1.y));
|
|
T.y = min(max(T0.x, T1.x), max(T0.y, T1.y));
|
|
|
|
// Ray intersects the AABB but the segment is completely outside or no intersection at all.
|
|
if (T.y < 0.0f || T.x > T.y)
|
|
{
|
|
bClipped = true;
|
|
return false;
|
|
}
|
|
|
|
if (bP0Outside && T.x > 0.0f && T.x < 1.0f)
|
|
{
|
|
bClipped.x = true;
|
|
}
|
|
if (bP1Outside && T.y > 0.0f && T.y < 1.0f)
|
|
{
|
|
bClipped.y = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool ClipRaySegment(float2 AABBMin, float2 AABBMax, inout float4 P0, inout float4 P1, inout float Rad0, inout float Rad1, out bool2 bClipped)
|
|
{
|
|
float2 T;
|
|
bool bIsValid = ClipRaySegment(AABBMin, AABBMax, P0, P1, T, bClipped);
|
|
|
|
if (bIsValid)
|
|
{
|
|
const bool bP0Outside = any(P0.xy < AABBMin) || any(P0.xy > AABBMax);
|
|
const bool bP1Outside = any(P1.xy < AABBMin) || any(P1.xy > AABBMax);
|
|
|
|
float4 P0New = P0;
|
|
float4 P1New = P1;
|
|
float Rad0New = Rad0;
|
|
float Rad1New = Rad1;
|
|
if (bP0Outside && T.x > 0.0f && T.x < 1.0f)
|
|
{
|
|
P0New = lerp(P0, P1, T.x);
|
|
Rad0New = lerp(Rad0, Rad1, T.x);
|
|
bClipped.x = true;
|
|
}
|
|
if (bP1Outside && T.y > 0.0f && T.y < 1.0f)
|
|
{
|
|
P1New = lerp(P0, P1, T.y);
|
|
Rad1New = lerp(Rad0, Rad1, T.y);
|
|
bClipped.y = true;
|
|
}
|
|
P0 = P0New;
|
|
P1 = P1New;
|
|
Rad0 = Rad0New;
|
|
Rad1 = Rad1New;
|
|
}
|
|
|
|
return bIsValid;
|
|
}
|
|
|
|
#endif // Common rasetrizer helper function & parameters
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_DEPTH_GRID
|
|
|
|
RWTexture2D<uint> OutVisTileDepthGrid;
|
|
RWTexture2DArray<uint> OutDepthCovTexture;
|
|
uint NumSamples;
|
|
|
|
groupshared uint group_FurthestDepth; // (4 bytes)
|
|
|
|
[numthreads(1024, 1, 1)]
|
|
void PrepareDepthGridCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID)
|
|
{
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_FurthestDepth = 0xFFFFFFFF;
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Assumes a workgroup size of 1024 threads to cover a maximum tile size of 32x32.
|
|
if (GroupThreadID < SqrTileSize)
|
|
{
|
|
uint2 Coord;
|
|
|
|
Coord.y = (GroupThreadID + 0.5f) * RcpTileSize;
|
|
Coord.x = GroupThreadID - (Coord.y * TileSize);
|
|
|
|
Coord += GroupID * TileSize;
|
|
|
|
if (all(Coord < (uint2)OutputResolution))
|
|
{
|
|
const float Depth = SceneDepthTexture.Load(uint3(Coord, 0));
|
|
const uint PackedDepth = PackHairVisDepthCoverage(Depth, 1.0);
|
|
|
|
// Compute furthest depth inside this tile
|
|
WaveInterlockedMin(group_FurthestDepth, PackedDepth);
|
|
|
|
// Copy scene depth to (multisampled) hair depth output texture
|
|
for (uint SampleIdx = 0; SampleIdx < NumSamples; ++SampleIdx)
|
|
{
|
|
InterlockedMax(OutDepthCovTexture[uint3(Coord, SampleIdx)], PackedDepth);
|
|
}
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (GroupThreadID == 0)
|
|
{
|
|
OutVisTileDepthGrid[GroupID] = group_FurthestDepth;
|
|
}
|
|
}
|
|
|
|
#endif //SHADER_RASTERCOMPUTE_DEPTH_GRID
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_BINNING
|
|
|
|
#ifndef PERMUTATION_INDIRECT_PRIM_IDS
|
|
#define PERMUTATION_INDIRECT_PRIM_IDS 0
|
|
#endif
|
|
|
|
RWTexture2DArray<uint> OutVisTileBinningGrid;
|
|
RWBuffer<uint> OutVisTilePrims;
|
|
RWBuffer<uint> OutVisTileArgs;
|
|
RWByteAddressBuffer OutVisTileData;
|
|
Texture2D<uint> VisTileDepthGrid;
|
|
ByteAddressBuffer IndirectPrimIDCount;
|
|
Buffer<uint> IndirectPrimIDs;
|
|
|
|
// TODO: Without setting a limit, we are sometimes caught in an infinite loop even though this should not happen.
|
|
#define DDA_MAX_ITERATIONS 256
|
|
|
|
struct FDDAContext
|
|
{
|
|
float2 Coord;
|
|
float2 DeltaDist;
|
|
float2 Step;
|
|
float2 SideDist;
|
|
};
|
|
|
|
FDDAContext DDACreateContext(float2 RayStart, float2 RayDir)
|
|
{
|
|
const float2 RayDirRcp = 1.0f / RayDir;
|
|
|
|
FDDAContext Context;
|
|
Context.Coord = floor(RayStart);
|
|
Context.DeltaDist = abs(RayDirRcp);
|
|
Context.Step = sign(RayDir);
|
|
Context.SideDist = (Context.Coord - RayStart + max(Context.Step, 0.0f)) * RayDirRcp;
|
|
|
|
return Context;
|
|
}
|
|
|
|
void DDAAdvance(inout FDDAContext Context)
|
|
{
|
|
if (Context.SideDist.x < Context.SideDist.y)
|
|
{
|
|
Context.SideDist.x += Context.DeltaDist.x;
|
|
Context.Coord.x += Context.Step.x;
|
|
}
|
|
else
|
|
{
|
|
Context.SideDist.y += Context.DeltaDist.y;
|
|
Context.Coord.y += Context.Step.y;
|
|
}
|
|
}
|
|
|
|
uint LoadOutVisTileData(uint index, uint offset)
|
|
{
|
|
return OutVisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4));
|
|
}
|
|
|
|
void StoreOutVisTileData(uint index, uint offset, uint value)
|
|
{
|
|
OutVisTileData.Store((((index)) * VT_SIZE * 4) + ((offset) * 4), (value));
|
|
}
|
|
|
|
groupshared uint group_LoopNum;
|
|
groupshared uint group_VerticesNum;
|
|
groupshared uint group_BatchNum;
|
|
|
|
#define TILES_TO_ALLOCATE_MAX 1024
|
|
groupshared uint group_TilesToAllocate[TILES_TO_ALLOCATE_MAX];
|
|
groupshared uint group_TilesToAllocateCount;
|
|
|
|
// The total number of line segments (VertexCount) is divided up equally between N binners - each binner = a workgroup which loops through the designated set segments in batches of 1024
|
|
// NB there is still potential to use LDS to prevent/reduce contention when adding segments to the binning grid and this may improve perf
|
|
|
|
[numthreads(1024, 1, 1)]
|
|
void BinningCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID)
|
|
{
|
|
ResolvedView = ResolveView();
|
|
if (GroupThreadID == 0)
|
|
{
|
|
#if PERMUTATION_INDIRECT_PRIM_IDS
|
|
group_VerticesNum = IndirectPrimIDCount.Load(0);
|
|
#else // PERMUTATION_INDIRECT_PRIM_IDS
|
|
#if PERMUTATION_CULLING
|
|
group_VerticesNum = HairStrandsVF_bCullingEnable ? HairStrandsVF_CullingIndirectBuffer[3] : VertexCount;
|
|
#else // PERMUTATION_CULLING
|
|
group_VerticesNum = VertexCount;
|
|
#endif //PERMUTATION_CULLING
|
|
#endif // PERMUTATION_INDIRECT_PRIM_IDS
|
|
|
|
group_BatchNum = (group_VerticesNum + 1023) / 1024;
|
|
group_LoopNum = (group_BatchNum + (NumBinners - 1)) * RcpNumBinners;
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
LOOP for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++)
|
|
{
|
|
const uint BatchIndex = LoopIndex + (GroupID * group_LoopNum);
|
|
bool bSegValid = (BatchIndex < group_BatchNum);
|
|
|
|
#if PERMUTATION_INDIRECT_PRIM_IDS
|
|
uint PrimID = 0;
|
|
const uint PrimIDIndex = BatchIndex * 1024 + GroupThreadID;
|
|
bSegValid = bSegValid && (PrimIDIndex < group_VerticesNum);
|
|
if (bSegValid)
|
|
{
|
|
PrimID = IndirectPrimIDs[PrimIDIndex];
|
|
}
|
|
#else // PERMUTATION_INDIRECT_PRIM_IDS
|
|
#if PERMUTATION_CULLING
|
|
uint PrimID = BatchIndex * 1024 + GroupThreadID;
|
|
bSegValid = bSegValid && (PrimID < group_VerticesNum);
|
|
|
|
if (bSegValid && HairStrandsVF_bCullingEnable)
|
|
{
|
|
const uint FetchIndex0 = PrimID;
|
|
const uint FetchIndex1 = min(FetchIndex0 + 1, group_VerticesNum - 1);
|
|
|
|
const uint VertexIndex0 = HairStrandsVF_CullingIndexBuffer[FetchIndex0];
|
|
const uint VertexIndex1 = HairStrandsVF_CullingIndexBuffer[FetchIndex1];
|
|
|
|
if (VertexIndex1 != VertexIndex0 + 1)
|
|
{
|
|
bSegValid = false;
|
|
}
|
|
else
|
|
{
|
|
PrimID = VertexIndex0;
|
|
}
|
|
}
|
|
#else // PERMUTATION_CULLING
|
|
const uint PrimID = BatchIndex * 1024 + GroupThreadID;
|
|
bSegValid = bSegValid && (PrimID < VertexCount);
|
|
#endif // PERMUTATION_CULLING
|
|
#endif // PERMUTATION_INDIRECT_PRIM_IDS
|
|
|
|
const uint SegmentCountLayerIdx = GroupID; // Stores number of segments per tile per workgroup.
|
|
const uint TmpSegmentCountLayerIdx = SegmentCountLayerIdx + NumBinners; // Also stores number of segments per tile per workgroup. Used as second counter for this two pass algorithm.
|
|
const uint TileAllocInfoLayerIdx = SegmentCountLayerIdx + NumBinners * 2; // Stores per tile per workgroup allocation info.
|
|
|
|
|
|
uint NearestDepth = 0;
|
|
float2 TileCoord0F = 0.0f;
|
|
float2 TileCoord1F = 0.0f;
|
|
|
|
// Project segment end points and clip them to the screen
|
|
if (bSegValid)
|
|
{
|
|
const float3 InstancePositionOffset = HairStrandsVF_GetHairInstancePositionOffset();
|
|
float4 H0 = 0.0f;
|
|
float4 H1 = 0.0f;
|
|
uint Type = -1;
|
|
CalcHomogenousPos(PrimID, InstancePositionOffset, H0, Type);
|
|
|
|
bool bIsEndCV = (Type == HAIR_CONTROLPOINT_END);
|
|
bSegValid = !bIsEndCV;
|
|
|
|
if (bSegValid)
|
|
{
|
|
CalcHomogenousPos(PrimID + 1, InstancePositionOffset, H1, Type);
|
|
|
|
// Do clipping in homogenous coordinates
|
|
bSegValid = BlinnLineClipping(H0, H1);
|
|
|
|
if (bSegValid)
|
|
{
|
|
float3 SP0 = NDCToPixelCoord(H0);
|
|
float3 SP1 = NDCToPixelCoord(H1);
|
|
SP0.xy *= RcpTileSize;
|
|
SP1.xy *= RcpTileSize;
|
|
|
|
// For peace of mind, make sure these are actually clamped to a valid range.
|
|
SP0 = clamp(SP0, 0.0f, float3(TileRes, 1.0f));
|
|
SP1 = clamp(SP1, 0.0f, float3(TileRes, 1.0f));
|
|
|
|
NearestDepth = PackHairVisDepthCoverage(max(SP0.z, SP1.z), 1.0f);
|
|
TileCoord0F = SP0.xy;
|
|
TileCoord1F = SP1.xy;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_TilesToAllocateCount = 0;
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Increment per workgroup per tile counters and add tiles to be allocated
|
|
if (bSegValid)
|
|
{
|
|
FDDAContext DDAContext = DDACreateContext(TileCoord0F, normalize(TileCoord1F - TileCoord0F));
|
|
const int2 EndCoord = (int2)floor(TileCoord1F);
|
|
|
|
for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt)
|
|
{
|
|
const int2 TileCoord = (int2)floor(DDAContext.Coord);
|
|
|
|
BRANCH
|
|
if (NearestDepth > VisTileDepthGrid[TileCoord])
|
|
{
|
|
uint OldTileSegmentCount;
|
|
InterlockedAdd(OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)], 1, OldTileSegmentCount);
|
|
|
|
BRANCH
|
|
if ((OldTileSegmentCount % 1024) == 0)
|
|
{
|
|
uint WritePos;
|
|
InterlockedAdd(group_TilesToAllocateCount, 1, WritePos);
|
|
if (WritePos < TILES_TO_ALLOCATE_MAX)
|
|
{
|
|
group_TilesToAllocate[WritePos] = PackVisTileCoord(TileCoord);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (all(TileCoord == EndCoord))
|
|
{
|
|
break;
|
|
}
|
|
|
|
DDAAdvance(DDAContext);
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Allocate tiles
|
|
const uint TilesToAllocateCount = min(TILES_TO_ALLOCATE_MAX, group_TilesToAllocateCount);
|
|
for (uint TileIdx = GroupThreadID; TileIdx < TilesToAllocateCount; TileIdx += 1024)
|
|
{
|
|
const uint PackedTileCoord = group_TilesToAllocate[TileIdx];
|
|
const uint2 TileCoord = UnpackVisTileCoord(PackedTileCoord);
|
|
|
|
const uint TotalNewWriteCount = OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)];
|
|
const uint TotalOldWriteCount = OutVisTileBinningGrid[uint3(TileCoord, TmpSegmentCountLayerIdx)];
|
|
|
|
uint NewTile;
|
|
WaveInterlockedAddScalar_(OutVisTileArgs[0], 1, NewTile);
|
|
|
|
StoreOutVisTileData(NewTile, VT_Coord, PackedTileCoord);
|
|
// Round down the count to the start of the tile and later compare against this to decide which tile to write to.
|
|
StoreOutVisTileData(NewTile, VT_MinWriteIndex, TotalNewWriteCount & ~1023u);
|
|
|
|
const uint PrevTile = (OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)] & 0xffff);
|
|
|
|
if (TotalOldWriteCount > 0)
|
|
{
|
|
StoreOutVisTileData(PrevTile, VT_PrimCount, 1024);
|
|
}
|
|
|
|
OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)] = (PrevTile << 16) | (NewTile & 0xffff);
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Write PrimID to tiles
|
|
if (bSegValid)
|
|
{
|
|
FDDAContext DDAContext = DDACreateContext(TileCoord0F, normalize(TileCoord1F - TileCoord0F));
|
|
const int2 EndCoord = (int2)floor(TileCoord1F);
|
|
|
|
for (int DDAIt = 0; DDAIt < DDA_MAX_ITERATIONS; ++DDAIt)
|
|
{
|
|
const int2 TileCoord = (int2)floor(DDAContext.Coord);
|
|
|
|
BRANCH
|
|
if (NearestDepth > VisTileDepthGrid[TileCoord])
|
|
{
|
|
const uint PackedTiles = OutVisTileBinningGrid[uint3(TileCoord, TileAllocInfoLayerIdx)];
|
|
const uint CurTile = (PackedTiles & 0xffff);
|
|
const uint PrevTile = ((PackedTiles >> 16) & 0xffff);
|
|
|
|
// Currently we need this to get our write position, but maybe there is a cheaper way to keep track of that?
|
|
uint OldTileSegmentCount;
|
|
InterlockedAdd(OutVisTileBinningGrid[uint3(TileCoord, TmpSegmentCountLayerIdx)], 1, OldTileSegmentCount);
|
|
|
|
const bool bWriteToCurTile = OldTileSegmentCount >= LoadOutVisTileData(CurTile, VT_MinWriteIndex);
|
|
const uint LocalWritePos = OldTileSegmentCount % 1024;
|
|
const uint WritePos = (bWriteToCurTile ? CurTile : PrevTile) * 1024 + LocalWritePos;
|
|
|
|
OutVisTilePrims[WritePos] = PrimID;
|
|
|
|
BRANCH
|
|
if (bWriteToCurTile)
|
|
{
|
|
if ((OldTileSegmentCount + 1) == OutVisTileBinningGrid[uint3(TileCoord, SegmentCountLayerIdx)])
|
|
{
|
|
StoreOutVisTileData(CurTile, VT_PrimCount, (OldTileSegmentCount == 1023) ? 1024 : ((OldTileSegmentCount + 1) % 1024));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (all(TileCoord == EndCoord))
|
|
{
|
|
break;
|
|
}
|
|
|
|
DDAAdvance(DDAContext);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif //SHADER_RASTERCOMPUTE_BINNING
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_COMPACTION
|
|
|
|
ByteAddressBuffer VisTileData;
|
|
Buffer<uint> VisTilePrims;
|
|
Buffer<uint> VisTileArgs;
|
|
RWByteAddressBuffer OutCompactedVisTileData;
|
|
RWBuffer<uint> OutCompactedVisTilePrims;
|
|
RWBuffer<uint> OutCompactedVisTileArgs;
|
|
|
|
uint LoadVisTileData(uint index, uint offset)
|
|
{
|
|
return VisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4));
|
|
}
|
|
|
|
void StoreCompactedVisTileData(uint index, uint offset, uint value)
|
|
{
|
|
OutCompactedVisTileData.Store((((index)) * VT_SIZE * 4) + ((offset) * 4), (value));
|
|
}
|
|
|
|
groupshared uint group_TotalPrimCount;
|
|
groupshared uint group_PrimWriteOffset;
|
|
groupshared uint group_NumTiles;
|
|
groupshared uint group_TilesToCompact[1024];
|
|
groupshared uint group_MaxLDSTileIdx;
|
|
|
|
[numthreads(1024, 1, 1)]
|
|
void CompactionCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint2 GroupID : SV_GroupID)
|
|
{
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_TotalPrimCount = 0;
|
|
group_NumTiles = 0;
|
|
group_MaxLDSTileIdx = 0;
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
const uint NumTiles = VisTileArgs[0];
|
|
const uint PackedCoord = PackVisTileCoord(GroupID);
|
|
|
|
// Compute total number of primitives at this tile coordinate
|
|
uint LocalPrimCount = 0;
|
|
{
|
|
for (uint TileIdx = GroupThreadID; TileIdx < NumTiles; TileIdx += 1024)
|
|
{
|
|
const uint TilePackedCoord = LoadVisTileData(TileIdx, VT_Coord);
|
|
if (PackedCoord == TilePackedCoord)
|
|
{
|
|
LocalPrimCount += LoadVisTileData(TileIdx, VT_PrimCount);
|
|
|
|
uint WritePos;
|
|
WaveInterlockedAddScalar_(group_NumTiles, 1, WritePos);
|
|
if (WritePos < 1024)
|
|
{
|
|
group_TilesToCompact[WritePos] = TileIdx;
|
|
WaveInterlockedMax(group_MaxLDSTileIdx, TileIdx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (LocalPrimCount > 0)
|
|
{
|
|
WaveInterlockedAdd(group_TotalPrimCount, LocalPrimCount);
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
const uint TotalPrimCount = group_TotalPrimCount;
|
|
|
|
if (TotalPrimCount == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// Allocate space
|
|
if (GroupThreadID == 0)
|
|
{
|
|
const uint NumTilesToAllocate = (TotalPrimCount + 1023) / 1024;
|
|
|
|
uint FirstCompactedTile;
|
|
InterlockedAdd(OutCompactedVisTileArgs[0], NumTilesToAllocate, FirstCompactedTile);
|
|
|
|
group_PrimWriteOffset = FirstCompactedTile * 1024;
|
|
|
|
// Initialize new tiles
|
|
for (uint TileIdx = 0; TileIdx < NumTilesToAllocate; ++TileIdx)
|
|
{
|
|
const uint CompactedTile = FirstCompactedTile + TileIdx;
|
|
|
|
const uint PrimCount = min(TotalPrimCount - TileIdx * 1024, 1024);
|
|
StoreCompactedVisTileData(CompactedTile, VT_PrimCount, PrimCount);
|
|
StoreCompactedVisTileData(CompactedTile, VT_Coord, PackedCoord);
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Copy PrimIDs to compacted memory
|
|
{
|
|
uint CurrentWriteOffset = group_PrimWriteOffset;
|
|
|
|
// First process the LDS list of tiles
|
|
const uint NumInputTiles = min(group_NumTiles, 1024);
|
|
for (uint LDSIdx = 0; LDSIdx < NumInputTiles; ++LDSIdx)
|
|
{
|
|
const uint TileIdx = group_TilesToCompact[LDSIdx];
|
|
|
|
const uint TilePrimOffset = TileIdx * 1024;
|
|
const uint TilePrimCount = LoadVisTileData(TileIdx, VT_PrimCount);
|
|
|
|
if (GroupThreadID < TilePrimCount)
|
|
{
|
|
OutCompactedVisTilePrims[CurrentWriteOffset + GroupThreadID] = VisTilePrims[TilePrimOffset + GroupThreadID];
|
|
}
|
|
|
|
CurrentWriteOffset += TilePrimCount;
|
|
}
|
|
|
|
// Check any remaning tiles
|
|
if (group_NumTiles > 1024)
|
|
{
|
|
for (uint TileIdx = group_MaxLDSTileIdx + 1; TileIdx < NumTiles; ++TileIdx)
|
|
{
|
|
const uint TilePackedCoord = LoadVisTileData(TileIdx, VT_Coord);
|
|
if (PackedCoord == TilePackedCoord)
|
|
{
|
|
const uint TilePrimOffset = TileIdx * 1024;
|
|
const uint TilePrimCount = LoadVisTileData(TileIdx, VT_PrimCount);
|
|
|
|
if (GroupThreadID < TilePrimCount)
|
|
{
|
|
OutCompactedVisTilePrims[CurrentWriteOffset + GroupThreadID] = VisTilePrims[TilePrimOffset + GroupThreadID];
|
|
}
|
|
|
|
CurrentWriteOffset += TilePrimCount;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif // SHADER_RASTERCOMPUTE_COMPACTION
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_RASTER
|
|
|
|
// Wave size
|
|
#if PERMUTATION_GROUP_SIZE == 64
|
|
#define WAVE_SIZE 32
|
|
#elif PERMUTATION_GROUP_SIZE == 32
|
|
#define WAVE_SIZE 32
|
|
#else
|
|
#error Unknown group size
|
|
#endif
|
|
|
|
// Simple rasterization algorithm that lerps between line endpoints. Is currently more robust than the Wu algorithm
|
|
// and optionally supports anti-aliasing similar to the Wu algorithm.
|
|
#define RASTER_LINEAR 0
|
|
// Implementation of Wu's line rasterization algorithm. Currently this implementation has tile shaped artifacts when the line segment is
|
|
// clipped against the tile which is why we use the simple linear algorithm at the moment.
|
|
#define RASTER_WU 1
|
|
// Set to 1 to enable writing to two pixels straddling the line segment when using the linear rasterization algorithm.
|
|
#define ENABLE_RASTER_LINEAR_AA 0
|
|
|
|
#define RASTER_ALGO RASTER_LINEAR
|
|
|
|
Buffer<uint> VisTilePrims;
|
|
Buffer<uint> VisTileArgs;
|
|
ByteAddressBuffer VisTileData;
|
|
RWTexture2DArray<uint> OutHairCountTexture;
|
|
RWTexture2DArray<uint> OutDepthCovTexture;
|
|
RWTexture2DArray<uint> OutPrimMatTexture;
|
|
|
|
uint LoadVisTileData(uint index, uint offset)
|
|
{
|
|
return VisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4));
|
|
}
|
|
|
|
groupshared uint4 group_SubTile[1024]; //(32 x 32 x 4 x 4 bytes = 16k bytes)
|
|
|
|
groupshared float3 group_PositionOffset;
|
|
groupshared float group_ooTileLODScale;
|
|
|
|
groupshared uint group_LoopNum;
|
|
groupshared uint group_TileNum;
|
|
|
|
groupshared uint group_ThreadsPerSeg;
|
|
|
|
#define GS_SEGS 320 //this number is limited by group shared memory
|
|
|
|
groupshared float4 group_SP0[GS_SEGS];
|
|
groupshared float4 group_SP1[GS_SEGS];
|
|
groupshared float group_Rad0[GS_SEGS];
|
|
groupshared float group_Rad1[GS_SEGS];
|
|
groupshared uint group_PrimMatID[GS_SEGS];
|
|
|
|
groupshared uint group_TileIndex;
|
|
|
|
void PlotInternal(int2 Coords, float AntiAliasingFactor, float4 P0, float4 P1, float Rad0, float Rad1, float SegmentLenSqRcp, uint PackedTileMin, uint PrimMatID)
|
|
{
|
|
const int2 IntraTileCoord = Coords - int2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff));
|
|
|
|
if (all(IntraTileCoord >= 0) && all(IntraTileCoord < int2(TileSize,TileSize)))
|
|
{
|
|
const float Alpha = ComputeLerpAlpha(Coords, P0.xy, P1.xy, SegmentLenSqRcp);
|
|
const float Depth = lerp(P0.z, P1.z, Alpha);
|
|
const uint PackedDepthCov = PackHairVisDepthCoverage(Depth, 1.0f);
|
|
const uint LinearIndex = IntraTileCoord.x + IntraTileCoord.y * TileSize;
|
|
|
|
// Write Depth + PrimMatID if depth test against hair depths is passed
|
|
uint OldValue;
|
|
InterlockedMax(group_SubTile[LinearIndex].x, PackedDepthCov, OldValue);
|
|
if (PackedDepthCov > OldValue)
|
|
{
|
|
group_SubTile[LinearIndex].y = PrimMatID;
|
|
}
|
|
|
|
// Add hair count if depth test against scene depth is passed
|
|
if (PackedDepthCov > group_SubTile[LinearIndex].w)
|
|
{
|
|
const float Rad = ComputePerspectiveCorrectRadius(Rad0, Rad1, Alpha, P0.w, P1.w);
|
|
InterlockedAdd(group_SubTile[LinearIndex].z, min(Rad, 0.5f) * 2.0f * 1000.0f * CoverageScale * AntiAliasingFactor);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Plot(int2 Coord, float FracY, float AntiAliasingFactor, bool bIsSteep, float4 P0, float4 P1, float Rad0, float Rad1, float SegmentLenSqRcp, uint PackedTileMin, uint PrimMatID)
|
|
{
|
|
// First pixel
|
|
{
|
|
float AAFactor = AntiAliasingFactor * (1.0f - FracY);
|
|
|
|
PlotInternal(bIsSteep ? Coord.yx : Coord.xy, AAFactor, P0, P1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
|
|
}
|
|
|
|
// Second pixel
|
|
{
|
|
float AAFactor = AntiAliasingFactor * FracY;
|
|
Coord.y += 1;
|
|
|
|
PlotInternal(bIsSteep ? Coord.yx : Coord.xy, AAFactor, P0, P1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
|
|
}
|
|
}
|
|
|
|
[numthreads(1024, 1, 1)]
|
|
void RasterCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID)
|
|
{
|
|
ResolvedView = ResolveView();
|
|
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_TileNum = VisTileArgs[0];
|
|
group_LoopNum = (float(group_TileNum) + float(NumRasterizers - 1)) * RcpNumRasterizers;
|
|
|
|
group_PositionOffset = HairStrandsVF_GetHairInstancePositionOffset();
|
|
|
|
/* no longer in use - keep for ref? Moving these values to group shared memory did seem to reduce VGPRs - more experimentation needed
|
|
group_RadScale = (((HairStrandsVF_TipScale - HairStrandsVF_RootScale) * HairStrandsVF_Radius * OutputResolutionf.x) / 63.0) / 255.0;
|
|
group_RadOffset = (HairStrandsVF_RootScale * HairStrandsVF_Radius * OutputResolutionf.x)/63.0;
|
|
*/
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
LOOP
|
|
for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++)
|
|
{
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_TileIndex = LoopIndex + (GroupID * group_LoopNum);
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
bool bTileValid = (group_TileIndex < group_TileNum);
|
|
|
|
uint PrimOffset = group_TileIndex * 1024;
|
|
uint PrimCount = LoadVisTileData(group_TileIndex, VT_PrimCount);
|
|
|
|
uint PackedCoord = LoadVisTileData(group_TileIndex, VT_Coord);
|
|
uint2 SubTileMin = UnpackVisTileCoord(PackedCoord) * TileSize;
|
|
|
|
uint PackedTileMin = ((SubTileMin.x & 0xffff) << 0) | ((SubTileMin.y & 0xffff) << 16);
|
|
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_ThreadsPerSeg = 1;
|
|
|
|
if (PrimCount <= 512)
|
|
group_ThreadsPerSeg = 2;
|
|
if (PrimCount <= 341)
|
|
group_ThreadsPerSeg = 3;
|
|
if (PrimCount <= 256)
|
|
group_ThreadsPerSeg = 4;
|
|
if (PrimCount <= 204)
|
|
group_ThreadsPerSeg = 5;
|
|
if (PrimCount <= 170)
|
|
group_ThreadsPerSeg = 6;
|
|
if (PrimCount <= 146)
|
|
group_ThreadsPerSeg = 7;
|
|
if (PrimCount <= 128)
|
|
group_ThreadsPerSeg = 8;
|
|
if (PrimCount <= 64)
|
|
group_ThreadsPerSeg = 16;
|
|
if (PrimCount <= 32)
|
|
group_ThreadsPerSeg = 32;
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
bool bThreadValid = (bTileValid && (GroupThreadID < (PrimCount * group_ThreadsPerSeg)));
|
|
|
|
uint WaveCount = ((PrimCount * group_ThreadsPerSeg) + (WAVE_SIZE - 1) ) / WAVE_SIZE;
|
|
uint WaveThreadCount = WaveCount * WAVE_SIZE;
|
|
|
|
bool bWaveThreadValid = (bTileValid && (GroupThreadID < WaveThreadCount));
|
|
|
|
bool bUseGroupSPs = (bThreadValid && (GroupThreadID < (min(PrimCount, GS_SEGS) * group_ThreadsPerSeg)));
|
|
|
|
bool bGenGroupSPs = (bThreadValid && (GroupThreadID < (min(PrimCount, GS_SEGS))));
|
|
|
|
if (bGenGroupSPs)
|
|
{
|
|
uint Prim = GroupThreadID;
|
|
uint PrimID = VisTilePrims[PrimOffset + Prim];
|
|
|
|
group_PrimMatID[Prim] = PackHairVisControlPointMaterialId(PrimID, HairMaterialId);
|
|
|
|
uint TypeDummy;
|
|
CalcHomogenousPosAndRad(PrimID, group_PositionOffset, group_SP0[Prim], group_Rad0[Prim], TypeDummy);
|
|
CalcHomogenousPosAndRad(PrimID+1, group_PositionOffset, group_SP1[Prim], group_Rad1[Prim], TypeDummy);
|
|
}
|
|
|
|
if (bWaveThreadValid)
|
|
{
|
|
for (uint LinearIndex = GroupThreadID; LinearIndex < SqrTileSize; LinearIndex += WaveThreadCount)
|
|
{
|
|
uint2 Coord;
|
|
|
|
Coord.y = (float(LinearIndex) + 0.5f) * RcpTileSize;
|
|
Coord.x = LinearIndex - (Coord.y * TileSize);
|
|
|
|
Coord += uint2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff));
|
|
|
|
group_SubTile[LinearIndex].x = OutDepthCovTexture[uint3(Coord, 0)];
|
|
group_SubTile[LinearIndex].y = GetInvalidHairControlPointId();
|
|
group_SubTile[LinearIndex].z = 0;
|
|
group_SubTile[LinearIndex].w = PackHairVisDepthCoverage(SceneDepthTexture.Load(uint3(Coord, 0)), 1.0f);
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (bThreadValid)
|
|
{
|
|
uint Prim = uint((float(GroupThreadID) + 0.5f) / float(group_ThreadsPerSeg));
|
|
uint PModTPS = GroupThreadID - (Prim * group_ThreadsPerSeg);
|
|
|
|
uint PrimMatID;
|
|
float4 SP0;
|
|
float4 SP1;
|
|
float Rad0;
|
|
float Rad1;
|
|
|
|
if (bUseGroupSPs)
|
|
{
|
|
PrimMatID = group_PrimMatID[Prim];
|
|
SP0 = group_SP0[Prim];
|
|
SP1 = group_SP1[Prim];
|
|
Rad0 = group_Rad0[Prim];
|
|
Rad1 = group_Rad1[Prim];
|
|
}
|
|
else
|
|
{
|
|
uint PrimID = VisTilePrims[PrimOffset + Prim];
|
|
PrimMatID = PackHairVisControlPointMaterialId(PrimID, HairMaterialId);
|
|
|
|
uint TypeDummy;
|
|
CalcHomogenousPosAndRad(PrimID, group_PositionOffset, SP0, Rad0, TypeDummy);
|
|
CalcHomogenousPosAndRad(PrimID + 1, group_PositionOffset, SP1, Rad1, TypeDummy);
|
|
}
|
|
|
|
// Clipping
|
|
{
|
|
SP0 = float4(NDCToPixelCoord(SP0), 1.0f / SP0.w);
|
|
SP1 = float4(NDCToPixelCoord(SP1), 1.0f / SP1.w);
|
|
|
|
// Clip against tile
|
|
const float2 TileMin = float2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff));
|
|
const float2 TileMax = TileMin + TileSize;
|
|
bool2 bClipped = false;
|
|
ClipRaySegment(TileMin - 0.5f, TileMax + 0.5f, SP0, SP1, Rad0, Rad1, bClipped);
|
|
}
|
|
|
|
const float SegmentLenSqRcp = 1.0f / dot(SP1.xy - SP0.xy, SP1.xy - SP0.xy);
|
|
|
|
#if RASTER_ALGO == RASTER_LINEAR
|
|
|
|
const bool bIsSteep = abs(SP1.x - SP0.x) < abs(SP1.y - SP0.y);
|
|
const float X0 = bIsSteep ? min(SP0.y, SP1.y) : min(SP0.x, SP1.x);
|
|
const float X1 = bIsSteep ? max(SP0.y, SP1.y) : max(SP0.x, SP1.x);
|
|
const int NumSteps = (int)(ceil(X1) - floor(X0));
|
|
const float RcpNumSteps = 1.0f / (X1 - X0);
|
|
|
|
LOOP
|
|
for (int J = PModTPS; J < NumSteps; J += group_ThreadsPerSeg)
|
|
{
|
|
const float Alpha = saturate(J * RcpNumSteps);
|
|
const float4 SP = lerp(SP0, SP1, Alpha);
|
|
|
|
const float AntiAliasingFactor = 1.0f;
|
|
#if !ENABLE_RASTER_LINEAR_AA
|
|
PlotInternal(SP.xy, AntiAliasingFactor, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
|
|
#else
|
|
const float2 Coord = (bIsSteep ? SP.yx : SP.xy) - 0.5f;
|
|
const float FracY = frac(Coord.y);
|
|
Plot(Coord, FracY, AntiAliasingFactor, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
|
|
#endif // !ENABLE_RASTER_LINEAR_AA
|
|
}
|
|
#elif RASTER_ALGO == RASTER_WU
|
|
// Wu's line algorithm. Currently this has some weird artifacts when clipping to tiles.
|
|
// TODO: Remove this entirely or fix the artifacts.
|
|
{
|
|
const bool bIsSteep = abs(SP1.y - SP0.y) > abs(SP1.x - SP0.x);
|
|
|
|
if (bIsSteep)
|
|
{
|
|
SP0.xy = SP0.yx;
|
|
SP1.xy = SP1.yx;
|
|
}
|
|
if (SP0.x > SP1.x)
|
|
{
|
|
float4 Tmp = SP0;
|
|
SP0 = SP1;
|
|
SP1 = Tmp;
|
|
}
|
|
|
|
const float2 D = SP1.xy - SP0.xy;
|
|
const float Gradient = abs(D.x) < 1e-5f ? 1.0f : D.y / D.x;
|
|
|
|
float DeltaY = 0.0f;
|
|
|
|
// First endpoint
|
|
int2 Px0;
|
|
{
|
|
const float2 SP0Int = SP0.xy - 0.5f; // transform to integer grid with pixel centers at 0.0 instead of 0.5.
|
|
float2 End;
|
|
End.x = floor(SP0Int.x);
|
|
End.y = SP0Int.y + Gradient * (End.x - SP0Int.x);
|
|
|
|
const float GapX = 1.0f;// 1.0f - frac(SP0Int.x + 0.5f);
|
|
|
|
Px0 = int2(End.x, floor(End.y));
|
|
|
|
if (PModTPS == 0)
|
|
{
|
|
const float FracY = frac(End.y);
|
|
Plot(Px0, FracY, GapX, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
|
|
}
|
|
|
|
DeltaY = End.y + Gradient; // First y-intersection for the main loop
|
|
}
|
|
|
|
// Second endpoint
|
|
int2 Px1;
|
|
{
|
|
const float2 SP1Int = SP1.xy - 0.5f; // transform to integer grid with pixel centers at 0.0 instead of 0.5.
|
|
float2 End;
|
|
End.x = floor(SP1Int.x);
|
|
End.y = SP1Int.y + Gradient * (End.x - SP1Int.x);
|
|
const float GapX = 1.0f;// frac(SP1Int.x + 0.5f);
|
|
|
|
Px1 = float2(End.x, floor(End.y));
|
|
|
|
if (PModTPS == 0)
|
|
{
|
|
const float FracY = frac(End.y);
|
|
Plot(Px1, FracY, GapX, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
|
|
}
|
|
}
|
|
|
|
// Main loop
|
|
const int XBegin = Px0.x + 1 + PModTPS;
|
|
const int XEnd = Px1.x;
|
|
DeltaY += PModTPS * Gradient;
|
|
for (int X = XBegin; X < XEnd; X += group_ThreadsPerSeg)
|
|
{
|
|
const int2 Coord = int2(X, floor(DeltaY));
|
|
const float FracY = frac(DeltaY);
|
|
Plot(Coord, FracY, 1.0f, bIsSteep, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedTileMin, PrimMatID);
|
|
DeltaY += group_ThreadsPerSeg * Gradient;
|
|
}
|
|
}
|
|
#endif // RASTER_ALGO == RASTER_LINEAR
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (bWaveThreadValid)
|
|
{
|
|
for (uint LinearIndex = GroupThreadID; LinearIndex < SqrTileSize; LinearIndex += WaveThreadCount)
|
|
{
|
|
uint2 Coord;
|
|
|
|
Coord.y = (float(LinearIndex) + 0.5f) * RcpTileSize;
|
|
Coord.x = LinearIndex - (Coord.y * TileSize);
|
|
|
|
Coord += uint2(((PackedTileMin >> 0) & 0xffff), ((PackedTileMin >> 16) & 0xffff));
|
|
|
|
if (group_SubTile[LinearIndex].y != GetInvalidHairControlPointId())
|
|
{
|
|
uint oldValue;
|
|
InterlockedMax(OutDepthCovTexture[uint3(Coord, 0)], group_SubTile[LinearIndex].x, oldValue);
|
|
if (group_SubTile[LinearIndex].x > oldValue)
|
|
{
|
|
OutPrimMatTexture[uint3(Coord, 0)] = group_SubTile[LinearIndex].y;
|
|
}
|
|
}
|
|
InterlockedAdd(OutHairCountTexture[uint3(Coord, 0)], group_SubTile[LinearIndex].z);
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
}
|
|
}
|
|
|
|
#endif //SHADER_RASTERCOMPUTE_RASTER
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE
|
|
|
|
#ifndef PERMUTATION_MULTI_SAMPLE_COUNT
|
|
#define PERMUTATION_MULTI_SAMPLE_COUNT 1
|
|
#endif
|
|
|
|
// Wave size
|
|
#if PERMUTATION_GROUP_SIZE == 64
|
|
#define WAVE_SIZE 32
|
|
#elif PERMUTATION_GROUP_SIZE == 32
|
|
#define WAVE_SIZE 32
|
|
#else
|
|
#error Unknown group size
|
|
#endif
|
|
|
|
Buffer<uint> VisTilePrims;
|
|
Buffer<uint> VisTileArgs;
|
|
ByteAddressBuffer VisTileData;
|
|
RWTexture2D<uint> OutHairCountTexture;
|
|
RWTexture2DArray<uint> OutDepthCovTexture;
|
|
RWTexture2DArray<uint> OutPrimMatTexture;
|
|
|
|
uint LoadVisTileData(uint index, uint offset)
|
|
{
|
|
return VisTileData.Load((((index)) * VT_SIZE * 4) + ((offset) * 4));
|
|
}
|
|
|
|
groupshared uint group_SubTileSceneDepth[256]; // (16 x 16 x 4 bytes = 1k bytes)
|
|
groupshared uint group_SubTileHairCount[256]; // (16 x 16 x 4 bytes = 1k bytes)
|
|
groupshared uint group_SubTileHairDepth[PERMUTATION_MULTI_SAMPLE_COUNT][256]; // (16 x 16 x 4 bytes = 1k bytes) per sample
|
|
groupshared uint group_SubTilePrimMatID[PERMUTATION_MULTI_SAMPLE_COUNT][256]; // (16 x 16 x 4 bytes = 1k bytes) per sample
|
|
|
|
groupshared float3 group_PositionOffset;
|
|
|
|
groupshared uint group_LoopNum;
|
|
groupshared uint group_TileNum;
|
|
|
|
float GetDistanceToLine(float2 P1, float2 P2, float2 P3, float RcpLineSegLength)
|
|
{
|
|
// We can compute the distance of P1 to the line defined by P2 and P3 as the height of the triangle spanned by these points.
|
|
// Area of triangle: A = 0.5 * h * b where h is the triangle height and b is the length of the base side.
|
|
// Solving for h gives: h = (2 * A) / b
|
|
// We can compute A using the determinant: A = 0.5 * abs(det(P1, P2, P3))
|
|
// After some simplification, this results in the following:
|
|
float A = abs(P1.x * (P2.y - P3.y) + P2.x * (P3.y - P1.y) + P3.x * (P1.y - P2.y));
|
|
return A * RcpLineSegLength;
|
|
}
|
|
|
|
uint GetCoverageMask(int2 PixelCoord, float2 P0, float2 P1)
|
|
{
|
|
const float LineThickness = 1.0f / PERMUTATION_MULTI_SAMPLE_COUNT; // In pixel units
|
|
uint Mask = 0;
|
|
|
|
// Set origin to PixelCoord
|
|
P0 -= PixelCoord;
|
|
P1 -= PixelCoord;
|
|
|
|
const float RcpLineSegLength = 1.0f / distance(P0, P1);
|
|
|
|
#if PERMUTATION_MULTI_SAMPLE_COUNT == 1
|
|
Mask |= (GetDistanceToLine(float2(0.5f, 0.5f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0;
|
|
#elif PERMUTATION_MULTI_SAMPLE_COUNT == 2
|
|
Mask |= (GetDistanceToLine(float2(0.75f, 0.75f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.25f, 0.25f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 1u) : 0;
|
|
#elif PERMUTATION_MULTI_SAMPLE_COUNT == 4
|
|
Mask |= (GetDistanceToLine(float2(0.375f, 0.125f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.875f, 0.375f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 1u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.125f, 0.625f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 2u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.625f, 0.875f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 3u) : 0;
|
|
#elif PERMUTATION_MULTI_SAMPLE_COUNT == 8
|
|
Mask |= (GetDistanceToLine(float2(0.5625f, 0.3125f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 0u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.4375f, 0.6875f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 1u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.8125f, 0.5625f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 2u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.3125f, 0.1875f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 3u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.1875f, 0.8125f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 4u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.0625f, 0.4375f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 5u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.6875f, 0.9375f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 6u) : 0;
|
|
Mask |= (GetDistanceToLine(float2(0.9375f, 0.0625f), P0, P1, RcpLineSegLength) <= LineThickness) ? (1u << 7u) : 0;
|
|
#else
|
|
#error Unsupported PERMUTATION_MULTI_SAMPLE_COUNT! Must be 1, 2, 4 or 8!
|
|
#endif
|
|
|
|
return Mask;
|
|
}
|
|
|
|
void Plot(int2 Coord, float4 P0, float4 P1, float Rad0, float Rad1, float SegmentLenSqRcp, uint PackedHalfTileMin, uint PrimMatID)
|
|
{
|
|
const int2 IntraTileCoord = Coord - int2(((PackedHalfTileMin >> 0) & 0xffff), ((PackedHalfTileMin >> 16) & 0xffff));
|
|
|
|
if (all(IntraTileCoord >= 0) && all(IntraTileCoord < int2(HalfTileSize,HalfTileSize)))
|
|
{
|
|
const float Alpha = ComputeLerpAlpha(Coord, P0.xy, P1.xy, SegmentLenSqRcp);
|
|
const float Depth = lerp(P0.z, P1.z, Alpha);
|
|
const uint PackedDepthCov = PackHairVisDepthCoverage(Depth, 1.0f);
|
|
const uint LinearIndex = IntraTileCoord.x + IntraTileCoord.y * HalfTileSize;
|
|
|
|
// Test against scene depth
|
|
if (PackedDepthCov > group_SubTileSceneDepth[LinearIndex])
|
|
{
|
|
const float Rad = ComputePerspectiveCorrectRadius(Rad0, Rad1, Alpha, P0.w, P1.w);
|
|
const uint HairCount = min(Rad, 0.5f) * 2.0f * 1000.0f * CoverageScale;
|
|
|
|
const uint CoverageMask = GetCoverageMask(Coord, P0.xy, P1.xy);
|
|
|
|
// Accumulate hair count
|
|
if (CoverageMask)
|
|
{
|
|
InterlockedAdd(group_SubTileHairCount[LinearIndex], HairCount);
|
|
}
|
|
|
|
UNROLL for (uint SampleIdx = 0; SampleIdx < PERMUTATION_MULTI_SAMPLE_COUNT; ++SampleIdx)
|
|
{
|
|
if (CoverageMask & (1u << SampleIdx))
|
|
{
|
|
// Write Depth + PrimMatID if depth test against hair depths is passed
|
|
uint OldValue;
|
|
InterlockedMax(group_SubTileHairDepth[SampleIdx][LinearIndex], PackedDepthCov, OldValue);
|
|
if (PackedDepthCov > OldValue)
|
|
{
|
|
group_SubTilePrimMatID[SampleIdx][LinearIndex] = PrimMatID;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
[numthreads(1024, 1, 1)]
|
|
void RasterMultiSampleCS(uint DispatchThreadID : SV_DispatchThreadID, uint GroupThreadID : SV_GroupThreadID, uint GroupID : SV_GroupID)
|
|
{
|
|
ResolvedView = ResolveView();
|
|
|
|
if (GroupThreadID == 0)
|
|
{
|
|
group_TileNum = VisTileArgs[0];
|
|
group_LoopNum = (float(group_TileNum) + float(NumRasterizers - 1)) * RcpNumRasterizers;
|
|
|
|
group_PositionOffset = HairStrandsVF_GetHairInstancePositionOffset();
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
LOOP for (uint LoopIndex = 0; LoopIndex < group_LoopNum; LoopIndex++)
|
|
{
|
|
const uint TileIndex = LoopIndex + (GroupID * group_LoopNum);
|
|
|
|
if (TileIndex >= group_TileNum)
|
|
{
|
|
return;
|
|
}
|
|
|
|
const uint PrimOffset = TileIndex * 1024;
|
|
const uint PrimCount = LoadVisTileData(TileIndex, VT_PrimCount);
|
|
|
|
const uint PackedCoord = LoadVisTileData(TileIndex, VT_Coord);
|
|
const uint2 TileMin = UnpackVisTileCoord(PackedCoord) * TileSize;
|
|
|
|
const uint PackedTileMin = ((TileMin.x & 0xffff) << 0) | ((TileMin.y & 0xffff) << 16);
|
|
|
|
uint ThreadsPerSeg = 1;
|
|
|
|
if (PrimCount <= 512)
|
|
ThreadsPerSeg = 2;
|
|
if (PrimCount <= 341)
|
|
ThreadsPerSeg = 3;
|
|
if (PrimCount <= 256)
|
|
ThreadsPerSeg = 4;
|
|
if (PrimCount <= 204)
|
|
ThreadsPerSeg = 5;
|
|
if (PrimCount <= 170)
|
|
ThreadsPerSeg = 6;
|
|
if (PrimCount <= 146)
|
|
ThreadsPerSeg = 7;
|
|
if (PrimCount <= 128)
|
|
ThreadsPerSeg = 8;
|
|
if (PrimCount <= 64)
|
|
ThreadsPerSeg = 16;
|
|
if (PrimCount <= 32)
|
|
ThreadsPerSeg = 32;
|
|
|
|
const bool bThreadValid = (GroupThreadID < (PrimCount * ThreadsPerSeg));
|
|
const uint Prim = uint((float(GroupThreadID) + 0.5f) / float(ThreadsPerSeg));
|
|
const uint PModTPS = GroupThreadID - (Prim * ThreadsPerSeg);
|
|
|
|
float4 SP0 = 0;
|
|
float4 SP1 = 0;
|
|
float Rad0 = 0;
|
|
float Rad1 = 0;
|
|
bool bIsEndPoint = false;
|
|
uint PrimMatID = ~0;
|
|
|
|
if (bThreadValid)
|
|
{
|
|
const uint PrimID = VisTilePrims[PrimOffset + Prim];
|
|
PrimMatID = PackHairVisControlPointMaterialId(PrimID, HairMaterialId);
|
|
|
|
uint Type;
|
|
CalcHomogenousPosAndRad(PrimID, group_PositionOffset, SP0, Rad0, Type);
|
|
CalcHomogenousPosAndRad(PrimID + 1, group_PositionOffset, SP1, Rad1, Type);
|
|
bIsEndPoint = (Type == HAIR_CONTROLPOINT_END);
|
|
|
|
SP0 = float4(NDCToPixelCoord(SP0), 1.0f / SP0.w);
|
|
SP1 = float4(NDCToPixelCoord(SP1), 1.0f / SP1.w);
|
|
}
|
|
|
|
// Split 32x32 tile into 4 16x16 tiles that are processed one after another.
|
|
// This is to reduce LDS memory pressure.
|
|
UNROLL for (uint SubTileIdx = 0; SubTileIdx < 4; ++SubTileIdx)
|
|
{
|
|
const uint2 SubTileMin = TileMin + uint2((SubTileIdx == 0 || SubTileIdx == 2) ? 0 : HalfTileSize, SubTileIdx < 2 ? 0 : HalfTileSize);
|
|
const uint2 SubTileMax = SubTileMin + HalfTileSize;
|
|
const uint PackedSubTileMin = ((SubTileMin.x & 0xFFFF) << 0u) | ((SubTileMin.y & 0xFFFF) << 16u);
|
|
|
|
// Initialize LDS
|
|
if (GroupThreadID < SqrHalfTileSize)
|
|
{
|
|
uint2 Coord;
|
|
Coord.y = (float(GroupThreadID) + 0.5f) * RcpHalfTileSize;
|
|
Coord.x = GroupThreadID - (Coord.y * HalfTileSize);
|
|
Coord += uint2(((PackedSubTileMin >> 0u) & 0xFFFF), ((PackedSubTileMin >> 16u) & 0xFFFF));
|
|
|
|
group_SubTileSceneDepth[GroupThreadID] = PackHairVisDepthCoverage(SceneDepthTexture.Load(uint3(Coord, 0)), 1.0f);
|
|
group_SubTileHairCount[GroupThreadID] = 0;
|
|
|
|
UNROLL for (uint SampleIdx = 0; SampleIdx < PERMUTATION_MULTI_SAMPLE_COUNT; ++SampleIdx)
|
|
{
|
|
const uint HairDepth = OutDepthCovTexture[uint3(Coord, SampleIdx)];
|
|
group_SubTileHairDepth[SampleIdx][GroupThreadID] = HairDepth;
|
|
group_SubTilePrimMatID[SampleIdx][GroupThreadID] = GetInvalidHairControlPointId();
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Rasterize to LDS
|
|
if (bThreadValid)
|
|
{
|
|
const uint2 SubTileMin = uint2(((PackedSubTileMin >> 0u) & 0xFFFF), ((PackedSubTileMin >> 16u) & 0xFFFF));
|
|
const uint2 SubTileMax = SubTileMin + HalfTileSize;
|
|
bool2 bClipped;
|
|
float2 T;
|
|
const bool bVisible = ClipRaySegment(SubTileMin - 0.5f, SubTileMax + 0.5f, SP0, SP1, T, bClipped);
|
|
T = saturate(T);
|
|
|
|
if (bVisible)
|
|
{
|
|
const float2 SP0Clipped = lerp(SP0, SP1, T.x).xy;
|
|
const float2 SP1Clipped = lerp(SP0, SP1, T.y).xy;
|
|
const bool bIsSteep = abs(SP1.x - SP0.x) < abs(SP1.y - SP0.y);
|
|
const float X0 = bIsSteep ? min(SP0Clipped.y, SP1Clipped.y) : min(SP0Clipped.x, SP1Clipped.x);
|
|
const float X1 = bIsSteep ? max(SP0Clipped.y, SP1Clipped.y) : max(SP0Clipped.x, SP1Clipped.x);
|
|
const int NumSteps = (int)(ceil(X1) - floor(X0));
|
|
const float RcpNumSteps = 1.0f / (X1 - X0);
|
|
|
|
const float SegmentLenSqRcp = 1.0f / dot(SP1.xy - SP0.xy, SP1.xy - SP0.xy);
|
|
|
|
const int End = !bClipped.y && !bIsEndPoint ? (NumSteps - 1) : NumSteps;
|
|
LOOP for (int J = PModTPS; J < End; J += ThreadsPerSeg)
|
|
{
|
|
const float Alpha = lerp(T.x, T.y, saturate(J * RcpNumSteps));
|
|
const float2 SP = lerp(SP0.xy, SP1.xy, Alpha);
|
|
Plot(SP, SP0, SP1, Rad0, Rad1, SegmentLenSqRcp, PackedSubTileMin, PrimMatID);
|
|
}
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Write out to global memory
|
|
if (GroupThreadID < SqrHalfTileSize)
|
|
{
|
|
uint2 Coord;
|
|
Coord.y = (float(GroupThreadID) + 0.5f) * RcpHalfTileSize;
|
|
Coord.x = GroupThreadID - (Coord.y * HalfTileSize);
|
|
Coord += uint2(((PackedSubTileMin >> 0u) & 0xFFFF), ((PackedSubTileMin >> 16u) & 0xFFFF));
|
|
|
|
const uint HairCount = group_SubTileHairCount[GroupThreadID];
|
|
if (HairCount != 0)
|
|
{
|
|
InterlockedAdd(OutHairCountTexture[Coord], HairCount);
|
|
}
|
|
|
|
UNROLL for (uint SampleIdx = 0; SampleIdx < PERMUTATION_MULTI_SAMPLE_COUNT; ++SampleIdx)
|
|
{
|
|
const uint3 SampleCoord = uint3(Coord, SampleIdx);
|
|
|
|
const uint PrimMatID = group_SubTilePrimMatID[SampleIdx][GroupThreadID];
|
|
if (PrimMatID != GetInvalidHairControlPointId())
|
|
{
|
|
const uint HairDepth = group_SubTileHairDepth[SampleIdx][GroupThreadID];
|
|
|
|
uint OldValue;
|
|
InterlockedMax(OutDepthCovTexture[SampleCoord], HairDepth, OldValue);
|
|
if (HairDepth > OldValue)
|
|
{
|
|
OutPrimMatTexture[SampleCoord] = PrimMatID;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif //SHADER_RASTERCOMPUTE_RASTER_MULTI_SAMPLE
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
#if SHADER_RASTERCOMPUTE_DEBUG
|
|
|
|
#include "../ShaderPrint.ush"
|
|
|
|
Texture2D<uint> VisTileDepthGrid;
|
|
Texture2DArray<uint> VisTileBinningGrid;
|
|
Buffer<uint> VisTileArgs;
|
|
uint MacroGroupId;
|
|
uint PrimitiveInfoIndex;
|
|
uint TotalPrimitiveInfoCount;
|
|
|
|
#define TilePrintOffset (TileSize >> 1)
|
|
|
|
float4 Transparent(float4 Color) { return float4(Color.xyz, 0.5f); }
|
|
|
|
uint GetTileTotalSegment(uint2 TileCoord, bool bPrintDetails)
|
|
{
|
|
const float TileDisplayScale = 1.5f;
|
|
const uint DisplayTileSize = TileSize * TileDisplayScale;
|
|
uint2 InlinedTileCoord = uint2(0, 0);
|
|
|
|
uint TotalSegments = 0;
|
|
const uint BinCount = NumBinners;// * 2; // Each binner fill in 2 bins, see binning algo.
|
|
for (uint BinIt = 0; BinIt < BinCount; ++BinIt)
|
|
{
|
|
const uint CurrTileSegments = VisTileBinningGrid.Load(uint4(TileCoord, BinIt, 0));
|
|
TotalSegments += CurrTileSegments;
|
|
|
|
if (bPrintDetails)
|
|
{
|
|
AddFilledQuadSS(InlinedTileCoord * DisplayTileSize, (InlinedTileCoord + 1) * DisplayTileSize, CurrTileSegments > 0 ? Transparent(ColorLightGreen) : Transparent(ColorLightRed));
|
|
AddQuadSS(InlinedTileCoord * DisplayTileSize, (InlinedTileCoord + 1) * DisplayTileSize, ColorYellow);
|
|
|
|
FShaderPrintContext Context = InitShaderPrintContext(true, InlinedTileCoord * DisplayTileSize + TilePrintOffset);
|
|
Print(Context, CurrTileSegments, FontWhite);
|
|
++InlinedTileCoord.x;
|
|
|
|
// Span details onto 2 lines
|
|
if (BinIt == NumBinners-1)
|
|
{
|
|
InlinedTileCoord.x = 0;
|
|
++InlinedTileCoord.y;
|
|
}
|
|
}
|
|
}
|
|
return TotalSegments;
|
|
}
|
|
|
|
void PrintTile(uint2 TileCoord, uint TotalSegments, bool bPrintText)
|
|
{
|
|
AddFilledQuadSS(TileCoord * TileSize, (TileCoord + 1) * TileSize, TotalSegments > 0 ? Transparent(ColorLightGreen) : Transparent(ColorLightRed));
|
|
if (bPrintText)
|
|
{
|
|
FShaderPrintContext Context = InitShaderPrintContext(true, TileCoord * TileSize + uint2(0, TileSize * 1.5f));
|
|
Print(Context, TotalSegments, FontWhite);
|
|
|
|
AddQuadSS(TileCoord * TileSize, (TileCoord + 1) * TileSize, ColorYellow);
|
|
}
|
|
}
|
|
|
|
[numthreads(8, 8, 1)]
|
|
void MainCS(uint3 ThreadId : SV_DispatchThreadID)
|
|
{
|
|
// Info/Stats
|
|
if (all(ThreadId == 0))
|
|
{
|
|
FShaderPrintContext Context = InitShaderPrintContext(true, uint2(50, 110));
|
|
Print(Context, TEXT("Raster compute "), FontYellow); Newline(Context);
|
|
Print(Context, TEXT("Macro Group Id : "), FontSilver); Print(Context, MacroGroupId, FontWhite); Newline(Context);
|
|
Print(Context, TEXT("Primitive Info : "), FontSilver); Print(Context, PrimitiveInfoIndex, FontWhite, 2, 0); Print(Context, TEXT("/"), FontSilver); Print(Context, TotalPrimitiveInfoCount, FontWhite, 2, 0); Newline(Context);
|
|
Newline(Context);
|
|
|
|
Print(Context, TEXT("Configuration "), FontYellow); Newline(Context);
|
|
Print(Context, TEXT("Output Resolution : "), FontSilver); Print(Context, OutputResolution, FontWhite); Newline(Context);
|
|
Print(Context, TEXT("Resolution Multiplier: "), FontSilver); Print(Context, ResolutionMultiplier, FontWhite); Newline(Context);
|
|
Newline(Context);
|
|
|
|
Print(Context, TEXT("Tile Size : "), FontSilver); Print(Context, TileSize, FontWhite); Newline(Context);
|
|
Print(Context, TEXT("Tile Res : "), FontSilver); Print(Context, TileRes.x, FontWhite, 2, 0); Print(Context, TEXT("x"), FontSilver); Print(Context, TileRes.y, FontWhite, 2, 0); Newline(Context);
|
|
Newline(Context);
|
|
|
|
Print(Context, TEXT("Num Binners : "), FontSilver); Print(Context, NumBinners, FontWhite); Newline(Context);
|
|
Print(Context, TEXT("Num Rasterizers : "), FontSilver); Print(Context, NumRasterizers, FontWhite); Newline(Context);
|
|
Print(Context, TEXT("Max Raster Count : "), FontSilver); Print(Context, MaxRasterCount, FontWhite); Newline(Context);
|
|
Newline(Context);
|
|
|
|
Print(Context, TEXT("Allocated Tile Count : "), FontSilver); Print(Context, VisTileArgs[0], FontWhite); Newline(Context);
|
|
|
|
|
|
}
|
|
|
|
// Cursor info
|
|
if (all(ThreadId == 0) && all(ShaderPrintData.CursorCoord >= 0))
|
|
{
|
|
const uint2 PixelCoord = ShaderPrintData.CursorCoord;
|
|
const uint2 TileCoord = PixelCoord >> TileSizeAsShift;
|
|
|
|
const uint TotalSegments = GetTileTotalSegment(TileCoord, true);
|
|
PrintTile(TileCoord, TotalSegments, true);
|
|
}
|
|
|
|
// All tile
|
|
{
|
|
const uint2 TileCoord = ThreadId.xy;
|
|
const uint TotalSegments = GetTileTotalSegment(TileCoord, false);
|
|
if (TotalSegments)
|
|
{
|
|
PrintTile(TileCoord, TotalSegments, false);
|
|
}
|
|
}
|
|
}
|
|
#endif //SHADER_RASTERCOMPUTE_DEBUG
|
|
|