Files
UnrealEngine/Engine/Shaders/Private/InstanceCulling/InstanceCullingLoadBalancer.ush
2025-05-18 13:04:45 +08:00

266 lines
7.8 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
#pragma once
#include "../Common.ush"
#include "../WaveOpUtil.ush"
#include "../ComputeShaderUtils.ush"
#if COMPILER_SUPPORTS_WAVE_VOTE
// TODO: Need to also check that wave width == thread group size (or implement workaround for other wave sizes)
#define USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION 0
#endif
// Whether to use a special case where all items are expected to have a single instance
#ifndef LOAD_BALANCER_SINGLE_INSTANCE_MODE
#define LOAD_BALANCER_SINGLE_INSTANCE_MODE 0
#endif
struct FPackedInstanceBatch
{
uint FirstItem_NumItems;
};
struct FPackedInstanceBatchItem
{
// packed 32-NumInstancesItemBits:NumInstancesItemBits - need one more bit for the case where one item has ThreadGroupSize work to do
uint InstanceDataOffset_NumInstances;
// packed 32-PrefixBits:PrefixBits
uint Payload_BatchPrefixOffset;
};
StructuredBuffer<FPackedInstanceBatch> BatchBuffer;
StructuredBuffer<FPackedInstanceBatchItem> ItemBuffer;
uint NumBatches;
uint NumItems;
uint NumGroupsPerBatch;
struct FInstanceBatch
{
uint FirstItem;
uint NumItems;
};
FInstanceBatch UnpackBatch(FPackedInstanceBatch PackedBatch, uint ItemDataOffset)
{
FInstanceBatch Result;
Result.FirstItem = ItemDataOffset + (PackedBatch.FirstItem_NumItems >> NUM_INSTANCES_ITEM_BITS);
Result.NumItems = PackedBatch.FirstItem_NumItems & NUM_INSTANCES_ITEM_MASK;
return Result;
}
struct FInstanceBatchItem
{
uint InstanceDataOffset;
uint NumInstances;
uint Payload;
uint BatchPrefixOffset;
};
FInstanceBatchItem UnpackItem(FPackedInstanceBatchItem PackedItem)
{
FInstanceBatchItem Result;
Result.InstanceDataOffset = PackedItem.InstanceDataOffset_NumInstances >> NUM_INSTANCES_ITEM_BITS;
Result.NumInstances = PackedItem.InstanceDataOffset_NumInstances & NUM_INSTANCES_ITEM_MASK;
Result.Payload = PackedItem.Payload_BatchPrefixOffset >> PREFIX_BITS;
Result.BatchPrefixOffset = PackedItem.Payload_BatchPrefixOffset & PREFIX_BIT_MASK;
return Result;
}
#if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
groupshared uint WorkBoundary[NUM_THREADS_PER_GROUP];
#else // !USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
groupshared uint ItemIndex[NUM_THREADS_PER_GROUP];
void PrefixMaxStep(uint ElementIndex, uint StepSize, inout uint CurrentValue)
{
if (ElementIndex >= StepSize)
{
CurrentValue = max(CurrentValue, ItemIndex[ElementIndex - StepSize]);
}
GroupMemoryBarrierWithGroupSync();
if (ElementIndex >= StepSize)
{
ItemIndex[ElementIndex] = CurrentValue;
}
GroupMemoryBarrierWithGroupSync();
}
uint PrefixMaxLastStep(uint ElementIndex, inout uint CurrentValue)
{
GroupMemoryBarrierWithGroupSync();
uint StepSize = NUM_THREADS_PER_GROUP / 2U;
if (ElementIndex >= StepSize)
{
uint NewValue = max(CurrentValue, ItemIndex[ElementIndex - StepSize]);
CurrentValue = NewValue;
}
// No sync needed since we don't look at ItemIndex[] again
return CurrentValue;
}
uint PrefixMax(uint ElementIndex)
{
GroupMemoryBarrierWithGroupSync();
uint CurrentValue = ItemIndex[ElementIndex];
GroupMemoryBarrierWithGroupSync();
// Note: manually unrolled as UNROLL seems to not figure out *= 2, at least on some platforms
//for (uint StepSize = 1U; StepSize < NUM_THREADS_PER_GROUP / 2U; StepSize *= 2u)
//{
// PrefixMaxStep(ElementIndex, StepSize, CurrentValue);
//}
PrefixMaxStep(ElementIndex, 1U, CurrentValue);
PrefixMaxStep(ElementIndex, 2U, CurrentValue);
PrefixMaxStep(ElementIndex, 4U, CurrentValue);
PrefixMaxStep(ElementIndex, 8U, CurrentValue);
#if NUM_THREADS_PER_GROUP > 32u
PrefixMaxStep(ElementIndex, 16U, CurrentValue);
#endif
#if NUM_THREADS_PER_GROUP > 64u
PrefixMaxStep(ElementIndex, 32U, CurrentValue);
#endif
#if NUM_THREADS_PER_GROUP > 128u
PrefixMaxStep(ElementIndex, 64U, CurrentValue);
#endif
#if NUM_THREADS_PER_GROUP > 256u
PrefixMaxStep(ElementIndex, 128U, CurrentValue);
#endif
return PrefixMaxLastStep(ElementIndex, CurrentValue);
}
#endif // USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
groupshared FInstanceBatchItem Items[NUM_THREADS_PER_GROUP];
struct FInstanceWorkSetup
{
bool bValid;
FInstanceBatchItem Item;
// Work out who I am in expansion of the item in range [0,WorkItem.Count], may be negative when no work at end of batch
int LocalItemIndex;
uint WorkItemIndex;
uint GroupIndexPerBatch; // 0..NumGroupsPerBatch if multiple groups per batch are requested
};
// Set bSupportMultipleBatches to true if this kernel is dispatched with more than 1 group per batch.
FInstanceWorkSetup InstanceCullingLoadBalancer_Setup(uint3 GroupId, uint GroupThreadIndex, uint ItemDataOffset, bool bSupportMultipleBatches = false)
{
FInstanceWorkSetup Setup = (FInstanceWorkSetup)0;
uint BatchIndex = GetUnWrappedDispatchGroupId(GroupId);
// Interleave multiple groups per batch if multiple groups per batch are requested
if (bSupportMultipleBatches)
{
Setup.GroupIndexPerBatch = BatchIndex % NumGroupsPerBatch;
BatchIndex = BatchIndex / NumGroupsPerBatch;
}
if (BatchIndex >= NumBatches)
{
return Setup;
}
// Load work description for whole work group
FInstanceBatch GroupBatch = UnpackBatch(BatchBuffer[BatchIndex], ItemDataOffset);
#if LOAD_BALANCER_SINGLE_INSTANCE_MODE
// Special case where all items are expected to have a single instance
if (GroupBatch.NumItems > GroupThreadIndex)
{
Setup.Item = UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]);
Setup.bValid = true;
}
#else
// Special case where we have an 1:1 item/thread no need to do prefix sums or whatever
// TODO: this case can also store less data per item
if (GroupBatch.NumItems == NUM_THREADS_PER_GROUP)
{
checkSlow(UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]).BatchPrefixOffset == GroupThreadIndex);
Setup.Item = UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]);
Setup.bValid = true;
return Setup;
}
// Load work Items (guaranteed to be less than group size)
#if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
WorkBoundary[GroupThreadIndex] = 0;
#else
ItemIndex[GroupThreadIndex] = 0U;
#endif
GroupMemoryBarrierWithGroupSync();
if (GroupThreadIndex < GroupBatch.NumItems)
{
FInstanceBatchItem WorkItem = UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]);
#if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
// Mark last instance as boundary
WorkBoundary[(WorkItem.BatchPrefixOffset + WorkItem.NumInstances) - 1U] = 1;
#else
ItemIndex[WorkItem.BatchPrefixOffset] = GroupThreadIndex;
#endif
Items[GroupThreadIndex] = WorkItem;
}
GroupMemoryBarrierWithGroupSync();
uint WorkItemIndex = 0U;
if (GroupBatch.NumItems < 4U)
{
// Distribute using linear search (no syncs requires like the prefix max op)
for (int Index = int(GroupBatch.NumItems) - 1; Index >= 0; --Index)
{
// TODO: pack the offset data into a single dword and store in group batch? 6x5 bits can fit and would be far more efficient that all the shared mumbojumbo
FInstanceBatchItem WorkItem = Items[Index];
if (GroupThreadIndex >= WorkItem.BatchPrefixOffset)
{
WorkItemIndex = uint(Index);
break;
}
}
}
else
{
#if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
// Ballot based boundary search for when wave ops are available.
bool bIsBoundary = WorkBoundary[GroupThreadIndex];
WorkItemIndex = WavePrefixCountBits(bIsBoundary);
#else
// Distribute the index to the other threads that should work on the same item
WorkItemIndex = PrefixMax(GroupThreadIndex);
#endif
}
Setup.WorkItemIndex = WorkItemIndex;
Setup.Item = Items[WorkItemIndex];
// Work out who I am in expansion of the item in range [0,WorkItem.Count], may be negative when no work at end of batch
Setup.LocalItemIndex = int(GroupThreadIndex - Setup.Item.BatchPrefixOffset);
Setup.bValid = Setup.LocalItemIndex >= 0 && Setup.LocalItemIndex < int(Setup.Item.NumInstances);
#endif //LOAD_BALANCER_SINGLE_INSTANCE_MODE
return Setup;
}
uint InstanceCullingLoadBalancer_GetNumBatches()
{
return NumBatches;
}