266 lines
7.8 KiB
HLSL
266 lines
7.8 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "../Common.ush"
|
|
#include "../WaveOpUtil.ush"
|
|
#include "../ComputeShaderUtils.ush"
|
|
|
|
#if COMPILER_SUPPORTS_WAVE_VOTE
|
|
// TODO: Need to also check that wave width == thread group size (or implement workaround for other wave sizes)
|
|
#define USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION 0
|
|
#endif
|
|
|
|
// Whether to use a special case where all items are expected to have a single instance
|
|
#ifndef LOAD_BALANCER_SINGLE_INSTANCE_MODE
|
|
#define LOAD_BALANCER_SINGLE_INSTANCE_MODE 0
|
|
#endif
|
|
|
|
struct FPackedInstanceBatch
|
|
{
|
|
uint FirstItem_NumItems;
|
|
};
|
|
|
|
struct FPackedInstanceBatchItem
|
|
{
|
|
// packed 32-NumInstancesItemBits:NumInstancesItemBits - need one more bit for the case where one item has ThreadGroupSize work to do
|
|
uint InstanceDataOffset_NumInstances;
|
|
// packed 32-PrefixBits:PrefixBits
|
|
uint Payload_BatchPrefixOffset;
|
|
};
|
|
|
|
|
|
StructuredBuffer<FPackedInstanceBatch> BatchBuffer;
|
|
StructuredBuffer<FPackedInstanceBatchItem> ItemBuffer;
|
|
uint NumBatches;
|
|
uint NumItems;
|
|
uint NumGroupsPerBatch;
|
|
|
|
|
|
struct FInstanceBatch
|
|
{
|
|
uint FirstItem;
|
|
uint NumItems;
|
|
};
|
|
|
|
FInstanceBatch UnpackBatch(FPackedInstanceBatch PackedBatch, uint ItemDataOffset)
|
|
{
|
|
FInstanceBatch Result;
|
|
Result.FirstItem = ItemDataOffset + (PackedBatch.FirstItem_NumItems >> NUM_INSTANCES_ITEM_BITS);
|
|
Result.NumItems = PackedBatch.FirstItem_NumItems & NUM_INSTANCES_ITEM_MASK;
|
|
|
|
return Result;
|
|
}
|
|
|
|
struct FInstanceBatchItem
|
|
{
|
|
uint InstanceDataOffset;
|
|
uint NumInstances;
|
|
uint Payload;
|
|
uint BatchPrefixOffset;
|
|
};
|
|
|
|
FInstanceBatchItem UnpackItem(FPackedInstanceBatchItem PackedItem)
|
|
{
|
|
FInstanceBatchItem Result;
|
|
|
|
Result.InstanceDataOffset = PackedItem.InstanceDataOffset_NumInstances >> NUM_INSTANCES_ITEM_BITS;
|
|
Result.NumInstances = PackedItem.InstanceDataOffset_NumInstances & NUM_INSTANCES_ITEM_MASK;
|
|
Result.Payload = PackedItem.Payload_BatchPrefixOffset >> PREFIX_BITS;
|
|
Result.BatchPrefixOffset = PackedItem.Payload_BatchPrefixOffset & PREFIX_BIT_MASK;
|
|
|
|
return Result;
|
|
}
|
|
|
|
|
|
#if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
|
|
|
|
groupshared uint WorkBoundary[NUM_THREADS_PER_GROUP];
|
|
|
|
#else // !USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
|
|
|
|
groupshared uint ItemIndex[NUM_THREADS_PER_GROUP];
|
|
|
|
void PrefixMaxStep(uint ElementIndex, uint StepSize, inout uint CurrentValue)
|
|
{
|
|
if (ElementIndex >= StepSize)
|
|
{
|
|
CurrentValue = max(CurrentValue, ItemIndex[ElementIndex - StepSize]);
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (ElementIndex >= StepSize)
|
|
{
|
|
ItemIndex[ElementIndex] = CurrentValue;
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
}
|
|
|
|
uint PrefixMaxLastStep(uint ElementIndex, inout uint CurrentValue)
|
|
{
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
uint StepSize = NUM_THREADS_PER_GROUP / 2U;
|
|
|
|
if (ElementIndex >= StepSize)
|
|
{
|
|
uint NewValue = max(CurrentValue, ItemIndex[ElementIndex - StepSize]);
|
|
CurrentValue = NewValue;
|
|
}
|
|
|
|
// No sync needed since we don't look at ItemIndex[] again
|
|
return CurrentValue;
|
|
}
|
|
|
|
uint PrefixMax(uint ElementIndex)
|
|
{
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
uint CurrentValue = ItemIndex[ElementIndex];
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
// Note: manually unrolled as UNROLL seems to not figure out *= 2, at least on some platforms
|
|
//for (uint StepSize = 1U; StepSize < NUM_THREADS_PER_GROUP / 2U; StepSize *= 2u)
|
|
//{
|
|
// PrefixMaxStep(ElementIndex, StepSize, CurrentValue);
|
|
//}
|
|
PrefixMaxStep(ElementIndex, 1U, CurrentValue);
|
|
PrefixMaxStep(ElementIndex, 2U, CurrentValue);
|
|
PrefixMaxStep(ElementIndex, 4U, CurrentValue);
|
|
PrefixMaxStep(ElementIndex, 8U, CurrentValue);
|
|
#if NUM_THREADS_PER_GROUP > 32u
|
|
PrefixMaxStep(ElementIndex, 16U, CurrentValue);
|
|
#endif
|
|
#if NUM_THREADS_PER_GROUP > 64u
|
|
PrefixMaxStep(ElementIndex, 32U, CurrentValue);
|
|
#endif
|
|
#if NUM_THREADS_PER_GROUP > 128u
|
|
PrefixMaxStep(ElementIndex, 64U, CurrentValue);
|
|
#endif
|
|
#if NUM_THREADS_PER_GROUP > 256u
|
|
PrefixMaxStep(ElementIndex, 128U, CurrentValue);
|
|
#endif
|
|
|
|
return PrefixMaxLastStep(ElementIndex, CurrentValue);
|
|
}
|
|
|
|
#endif // USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
|
|
|
|
|
|
groupshared FInstanceBatchItem Items[NUM_THREADS_PER_GROUP];
|
|
|
|
struct FInstanceWorkSetup
|
|
{
|
|
bool bValid;
|
|
FInstanceBatchItem Item;
|
|
// Work out who I am in expansion of the item in range [0,WorkItem.Count], may be negative when no work at end of batch
|
|
int LocalItemIndex;
|
|
uint WorkItemIndex;
|
|
uint GroupIndexPerBatch; // 0..NumGroupsPerBatch if multiple groups per batch are requested
|
|
};
|
|
|
|
|
|
// Set bSupportMultipleBatches to true if this kernel is dispatched with more than 1 group per batch.
|
|
FInstanceWorkSetup InstanceCullingLoadBalancer_Setup(uint3 GroupId, uint GroupThreadIndex, uint ItemDataOffset, bool bSupportMultipleBatches = false)
|
|
{
|
|
FInstanceWorkSetup Setup = (FInstanceWorkSetup)0;
|
|
|
|
uint BatchIndex = GetUnWrappedDispatchGroupId(GroupId);
|
|
// Interleave multiple groups per batch if multiple groups per batch are requested
|
|
if (bSupportMultipleBatches)
|
|
{
|
|
Setup.GroupIndexPerBatch = BatchIndex % NumGroupsPerBatch;
|
|
BatchIndex = BatchIndex / NumGroupsPerBatch;
|
|
}
|
|
|
|
if (BatchIndex >= NumBatches)
|
|
{
|
|
return Setup;
|
|
}
|
|
|
|
// Load work description for whole work group
|
|
FInstanceBatch GroupBatch = UnpackBatch(BatchBuffer[BatchIndex], ItemDataOffset);
|
|
|
|
#if LOAD_BALANCER_SINGLE_INSTANCE_MODE
|
|
// Special case where all items are expected to have a single instance
|
|
if (GroupBatch.NumItems > GroupThreadIndex)
|
|
{
|
|
Setup.Item = UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]);
|
|
Setup.bValid = true;
|
|
}
|
|
#else
|
|
// Special case where we have an 1:1 item/thread no need to do prefix sums or whatever
|
|
// TODO: this case can also store less data per item
|
|
if (GroupBatch.NumItems == NUM_THREADS_PER_GROUP)
|
|
{
|
|
checkSlow(UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]).BatchPrefixOffset == GroupThreadIndex);
|
|
|
|
Setup.Item = UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]);
|
|
Setup.bValid = true;
|
|
|
|
return Setup;
|
|
}
|
|
|
|
// Load work Items (guaranteed to be less than group size)
|
|
#if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
|
|
WorkBoundary[GroupThreadIndex] = 0;
|
|
#else
|
|
ItemIndex[GroupThreadIndex] = 0U;
|
|
#endif
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
if (GroupThreadIndex < GroupBatch.NumItems)
|
|
{
|
|
FInstanceBatchItem WorkItem = UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]);
|
|
#if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
|
|
// Mark last instance as boundary
|
|
WorkBoundary[(WorkItem.BatchPrefixOffset + WorkItem.NumInstances) - 1U] = 1;
|
|
#else
|
|
ItemIndex[WorkItem.BatchPrefixOffset] = GroupThreadIndex;
|
|
#endif
|
|
Items[GroupThreadIndex] = WorkItem;
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
uint WorkItemIndex = 0U;
|
|
if (GroupBatch.NumItems < 4U)
|
|
{
|
|
// Distribute using linear search (no syncs requires like the prefix max op)
|
|
for (int Index = int(GroupBatch.NumItems) - 1; Index >= 0; --Index)
|
|
{
|
|
// TODO: pack the offset data into a single dword and store in group batch? 6x5 bits can fit and would be far more efficient that all the shared mumbojumbo
|
|
FInstanceBatchItem WorkItem = Items[Index];
|
|
if (GroupThreadIndex >= WorkItem.BatchPrefixOffset)
|
|
{
|
|
WorkItemIndex = uint(Index);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
#if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION
|
|
// Ballot based boundary search for when wave ops are available.
|
|
bool bIsBoundary = WorkBoundary[GroupThreadIndex];
|
|
WorkItemIndex = WavePrefixCountBits(bIsBoundary);
|
|
#else
|
|
// Distribute the index to the other threads that should work on the same item
|
|
WorkItemIndex = PrefixMax(GroupThreadIndex);
|
|
#endif
|
|
}
|
|
|
|
Setup.WorkItemIndex = WorkItemIndex;
|
|
Setup.Item = Items[WorkItemIndex];
|
|
// Work out who I am in expansion of the item in range [0,WorkItem.Count], may be negative when no work at end of batch
|
|
Setup.LocalItemIndex = int(GroupThreadIndex - Setup.Item.BatchPrefixOffset);
|
|
Setup.bValid = Setup.LocalItemIndex >= 0 && Setup.LocalItemIndex < int(Setup.Item.NumInstances);
|
|
#endif //LOAD_BALANCER_SINGLE_INSTANCE_MODE
|
|
|
|
return Setup;
|
|
}
|
|
|
|
|
|
uint InstanceCullingLoadBalancer_GetNumBatches()
|
|
{
|
|
return NumBatches;
|
|
} |