// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "../Common.ush" #include "../WaveOpUtil.ush" #include "../ComputeShaderUtils.ush" #if COMPILER_SUPPORTS_WAVE_VOTE // TODO: Need to also check that wave width == thread group size (or implement workaround for other wave sizes) #define USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION 0 #endif // Whether to use a special case where all items are expected to have a single instance #ifndef LOAD_BALANCER_SINGLE_INSTANCE_MODE #define LOAD_BALANCER_SINGLE_INSTANCE_MODE 0 #endif struct FPackedInstanceBatch { uint FirstItem_NumItems; }; struct FPackedInstanceBatchItem { // packed 32-NumInstancesItemBits:NumInstancesItemBits - need one more bit for the case where one item has ThreadGroupSize work to do uint InstanceDataOffset_NumInstances; // packed 32-PrefixBits:PrefixBits uint Payload_BatchPrefixOffset; }; StructuredBuffer BatchBuffer; StructuredBuffer ItemBuffer; uint NumBatches; uint NumItems; uint NumGroupsPerBatch; struct FInstanceBatch { uint FirstItem; uint NumItems; }; FInstanceBatch UnpackBatch(FPackedInstanceBatch PackedBatch, uint ItemDataOffset) { FInstanceBatch Result; Result.FirstItem = ItemDataOffset + (PackedBatch.FirstItem_NumItems >> NUM_INSTANCES_ITEM_BITS); Result.NumItems = PackedBatch.FirstItem_NumItems & NUM_INSTANCES_ITEM_MASK; return Result; } struct FInstanceBatchItem { uint InstanceDataOffset; uint NumInstances; uint Payload; uint BatchPrefixOffset; }; FInstanceBatchItem UnpackItem(FPackedInstanceBatchItem PackedItem) { FInstanceBatchItem Result; Result.InstanceDataOffset = PackedItem.InstanceDataOffset_NumInstances >> NUM_INSTANCES_ITEM_BITS; Result.NumInstances = PackedItem.InstanceDataOffset_NumInstances & NUM_INSTANCES_ITEM_MASK; Result.Payload = PackedItem.Payload_BatchPrefixOffset >> PREFIX_BITS; Result.BatchPrefixOffset = PackedItem.Payload_BatchPrefixOffset & PREFIX_BIT_MASK; return Result; } #if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION groupshared uint WorkBoundary[NUM_THREADS_PER_GROUP]; #else // !USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION groupshared uint ItemIndex[NUM_THREADS_PER_GROUP]; void PrefixMaxStep(uint ElementIndex, uint StepSize, inout uint CurrentValue) { if (ElementIndex >= StepSize) { CurrentValue = max(CurrentValue, ItemIndex[ElementIndex - StepSize]); } GroupMemoryBarrierWithGroupSync(); if (ElementIndex >= StepSize) { ItemIndex[ElementIndex] = CurrentValue; } GroupMemoryBarrierWithGroupSync(); } uint PrefixMaxLastStep(uint ElementIndex, inout uint CurrentValue) { GroupMemoryBarrierWithGroupSync(); uint StepSize = NUM_THREADS_PER_GROUP / 2U; if (ElementIndex >= StepSize) { uint NewValue = max(CurrentValue, ItemIndex[ElementIndex - StepSize]); CurrentValue = NewValue; } // No sync needed since we don't look at ItemIndex[] again return CurrentValue; } uint PrefixMax(uint ElementIndex) { GroupMemoryBarrierWithGroupSync(); uint CurrentValue = ItemIndex[ElementIndex]; GroupMemoryBarrierWithGroupSync(); // Note: manually unrolled as UNROLL seems to not figure out *= 2, at least on some platforms //for (uint StepSize = 1U; StepSize < NUM_THREADS_PER_GROUP / 2U; StepSize *= 2u) //{ // PrefixMaxStep(ElementIndex, StepSize, CurrentValue); //} PrefixMaxStep(ElementIndex, 1U, CurrentValue); PrefixMaxStep(ElementIndex, 2U, CurrentValue); PrefixMaxStep(ElementIndex, 4U, CurrentValue); PrefixMaxStep(ElementIndex, 8U, CurrentValue); #if NUM_THREADS_PER_GROUP > 32u PrefixMaxStep(ElementIndex, 16U, CurrentValue); #endif #if NUM_THREADS_PER_GROUP > 64u PrefixMaxStep(ElementIndex, 32U, CurrentValue); #endif #if NUM_THREADS_PER_GROUP > 128u PrefixMaxStep(ElementIndex, 64U, CurrentValue); #endif #if NUM_THREADS_PER_GROUP > 256u PrefixMaxStep(ElementIndex, 128U, CurrentValue); #endif return PrefixMaxLastStep(ElementIndex, CurrentValue); } #endif // USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION groupshared FInstanceBatchItem Items[NUM_THREADS_PER_GROUP]; struct FInstanceWorkSetup { bool bValid; FInstanceBatchItem Item; // Work out who I am in expansion of the item in range [0,WorkItem.Count], may be negative when no work at end of batch int LocalItemIndex; uint WorkItemIndex; uint GroupIndexPerBatch; // 0..NumGroupsPerBatch if multiple groups per batch are requested }; // Set bSupportMultipleBatches to true if this kernel is dispatched with more than 1 group per batch. FInstanceWorkSetup InstanceCullingLoadBalancer_Setup(uint3 GroupId, uint GroupThreadIndex, uint ItemDataOffset, bool bSupportMultipleBatches = false) { FInstanceWorkSetup Setup = (FInstanceWorkSetup)0; uint BatchIndex = GetUnWrappedDispatchGroupId(GroupId); // Interleave multiple groups per batch if multiple groups per batch are requested if (bSupportMultipleBatches) { Setup.GroupIndexPerBatch = BatchIndex % NumGroupsPerBatch; BatchIndex = BatchIndex / NumGroupsPerBatch; } if (BatchIndex >= NumBatches) { return Setup; } // Load work description for whole work group FInstanceBatch GroupBatch = UnpackBatch(BatchBuffer[BatchIndex], ItemDataOffset); #if LOAD_BALANCER_SINGLE_INSTANCE_MODE // Special case where all items are expected to have a single instance if (GroupBatch.NumItems > GroupThreadIndex) { Setup.Item = UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]); Setup.bValid = true; } #else // Special case where we have an 1:1 item/thread no need to do prefix sums or whatever // TODO: this case can also store less data per item if (GroupBatch.NumItems == NUM_THREADS_PER_GROUP) { checkSlow(UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]).BatchPrefixOffset == GroupThreadIndex); Setup.Item = UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]); Setup.bValid = true; return Setup; } // Load work Items (guaranteed to be less than group size) #if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION WorkBoundary[GroupThreadIndex] = 0; #else ItemIndex[GroupThreadIndex] = 0U; #endif GroupMemoryBarrierWithGroupSync(); if (GroupThreadIndex < GroupBatch.NumItems) { FInstanceBatchItem WorkItem = UnpackItem(ItemBuffer[GroupBatch.FirstItem + GroupThreadIndex]); #if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION // Mark last instance as boundary WorkBoundary[(WorkItem.BatchPrefixOffset + WorkItem.NumInstances) - 1U] = 1; #else ItemIndex[WorkItem.BatchPrefixOffset] = GroupThreadIndex; #endif Items[GroupThreadIndex] = WorkItem; } GroupMemoryBarrierWithGroupSync(); uint WorkItemIndex = 0U; if (GroupBatch.NumItems < 4U) { // Distribute using linear search (no syncs requires like the prefix max op) for (int Index = int(GroupBatch.NumItems) - 1; Index >= 0; --Index) { // TODO: pack the offset data into a single dword and store in group batch? 6x5 bits can fit and would be far more efficient that all the shared mumbojumbo FInstanceBatchItem WorkItem = Items[Index]; if (GroupThreadIndex >= WorkItem.BatchPrefixOffset) { WorkItemIndex = uint(Index); break; } } } else { #if USE_WAVE_BIT_PREFIX_WORK_DISTRIBUTION // Ballot based boundary search for when wave ops are available. bool bIsBoundary = WorkBoundary[GroupThreadIndex]; WorkItemIndex = WavePrefixCountBits(bIsBoundary); #else // Distribute the index to the other threads that should work on the same item WorkItemIndex = PrefixMax(GroupThreadIndex); #endif } Setup.WorkItemIndex = WorkItemIndex; Setup.Item = Items[WorkItemIndex]; // Work out who I am in expansion of the item in range [0,WorkItem.Count], may be negative when no work at end of batch Setup.LocalItemIndex = int(GroupThreadIndex - Setup.Item.BatchPrefixOffset); Setup.bValid = Setup.LocalItemIndex >= 0 && Setup.LocalItemIndex < int(Setup.Item.NumInstances); #endif //LOAD_BALANCER_SINGLE_INSTANCE_MODE return Setup; } uint InstanceCullingLoadBalancer_GetNumBatches() { return NumBatches; }