// Copyright Epic Games, Inc. All Rights Reserved. #include "GraphLoadBalance.h" #include "Algo/MinElement.h" #include "Algo/Sort.h" #include "Algo/Transform.h" #include "GraphReachability.h" #include "Math/UnrealMathUtility.h" #include "Misc/ScopeExit.h" #include "Packing.h" #include "SortSpecialCases.h" namespace Algo::Graph { /** Implements ConstructLoadBalance. */ class FLoadBalanceBuilder { public: FLoadBalanceBuilder(TConstArrayView> InGraph, int32 InNumBuckets, TConstArrayView> InTransposeGraph, TConstArrayView> InReachabilityGraph, TArray>& OutAssignments, TArray>& OutRootAssignments); void LoadBalance(); private: /** * Roots and vertices in a cluster of vertices, which is a transitively-closed set of vertices that should be * assigned to a bucket as a group. * * Initially every cluster is one of the disjoint subgraphs of the ingraph, but we might need to split some of * those clusters to balance the buckets. * * Also holds intermediate data used by PackClusters. */ struct FCluster { TArray Roots; TArray Vertices; int32 SplitGroup = INDEX_NONE; }; /** An output bucket of vertices, represented as a collection of clusters. */ struct FBucket { TArray ClusterIndices; int32 Size = 0; void CalculateSize(TConstArrayView Clusters); }; /** Helper enum for SplitCluster. Types of estimations for the results of merging a root into a cluster. */ enum class EEstimationType { Exact, TightUpperBound, LooseUpperBoundUniformIncrease, }; /** * Helper struct for SplitCluster. Records metrics about the results of a merge between a root's reachable vertices * and a cluster's accumulated vertices. Calculating the merge is too expensive, and even updating estimates for * the merge results for every root after every decision is too expensive, so this struct supports multiple levels * of estimation. */ struct FRootAndClusterMergeData { /** The root being merged. Note the cluster being merged is implicit - the owner of this mergedata. */ FVertex Root = Algo::Graph::InvalidVertex; /** * UpperBound for how many vertices will reduce when merging this cluster. * The definition of a vertex reducing is that it exists in both the cluster and the root and will therefore * not cause an increase in cluster size when the two are merged. * This field is an exact value rather than an estimate if bExact is true. It can be modified upwards based * on changes to the size of the cluster. */ int32 ReductionUpperBound = 0; /** Number of vertices in the root being merged. Copied here for spatial locality of the data when sorting. */ int32 RootSize = 0; /** The size the cluster had the last time we estimated or modified ReductionUpperBound. */ int32 ClusterSizeWhenEstimated = 0; /** * The value of ReductionUpperBound the last time we calculated reduction exactly. Used in combination with * the cluster's RootReductions list to recalculate the exact reduction without needing to compare vertices. */ int32 PreviousExactEstimate = 0; /** Whether ReductionUpperBound is known to be exact or is an upperbound estimate. */ bool bExact = false; /** * Return whether A is a worse merge (e.g. lower reduction value) than B. We sort merges from worst to best * so we can pop the best merge off the back of a TArray without any shifts. */ static bool IsWorse(FRootAndClusterMergeData& A, FRootAndClusterMergeData& B); }; /** Helper struct for SplitCluster. Data about one of the clusters we are building up by merging root vertices. */ struct FMergeCluster { /** Roots that have been merged into the cluster. */ TArray Roots; /** NumVertices-length bitarray specifying which vertices are in the cluster. */ TBitArray<> VertexInCluster; /** * NumVertices-length integer array specifying for each root vertex how many vertices reachable from that root * have been added onto VertexInCluster since the last time RootReductions were consumed. This allows us to * recalculate the number of reductions that will occur when merging the root into the cluster. */ TArray RootReductions; /** * An array of the mergedatas for merging every remaining root into the cluster. * The array is sorted from worst to best estimated merge results. After each new root is committed we pop the * root's mergedata off the end, and do some work to update the estimates for the remaining roots and restore * the sortedness of the array. */ TArray MergeDatas; /** Number of true bits in VertexInCluster, aka the number of vertices in the cluster. */ int32 VertexInClusterCount = 0; }; /** * Helper struct for SplitCluster. Metrics about the best merge for each cluster. We sort a list of these structs * to find the best merge across all clusters. */ struct FClusterMergeSelectionData { /** The root being merged */ FVertex Root; /* The cluster being merged */ FMergeCluster* Cluster; /** The exact reduction value - @see FRootAndClusterMergeData for definition of reduction. */ int32 Reduction; /** The post-merge size of the cluster. */ int32 NewSize; /** The spread between the new max-sized cluster and the new min-sized cluster if this merge is selected. */ int32 Spread; /** True iff the new spread is over the maximum allowed and the importance of the overage is not suppressed. */ bool bOverSpread; /** Return whether A is a better merge to select than B */ static bool IsBetter(const FClusterMergeSelectionData& A, const FClusterMergeSelectionData& B); }; void FindDisjointSubgraphs(TArray& OutDisjointSubgraphs); void FindRoots(TArrayView InOutDisjointSubgraphs); void PackClusters(TArray&& Clusters, TArray& OutBuckets); /** Merge the clusters listed in each bucket into a single merged cluster for the bucket. */ void CreateBucketClusters(TConstArrayView Buckets, TConstArrayView Clusters, TArray& OutBuckets); TArray SplitCluster(FCluster&& InCluster, int32 SplitSize, int32 DesiredSpread); /** After growing the cluster, update estimates in the cluster's sorted list of MergeDatas and resort the list. */ void UpdateClusterMergeDatas(FMergeCluster& Cluster, TBitArray<>& RootAssigned); /** * Merges the vertices of the given cluster and root, optionally stores the merged results in cluster, and returns the * number of vertices from root that reduced (were already present in cluster). */ int32 CalculateMergeResults(FMergeCluster& Cluster, FVertex Root, bool bWriteResultsToCluster); /** * Update the estimate for the given mergedata in the given cluster, using the given estimatetype, after growing * the cluster. Report whether the update changed the ReductionUpperBound. */ void UpdateMergeDataEstimate(FMergeCluster& Cluster, FRootAndClusterMergeData& MergeData, EEstimationType EstimationType, bool& bOutEstimateWasModified); TConstArrayView> Graph; TConstArrayView> TransposeGraph; TConstArrayView> ReachabilityGraph; /** * Only valid during SplitCluster. List for each vertex of the cluster's roots that have the vertex in their * reachability graph. */ TArray> ReachableByRootGraph; TArray64 ReachableByRootGraphBuffer; /** * Only valid during SplitCluster. Due to memory constraints not all roots can be reported in ReachableByRootGraph. * This NumVertices-length array records whether the given vertex is a root and is in that graph. */ TBitArray<> RootInReachableByRootSet; TBitArray<> VisitedScratch; TArray StackScratch; TArray UpdateMergeScratch; TArray>& Assignments; TArray>& RootAssignments; int32 NumBuckets; int32 NumVertices; }; void ConstructLoadBalance(TConstArrayView> Graph, int32 NumBuckets, TArray>& OutAssignments) { FLoadBalanceContext Context; Context.Graph = Graph; Context.NumBuckets = NumBuckets; Context.OutAssignments = &OutAssignments; ConstructLoadBalance(Context); } void ConstructLoadBalance(FLoadBalanceContext& Context) { check(Context.OutAssignments != nullptr); check(Context.NumBuckets > 0); TArray64 ReachabilityGraphEdgesBuffer; TArray> ReachabilityGraphBuffer; TConstArrayView> ReachabilityGraph = Context.ReachabilityGraph; TArray64 TransposeGraphEdgesBuffer; TArray> TransposeGraphBuffer; TConstArrayView> TransposeGraph = Context.TransposeGraph; TArray> RootAssignmentsBuffer; TArray>* RootAssignments = Context.OutRootAssignments; int32 NumVertices = Context.Graph.Num(); if (ReachabilityGraph.Num() != NumVertices) { ConstructReachabilityGraph(Context.Graph, ReachabilityGraphEdgesBuffer, ReachabilityGraphBuffer); ReachabilityGraph = ReachabilityGraphBuffer; } if (TransposeGraph.Num() != NumVertices) { ConstructTransposeGraph(Context.Graph, TransposeGraphEdgesBuffer, TransposeGraphBuffer); TransposeGraph = TransposeGraphBuffer; } if (RootAssignments == nullptr) { RootAssignments = &RootAssignmentsBuffer; } FLoadBalanceBuilder Builder(Context.Graph, Context.NumBuckets, TransposeGraph, ReachabilityGraph, *Context.OutAssignments, *RootAssignments); Builder.LoadBalance(); } FLoadBalanceBuilder::FLoadBalanceBuilder(TConstArrayView> InGraph, int32 InNumBuckets, TConstArrayView> InTransposeGraph, TConstArrayView> InReachabilityGraph, TArray>& OutAssignments, TArray>& OutRootAssignments) : Graph(InGraph) , TransposeGraph(InTransposeGraph) , ReachabilityGraph(InReachabilityGraph) , Assignments(OutAssignments) , RootAssignments(OutRootAssignments) , NumBuckets(InNumBuckets) , NumVertices(InGraph.Num()) { } void FLoadBalanceBuilder::LoadBalance() { TArray DisjointSubgraphs; FindDisjointSubgraphs(DisjointSubgraphs); FindRoots(DisjointSubgraphs); TArray Buckets; PackClusters(MoveTemp(DisjointSubgraphs), Buckets); Assignments.SetNum(NumBuckets, EAllowShrinking::No); RootAssignments.SetNum(NumBuckets, EAllowShrinking::No); for (int32 BucketIndex = 0; BucketIndex < NumBuckets; ++BucketIndex) { Assignments[BucketIndex] = MoveTemp(Buckets[BucketIndex].Vertices); RootAssignments[BucketIndex] = MoveTemp(Buckets[BucketIndex].Roots); } } void FLoadBalanceBuilder::FindDisjointSubgraphs(TArray& OutDisjointSubgraphs) { TArray& Subgraphs = OutDisjointSubgraphs; Subgraphs.Reset(); TBitArray<>& Visited = VisitedScratch; TArray& Stack = StackScratch; Visited.Init(false, NumVertices); Stack.Reset(NumVertices); // While there are unvisted vertices, graphsearch from an arbitrary vertex on the union graph of Edges+References. // All vertices found in that search are a maximal subgraph and one of our disjoint subgraphs. for (FVertex RootVertex = 0; RootVertex < NumVertices; ++RootVertex) { if (Visited[RootVertex]) { continue; } check(Stack.IsEmpty()); FCluster& Subgraph = Subgraphs.Emplace_GetRef(); Visited[RootVertex] = true; Subgraph.Vertices.Add(RootVertex); Stack.Add(RootVertex); while (!Stack.IsEmpty()) { FVertex Vertex = Stack.Pop(EAllowShrinking::No); for (TConstArrayView Edges : { Graph[Vertex], TransposeGraph[Vertex] }) { for (FVertex Edge : Edges) { if (!Visited[Edge]) { Visited[Edge] = true; Subgraph.Vertices.Add(Edge); Stack.Add(Edge); } } } } } // Sort the vertices in each Subgraph back into Root to Leaf order for (FCluster& Subgraph : Subgraphs) { Algo::Sort(Subgraph.Vertices); } } void FLoadBalanceBuilder::FindRoots(TArrayView InOutDisjointSubgraphs) { TBitArray<>& Visited = VisitedScratch; // Since the subgraphs are disjoint, we can use the same Visited set for all of them without clearing it // between subgraphs, and we can read reachability from the reachabilitygraph for the entire graph. Visited.Init(false, NumVertices); int32 NumMarkedVertices = 0; for (FCluster& Subgraph : InOutDisjointSubgraphs) { // Roots include all the vertices of the subgraph that do not have any referencers, but they can also include // vertices in a cycle, so to find them we we have to iteratively subtract reachable vertices from remaining. int32 NumSubgraphVertices = Subgraph.Vertices.Num(); Subgraph.Roots.Reset(NumSubgraphVertices); // The vertices are sorted in RootToLeaf order. Iterating from 0 to N-1 and ignoring any reachables from // previous roots will mean that each new vertex is a root. for (FVertex Root : Subgraph.Vertices) { if (Visited[Root]) { continue; } Subgraph.Roots.Add(Root); for (FVertex Reachable : ReachabilityGraph[Root]) { if (!Visited[Reachable]) { Visited[Reachable] = true; ++NumMarkedVertices; } } } } check(NumMarkedVertices == NumVertices); // The subgraphs should span the graph } void FLoadBalanceBuilder::FBucket::CalculateSize(TConstArrayView Clusters) { Size = 0; for (int32 ClusterIndex : ClusterIndices) { Size += Clusters[ClusterIndex].Vertices.Num(); } }; void FLoadBalanceBuilder::PackClusters(TArray&& InDisjointSubgraphs, TArray& OutBuckets) { // Our clusters initially are the disjoint subgraphs, but we might split some of the subgraphs into // strongly-related but not disjoint Clusters. So for the rest of the algorithm we assume they are generalized // clusters and might overlap. TArray& Clusters = InDisjointSubgraphs; TArray Buckets; // Buckets that we populate by calling ScheduleValues Buckets.SetNum(NumBuckets); if (NumBuckets < 2) { Buckets[0].ClusterIndices = Algo::RangeArray>(0, Clusters.Num()); Buckets[0].CalculateSize(Clusters); CreateBucketClusters(Buckets, Clusters, OutBuckets); return; } // MaxSpreadToNumVerticesRatio is a tuning variable used to specify how hard we want to look for an optimal // solution. When the spread is less than this fraction of NumVertices, we stop looking. constexpr double MaxSpreadToNumVerticesRatio = .1; int32 MaxSpread = MaxSpreadToNumVerticesRatio * NumVertices; TArray> BucketsClusterIndices; // Output from Algo::ScheduleValues TArray> PackExclusionGroups; // Input to Algo::ScheduleValues TArray ClusterCosts; // Input to Algo::ScheduleValues Algo::Transform(Clusters, ClusterCosts, [](const FCluster& Cluster) { return Cluster.Vertices.Num(); }); int32 SplitAttempts = 0; for (;;) { Algo::ScheduleValues(ClusterCosts, NumBuckets, PackExclusionGroups, BucketsClusterIndices); for (int32 BucketIndex = 0; BucketIndex < NumBuckets; ++BucketIndex) { FBucket& Bucket = Buckets[BucketIndex]; Bucket.ClusterIndices = MoveTemp(BucketsClusterIndices[BucketIndex]); Bucket.CalculateSize(Clusters); } Algo::Sort(Buckets, [](const FBucket& A, const FBucket& B) { return A.Size > B.Size; }); FBucket& BiggestBucket = Buckets[0]; int32 Spread = BiggestBucket.Size - Buckets.Last().Size; if (Spread <= MaxSpread) { break; } if (SplitAttempts >= NumBuckets) { // Tried too many times; we should only need to split at most one cluster per bucket break; }; int32 SplitSize = NumBuckets; // We can only split clusters up into roots. Splitting within a root does not help minimize the results because // adding the vertex that is not the root to a bucket makes that bucket overlap the bucket containing the root // of the vertex without adding any new vertices to the vertices spanned by the pair of buckets. // From the clusters assigned to the biggest bucket, pick the smallest one that has >= SplitSize roots. // If none have >= SplitSize roots, pick the smallest one with the largest number of roots and clamp the // SplitSize to number of roots. int32 IndexToSplit = *Algo::MinElement(BiggestBucket.ClusterIndices, [&Clusters, SplitSize](int32 A, int32 B) { FCluster& ClusterA = Clusters[A]; FCluster& ClusterB = Clusters[B]; if ((ClusterA.SplitGroup == INDEX_NONE) != (ClusterB.SplitGroup == INDEX_NONE)) { // Clusters resulting from a split are not allowed to be split a second time, because their fragments // would reduce when merged into a single cluster in a bucket and the bucket size would not match // the size we expected when packing. Push all clusters with a SplitGroup to the back of the sort. return ClusterA.SplitGroup == INDEX_NONE; } bool bAHasEnoughRoots = ClusterA.Roots.Num() >= SplitSize; if (bAHasEnoughRoots != (ClusterB.Roots.Num() >= SplitSize)) { return bAHasEnoughRoots; } if (!bAHasEnoughRoots && ClusterA.Roots.Num() != ClusterB.Roots.Num()) { return ClusterA.Roots.Num() < ClusterB.Roots.Num(); } return ClusterA.Vertices.Num() < ClusterB.Vertices.Num(); }); FCluster& ClusterToSplit = Clusters[IndexToSplit]; if (ClusterToSplit.SplitGroup != INDEX_NONE) { // All elements were invalid; there is nothing we can split in the biggest bucket, which means there is // nothing further we can split to reduce the spread break; } SplitSize = FMath::Min(SplitSize, ClusterToSplit.Roots.Num()); if (SplitSize < 2) { // All elements in the biggest bucket had only a single root and are unsplittable; there is nothing further // we can split to reduce the spread break; } int32 NewBiggestBucketSize = BiggestBucket.Size - ClusterToSplit.Vertices.Num(); int32 NewMaximum = FMath::Max(NewBiggestBucketSize, Buckets[1].Size); int32 NewMinimum = FMath::Min(NewBiggestBucketSize, Buckets.Last().Size); int32 DesiredSpread = NewMaximum - NewMinimum + MaxSpread; TArray NewClusters = SplitCluster(MoveTemp(ClusterToSplit), SplitSize, DesiredSpread); // Remove the SplitCluster from wherever it is in the middle of Clusters, and shift down by one all of our // indices in data that persists between loops that were pointing to clusters at a higher index. Clusters.RemoveAt(IndexToSplit); ClusterCosts.RemoveAt(IndexToSplit); for (TArray& ExclusionGroup : PackExclusionGroups) { for (int32& ClusterIndex : ExclusionGroup) { // We should not have tried to split any cluster in an exclusiongroup check(ClusterIndex != IndexToSplit); if (ClusterIndex > IndexToSplit) { --ClusterIndex; } } } // SplitClusters from the same original Cluster are not allowed to be assigned to the same // bucket, because they would merge and reduce the total size of that bucket and make our // packing not balanced, so we create an exclusiongroup for each split. TArray& ExclusionGroup = PackExclusionGroups.Emplace_GetRef(); check(NewClusters.Num() == SplitSize); for (FCluster& Cluster : NewClusters) { ClusterCosts.Add(Cluster.Vertices.Num()); int32 ClusterIndex = Clusters.Num(); Cluster.SplitGroup = SplitAttempts; ExclusionGroup.Add(ClusterIndex); Clusters.Add(MoveTemp(Cluster)); } ++SplitAttempts; } CreateBucketClusters(Buckets, Clusters, OutBuckets); } void FLoadBalanceBuilder::CreateBucketClusters(TConstArrayView Buckets, TConstArrayView Clusters, TArray& OutBuckets) { OutBuckets.SetNum(NumBuckets, EAllowShrinking::No); for (int32 BucketIndex = 0; BucketIndex < NumBuckets; ++BucketIndex) { const FBucket& Bucket = Buckets[BucketIndex]; FCluster& OutBucket = OutBuckets[BucketIndex]; int32 NumRoots = 0; for (int32 ClusterIndex : Bucket.ClusterIndices) { NumRoots += Clusters[ClusterIndex].Roots.Num(); } OutBucket.Vertices.Reset(Bucket.Size); OutBucket.Roots.Reset(NumRoots); for (int32 ClusterIndex : Bucket.ClusterIndices) { const FCluster& Cluster = Clusters[ClusterIndex]; OutBucket.Vertices.Append(Cluster.Vertices); OutBucket.Roots.Append(Cluster.Roots); } Algo::Sort(OutBucket.Vertices); OutBucket.Vertices.SetNum(Algo::Unique(OutBucket.Vertices), EAllowShrinking::No ); Algo::Sort(OutBucket.Roots); OutBucket.Roots.SetNum(Algo::Unique(OutBucket.Roots), EAllowShrinking::No); } } template void GetMaxAndMins(RangeType&& Range, ProjectedElementType& OutMaximum, MinimumsRangeType&& InOutMinimums, ProjectionType Proj) { int32 NumMinimums = GetNum(InOutMinimums); int32 NumRange = GetNum(Range); check(NumRange >= NumMinimums && NumMinimums > 0); OutMaximum = Proj(Range[0]); InOutMinimums[0] = OutMaximum; int32 Index; for (Index = 1; Index < NumMinimums; ++Index) { ProjectedElementType Element = Proj(Range[Index]); InOutMinimums[Index] = Element; OutMaximum = FMath::Max(OutMaximum, Element); } Algo::Sort(InOutMinimums); for (; Index < NumRange; ++Index) { ProjectedElementType Element = Proj(Range[Index]); if (Element < InOutMinimums.Last()) { InOutMinimums.Last() = Element; Algo::RestoreSort(InOutMinimums, NumMinimums - 1, TLess()); } OutMaximum = FMath::Max(OutMaximum, Element); } } TArray FLoadBalanceBuilder::SplitCluster(FCluster&& InCluster, int32 SplitSize, int32 DesiredSpread) { check(SplitSize >= 2); // Create the ReachedByRoot graph edges for all vertices, so we know which other roots to update when the // vertices of a root are merged into one of the output clusters. There might be a large number of roots // and so the ReachedByRoot graph might be too large to fit in memory; restrict it in size to 1000*NumVertices. constexpr int32 ReachableByRootEdgesPerVertexLimit = 1000; ReachableByRootGraph.Reset(); ReachableByRootGraphBuffer.Reset(); TArray RootsInGraph; ConstructPartialTransposeGraph(ReachabilityGraph, InCluster.Roots, ReachableByRootEdgesPerVertexLimit * NumVertices, ReachableByRootGraphBuffer, ReachableByRootGraph, RootsInGraph); RootInReachableByRootSet.Init(false, NumVertices); for (FVertex Root : RootsInGraph) { RootInReachableByRootSet[Root] = true; } ON_SCOPE_EXIT { ReachableByRootGraph.Empty(); ReachableByRootGraphBuffer.Empty(); RootInReachableByRootSet.Empty(); UpdateMergeScratch.Empty(); }; // Create RootAssigned arraymap to specify whether each root is assigned TBitArray<> RootAssigned; RootAssigned.SetNumUninitialized(NumVertices); RootAssigned.SetRange(0, NumVertices, false); int32 NumRemainingRoots = InCluster.Roots.Num(); // Create one MergeCluster per splitsize int32 NumClusters = SplitSize; TArray Clusters; Clusters.Reserve(NumClusters); for (int32 ClusterIndex = 0; ClusterIndex < NumClusters; ++ClusterIndex) { FMergeCluster& Cluster = Clusters.Emplace_GetRef(); Cluster.VertexInCluster.Init(false, NumVertices); Cluster.RootReductions.SetNumZeroed(NumVertices); } // Assign the biggest root to output cluster 0. Its better for the algorithm to know where it's headed so it can // prefer to merge in roots that reduce well with that biggest root. We cannot seed the other buckets however, // because we don't know which other roots will end up NOT being assigned to the bucket with the biggest root. { FVertex BestRoot = *Algo::MinElement(InCluster.Roots, [this](FVertex A, FVertex B) { return ReachabilityGraph[A].Num() > ReachabilityGraph[B].Num(); }); check(!RootAssigned[BestRoot]); RootAssigned[BestRoot] = true; --NumRemainingRoots; Clusters[0].Roots.Add(BestRoot); CalculateMergeResults(Clusters[0], BestRoot, true /* bWriteResultsToCluster */); } // Initialize the MergeDatas in each cluster; for better performance do this after assigning the intial seed. for (FMergeCluster& Cluster : Clusters) { TArray& MergeDatas = Cluster.MergeDatas; MergeDatas.Reserve(NumRemainingRoots); for (FVertex Root : InCluster.Roots) { if (!RootAssigned[Root]) { FRootAndClusterMergeData& MergeData = MergeDatas.Emplace_GetRef(); MergeData.Root = Root; MergeData.RootSize = ReachabilityGraph[Root].Num(); bool bModified; UpdateMergeDataEstimate(Cluster, MergeData, EEstimationType::TightUpperBound, bModified); } } Algo::Sort(MergeDatas, FRootAndClusterMergeData::IsWorse); } // The main loop: on each iteration of the loop find the best root to move into the best cluster, // where best is defined by maximizing reduction and minimizing spread. Assign that best root to the best cluster. int32 BalancedAmountPerCluster = InCluster.Vertices.Num() / NumClusters; TArray SelectionDatas; while (NumRemainingRoots > 0) { for (FMergeCluster& Cluster : Clusters) { // We need to update every Cluster's MergeDatas on every loop, rather than just the one that won the last // loop, in case the best root in a non-winning cluster was the same as the winner's. UpdateClusterMergeDatas(Cluster, RootAssigned); } int32 MaxClusterSize; int32 MinClusterSizes[2]; GetMaxAndMins(Clusters, MaxClusterSize, TArrayView(MinClusterSizes), [&Clusters](const FMergeCluster& Cluster) { return Cluster.VertexInClusterCount; }); SelectionDatas.Reset(); for (FMergeCluster& Cluster : Clusters) { FRootAndClusterMergeData& MergeData = Cluster.MergeDatas.Last(); FClusterMergeSelectionData& SelectionData = SelectionDatas.Emplace_GetRef(); SelectionData.Root = MergeData.Root; SelectionData.Cluster = &Cluster; int32 OldSize = Cluster.VertexInClusterCount; check(MergeData.bExact&& MergeData.ClusterSizeWhenEstimated == OldSize); SelectionData.Reduction = MergeData.ReductionUpperBound; SelectionData.NewSize = OldSize + MergeData.RootSize - SelectionData.Reduction; int32 NewMaximum = FMath::Max(MaxClusterSize, SelectionData.NewSize); int32 NewMinimum; if (OldSize == MinClusterSizes[0]) { NewMinimum = FMath::Min(SelectionData.NewSize, MinClusterSizes[1]); } else { NewMinimum = MinClusterSizes[0]; } SelectionData.Spread = NewMaximum - NewMinimum; bool bCausedSpread = NewMaximum > MaxClusterSize; bool bSpreadIsAProblem = NewMaximum > BalancedAmountPerCluster; bool bSpreadIsOver = SelectionData.Spread > DesiredSpread; SelectionData.bOverSpread = bCausedSpread && bSpreadIsAProblem && bSpreadIsOver; } FClusterMergeSelectionData* BestData = Algo::MinElement(SelectionDatas, FClusterMergeSelectionData::IsBetter); FMergeCluster& BestCluster = *BestData->Cluster; int32 Reduction = CalculateMergeResults(BestCluster, BestData->Root, true /* bWriteResultsToCluster */); check(Reduction == BestData->Reduction); BestCluster.Roots.Add(BestData->Root); check(!RootAssigned[BestData->Root]); RootAssigned[BestData->Root] = true; --NumRemainingRoots; } TArray OutClusters; OutClusters.Reserve(Clusters.Num()); for (FMergeCluster& Cluster : Clusters) { FCluster& OutCluster = OutClusters.Emplace_GetRef(); OutCluster.Vertices.Empty(Cluster.VertexInClusterCount); for (FVertex Vertex = 0; Vertex < NumVertices; ++Vertex) { if (Cluster.VertexInCluster[Vertex]) { OutCluster.Vertices.Add(Vertex); } } OutCluster.Roots = MoveTemp(Cluster.Roots); } return OutClusters; } void FLoadBalanceBuilder::UpdateClusterMergeDatas(FMergeCluster& Cluster, TBitArray<>& RootAssigned) { TArray& MergeDatas = Cluster.MergeDatas; check(!MergeDatas.IsEmpty()); int32 OriginalMergeDataNum = MergeDatas.Num(); int32 RemovedMergedDataNum = 0; // Pop mergedatas for completed roots off the back of the list as we encounter them // Set BestMergeData to the first mergedata found for a remaining root FRootAndClusterMergeData BestMergeData = MergeDatas.Pop(EAllowShrinking::No); while (RootAssigned[BestMergeData.Root]) { ++RemovedMergedDataNum; check(!MergeDatas.IsEmpty()); // We only update when there are remaining roots, so it should never be empty BestMergeData = MergeDatas.Pop(EAllowShrinking::No); } // Update the mergedata to get the exact value; usually this will be less than the upperbound and we need to // compare exact values when choosing the best. The mergedata at the back of the list is the estimated best and // is likely to be the actual best, so its a good first mergedata to make exact. bool bModified; UpdateMergeDataEstimate(Cluster, BestMergeData, EEstimationType::Exact, bModified); // Collect all the mergedatas from the back of the list until we reach a point in the list where the // (uniform-upperbound) estimates for all remaining mergedatas are worse than the (tighter-bound) estimate for // the worst merge we've found so far. UpdateMergeScratch.Reset(MergeDatas.Num()); TOptional WorstMergeData; for (;;) { // Continue to pop off mergedatas for completed roots as we find them. while (!MergeDatas.IsEmpty() && RootAssigned[MergeDatas.Last().Root]) { ++RemovedMergedDataNum; MergeDatas.Pop(EAllowShrinking::No); } if (MergeDatas.IsEmpty()) { // This happens e.g. when only one root remains break; } // Compare the new end of the list. Update its estimate using LooseUpperBoundUniformIncrease. // (1) This is the most conservative estimate, so if it's worse than any of the estimates or exact values we've // found so far, we know its tighter estimate will also be worse. // (2) This is a uniform estimate: it raises all up-to-date estimate types of all roots by the same amount. // So since the roots from this point and earlier in the list were previously sorted by their estimate, // the sort order will be unchanged even if we bring them all up-to-date with LooseUpperBoundUniformIncrease // estimation, and further, their non-up-to-date value will be <= their up-to-date value. // // These two conditions are sufficient to guarantee that we don't have inspect mergedatas earlier in the list // to correctly find the best merge or to resort the list once the end of the list has a worse estimate than // our current worst. { FRootAndClusterMergeData& CompareMergeData = MergeDatas.Last(); FRootAndClusterMergeData* WorstMergeDataPtr = WorstMergeData.IsSet() ? &WorstMergeData.GetValue() : &BestMergeData; UpdateMergeDataEstimate(Cluster, CompareMergeData, EEstimationType::LooseUpperBoundUniformIncrease, bModified); if (!FRootAndClusterMergeData::IsWorse(*WorstMergeDataPtr, CompareMergeData)) { break; } } // NewMergeData's most-conservative estimate is as good or better than the worst estimate we've found // so far. Sort it into the (partially sorted) list of mergedatas we're collecting: either it's the best, // or the worse, or it's in the otherwise-unsorted middle. // Sorting the middle at the end is faster than sorting it as we go because it avoids shifts. // NewMergeData doesn't get to count itself as the best unless its estimate has been made exact, but don't // spend time making the estimate exact unless we have to. FRootAndClusterMergeData NewMergeData = MergeDatas.Pop(EAllowShrinking::No); bool bIsNewBest = false; bool bIsNewWorst = false; // If WorstMergeData is not set then we already compared NewMergeData to BestMergeData above and do not // need to redo the comparison if (!WorstMergeData.IsSet() || FRootAndClusterMergeData::IsWorse(BestMergeData, NewMergeData)) { // Change to the tighter estimate and recompare UpdateMergeDataEstimate(Cluster, NewMergeData, EEstimationType::TightUpperBound, bModified); // If the NewMergeData's value was not modified when we changed to a tighter estimate, then we // know it is still better than the BestMergeData and we do not need to compare again if (!bModified || FRootAndClusterMergeData::IsWorse(BestMergeData, NewMergeData)) { // Change to the tightest estimate (the exact value) and recompare UpdateMergeDataEstimate(Cluster, NewMergeData, EEstimationType::Exact, bModified); bIsNewBest = !bModified || FRootAndClusterMergeData::IsWorse(BestMergeData, NewMergeData); } // NewMergeData was previously better than the worst mergedata, but we changed it to a TightUpperBound // or Exact value so it might be worse now. It can't be worse than the worst if it's better than the best. bIsNewWorst = !bIsNewBest && (!WorstMergeData.IsSet() || FRootAndClusterMergeData::IsWorse(NewMergeData, *WorstMergeData)); } if (bIsNewBest) { if (!WorstMergeData.IsSet()) { WorstMergeData.Emplace(MoveTemp(BestMergeData)); } else { UpdateMergeScratch.Add(MoveTemp(BestMergeData)); } BestMergeData = MoveTemp(NewMergeData); } else if (bIsNewWorst) { if (WorstMergeData.IsSet()) { UpdateMergeScratch.Add(MoveTemp(*WorstMergeData)); } WorstMergeData.Emplace(MoveTemp(NewMergeData)); } else { UpdateMergeScratch.Add(MoveTemp(NewMergeData)); } } // Push all the mergedatas we pulled off the list back onto the list, sorted from worst to best if (WorstMergeData.IsSet()) { MergeDatas.Add(*WorstMergeData); } if (!UpdateMergeScratch.IsEmpty()) { Algo::Sort(UpdateMergeScratch, FRootAndClusterMergeData::IsWorse); for (FRootAndClusterMergeData& Updated : UpdateMergeScratch) { MergeDatas.Add(MoveTemp(Updated)); } UpdateMergeScratch.Reset(); } MergeDatas.Add(MoveTemp(BestMergeData)); check(MergeDatas.Num() == OriginalMergeDataNum - RemovedMergedDataNum); } int32 FLoadBalanceBuilder::CalculateMergeResults(FMergeCluster& Cluster, FVertex Root, bool bWriteResultsToCluster) { int32 Reduction = 0; if (bWriteResultsToCluster) { TArray& RootReductions = Cluster.RootReductions; for (FVertex Reachable : ReachabilityGraph[Root]) { if (!Cluster.VertexInCluster[Reachable]) { Cluster.VertexInCluster[Reachable] = true; // When we add a vertex to the cluster, we need to inform every remainingroot in the cluster that has // that vertex in its reachability set that the number of reductions it will have when merged into the // cluster has increased by one. for (FVertex ReachedByRoot : ReachableByRootGraph[Reachable]) { ++RootReductions[ReachedByRoot]; } } else { ++Reduction; } } Cluster.VertexInClusterCount += ReachabilityGraph[Root].Num() - Reduction; } else { for (FVertex Reachable : ReachabilityGraph[Root]) { Reduction += Cluster.VertexInCluster[Reachable] ? 1 : 0; } } return Reduction; } void FLoadBalanceBuilder::UpdateMergeDataEstimate(FMergeCluster& Cluster, FRootAndClusterMergeData& MergeData, EEstimationType EstimationType, bool& bOutEstimateWasModified) { int32 ClusterSize = Cluster.VertexInClusterCount; MergeData.bExact = MergeData.bExact & (ClusterSize == MergeData.ClusterSizeWhenEstimated); if (MergeData.bExact) { bOutEstimateWasModified = false; return; } int32 PreviousReductionUpperBound = MergeData.ReductionUpperBound; if (EstimationType == EEstimationType::Exact || (EstimationType != EEstimationType::LooseUpperBoundUniformIncrease && RootInReachableByRootSet[MergeData.Root])) { if (RootInReachableByRootSet[MergeData.Root]) { // We have data for every vertex about whether it is reachable by the root, and we use that data // every time we merge a vertex into a cluster, to increment the reduction count of every root // that includes the vertex. // This allows this function to do a cheap update of the exact value of the root's reduction: // we just consume the recorded delta reduction value of this root and add it to the reduction it // had the last time we updated it. TArray& RootReductions = Cluster.RootReductions; MergeData.ReductionUpperBound = MergeData.PreviousExactEstimate + RootReductions[MergeData.Root]; RootReductions[MergeData.Root] = 0; } else { // We don't have the reachedby data for this root, so we have to do the slow calculation of the // merge results MergeData.ReductionUpperBound = CalculateMergeResults(Cluster, MergeData.Root, false /* bWriteResultsToCluster */); } MergeData.PreviousExactEstimate = MergeData.ReductionUpperBound; MergeData.bExact = true; } else { int32 LooseEstimate = MergeData.ReductionUpperBound + ClusterSize - MergeData.ClusterSizeWhenEstimated; if (EstimationType == EEstimationType::TightUpperBound) { // TightUpperBound assumes all new vertices in the cluster will reduce with this one, // but notes that the number of reductions is <= size of this root MergeData.ReductionUpperBound = FMath::Min(LooseEstimate, MergeData.RootSize); } else { check(EstimationType == EEstimationType::LooseUpperBoundUniformIncrease); // LooseUpperBoundUniformIncrease is the same as TightUpperBound, but without the clamping // It has two properties useful for our sorted array: // 1) UpperBoundUniformIncrease(CurrentCluster) >= UpperBoundUniformIncrease(PreviousSmallerCluster) // for all MergeDatas and all growth from PreviousSmallerCluster to CurrentCluster // 2) UpperBoundUniformIncrease(CurrentCluster, MergeDataA) - UpperBoundUniformIncrease(PreviousSmallerCluster, MergeDataA) == // UpperBoundUniformIncrease(CurrentCluster, MergeDataB) - UpperBoundUniformIncrease(PreviousSmallerCluster, MergeDataB) // for all MergeDataA, MergeDataB // These two properties allow us to know we do not need to investigate MergeDatas with an UpperBoundUniformIncrease estimate // worse than our current worst estimate. MergeData.ReductionUpperBound = LooseEstimate; } } MergeData.ClusterSizeWhenEstimated = ClusterSize; bOutEstimateWasModified = MergeData.ReductionUpperBound != PreviousReductionUpperBound; }; bool FLoadBalanceBuilder::FRootAndClusterMergeData::IsWorse(FRootAndClusterMergeData& A, FRootAndClusterMergeData& B) { if (A.ReductionUpperBound != B.ReductionUpperBound) { // First maximize reduction, aka maximize sharing return A.ReductionUpperBound < B.ReductionUpperBound; } if (A.RootSize != B.RootSize) { // If the reduction is the same, prefer to merge the smaller root; if we can reduce // a smaller root by the same amount as a larger root, we should take the smaller root because it // adds fewer unreduced vertices return A.RootSize > B.RootSize; } // Prefer earlier root values if all else is equal. // TODO: This improves the splitting, I'm not sure why. return A.Root > B.Root; }; bool FLoadBalanceBuilder::FClusterMergeSelectionData::IsBetter(const FClusterMergeSelectionData& A, const FClusterMergeSelectionData& B) { if (A.bOverSpread != B.bOverSpread) { // If a merge goes over the spread, don't use it unless all merges go over the spread return !A.bOverSpread; } if (A.bOverSpread && A.Spread != B.Spread) { // If all merges go over the spread, pick the one that minimizes the resultant spread return A.Spread < B.Spread; } if (A.Reduction != B.Reduction) { // When spread is not a factor, we want to maximize reduction, aka maximize sharing return A.Reduction > B.Reduction; } // If the reduction is the same, prefer the merge with the smaller final size; if we can reduce // a smaller root by the same amount as a larger root, we should take the smaller root because it // adds fewer unreduced vertices return A.NewSize < B.NewSize; } }