Files
UnrealEngine/Engine/Source/Editor/UnrealEd/Private/Cooker/CookDirector.cpp
2025-05-18 13:04:45 +08:00

2415 lines
79 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
#include "CookDirector.h"
#include "Async/Fundamental/Scheduler.h"
#include "CompactBinaryTCP.h"
#include "Cooker/CookGenerationHelper.h"
#include "Cooker/CookPackageData.h"
#include "Cooker/CookPlatformManager.h"
#include "Cooker/CookWorkerServer.h"
#include "CookOnTheSide/CookOnTheFlyServer.h"
#include "CoreGlobals.h"
#include "GenericPlatform/GenericPlatformOutputDevices.h"
#include "LoadBalanceCookBurden.h"
#include "HAL/PlatformMisc.h"
#include "HAL/PlatformTime.h"
#include "HAL/RunnableThread.h"
#include "Math/NumericLimits.h"
#include "Misc/CommandLine.h"
#include "Misc/ConfigCacheIni.h"
#include "Misc/PathViews.h"
#include "PackageTracker.h"
#include "Serialization/CompactBinary.h"
#include "Serialization/CompactBinaryWriter.h"
#include "ShaderCompiler.h"
#include "Sockets.h"
#include "SocketSubsystem.h"
#include "String/ParseTokens.h"
#include "UnrealEdMisc.h"
extern CORE_API int32 GNumForegroundWorkers; // TaskGraph.cpp
LLM_DEFINE_TAG(Cooker_MPCook);
namespace UE::Cook
{
constexpr int32 RetractionMinimumNumAssignments = 100;
/** Profile data for each CookWorker that needs to be collected on the Director. */
struct FCookWorkerProfileData
{
float IdleTimeSeconds = 0.f;
bool bIsIdle = true;
void UpdateIdle(bool bInIsIdle, float DeltaTime)
{
if (bInIsIdle)
{
if (bIsIdle)
{
IdleTimeSeconds += DeltaTime;
}
}
bIsIdle = bInIsIdle;
}
};
/**
* A class that has an instance active while we need to handle retraction of assigned results from a CookWorker.
* Keeps track of the expected message coming back from the remote worker, prevents repeatedly sending messages, gives
* a warning if the remote worker does not respond.
*/
class FCookDirector::FRetractionHandler
{
public:
FRetractionHandler(FCookDirector& InDirector);
/** Initialize to search idle and busy workers to send a RetractionRequestMessage. */
void Initialize();
/** Initialize to handle an unexpected RetractionResultsMessage. */
void InitializeForResultsMessage(const FWorkerId& FromWorker);
void TickFromSchedulerThread(bool bAllWorkersConnected, bool bAnyIdle, int32 BusiestNumAssignments);
/** Hook called by the director when a retraction message comes in. */
void HandleRetractionMessage(FMPCollectorServerMessageContext& Context, bool bReadSuccessful,
FRetractionResultsMessage&& Message);
private:
enum class ERetractionState : uint8
{
Idle,
WantToRetract,
WaitingForResponse,
Count,
};
enum class ERetractionResult : uint8
{
NoneAvailable,
Retracted
};
private:
/** Try to select a worker for retraction */
ERetractionState TickWantToRetract(bool& bOutAnyIdle, int32& OutBusiestNumAssignments);
/** Tick the asynchronous wait for the message to come in, and synchronously handle it when it does. */
ERetractionState TickWaitingForResponse();
/**
* Pick workers to give the retracted packages to, and assign those packages to the worker
* in the local and remote state.
*/
ERetractionResult ReassignPackages(const FWorkerId& WorkerId, TConstArrayView<FPackageData*> Packages);
/** Pick workers to give the retracted packages to. */
TArray<FWorkerId> CalculateWorkersToSplitOver(int32 NumPackages, const FWorkerId& FromWorker,
TConstArrayView<TRefCountPtr<FCookWorkerServer>> LocalRemoteWorkers);
void SetRetractionState(ERetractionState NewState, bool& bOutHadStateChange);
bool IsAvailableForRetraction(const FWorkerId& WorkerId);
private:
FCookDirector& Director;
FWorkerId ExpectedWorker;
TMap<FWorkerId, TArray<FName>> PackagesToRetract;
TMap<FWorkerId, int32> WorkersUnavailableForRetract;
FWorkerId WorkerWithResults;
double MessageSentTimeSeconds = 0.;
double LastWarnTimeSeconds = 0.;
ERetractionState RetractionState = ERetractionState::Idle;
};
FCookDirector::FCookDirector(UCookOnTheFlyServer& InCOTFS, int32 CookProcessCount,
bool bInCookProcessCountSetByCommandLine)
: RunnableShunt(*this)
, COTFS(InCOTFS)
, bCookProcessCountSetByCommandLine(bInCookProcessCountSetByCommandLine)
{
check(CookProcessCount > 1);
WorkersStalledStartTimeSeconds = MAX_flt;
WorkersStalledWarnTimeSeconds = MAX_flt;
ShutdownEvent->Reset();
LocalWorkerProfileData = MakeUnique<FCookWorkerProfileData>();
RetractionHandler = MakeUnique<FRetractionHandler>(*this);
bool bConfigValid;
ParseConfig(CookProcessCount, bConfigValid);
if (!bConfigValid)
{
UE_LOG(LogCook, Error,
TEXT("CookDirector initialization failure: config settings are invalid for multiprocess. CookMultiprocess is disabled and the cooker is running as a single process."));
bMultiprocessAvailable = false;
return;
}
ISocketSubsystem* SocketSubsystem = ISocketSubsystem::Get();
if (!SocketSubsystem)
{
UE_LOG(LogCook, Error,
TEXT("CookDirector initialization failure: platform does not support network sockets. CookMultiprocess is disabled and the cooker is running as a single process."));
bMultiprocessAvailable = false;
return;
}
bMultiprocessAvailable = true;
Register(new FLogMessagesMessageHandler(*COTFS.LogHandler));
Register(new TMPCollectorServerMessageCallback<FRetractionResultsMessage>([this]
(FMPCollectorServerMessageContext& Context, bool bReadSuccessful, FRetractionResultsMessage&& Message)
{
// Called from inside CommunicationLock
RetractionHandler->HandleRetractionMessage(Context, bReadSuccessful, MoveTemp(Message));
}));
Register(new TMPCollectorServerMessageCallback<FHeartbeatMessage>([this]
(FMPCollectorServerMessageContext& Context, bool bReadSuccessful, FHeartbeatMessage&& Message)
{
HandleHeartbeatMessage(Context, bReadSuccessful, MoveTemp(Message));
}));
Register(new FAssetRegistryMPCollector(COTFS));
Register(new FPackageWriterMPCollector(COTFS));
LastTickTimeSeconds = FPlatformTime::Seconds();
#if ENABLE_COOK_STATS
FCookStatsManager::CookStatsCallbacks.AddRaw(this, &FCookDirector::LogCookStats);
#endif
}
bool FCookDirector::IsMultiprocessAvailable() const
{
return bMultiprocessAvailable;
}
void FCookDirector::ParseConfig(int32 CookProcessCount, bool& bOutValid)
{
bOutValid = true;
const TCHAR* CommandLine = FCommandLine::Get();
FString Text;
// CookWorkerCount
RequestedCookWorkerCount = CookProcessCount - 1;
check(RequestedCookWorkerCount > 0);
// CookDirectorListenPort
WorkerConnectPort = Sockets::COOKDIRECTOR_DEFAULT_REQUEST_CONNECTION_PORT;
FParse::Value(CommandLine, TEXT("-CookDirectorListenPort="), WorkerConnectPort);
// ShowCookWorker
if (!FParse::Value(CommandLine, TEXT("-ShowCookWorker="), Text))
{
if (FParse::Param(CommandLine, TEXT("ShowCookWorker")))
{
Text = TEXT("SeparateWindows");
}
}
if (Text == TEXT("CombinedLogs")) { ShowWorkerOption = EShowWorker::CombinedLogs; }
else if (Text == TEXT("SeparateLogs")) { ShowWorkerOption = EShowWorker::SeparateLogs; }
else if (Text == TEXT("SeparateWindows")) { ShowWorkerOption = EShowWorker::SeparateWindows; }
else
{
if (!Text.IsEmpty())
{
UE_LOG(LogCook, Warning, TEXT("Invalid selection \"%s\" for -ShowCookWorker."), *Text);
}
ShowWorkerOption = EShowWorker::CombinedLogs;
}
// LoadBalanceAlgorithm
LoadBalanceAlgorithm = ELoadBalanceAlgorithm::CookBurden;
if (FParse::Value(CommandLine, TEXT("-CookLoadBalance="), Text))
{
if (Text == TEXT("Striped")) { LoadBalanceAlgorithm = ELoadBalanceAlgorithm::Striped; }
else if (Text == TEXT("CookBurden")) { LoadBalanceAlgorithm = ELoadBalanceAlgorithm::CookBurden; }
else
{
UE_LOG(LogCook, Warning, TEXT("Invalid selection \"%s\" for -CookLoadBalance."), *Text);
}
}
bAllowLocalCooks = !FParse::Param(CommandLine, TEXT("CookForceRemote"));
int32 MultiprocessId = UE::GetMultiprocessId();
if (MultiprocessId != 0)
{
bOutValid = false;
UE_LOG(LogCook, Error,
TEXT("CookMultiprocess is incompatible with -MultiprocessId on the CookDirector's commandline. The CookDirector needs to be able to specify all MultiprocessIds."));
}
}
FCookDirector::~FCookDirector()
{
StopCommunicationThread();
#if ENABLE_COOK_STATS
FCookStatsManager::CookStatsCallbacks.RemoveAll(this);
#endif
TSet<FPackageData*> AbortedAssignments;
for (TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
Pair.Value->AbortWorker(AbortedAssignments, ECookDirectorThread::SchedulerThread, HeartbeatNumber);
}
for (FPackageData* PackageData : AbortedAssignments)
{
// Packages that were assigned to workers should be in an AssignedToWorker state and
// therefore should be InProgress.
check(PackageData->IsInProgress());
PackageData->SetWorkerAssignment(FWorkerId::Invalid(), ESendFlags::QueueNone);
EPackageState NewState = PackageData->IsInStateProperty(EPackageStateProperty::Saving)
? EPackageState::SaveActive
: EPackageState::Request;
PackageData->SendToState(NewState, ESendFlags::QueueAddAndRemove, EStateChangeReason::CookerShutdown);
}
RemoteWorkers.Empty();
RemoteWorkerProfileDatas.Empty();
PendingConnections.Empty();
Sockets::CloseSocket(WorkerConnectSocket);
}
void FCookDirector::LaunchCommunicationThread()
{
if (!CommunicationThread && FPlatformProcess::SupportsMultithreading())
{
CommunicationThread = FRunnableThread::Create(&RunnableShunt, TEXT("FCookDirector"), 0, TPri_Normal);
}
}
void FCookDirector::StopCommunicationThread()
{
ShutdownEvent->Trigger();
if (CommunicationThread)
{
CommunicationThread->WaitForCompletion();
delete CommunicationThread;
CommunicationThread = nullptr;
}
ShutdownEvent->Reset();
}
uint32 FCookDirector::RunCommunicationThread()
{
constexpr float TickPeriod = 1.f;
constexpr float MinSleepTime = 0.001f;
for (;;)
{
double StartTime = FPlatformTime::Seconds();
TickCommunication(ECookDirectorThread::CommunicateThread);
double CurrentTime = FPlatformTime::Seconds();
float RemainingDuration = StartTime + TickPeriod - CurrentTime;
if (RemainingDuration > .001f)
{
uint32 WaitTimeMilliseconds = static_cast<uint32>(RemainingDuration * 1000);
if (ShutdownEvent->Wait(WaitTimeMilliseconds))
{
break;
}
}
}
return 0;
}
uint32 FCookDirector::FRunnableShunt::Run()
{
return Director.RunCommunicationThread();
}
void FCookDirector::FRunnableShunt::Stop()
{
Director.ShutdownEvent->Trigger();
}
void FCookDirector::StartCook(const FBeginCookContext& InBeginContext)
{
BeginCookContext.Set(InBeginContext);
// We launch the CookWorkers during StartCook, so that their startup time overlaps with
// work the Director has to do during the first tick, for PumpRequests and AssignRequests.
FScopeLock CommunicationScopeLock(&CommunicationLock);
InitializeWorkers();
}
void FCookDirector::AssignRequests(TArrayView<FPackageData*> Requests, TArray<FWorkerId>& OutAssignments,
TMap<FPackageData*, TArray<FPackageData*>>&& RequestGraph)
{
TArray<FWorkerId> WorkerIds;
TArray<TRefCountPtr<FCookWorkerServer>> LocalRemoteWorkers;
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
InitializeWorkers();
}
LocalRemoteWorkers = CopyRemoteWorkers();;
WorkerIds.Reserve(LocalRemoteWorkers.Num() + 1);
if (bAllowLocalCooks)
{
WorkerIds.Add(FWorkerId::Local());
}
for (const TRefCountPtr<FCookWorkerServer>& RemoteWorker : LocalRemoteWorkers)
{
WorkerIds.Add(RemoteWorker->GetWorkerId());
}
AssignRequests(MoveTemp(WorkerIds), LocalRemoteWorkers, Requests, OutAssignments, MoveTemp(RequestGraph),
true /* bInitialAssignment */);
// Check for a race condition with the communication thread; if a Server was aborted after we CopyRemoteWorkers
// above but before we assigned packages to it, we need to abort those assigments now.
TSet<FPackageData*> PackagesToReassignSet;
for (TRefCountPtr<FCookWorkerServer>& RemoteWorker : LocalRemoteWorkers)
{
if (RemoteWorker->IsShuttingDown())
{
RemoteWorker->AbortAllAssignments(PackagesToReassignSet, ECookDirectorThread::SchedulerThread, HeartbeatNumber);
}
}
if (!PackagesToReassignSet.IsEmpty())
{
// Defer the reassign because our caller has not yet put the packages into the assigned state.
FScopeLock CommunicationScopeLock(&CommunicationLock);
DeferredPackagesToReassign.Append(PackagesToReassignSet.Array());
}
}
TMap<FPackageData*, FAssignPackageExtraData> FCookDirector::GetAssignPackageExtraDatas(
TConstArrayView<FPackageData*> Requests) const
{
TConstArrayView<const ITargetPlatform*> SessionPlatforms = COTFS.PlatformManager->GetSessionPlatforms();
FMPCollectorServerTickPackageContext Context;
Context.Platforms = SessionPlatforms;
TMap<FPackageData*, FAssignPackageExtraData> Results;
for (FPackageData* Request : Requests)
{
FAssignPackageExtraData* ExtraData = nullptr;
auto GetOrAllocateExtraData = [&Results, &ExtraData](FPackageData* InRequest)
{
if (!ExtraData)
{
ExtraData = &Results.FindOrAdd(InRequest);
}
return ExtraData;
};
TRefCountPtr<FGenerationHelper> GenerationHelper = Request->GetGenerationHelper();
if (GenerationHelper)
{
ExtraData = GetOrAllocateExtraData(Request);
ExtraData->GeneratorPerPlatformPreviousGeneratedPackages.Reserve(SessionPlatforms.Num());
for (const ITargetPlatform* TargetPlatform : SessionPlatforms)
{
TMap<FName, FAssetPackageData>& PreviousGeneratedPackages
= ExtraData->GeneratorPerPlatformPreviousGeneratedPackages.FindOrAdd(TargetPlatform);
PreviousGeneratedPackages = GenerationHelper->GetPreviousGeneratedPackages(TargetPlatform);
}
}
Context.PackageName = Request->GetPackageName();
for (const TPair<FGuid, TRefCountPtr<IMPCollector>>& CollectorPair : Collectors)
{
IMPCollector* Collector = CollectorPair.Value.GetReference();
Collector->ServerTickPackage(Context);
if (!Context.Messages.IsEmpty())
{
ExtraData = GetOrAllocateExtraData(Request);
FGuid MessageType = Collector->GetMessageType();
for (FCbObject& Object : Context.Messages)
{
ExtraData->PerPackageCollectorMessages.Add({ MessageType, MoveTemp(Object) });
}
Context.Messages.Reset();
}
}
}
return Results;
}
TArray<FPackageData*> FCookDirector::GetInfoPackagesForRequests(TConstArrayView<FPackageData*> Requests) const
{
TSet<FPackageData*> InfoPackages;
for (FPackageData* Request : Requests)
{
if (!Request->GetParentGenerator().IsNone())
{
FPackageData* Generator = COTFS.PackageDatas->FindPackageDataByPackageName(Request->GetParentGenerator());
if (Generator)
{
InfoPackages.Add(Generator);
}
}
}
return InfoPackages.Array();
}
void FCookDirector::AssignRequests(TArray<FWorkerId>&& InWorkers,
TArray<TRefCountPtr<FCookWorkerServer>>& InRemoteWorkers,
TArrayView<FPackageData*> Requests, TArray<FWorkerId>& OutAssignments,
TMap<FPackageData*, TArray<FPackageData*>>&& RequestGraph, bool bInitialAssignment)
{
check(InWorkers.Num() > 0);
if (InWorkers.Num() <= 1)
{
FWorkerId WorkerId = InWorkers[0];
OutAssignments.SetNum(Requests.Num());
TArray<UE::Cook::FPackageData*> RemovedRequests;
for (int32 RequestIndex = 0; RequestIndex < Requests.Num(); ++RequestIndex)
{
FPackageData* Request = Requests[RequestIndex];
FWorkerId& Assignment = OutAssignments[RequestIndex];
FWorkerId WorkerIdConstraint = Request->GetWorkerAssignmentConstraint();
if (WorkerIdConstraint.IsValid() && WorkerIdConstraint != WorkerId)
{
UE_LOG(LogCook, Warning,
TEXT("Package %s can only be cooked by a now-disconnected CookWorker. The package can not be cooked."),
*Request->GetPackageName().ToString());
Assignment = FWorkerId::Invalid();
RemovedRequests.Add(Request);
}
else
{
Assignment = WorkerId;
}
}
if (!WorkerId.IsLocal())
{
TRefCountPtr<FCookWorkerServer>* RemoteWorker = InRemoteWorkers.FindByPredicate(
[&WorkerId](const TRefCountPtr<FCookWorkerServer>& X) { return X->GetWorkerId() == WorkerId; });
check(RemoteWorker);
TArrayView<FPackageData*> RequestsToSend = Requests;
TArray<FPackageData*> RequestBuffer;
if (!RemovedRequests.IsEmpty())
{
TSet<FPackageData*> RequestSet;
for (FPackageData* Request : Requests)
{
RequestSet.Add(Request);
}
for (FPackageData* Remove : RemovedRequests)
{
RequestSet.Remove(Remove);
}
RequestBuffer = RequestSet.Array();
RequestsToSend = RequestBuffer;
}
(*RemoteWorker)->AppendAssignments(RequestsToSend, GetAssignPackageExtraDatas(RequestsToSend),
GetInfoPackagesForRequests(RequestsToSend), ECookDirectorThread::SchedulerThread);
}
return;
}
InWorkers.Sort();
// Call the LoadBalancing algorithm to split the requests among the LocalWorker and RemoteWorkers
LoadBalance(InWorkers, Requests, MoveTemp(RequestGraph), OutAssignments);
int32 MaxRemoteIndex = InWorkers.Last().IsLocal() ? -1 : InWorkers.Last().GetRemoteIndex();
// Split the output array of WorkerId assignments into a batch for each of the RemoteWorkers
TArray<TArray<FPackageData*>> RemoteBatches; // Indexed by WorkerId.GetRemoteIndex()
TArray<bool> RemoteIndexIsValid; // Indexed by WorkerId.GetRemoteIndex()
RemoteBatches.SetNum(MaxRemoteIndex+1);
RemoteIndexIsValid.Init(false, MaxRemoteIndex+1);
for (FWorkerId WorkerId : InWorkers)
{
if (!WorkerId.IsLocal())
{
RemoteIndexIsValid[WorkerId.GetRemoteIndex()] = true;
}
}
// Debug function for generated packages. We want to be able to force their assignment to all,some,none
// on the same worker that saved their generator.
int32 MPCookGeneratorNextWorker = -1;
auto GetNextWorkerForGeneratedPackage = [&MPCookGeneratorNextWorker, &InWorkers](FWorkerId WorkerNotToUse)
{
MPCookGeneratorNextWorker = (MPCookGeneratorNextWorker + 1) % InWorkers.Num();
if (InWorkers[MPCookGeneratorNextWorker] == WorkerNotToUse)
{
MPCookGeneratorNextWorker = (MPCookGeneratorNextWorker + 1) % InWorkers.Num();
}
return InWorkers[MPCookGeneratorNextWorker];
};
for (int32 RequestIndex = 0; RequestIndex < Requests.Num(); ++RequestIndex)
{
FWorkerId& WorkerId = OutAssignments[RequestIndex];
// Override the loadbalancer's assignment if the Package has a WorkerAssignmentConstraint
// This allows us to guarantee that generated packages will be cooked on the worker that cooked
// their generator package
FPackageData* RequestPackage = Requests[RequestIndex];
FWorkerId WorkerIdConstraint = RequestPackage->GetWorkerAssignmentConstraint();
if (WorkerIdConstraint.IsValid())
{
WorkerId = WorkerIdConstraint;
}
// Override the loadbalancer's assignment to force it local if the Package is blocking urgency
else if (RequestPackage->GetUrgency() == EUrgency::Blocking)
{
WorkerId = FWorkerId::Local();
}
else if (RequestPackage->IsGenerated() && bInitialAssignment)
{
TRefCountPtr<FGenerationHelper> GenerationHelper = RequestPackage->GetParentGenerationHelper();
if (GenerationHelper)
{
FWorkerId GeneratorWorker = GenerationHelper->GetWorkerIdThatSavedGenerator();
if (GeneratorWorker.IsValid())
{
switch (COTFS.MPCookGeneratorSplit)
{
default: // fallthrough
case EMPCookGeneratorSplit::AnyWorker:
// If we're allowed to put them on any worker, put them on the GeneratorWorker anyway
// for the initial assignment since it already has the GenerationHelper created
WorkerId = GeneratorWorker;
break;
case EMPCookGeneratorSplit::AllOnSameWorker:
WorkerId = GeneratorWorker;
break;
case EMPCookGeneratorSplit::SomeOnSameWorker:
if ((GenerationHelper->GetMPCookNextAssignmentIndex()++) % 2 == 0)
{
WorkerId = GeneratorWorker;
}
else
{
WorkerId = GetNextWorkerForGeneratedPackage(GeneratorWorker);
}
break;
case EMPCookGeneratorSplit::NoneOnSameWorker:
WorkerId = GetNextWorkerForGeneratedPackage(GeneratorWorker);
break;
}
}
}
}
if (!WorkerId.IsLocal())
{
uint8 RemoteIndex = WorkerId.GetRemoteIndex();
if (RemoteIndex >= RemoteBatches.Num() || !RemoteIndexIsValid[RemoteIndex])
{
UE_LOG(LogCook, Error,
TEXT("Package %s can only be cooked by a now-disconnected CookWorker. The package can not be cooked."),
*Requests[RequestIndex]->GetPackageName().ToString());
WorkerId = FWorkerId::Invalid();
continue;
}
TArray<FPackageData*>& RemoteBatch = RemoteBatches[RemoteIndex];
if (RemoteBatch.Num() == 0)
{
RemoteBatch.Reserve(2 * Requests.Num() / (InWorkers.Num()));
}
RemoteBatch.Add(Requests[RequestIndex]);
}
}
// Assign each batch to the FCookWorkerServer in RemoteWorkers;
// the CookWorkerServer's tick will handle sending the message to the remote process
for (FWorkerId WorkerId : InWorkers)
{
if (!WorkerId.IsLocal())
{
TRefCountPtr<FCookWorkerServer>* RemoteWorker = InRemoteWorkers.FindByPredicate(
[&WorkerId](const TRefCountPtr<FCookWorkerServer>& X) { return X->GetWorkerId() == WorkerId; });
check(RemoteWorker);
TArray<FPackageData*>& RemoteBatch = RemoteBatches[WorkerId.GetRemoteIndex()];
(*RemoteWorker)->AppendAssignments(RemoteBatch,
GetAssignPackageExtraDatas(RemoteBatch), GetInfoPackagesForRequests(RemoteBatch), ECookDirectorThread::SchedulerThread);
}
}
bIsFirstAssignment = false;
}
TArray<TRefCountPtr<FCookWorkerServer>> FCookDirector::CopyRemoteWorkers() const
{
TArray<TRefCountPtr<FCookWorkerServer>> LocalRemoteWorkers;
FScopeLock CommunicationScopeLock(&CommunicationLock);
LocalRemoteWorkers.Reset(RemoteWorkers.Num());
for (const TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
LocalRemoteWorkers.Add(Pair.Value);
}
return LocalRemoteWorkers;
}
void FCookDirector::RemoveFromWorker(FPackageData& PackageData)
{
FWorkerId WorkerId = PackageData.GetWorkerAssignment();
if (!WorkerId.IsRemote())
{
return;
}
TRefCountPtr<FCookWorkerServer> OwningWorker;
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
const TRefCountPtr<FCookWorkerServer>* RemoteWorkerPtr = FindRemoteWorkerInLock(WorkerId);
if (!RemoteWorkerPtr)
{
return;
}
OwningWorker = *RemoteWorkerPtr;
}
OwningWorker->AbortAssignment(PackageData, ECookDirectorThread::SchedulerThread, HeartbeatNumber);
bForceNextHeartbeat = true;
}
void FCookDirector::BroadcastMessage(const IMPCollectorMessage& Message, ECookBroadcastTiming Timing)
{
BroadcastMessage(MarshalToCompactBinaryTCP(Message), Timing);
}
void FCookDirector::BroadcastMessage(UE::CompactBinaryTCP::FMarshalledMessage&& Message, ECookBroadcastTiming Timing)
{
if (bReceivingMessages)
{
QueuedBroadcasts.Emplace(MoveTemp(Message), Timing);
return;
}
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
InitializeWorkers();
}
TArray<TRefCountPtr<FCookWorkerServer>> LocalRemoteWorkers = CopyRemoteWorkers();
for (TRefCountPtr<FCookWorkerServer>& RemoteWorker : LocalRemoteWorkers)
{
UE::CompactBinaryTCP::FMarshalledMessage CopyMessage(Message);
switch (Timing)
{
case ECookBroadcastTiming::Immediate:
{
if (RemoteWorker->IsConnected())
{
RemoteWorker->SendMessage(MoveTemp(CopyMessage), ECookDirectorThread::SchedulerThread);
}
else
{
RemoteWorker->AppendMessage(MoveTemp(CopyMessage), ECookDirectorThread::SchedulerThread);
}
break;
}
case ECookBroadcastTiming::AfterAssignPackages:
{
RemoteWorker->AppendMessage(MoveTemp(CopyMessage), ECookDirectorThread::SchedulerThread);
break;
}
default:
checkNoEntry();
break;
}
}
}
int32 FCookDirector::InsertBroadcastFence()
{
int32 LocalHeartbeatNumber = HeartbeatNumber;
// We need to Tick, and have the Tick function send and update the heartbeat number, so that any preceding
// messages get sent before the heartbeat.
bForceNextHeartbeat = true;
TickFromSchedulerThread();
return LocalHeartbeatNumber;
}
bool FCookDirector::IsBroadcastFencePassed(int32 Fence, TArray<FWorkerId>* OutPendingWorkers)
{
bool bHasPendingWorker = false;
FScopeLock CommunicationScopeLock(&CommunicationLock);
for (TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
FCookWorkerServer* RemoteWorker = Pair.Value.GetReference();
if (RemoteWorker->GetLastReceivedHeartbeatNumber() < Fence)
{
bHasPendingWorker = true;
if (OutPendingWorkers)
{
OutPendingWorkers->Add(RemoteWorker->GetWorkerId());
}
}
}
return !bHasPendingWorker;
}
void FCookDirector::TickFromSchedulerThread()
{
double CurrentTime = FPlatformTime::Seconds();
if (!CommunicationThread)
{
TickCommunication(ECookDirectorThread::SchedulerThread);
}
int32 BusiestNumAssignments = 0;
bool bLocalWorkerIdle = true;
bool bAnyIdle = false;
float DeltaTime = static_cast<float>(CurrentTime - LastTickTimeSeconds);
LastTickTimeSeconds = CurrentTime;
if (bAllowLocalCooks)
{
BusiestNumAssignments = COTFS.NumMultiprocessLocalWorkerAssignments();
bLocalWorkerIdle = BusiestNumAssignments == 0;
bAnyIdle = bLocalWorkerIdle;
LocalWorkerProfileData->UpdateIdle(bLocalWorkerIdle, DeltaTime);
}
bool bSendHeartbeat;
int32 LocalHeartbeatNumber;
TickHeartbeat(bForceNextHeartbeat, CurrentTime, bSendHeartbeat, LocalHeartbeatNumber);
bForceNextHeartbeat = false;
TArray<TRefCountPtr<FCookWorkerServer>, TInlineAllocator<16>> WorkersWithMessage;
bool bAllWorkersConnected = false;
if (bWorkersInitialized)
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
bAllWorkersConnected = true;
for (TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
FCookWorkerServer* RemoteWorker = Pair.Value.GetReference();
FCookWorkerProfileData& ProfileData = RemoteWorkerProfileDatas[RemoteWorker->GetProfileId()];
bAllWorkersConnected &= RemoteWorker->IsConnected();
int32 NumAssignments = RemoteWorker->NumAssignments();
BusiestNumAssignments = FMath::Max(NumAssignments, BusiestNumAssignments);
bool bWorkerIdle = NumAssignments == 0;
bAnyIdle |= bWorkerIdle;
ProfileData.UpdateIdle(bWorkerIdle, DeltaTime);
if (RemoteWorker->HasMessages())
{
WorkersWithMessage.Add(RemoteWorker);
}
if (bSendHeartbeat)
{
RemoteWorker->SignalHeartbeat(ECookDirectorThread::SchedulerThread, LocalHeartbeatNumber);
}
}
for (TPair<FCookWorkerServer*, TRefCountPtr<FCookWorkerServer>>& Pair : ShuttingDownWorkers)
{
if (Pair.Value && Pair.Value->HasMessages())
{
WorkersWithMessage.Add(Pair.Value);
}
}
ReassignAbortedPackages(DeferredPackagesToReassign);
RetractionHandler->TickFromSchedulerThread(bAllWorkersConnected, bAnyIdle, BusiestNumAssignments);
}
bool bIsStalled = bLocalWorkerIdle && WorkersWithMessage.IsEmpty();
if (bIsStalled)
{
// We are only stalled if we have no local work to do and there is a remote worker with assigned work. If no
// worker has assigned work then we are done with the cook rather than stalled.
bool bRemoteWorkerHasWork = !COTFS.PackageDatas->GetAssignedToWorkerSet().IsEmpty();
if (!bRemoteWorkerHasWork)
{
// Do the slow check to see whether we have a package in the SaveStalledAssignedToWorker state only if
// we have to
for (FPackageData* PackageData : COTFS.PackageDatas->GetSaveStalledSet())
{
if (PackageData->IsInStateProperty(EPackageStateProperty::AssignedToWorkerProperty))
{
bRemoteWorkerHasWork = true;
break;
}
}
}
bIsStalled = bIsStalled && bRemoteWorkerHasWork;
}
{
bReceivingMessages = true;
for (TRefCountPtr<FCookWorkerServer>& Worker : WorkersWithMessage)
{
Worker->HandleReceiveMessages(ECookDirectorThread::SchedulerThread);
}
bReceivingMessages = false;
}
for (TPair<UE::CompactBinaryTCP::FMarshalledMessage, ECookBroadcastTiming>& Pair : QueuedBroadcasts)
{
BroadcastMessage(MoveTemp(Pair.Key), Pair.Value);
}
QueuedBroadcasts.Empty();
// Process any queued messages
WorkersWithMessage.Empty();
SetWorkersStalled(bIsStalled);
LastTickTimeSeconds = CurrentTime;
}
void FCookDirector::UpdateDisplayDiagnostics() const
{
DisplayRemainingPackages();
}
void FCookDirector::DisplayRemainingPackages() const
{
constexpr int32 DisplayWidth = 16;
UE_LOG(LogCook, Display, TEXT("\t%s: %d packages remain."), *GetDisplayName(FWorkerId::Local(), DisplayWidth),
COTFS.NumMultiprocessLocalWorkerAssignments());
FScopeLock CommunicationScopeLock(&CommunicationLock);
for (const TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
FCookWorkerServer* RemoteWorker = Pair.Value;
UE_LOG(LogCook, Display, TEXT("\t%s: %d packages remain."),
*GetDisplayName(*RemoteWorker, DisplayWidth), RemoteWorker->NumAssignments());
}
}
FString FCookDirector::GetDisplayName(const FWorkerId& WorkerId, int32 PreferredWidth) const
{
FString Result;
if (WorkerId.IsInvalid())
{
Result = TEXTVIEW("None");
}
else if (WorkerId.IsLocal())
{
Result = TEXTVIEW("Local");
}
else
{
const TRefCountPtr<FCookWorkerServer>* RemoteWorker = nullptr;
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
RemoteWorker = FindRemoteWorkerInLock(WorkerId);
if (!RemoteWorker)
{
for (const TPair<FCookWorkerServer*, TRefCountPtr<FCookWorkerServer>>& Pair : ShuttingDownWorkers)
{
if (Pair.Value && Pair.Value->GetWorkerId() == WorkerId)
{
RemoteWorker = &Pair.Value;
break;
}
}
}
}
if (RemoteWorker)
{
Result = FString::Printf(TEXT("%d"), (*RemoteWorker)->GetProfileId());
}
else
{
Result = FString::Printf(TEXT("Unknown (WorkerId %d)"), WorkerId.GetRemoteIndex());
}
}
constexpr FStringView Prefix(TEXTVIEW("CookWorker "));
Result = FString(Prefix) + Result.LeftPad(PreferredWidth-Prefix.Len());
return Result;
}
FString FCookDirector::GetDisplayName(const FCookWorkerServer& RemoteWorker, int32 PreferredWidth) const
{
FString Result = FString::Printf(TEXT("%d"), RemoteWorker.GetProfileId());
constexpr FStringView Prefix(TEXTVIEW("CookWorker "));
Result = FString(Prefix) + Result.LeftPad(PreferredWidth - Prefix.Len());
return Result;
}
const TRefCountPtr<FCookWorkerServer>* FCookDirector::FindRemoteWorkerInLock(const FWorkerId& WorkerId) const
{
if (!WorkerId.IsRemote())
{
return nullptr;
}
return RemoteWorkers.Find(WorkerId.GetRemoteIndex());
}
void FCookDirector::TickCommunication(ECookDirectorThread TickThread)
{
bool bHasShutdownWorkers = false;
TickWorkerConnects(TickThread);
TArray<TRefCountPtr<FCookWorkerServer>, TInlineAllocator<16>> LocalRemoteWorkers;
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
for (const TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair: RemoteWorkers)
{
LocalRemoteWorkers.Add(Pair.Value);
}
if (!RemoteWorkers.IsEmpty())
{
bWorkersActive = true;
}
else
{
bWorkersActive = false;
for (const TPair <FCookWorkerServer*, TRefCountPtr<FCookWorkerServer>>& Pair : ShuttingDownWorkers)
{
FCookWorkerServer* RemoteWorker = Pair.Key;
check(RemoteWorker->IsShuttingDown());
bWorkersActive = bWorkersActive || RemoteWorker->IsFlushingBeforeShutdown();
}
}
bHasShutdownWorkers = !ShuttingDownWorkers.IsEmpty();
}
for (TRefCountPtr<FCookWorkerServer>& RemoteWorker: LocalRemoteWorkers)
{
RemoteWorker->TickCommunication(TickThread);
if (RemoteWorker->IsShuttingDown())
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
TRefCountPtr<FCookWorkerServer>& Existing = ShuttingDownWorkers.FindOrAdd(RemoteWorker.GetReference());
check(!Existing); // We should not be able to send the same pointer into ShuttingDown twice
bHasShutdownWorkers = true;
}
}
if (bHasShutdownWorkers)
{
TickWorkerShutdowns(TickThread);
}
}
void FCookDirector::PumpCookComplete(bool& bCompleted)
{
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
if (!bCookCompleteSent)
{
bool bAllIdle = true;
for (TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
FCookWorkerServer& RemoteWorker = *Pair.Value;
if (RemoteWorker.NumAssignments() > 0)
{
bAllIdle = false;
break;
}
}
if (bAllIdle && FinalIdleHeartbeatFence == -1)
{
bool bSendHeartbeat;
double CurrentTime = FPlatformTime::Seconds();
TickHeartbeat(true /* bForceHeartbeat */, CurrentTime, bSendHeartbeat, FinalIdleHeartbeatFence);
for (TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
FCookWorkerServer& RemoteWorker = *Pair.Value;
RemoteWorker.SignalHeartbeat(ECookDirectorThread::SchedulerThread, FinalIdleHeartbeatFence);
}
bAllIdle = false;
}
else
{
for (TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
FCookWorkerServer& RemoteWorker = *Pair.Value;
if (RemoteWorker.GetLastReceivedHeartbeatNumber() < FinalIdleHeartbeatFence)
{
bAllIdle = false;
SetWorkersStalled(true);
break;
}
}
}
if (bAllIdle)
{
for (TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
FCookWorkerServer& RemoteWorker = *Pair.Value;
RemoteWorker.SignalCookComplete(ECookDirectorThread::SchedulerThread);
check(RemoteWorker.IsShuttingDown());
}
bCookCompleteSent = true;
}
}
bCompleted = !bWorkersActive;
}
TickFromSchedulerThread();
}
void FCookDirector::ShutdownCookSession()
{
StopCommunicationThread();
// Cancel any inprogress workers and move them to the Shutdown list
for (;;)
{
TRefCountPtr<FCookWorkerServer> RemoteWorker;
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
if (RemoteWorkers.IsEmpty())
{
break;
}
RemoteWorker = TMap<int32, TRefCountPtr<FCookWorkerServer>>::TIterator(RemoteWorkers).Value();
}
AbortWorker(RemoteWorker->GetWorkerId(), ECookDirectorThread::SchedulerThread);
}
// Immediately shutdown any gracefully shutting down workers
TArray<TRefCountPtr<FCookWorkerServer>, TInlineAllocator<16>> WorkersNeedingAbort;
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
for (TPair<FCookWorkerServer*, TRefCountPtr<FCookWorkerServer>>& Pair : ShuttingDownWorkers)
{
// The Value was set by AbortWorker for any new entries, and all old entries guarantee the value is set
check(Pair.Value);
if (Pair.Key->IsFlushingBeforeShutdown())
{
WorkersNeedingAbort.Add(Pair.Value);
}
}
}
for (TRefCountPtr<FCookWorkerServer>& RemoteWorker : WorkersNeedingAbort)
{
TSet<FPackageData*> UnusedPendingPackages;
RemoteWorker->AbortWorker(UnusedPendingPackages, ECookDirectorThread::SchedulerThread, HeartbeatNumber);
}
// Wait for all the shutdowns to complete
for (;;)
{
TickWorkerShutdowns(ECookDirectorThread::SchedulerThread);
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
if (ShuttingDownWorkers.IsEmpty())
{
break;
}
}
constexpr float SleepSeconds = 0.010f;
FPlatformProcess::Sleep(SleepSeconds);
}
// Kill any connections that had just been made and not yet assigned to a Server
PendingConnections.Reset();
// Restore the FCookDirector to its original state so that it is ready for a new session
bWorkersInitialized = false;
bIsFirstAssignment = true;
bCookCompleteSent = false;
bWorkersActive = false;
HeartbeatNumber = 0;
NextHeartbeatTimeSeconds = 0.;
FinalIdleHeartbeatFence = -1;
}
void FCookDirector::Register(IMPCollector* Collector)
{
TRefCountPtr<IMPCollector>& Existing = Collectors.FindOrAdd(Collector->GetMessageType());
if (Existing)
{
UE_LOG(LogCook, Error,
TEXT("Duplicate IMPCollectors registered. Guid: %s, Existing: %s, Registering: %s. Keeping the Existing."),
*Collector->GetMessageType().ToString(), Existing->GetDebugName(), Collector->GetDebugName());
return;
}
Existing = Collector;
}
void FCookDirector::Unregister(IMPCollector* Collector)
{
TRefCountPtr<IMPCollector> Existing;
Collectors.RemoveAndCopyValue(Collector->GetMessageType(), Existing);
if (Existing && Existing.GetReference() != Collector)
{
UE_LOG(LogCook, Error,
TEXT("Duplicate IMPCollector during Unregister. Guid: %s, Existing: %s, Unregistering: %s. Ignoring the Unregister."),
*Collector->GetMessageType().ToString(), Existing->GetDebugName(), Collector->GetDebugName());
Collectors.Add(Collector->GetMessageType(), MoveTemp(Existing));
}
}
void FCookDirector::SetWorkersStalled(bool bInWorkersStalled)
{
if (bInWorkersStalled != bWorkersStalled)
{
bWorkersStalled = bInWorkersStalled;
if (bWorkersStalled)
{
const double CurrentTime = FPlatformTime::Seconds();
WorkersStalledStartTimeSeconds = CurrentTime;
WorkersStalledWarnTimeSeconds = CurrentTime + GCookProgressWarnBusyTime;
}
else
{
WorkersStalledStartTimeSeconds = MAX_flt;
WorkersStalledWarnTimeSeconds = MAX_flt;
}
}
else if (bWorkersStalled)
{
const double CurrentTime = FPlatformTime::Seconds();
if (CurrentTime >= WorkersStalledWarnTimeSeconds)
{
UE_LOG(LogCook, Display,
TEXT("Cooker has been blocked with no results from remote CookWorkers for %.0f seconds."),
(float)(CurrentTime - WorkersStalledStartTimeSeconds));
WorkersStalledWarnTimeSeconds = CurrentTime + GCookProgressWarnBusyTime;
}
}
}
void FCookDirector::TickHeartbeat(bool bForceHeartbeat, double CurrentTimeSeconds, bool& bOutSendHeartbeat,
int32& OutHeartbeatNumber)
{
constexpr float HeartbeatPeriodSeconds = 30.f;
bOutSendHeartbeat = false;
OutHeartbeatNumber = HeartbeatNumber;
if (bForceHeartbeat)
{
bOutSendHeartbeat = true;
}
else if (NextHeartbeatTimeSeconds == 0.)
{
NextHeartbeatTimeSeconds = CurrentTimeSeconds + HeartbeatPeriodSeconds;
}
else if (CurrentTimeSeconds >= NextHeartbeatTimeSeconds)
{
bOutSendHeartbeat = true;
}
if (bOutSendHeartbeat)
{
checkf(HeartbeatNumber < MAX_int32, TEXT("Overflow"));
HeartbeatNumber++;
NextHeartbeatTimeSeconds = CurrentTimeSeconds + HeartbeatPeriodSeconds;
}
}
void FCookDirector::ResetFinalIdleHeartbeatFence()
{
FinalIdleHeartbeatFence = -1;
}
void FCookDirector::HandleHeartbeatMessage(FMPCollectorServerMessageContext& Context, bool bReadSuccessful,
FHeartbeatMessage&& Message)
{
if (!bReadSuccessful)
{
UE_LOG(LogCook, Error, TEXT("Corrupt HeartbeatMessage received from CookWorker %d. It will be ignored."),
Context.GetProfileId());
return;
}
Context.GetCookWorkerServer()->SetLastReceivedHeartbeatNumberInLock(Message.HeartbeatNumber);
}
FCookDirector::FPendingConnection::FPendingConnection(FPendingConnection&& Other)
{
Swap(Socket, Other.Socket);
Buffer = MoveTemp(Other.Buffer);
}
FCookDirector::FPendingConnection::~FPendingConnection()
{
Sockets::CloseSocket(Socket);
}
FSocket* FCookDirector::FPendingConnection::DetachSocket()
{
FSocket* Result = Socket;
Socket = nullptr;
return Result;
}
void FWorkerConnectMessage::Write(FCbWriter& Writer) const
{
Writer << "RemoteIndex" << RemoteIndex;
}
bool FWorkerConnectMessage::TryRead(FCbObjectView Object)
{
RemoteIndex = Object["RemoteIndex"].AsInt32(-1);
return RemoteIndex >= 0;
}
FGuid FWorkerConnectMessage::MessageType(TEXT("302096E887DA48F7B079FAFAD0EE5695"));
bool FCookDirector::TryCreateWorkerConnectSocket()
{
if (WorkerConnectSocket)
{
return true;
}
ISocketSubsystem* SocketSubsystem = ISocketSubsystem::Get();
if (!SocketSubsystem)
{
// Error was already logged in the constructor
return false;
}
FString ErrorReason;
TSharedPtr<FInternetAddr> ListenAddr;
WorkerConnectSocket = Sockets::CreateListenSocket(WorkerConnectPort, ListenAddr, WorkerConnectAuthority,
TEXT("FCookDirector-WorkerConnect"), ErrorReason);
if (!WorkerConnectSocket)
{
UE_LOG(LogCook, Error,
TEXT("CookDirector could not create listen socket, CookWorkers will be disabled. Reason: %s."),
*ErrorReason);
return false;
}
return true;
}
void FCookDirector::InitializeWorkers()
{
if (bWorkersInitialized)
{
return;
}
int32 InitialRequestCount = COTFS.InitialRequestCount;
bWorkersInitialized = true;
check(!CommunicationThread);
check(RemoteWorkers.IsEmpty());
bool bSucceeded = false;
ON_SCOPE_EXIT
{
if (!bSucceeded)
{
bWorkersActive = false;
}
};
constexpr int32 MinRequestsForMPCookWithCookSinglePackage = 100;
UE::Cook::FCookByTheBookOptions& Options = *COTFS.CookByTheBookOptions;
if (!bCookProcessCountSetByCommandLine
&& (Options.bSkipHardReferences
|| (Options.bSkipSoftReferences && InitialRequestCount < MinRequestsForMPCookWithCookSinglePackage)))
{
UE_LOG(LogCook, Display,
TEXT("CookDirector initialization skipped: -CookSinglePackage was requested. CookMultiprocess is disabled and the cooker is running as a single process."));
return;
}
if (Options.bCookList)
{
UE_LOG(LogCook, Display,
TEXT("CookDirector initialization skipped: -CookList was requested. CookMultiprocess is disabled and the cooker is running as a single process."));
return;
}
UE_LOG(LogCook, Display, TEXT("CookProcessCount=%d. CookMultiprocess is enabled with 1 CookDirector and %d %s."),
RequestedCookWorkerCount + 1, RequestedCookWorkerCount,
RequestedCookWorkerCount > 1 ? TEXT("CookWorkers") : TEXT("CookWorker"));
ActivateMachineResourceReduction();
if (!TryCreateWorkerConnectSocket())
{
return;
}
RemoteWorkers.Reserve(RequestedCookWorkerCount);
for (int32 RemoteIndex = 0; RemoteIndex < RequestedCookWorkerCount; ++RemoteIndex)
{
int32 ProfileId = RemoteWorkerProfileDatas.Num();
RemoteWorkerProfileDatas.Emplace();
RemoteWorkers.Add(RemoteIndex, new FCookWorkerServer(*this, ProfileId,
FWorkerId::FromRemoteIndex(RemoteIndex)));
}
bWorkersActive = true;
ConstructReadonlyThreadVariables();
ShutdownEvent->Reset();
LaunchCommunicationThread();
bSucceeded = true;
}
void FCookDirector::RecreateWorkers()
{
// TODO: Finish implementing the recreation of workers that have crashed
// Find any unused RemoteIndex less than the maximum used RemoteIndex
FScopeLock CommunicationScopeLock(&CommunicationLock);
if (RemoteWorkers.Num() >= RequestedCookWorkerCount || !WorkerConnectSocket)
{
return;
}
TArray<uint8> UnusedRemoteIndexes;
RemoteWorkers.KeySort(TLess<>());
uint8 NextPossiblyOpenIndex = 0;
for (TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
check(NextPossiblyOpenIndex <= Pair.Key);
while (NextPossiblyOpenIndex != Pair.Key)
{
UnusedRemoteIndexes.Add(NextPossiblyOpenIndex++);
}
}
// Add RemoteWorkers, pulling the RemoteIndex id from the UnusedRemoteIndexes if any exist
// otherwise use the next integer because all indexes up to RemoteWorkers.Num() are in use.
while (RemoteWorkers.Num() < RequestedCookWorkerCount)
{
uint8 RemoteIndex;
if (UnusedRemoteIndexes.Num())
{
RemoteIndex = UnusedRemoteIndexes[0];
UnusedRemoteIndexes.RemoveAtSwap(0);
}
else
{
RemoteIndex = RemoteWorkers.Num();
}
int32 ProfileId = RemoteWorkerProfileDatas.Num();
RemoteWorkerProfileDatas.Emplace();
RemoteWorkers.Add(RemoteIndex, new FCookWorkerServer(*this, ProfileId,
FWorkerId::FromRemoteIndex(RemoteIndex)));
bWorkersActive = true;
}
}
void FCookDirector::ActivateMachineResourceReduction()
{
if (bHasReducedMachineResources)
{
return;
}
bHasReducedMachineResources = true;
// When running a multiprocess cook, we remove the Memory triggers and trigger GC based solely on PressureLevel.
// But keep the Soft GC settings
PRAGMA_DISABLE_DEPRECATION_WARNINGS
COTFS.MemoryMaxUsedPhysical = 0;
COTFS.MemoryMaxUsedVirtual = 0;
PRAGMA_ENABLE_DEPRECATION_WARNINGS
COTFS.MemoryMinFreeVirtual = 0;
COTFS.MemoryMinFreePhysical = 0;
COTFS.MemoryTriggerGCAtPressureLevel = FGenericPlatformMemoryStats::EMemoryPressureStatus::Critical;
UE_LOG(LogCook, Display, TEXT("CookMultiprocess changed CookSettings for Memory:%s"),
*COTFS.GetCookSettingsForMemoryLogText());
// Set CoreLimit for updating workerthreads in this process and passing to the commandline for workers
int32 NumProcesses = RequestedCookWorkerCount + 1;
int32 NumberOfCores = FPlatformMisc::NumberOfCores();
int32 HyperThreadCount = FPlatformMisc::NumberOfCoresIncludingHyperthreads();
int32 NumberOfHyperThreadsPerCore = HyperThreadCount / NumberOfCores;
CoreLimit = FMath::Max(NumberOfCores / NumProcesses, 1);
const TCHAR* CommandLine = FCommandLine::Get();
float CoreOversubscription = 1.0f;
if (FParse::Value(CommandLine, TEXT("-MPCookCoreSubscription="), CoreOversubscription))
{
CoreLimit = FMath::Clamp(static_cast<int32>(CoreLimit*CoreOversubscription), 1, NumberOfCores);
}
int32 CoreIncludingHyperthreadsLimit = CoreLimit * NumberOfHyperThreadsPerCore;
int32 NumberOfWorkers = FMath::Max(CoreLimit - 1, 1) * NumberOfHyperThreadsPerCore;
// Update the number of Cores and WorkerThreads for this process
check(IsInGameThread());
int32 NumBackgroundWorkers = FMath::Max(1,
NumberOfWorkers - FMath::Min<int32>(GNumForegroundWorkers, NumberOfWorkers));
int32 NumForegroundWorkers = FMath::Max(1, NumberOfWorkers - NumBackgroundWorkers);
LowLevelTasks::FScheduler::Get().RestartWorkers(NumForegroundWorkers, NumBackgroundWorkers);
// Update the number of ShaderCompilerWorkers that can be launched
GShaderCompilingManager->OnMachineResourcesChanged(CoreLimit, CoreIncludingHyperthreadsLimit);
UE_LOG(LogCook, Display, TEXT("CookMultiprocess changed number of cores from %d to %d."),
NumberOfCores, CoreLimit);
UE_LOG(LogCook, Display, TEXT("CookMultiprocess changed number of hyperthreads from %d to %d."),
HyperThreadCount, CoreIncludingHyperthreadsLimit);
}
void FCookDirector::TickWorkerConnects(ECookDirectorThread TickThread)
{
using namespace UE::CompactBinaryTCP;
if (!WorkerConnectSocket)
{
return;
}
bool bReadReady;
while (WorkerConnectSocket->HasPendingConnection(bReadReady) && bReadReady)
{
FSocket* WorkerSocket = WorkerConnectSocket->Accept(TEXT("Client Connection"));
if (!WorkerSocket)
{
UE_LOG(LogCook, Warning, TEXT("Pending connection failed to create a ClientSocket."));
}
else
{
WorkerSocket->SetNonBlocking(true);
PendingConnections.Add(FPendingConnection(WorkerSocket));
}
}
for (TArray<FPendingConnection>::TIterator Iter(PendingConnections); Iter; ++Iter)
{
FPendingConnection& Conn = *Iter;
TArray<FMarshalledMessage> Messages;
EConnectionStatus Status;
Status = TryReadPacket(Conn.Socket, Conn.Buffer, Messages);
if (Status != EConnectionStatus::Okay)
{
UE_LOG(LogCook, Warning,
TEXT("Pending connection failed before sending a WorkerPacket: %s"), DescribeStatus(Status));
Iter.RemoveCurrent();
}
if (Messages.Num() == 0)
{
continue;
}
FPendingConnection LocalConn(MoveTemp(Conn));
Iter.RemoveCurrent();
if (Messages[0].MessageType != FWorkerConnectMessage::MessageType)
{
UE_LOG(LogCook, Warning,
TEXT("Pending connection sent a different message before sending a connection message. MessageType: %s. Connection will be ignored."),
*Messages[0].MessageType.ToString());
continue;
}
FWorkerConnectMessage Message;
if (!Message.TryRead(MoveTemp(Messages[0].Object)))
{
UE_LOG(LogCook, Warning,
TEXT("Pending connection sent an invalid Connection Message. Connection will be ignored."));
continue;
}
TRefCountPtr<FCookWorkerServer> RemoteWorker;
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
TRefCountPtr<FCookWorkerServer>* RemoteWorkerPtr;
RemoteWorkerPtr = RemoteWorkers.Find(Message.RemoteIndex);
if (!RemoteWorkerPtr)
{
TStringBuilder<256> ValidIndexes;
if (RemoteWorkers.Num())
{
RemoteWorkers.KeySort(TLess<>());
for (TPair<int32, TRefCountPtr<FCookWorkerServer>>& Pair : RemoteWorkers)
{
ValidIndexes.Appendf(TEXT("%d,"), Pair.Key);
}
ValidIndexes.RemoveSuffix(1); // Remove the terminating comma
}
UE_LOG(LogCook, Warning,
TEXT("Pending connection sent a Connection Message with invalid RemoteIndex %d. ValidIndexes = {%s}. Connection will be ignored."),
Message.RemoteIndex, *ValidIndexes);
continue;
}
RemoteWorker = *RemoteWorkerPtr;
}
FSocket* LocalSocket = LocalConn.DetachSocket();
Messages.RemoveAt(0);
if (!RemoteWorker->TryHandleConnectMessage(Message, LocalSocket, MoveTemp(Messages), TickThread))
{
UE_LOG(LogCook, Warning,
TEXT("Pending connection sent a Connection Message with an already in-use RemoteIndex. Connection will be ignored."));
Sockets::CloseSocket(LocalSocket);
continue;
}
}
}
void FCookDirector::TickWorkerShutdowns(ECookDirectorThread TickThread)
{
// Move any newly shutting down workers from RemoteWorkers
TArray<TRefCountPtr<FCookWorkerServer>> NewShutdowns;
TArray<TRefCountPtr<FCookWorkerServer>, TInlineAllocator<16>> LocalRemoteWorkers;
{
FScopeLock RemoteWorkersScopeLock(&CommunicationLock);
for (TPair<FCookWorkerServer*, TRefCountPtr<FCookWorkerServer>>& Pair : ShuttingDownWorkers)
{
if (!Pair.Value)
{
NewShutdowns.Emplace(Pair.Key);
}
else
{
LocalRemoteWorkers.Add(Pair.Value);
}
}
}
if (!NewShutdowns.IsEmpty())
{
for (FCookWorkerServer* NewShutdown : NewShutdowns)
{
AbortWorker(NewShutdown->GetWorkerId(), TickThread);
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
check(ShuttingDownWorkers.FindOrAdd(NewShutdown).IsValid()); // Abort worker should have set the value
}
LocalRemoteWorkers.Emplace(NewShutdown);
}
}
TArray<TRefCountPtr<FCookWorkerServer>, TInlineAllocator<16>> CompletedWorkers;
for (TRefCountPtr<FCookWorkerServer>& RemoteWorker : LocalRemoteWorkers)
{
RemoteWorker->TickCommunication(TickThread);
if (RemoteWorker->IsShutdownComplete())
{
CompletedWorkers.Add(RemoteWorker);
}
}
LocalRemoteWorkers.Empty();
if (!CompletedWorkers.IsEmpty())
{
FScopeLock CommunicationScopeLock(&CommunicationLock);
for (TRefCountPtr<FCookWorkerServer>& CompletedWorker : CompletedWorkers)
{
ShuttingDownWorkers.Remove(CompletedWorker.GetReference());
}
}
CompletedWorkers.Empty();
}
FString FCookDirector::GetWorkerLogFileName(int32 ProfileId)
{
FString DirectorLogFileName = FGenericPlatformOutputDevices::GetAbsoluteLogFilename();
FStringView BaseFileName = FPathViews::GetBaseFilenameWithPath(DirectorLogFileName);
FStringView Extension = FPathViews::GetExtension(DirectorLogFileName, true /* bIncludeDot */);
return FString::Printf(TEXT("%.*s_Worker%d%*s"), BaseFileName.Len(), BaseFileName.GetData(), ProfileId,
Extension.Len(), Extension.GetData());
}
FString FCookDirector::GetWorkerCommandLine(FWorkerId WorkerId, int32 ProfileId)
{
const TCHAR* CommandLine = FCommandLine::Get();
const TCHAR* ProjectName = FApp::GetProjectName();
checkf(ProjectName && ProjectName[0], TEXT("Expected UnrealEditor to be running with a non-empty project name"));
// Note that we need to handle quoted strings for e.g. a projectfile with spaces in it; FParse::Token
// does handle them
FString Token;
TArray<FString> Tokens;
bool bAssetRegistryCacheWritingDisabled = false;
while (FParse::Token(CommandLine, Token, false /* bUseEscape */))
{
if (Token.IsEmpty())
{
continue;
}
if (Token.StartsWith(TEXT("-run=")) ||
Token == TEXT("-CookOnTheFly") ||
Token == TEXT("-CookWorker") ||
Token.StartsWith(TEXT("-CookCultures")) ||
Token.StartsWith(TEXT("-CookDirectorHost=")) ||
Token.StartsWith(TEXT("-MultiprocessId=")) ||
Token.StartsWith(TEXT("-CookProfileId=")) ||
Token.StartsWith(TEXT("-ShowCookWorker")) ||
Token.StartsWith(TEXT("-CoreLimit")) ||
Token.StartsWith(TEXT("-PhysicalCoreLimit")) ||
Token.StartsWith(TEXT("-MPCookCoreSubscription")) ||
Token.StartsWith(TEXT("-CookProcessCount=")) ||
Token.StartsWith(TEXT("-abslog=")) ||
Token.StartsWith(TEXT("-unattended"))
)
{
continue;
}
else if (Token.StartsWith(TEXT("-tracefile=")))
{
FString TraceFile;
FString TokenString(Token);
if (FParse::Value(*TokenString, TEXT("-tracefile="), TraceFile) && !TraceFile.IsEmpty())
{
FStringView BaseFilenameWithPath = FPathViews::GetBaseFilenameWithPath(TraceFile);
FStringView Extension = FPathViews::GetExtension(TraceFile, true /* bIncludeDot */);
Tokens.Add(FString::Printf(TEXT("-tracefile=\"%.*s_Worker%d%.*s\""),
BaseFilenameWithPath.Len(), BaseFilenameWithPath.GetData(),
ProfileId,
Extension.Len(), Extension.GetData()));
continue;
}
}
else if (Token == TEXT("-NoAssetRegistryCacheWrite"))
{
// If cache writing has been disabled, we must also disable reading of the discovery cache on workers.
// Otherwise workers could be looking at old discovery results that no longer match what's on disk.
Tokens.Add(TEXT("-NoAssetRegistryDiscoveryCache"));
bAssetRegistryCacheWritingDisabled = true;
}
Tokens.Add(MoveTemp(Token));
}
if (Tokens[0] != ProjectName && !Tokens[0].EndsWith(TEXT(".uproject"), ESearchCase::IgnoreCase))
{
FString ProjectFilePath = FPaths::GetProjectFilePath();
if (!FPaths::IsSamePath(Tokens[0], ProjectFilePath))
{
Tokens.Insert(ProjectFilePath, 0);
}
}
Tokens.Insert(TEXT("-run=cook"), 1);
Tokens.Insert(TEXT("-cookworker"), 2);
Tokens.Insert(FString::Printf(TEXT("-CookProfileId=%d"), ProfileId), 3);
Tokens.Insert(FString::Printf(TEXT("-MultiprocessId=%d"), WorkerId.GetMultiprocessId()), 4);
// This should have been constructed in TryCreateWorkerConnectSocket before any CookWorkerServers could exist to
// call GetWorkerCommandLine
check(!WorkerConnectAuthority.IsEmpty());
Tokens.Add(FString::Printf(TEXT("-CookDirectorHost=%s"), *WorkerConnectAuthority));
Tokens.Add(TEXT("-unattended"));
Tokens.Add(FString::Printf(TEXT("-abslog=%s"), *GetWorkerLogFileName(ProfileId)));
IAssetRegistry& AssetRegistry = IAssetRegistry::GetChecked();
if (AssetRegistry.HasSerializedDiscoveryCache())
{
UE_CLOG(bAssetRegistryCacheWritingDisabled, LogCook, Warning, TEXT("Invalid configuration: The Cook Director has written an AssetRegistry discovery cache when explicitly told not to via -NoAssetRegistryCacheWrite"));
if (!bAssetRegistryCacheWritingDisabled)
{
// If the director was successful in writing a discovery cache, let workers re-use the director's cache
Tokens.Add(TEXT("-AssetRegistryCacheSkipInvalidate"));
}
}
if (CoreLimit > 0)
{
Tokens.Add(FString::Printf(TEXT("-PhysicalCoreLimit=%d"), CoreLimit));
}
// We are joining the tokens back into a commandline string; wrap tokens with whitespace in quotes
for (FString& IterToken : Tokens)
{
int32 IndexOfWhitespace = UE::String::FindFirstOfAnyChar(IterToken, { ' ', '\r', '\n' });
if (IndexOfWhitespace != INDEX_NONE)
{
int32 IndexOfQuote;
if (!IterToken.FindChar('\"', IndexOfQuote))
{
IterToken = FString::Printf(TEXT("\"%s\""), *IterToken);
}
}
}
return FString::Join(Tokens, TEXT(" "));
}
bool FDirectorConnectionInfo::TryParseCommandLine()
{
if (!FParse::Value(FCommandLine::Get(), TEXT("-CookDirectorHost="), HostURI))
{
UE_LOG(LogCook, Error, TEXT("CookWorker startup failed: no CookDirector specified on commandline."));
return false;
}
uint32 MultiprocessId = UE::GetMultiprocessId();
if (MultiprocessId == 0 && !FParse::Value(FCommandLine::Get(), TEXT("-MultiprocessId="), MultiprocessId))
{
UE_LOG(LogCook, Error, TEXT("CookWorker startup failed: no MultiprocessId specified on commandline."));
return false;
}
if (MultiprocessId < 1 || 257 <= MultiprocessId)
{
UE_LOG(LogCook, Error,
TEXT("CookWorker startup failed: commandline had invalid -MultiprocessId=%d; MultiprocessId must be in the range [1, 256]."),
MultiprocessId);
return false;
}
RemoteIndex = static_cast<int32>(MultiprocessId - 1);
return true;
}
void FCookDirector::LoadBalance(TConstArrayView<FWorkerId> SortedWorkers, TArrayView<FPackageData*> Requests,
TMap<FPackageData*, TArray<FPackageData*>>&& RequestGraph, TArray<FWorkerId>& OutAssignments)
{
OutAssignments.Reset(Requests.Num());
bool bLogResults = bIsFirstAssignment;
switch (LoadBalanceAlgorithm)
{
case ELoadBalanceAlgorithm::Striped:
return LoadBalanceStriped(SortedWorkers, Requests, MoveTemp(RequestGraph), OutAssignments, bLogResults);
case ELoadBalanceAlgorithm::CookBurden:
return LoadBalanceCookBurden(SortedWorkers, Requests, MoveTemp(RequestGraph), OutAssignments, bLogResults);
}
checkNoEntry();
return LoadBalanceCookBurden(SortedWorkers, Requests, MoveTemp(RequestGraph), OutAssignments, bLogResults);
}
void FCookDirector::AbortWorker(FWorkerId WorkerId, ECookDirectorThread TickThread)
{
check(!WorkerId.IsLocal());
int32 Index = WorkerId.GetRemoteIndex();
TRefCountPtr<FCookWorkerServer> RemoteWorker;
{
FScopeLock RemoteWorkersScopeLock(&CommunicationLock);
RemoteWorkers.RemoveAndCopyValue(Index, RemoteWorker);
if (!RemoteWorker)
{
return;
}
}
TSet<FPackageData*> PackagesToReassignSet;
RemoteWorker->AbortAllAssignments(PackagesToReassignSet, TickThread, HeartbeatNumber);
if (!RemoteWorker->IsShuttingDown())
{
RemoteWorker->AbortWorker(PackagesToReassignSet, TickThread, HeartbeatNumber);
}
TArray<FPackageData*> PackagesToReassign = PackagesToReassignSet.Array();
if (TickThread == ECookDirectorThread::SchedulerThread)
{
ReassignAbortedPackages(PackagesToReassign);
}
{
FScopeLock RemoteWorkersScopeLock(&CommunicationLock);
TRefCountPtr<FCookWorkerServer>& Existing = ShuttingDownWorkers.FindOrAdd(RemoteWorker.GetReference());
// We should not be able to abort a worker twice because we removed it from RemoteWorkers above
check(!Existing);
Existing = MoveTemp(RemoteWorker);
if (TickThread != ECookDirectorThread::SchedulerThread)
{
DeferredPackagesToReassign.Append(PackagesToReassign);
}
}
}
void FCookDirector::ReassignAbortedPackages(TArray<FPackageData*>& PackagesToReassign)
{
for (FPackageData* PackageData : PackagesToReassign)
{
// Packages that were assigned to a worker should be in the AssignedToWorker state and therefore in progress.
check(PackageData->IsInProgress());
PackageData->SetWorkerAssignment(FWorkerId::Invalid());
EPackageState NewState = PackageData->IsInStateProperty(EPackageStateProperty::Saving)
? EPackageState::SaveActive
: EPackageState::Request;
PackageData->SendToState(NewState, ESendFlags::QueueAddAndRemove, EStateChangeReason::ReassignAbortedPackages);
}
PackagesToReassign.Empty();
}
void FCookDirector::ConstructReadonlyThreadVariables()
{
IsCookIgnoreTimeouts(); // The global variables are read-only; call them now to initialize them
CommandletExecutablePath = FUnrealEdMisc::Get().GetProjectEditorBinaryPath();
InitialConfigMessage = MakeUnique<FInitialConfigMessage>();
FMPCollectorServerTickContext StartupContext(FMPCollectorServerTickContext::EServerEventType::WorkerStartup);
StartupContext.Platforms = COTFS.PlatformManager->GetSessionPlatforms();
InitialConfigMessage->ReadFromLocal(COTFS, StartupContext.Platforms,
*COTFS.CookByTheBookOptions, *COTFS.CookOnTheFlyOptions, BeginCookContext);
for (const TPair<FGuid, TRefCountPtr<IMPCollector>>& CollectorPair : Collectors)
{
IMPCollector* Collector = CollectorPair.Value.GetReference();
Collector->ServerTick(StartupContext);
if (!StartupContext.Messages.IsEmpty())
{
FGuid MessageType = Collector->GetMessageType();
for (FCbObject& Object : StartupContext.Messages)
{
InitialConfigMessage->AddMessage({ MessageType, MoveTemp(Object) });
}
StartupContext.Messages.Reset();
}
}
}
const FInitialConfigMessage& FCookDirector::GetInitialConfigMessage()
{
return *InitialConfigMessage;
}
FCookDirector::FLaunchInfo FCookDirector::GetLaunchInfo(FWorkerId WorkerId, int32 ProfileId)
{
FLaunchInfo Info;
Info.ShowWorkerOption = GetShowWorkerOption();
Info.CommandletExecutable = CommandletExecutablePath;
Info.WorkerCommandLine = GetWorkerCommandLine(WorkerId, ProfileId);
return Info;
}
#if ENABLE_COOK_STATS
void FCookDirector::LogCookStats(FCookStatsManager::AddStatFuncRef AddStat)
{
auto IdleTimeToString = [](float IdleTime)
{
return FString::Printf(TEXT("%.1fs"), IdleTime);
};
TArray<FCookStatsManager::StringKeyValue> Stats;
Stats.Emplace(TEXT("LocalWorker IdleTime"), IdleTimeToString(LocalWorkerProfileData->IdleTimeSeconds));
for (int32 ProfileId = 0; ProfileId < RemoteWorkerProfileDatas.Num(); ++ProfileId)
{
FCookWorkerProfileData& ProfileData = RemoteWorkerProfileDatas[ProfileId];
Stats.Emplace(FString::Printf(TEXT("CookWorker %d IdleTime"), ProfileId),
IdleTimeToString(ProfileData.IdleTimeSeconds));
}
AddStat(TEXT("CookDirector"), Stats);
}
#endif
FCookDirector::FRetractionHandler::FRetractionHandler(FCookDirector& InDirector)
: Director(InDirector)
{
}
FCookDirector::FRetractionHandler::ERetractionState FCookDirector::FRetractionHandler::TickWantToRetract(
bool& bOutAnyIdle, int32& OutBusiestNumAssignments)
{
FWorkerId BusiestWorker;
TArray<FWorkerId> IdleWorkers;
int32 BusiestNumAssignments = 0;
if (Director.bAllowLocalCooks)
{
int32 NumAssignments = Director.COTFS.NumMultiprocessLocalWorkerAssignments();
if (NumAssignments > BusiestNumAssignments)
{
if (IsAvailableForRetraction(FWorkerId::Local()))
{
BusiestWorker = FWorkerId::Local();
BusiestNumAssignments = NumAssignments;
}
}
if (NumAssignments == 0)
{
IdleWorkers.Add(FWorkerId::Local());
}
}
TArray<TRefCountPtr<FCookWorkerServer>> LocalRemoteWorkers = Director.CopyRemoteWorkers();
for (const TRefCountPtr<FCookWorkerServer>& RemoteWorker : LocalRemoteWorkers)
{
int32 NumAssignments = RemoteWorker->NumAssignments();
if (NumAssignments > BusiestNumAssignments)
{
if (IsAvailableForRetraction(RemoteWorker->GetWorkerId()))
{
BusiestWorker = RemoteWorker->GetWorkerId();
BusiestNumAssignments = NumAssignments;
}
}
if (NumAssignments == 0)
{
IdleWorkers.Add(RemoteWorker->GetWorkerId());
}
}
bOutAnyIdle = !IdleWorkers.IsEmpty();
OutBusiestNumAssignments = BusiestNumAssignments;
if (IdleWorkers.IsEmpty() || BusiestNumAssignments < RetractionMinimumNumAssignments)
{
// Worker loads changed after the point where we decided to initialize the RetractionHandler,
// or all workers with packages assigned are unavailable for retraction, so retraction is not
// currently possible. Try again later.
return ERetractionState::WantToRetract;
}
check(!BusiestWorker.IsInvalid());
// Plan to divide the assignments evenly between all idle workers and the one busiest worker. This means
// retracting all but 1/(N+1) packages from the busiest worker.
int32 NumAssignmentsToRetract = (BusiestNumAssignments * IdleWorkers.Num()) / (IdleWorkers.Num() + 1);
TStringBuilder<256> IdleWorkerListText;
for (FWorkerId& WorkerId : IdleWorkers)
{
IdleWorkerListText << Director.GetDisplayName(WorkerId) << TEXT(", ");
}
IdleWorkerListText.RemoveSuffix(2);
UE_LOG(LogCook, Display,
TEXT("Idle CookWorkers: { %s }. Retracting %d packages from %s to distribute to the idle CookWorkers."),
*IdleWorkerListText, NumAssignmentsToRetract, *Director.GetDisplayName(BusiestWorker));
Director.DisplayRemainingPackages();
if (BusiestWorker.IsLocal())
{
ExpectedWorker = FWorkerId::Local();
WorkerWithResults = ExpectedWorker;
TArray<FName> LocalPackagesToRetract;
Director.COTFS.GetPackagesToRetract(NumAssignmentsToRetract, LocalPackagesToRetract);
PackagesToRetract.FindOrAdd(ExpectedWorker).Append(MoveTemp(LocalPackagesToRetract));
}
else
{
TRefCountPtr<FCookWorkerServer>* RemoteWorker = LocalRemoteWorkers.FindByPredicate(
[&BusiestWorker](const TRefCountPtr<FCookWorkerServer>& X) { return X->GetWorkerId() == BusiestWorker; });
check(RemoteWorker);
FRetractionRequestMessage Message;
Message.RequestedCount = NumAssignmentsToRetract;
(*RemoteWorker)->SendMessage(Message, ECookDirectorThread::SchedulerThread);
ExpectedWorker = BusiestWorker;
MessageSentTimeSeconds = FPlatformTime::Seconds();
LastWarnTimeSeconds = MessageSentTimeSeconds;
}
return ERetractionState::WaitingForResponse;
}
void FCookDirector::FRetractionHandler::InitializeForResultsMessage(const FWorkerId& FromWorker)
{
ExpectedWorker = FromWorker;
}
void FCookDirector::FRetractionHandler::TickFromSchedulerThread(bool bAllWorkersConnected, bool bAnyIdle,
int32 BusiestNumAssignments)
{
bool bHadStateChange;
int32 NumTransitions = 0;
constexpr int32 MaxNumTransitions = static_cast<int32>(ERetractionState::Count);
do
{
bHadStateChange = false;
switch (RetractionState)
{
case ERetractionState::Idle:
{
if (!bAnyIdle || !bAllWorkersConnected || BusiestNumAssignments <= RetractionMinimumNumAssignments)
{
break;
}
SetRetractionState(ERetractionState::WantToRetract, bHadStateChange);
break;
}
case ERetractionState::WantToRetract:
{
if (!bAnyIdle || !bAllWorkersConnected || BusiestNumAssignments <= RetractionMinimumNumAssignments)
{
SetRetractionState(ERetractionState::Idle, bHadStateChange);
break;
}
ERetractionState NewState = TickWantToRetract(bAnyIdle, BusiestNumAssignments);
SetRetractionState(NewState, bHadStateChange);
break;
}
case ERetractionState::WaitingForResponse:
{
ERetractionState NewState = TickWaitingForResponse();
SetRetractionState(NewState, bHadStateChange);
break;
}
default:
checkNoEntry();
break;
}
} while (bHadStateChange
&& RetractionState != ERetractionState::Idle
&& ++NumTransitions <= MaxNumTransitions);
}
void FCookDirector::FRetractionHandler::SetRetractionState(ERetractionState NewState, bool &bOutHadStateChange)
{
if (RetractionState == NewState)
{
bOutHadStateChange = false;
return;
}
bOutHadStateChange = true;
RetractionState = NewState;
if (NewState == ERetractionState::Idle)
{
WorkersUnavailableForRetract.Empty();
}
}
bool FCookDirector::FRetractionHandler::IsAvailableForRetraction(const FWorkerId& WorkerId)
{
// Called from inside CommunicationLock
int32* AssignedPackagesFence = WorkersUnavailableForRetract.Find(WorkerId);
if (!AssignedPackagesFence)
{
return true;
}
int32 CurrentFenceMarker;
if (WorkerId.IsLocal())
{
CurrentFenceMarker = Director.COTFS.PackageDatas->GetMonitor().GetMPCookAssignedFenceMarker();
}
else
{
const TRefCountPtr<FCookWorkerServer>* RemoteWorker = Director.FindRemoteWorkerInLock(WorkerId);
if (!RemoteWorker)
{
WorkersUnavailableForRetract.Remove(WorkerId);
return true;
}
CurrentFenceMarker = (*RemoteWorker)->GetPackagesAssignedFenceMarker();
}
if (*AssignedPackagesFence == CurrentFenceMarker)
{
// FenceMarker has not changed since we recorded the worker as unavailable for retraction at that fence marker
// The worker is still unavailable for retraction
return false;
}
WorkersUnavailableForRetract.Remove(WorkerId);
return true;
}
FCookDirector::FRetractionHandler::ERetractionState FCookDirector::FRetractionHandler::TickWaitingForResponse()
{
// Called from inside CommunicationLock
if (ExpectedWorker.IsInvalid())
{
// We decided to cancel
checkf(PackagesToRetract.IsEmpty(), TEXT("We should not have any packages when we cancelled."));
return ERetractionState::Idle;
}
if (WorkerWithResults.IsInvalid())
{
double CurrentTime = FPlatformTime::Seconds();
constexpr float WarnDuration = 60.f;
if (static_cast<float>(CurrentTime - LastWarnTimeSeconds) < WarnDuration)
{
return ERetractionState::WaitingForResponse;
}
check(ExpectedWorker.IsRemote());
{
const TRefCountPtr<FCookWorkerServer>* RemoteWorkerPtr = Director.FindRemoteWorkerInLock(ExpectedWorker);
if (!RemoteWorkerPtr)
{
// The CookWorker aborted and we already reassigned all of its packages; stop waiting for a retraction
// message from it.
check(PackagesToRetract.IsEmpty()); // Otherwise WorkerWithResults would have been set
ExpectedWorker = FWorkerId::Invalid();
return ERetractionState::Idle;
}
}
UE_CLOG(!IsCookIgnoreTimeouts(), LogCook, Display,
TEXT("%s has not responded to a RetractionRequest message for %.1f seconds. Continuing to wait..."),
*Director.GetDisplayName(ExpectedWorker), static_cast<float>(CurrentTime - MessageSentTimeSeconds));
LastWarnTimeSeconds = CurrentTime;
return ERetractionState::WaitingForResponse;
}
// Convert names to packagedatas and collect results from all CookWorkers who sent a message.
TArray<FPackageData*> PackageDatasToReassign;
for (const TPair<FWorkerId, TArray<FName>>& Pair : PackagesToRetract)
{
TRefCountPtr<FCookWorkerServer> RemoteWorker;
if (Pair.Key.IsRemote())
{
const TRefCountPtr<FCookWorkerServer>* FoundRemoteWorker = Director.FindRemoteWorkerInLock(Pair.Key);
if (FoundRemoteWorker)
{
RemoteWorker = *FoundRemoteWorker;
}
}
TArray<FPackageData*> WorkerPackageDatas;
WorkerPackageDatas.Reserve(Pair.Value.Num());
for (FName PackageName : Pair.Value)
{
FPackageData* PackageData = Director.COTFS.PackageDatas->FindPackageDataByPackageName(PackageName);
if (PackageData)
{
WorkerPackageDatas.Add(PackageData);
}
}
if (RemoteWorker)
{
// The worker(s) that sent the retraction message aborted all of the packages, so mark locally that they
// have been aborted
RemoteWorker->AbortAssignments(WorkerPackageDatas, ECookDirectorThread::SchedulerThread,
Director.HeartbeatNumber, ENotifyRemote::LocalOnly);
}
PackageDatasToReassign.Append(MoveTemp(WorkerPackageDatas));
}
// Reassign the packages
ERetractionResult Result = ReassignPackages(WorkerWithResults, PackageDatasToReassign);
if (Result == ERetractionResult::NoneAvailable)
{
TOptional<int32> AssignedPackagesFence;
if (WorkerWithResults.IsLocal())
{
AssignedPackagesFence.Emplace(Director.COTFS.PackageDatas->GetMonitor().GetMPCookAssignedFenceMarker());
}
else
{
const TRefCountPtr<FCookWorkerServer>* RemoteWorker = Director.FindRemoteWorkerInLock(WorkerWithResults);
if (RemoteWorker)
{
AssignedPackagesFence.Emplace((*RemoteWorker)->GetPackagesAssignedFenceMarker());
}
}
if (AssignedPackagesFence.IsSet())
{
WorkersUnavailableForRetract.Add(WorkerWithResults, *AssignedPackagesFence);
}
}
// Mark that we are no longer waiting
ExpectedWorker = FWorkerId::Invalid();
WorkerWithResults = FWorkerId::Invalid();
PackagesToRetract.Empty();
// Return to WantToRetract state; that state will handle returning to idle if the retraction was sufficient
return ERetractionState::WantToRetract;
}
void FCookDirector::FRetractionHandler::HandleRetractionMessage(FMPCollectorServerMessageContext& Context,
bool bReadSuccessful, FRetractionResultsMessage&& Message)
{
// Called from inside CommunicationLock
if (!bReadSuccessful)
{
UE_LOG(LogCook, Error,
TEXT("Corrupt RetractionResultsMessage received from CookWorker %d. It will be ignored and packages may fail to cook."),
Context.GetProfileId());
return;
}
if (RetractionState != ERetractionState::WaitingForResponse)
{
UE_LOG(LogCook, Warning, TEXT("Retractionmessage received from CookWorker %d when we were not expecting one."),
Context.GetProfileId());
InitializeForResultsMessage(Context.GetWorkerId());
bool bUnusedHadStateChange;
SetRetractionState(ERetractionState::WaitingForResponse, bUnusedHadStateChange);
}
UE_CLOG(WorkerWithResults.IsValid(), LogCook, Error,
TEXT("Unexpectedly received RetractionResults message from multiple CookWorkers. Merging the results."));
WorkerWithResults = Context.GetWorkerId();
PackagesToRetract.FindOrAdd(WorkerWithResults).Append(Message.ReturnedPackages);
}
FCookDirector::FRetractionHandler::ERetractionResult
FCookDirector::FRetractionHandler::ReassignPackages(const FWorkerId& FromWorker,
TConstArrayView<FPackageData*> Packages)
{
TArray<TRefCountPtr<FCookWorkerServer>> LocalRemoteWorkers = Director.CopyRemoteWorkers();
TArray<FWorkerId> WorkersRequiredByConstraint;
TArray<FPackageData*> AssignmentPackages;
for (FPackageData* PackageData : Packages)
{
EPackageState State = PackageData->GetState();
if (State == EPackageState::Idle)
{
continue;
}
FWorkerId WorkerConstraint = PackageData->GetWorkerAssignmentConstraint();
if (WorkerConstraint.IsValid())
{
if (!WorkerConstraint.IsLocal() && !LocalRemoteWorkers.FindByPredicate(
[&WorkerConstraint](const TRefCountPtr<FCookWorkerServer>& X)
{
return X->GetWorkerId() == WorkerConstraint;
}))
{
continue;
}
WorkersRequiredByConstraint.AddUnique(WorkerConstraint);
}
AssignmentPackages.Add(PackageData);
// The package is no longer assigned to the worker, and the worker already knows it, so remove its assignment.
// We have to remove the assignment so that we can reassign it down below; it is invalid to call
// SetWorkerAssignment to a new worker without clearing the old worker first.
PackageData->SetWorkerAssignment(FWorkerId::Invalid());
}
if (AssignmentPackages.IsEmpty())
{
UE_LOG(LogCook, Display,
TEXT("Retraction results message received from %s; no packages were available for retraction."),
*Director.GetDisplayName(FromWorker));
Director.DisplayRemainingPackages();
return ERetractionResult::NoneAvailable;
}
TArray<FWorkerId> WorkersToSplitOver = CalculateWorkersToSplitOver(AssignmentPackages.Num(), FromWorker,
LocalRemoteWorkers);
if (WorkersToSplitOver.IsEmpty())
{
// Send the packages back to the Director for reassignment
TPackageDataMap<ESuppressCookReason>& RestartedRequests = Director.COTFS.PackageDatas->GetRequestQueue().GetRestartedRequests();
for (FPackageData* PackageData : AssignmentPackages)
{
// UnStall the package to handle the case where it was previously in the save state on the LocalWorker
// and was retracted to a remote worker, so we stalled it, but now we've pulled it back to the LocalWorker.
PackageData->UnStall(ESendFlags::QueueAddAndRemove);
// If the packagedata was unstalled back into the Save State, then keep it there, but otherwise it is in
// the AssignedToWorker state so kick it back to readyrequest.
if (PackageData->GetState() != EPackageState::SaveActive)
{
PackageData->SendToState(EPackageState::Request, ESendFlags::QueueRemove, EStateChangeReason::Retraction);
RestartedRequests.Add(PackageData, ESuppressCookReason::RetractedByCookDirector);
}
}
// MPCOOKTODO: Add a method to PumpRequests long enough to assign the packages
UE_LOG(LogCook, Display,
TEXT("%d packages retracted from %s. No workers are currently idle so the packages were assigned evenly to all CookWorkers."),
AssignmentPackages.Num(), *Director.GetDisplayName(FromWorker));
Director.DisplayRemainingPackages();
return ERetractionResult::Retracted;
}
for (const FWorkerId& WorkerId : WorkersRequiredByConstraint)
{
WorkersToSplitOver.AddUnique(WorkerId);
}
TStringBuilder<256> WorkerListText;
for (const FWorkerId& WorkerId : WorkersToSplitOver)
{
WorkerListText << Director.GetDisplayName(WorkerId) << TEXT(", ");
}
WorkerListText.RemoveSuffix(2);
UE_LOG(LogCook, Display, TEXT("%d packages retracted from %s and distributed to idle workers { %s }."),
AssignmentPackages.Num(), *Director.GetDisplayName(FromWorker), *WorkerListText);
TMap<FPackageData*, TArray<FPackageData*>> RequestGraph;
TArray<FWorkerId> Assignments;
Director.AssignRequests(MoveTemp(WorkersToSplitOver), LocalRemoteWorkers, AssignmentPackages, Assignments,
MoveTemp(RequestGraph), false /* bInitialAssignment */);
FRequestQueue& RequestQueue = Director.COTFS.PackageDatas->GetRequestQueue();
bool bAssignedToLocal = false;
for (int32 Index = 0; Index < AssignmentPackages.Num(); ++Index)
{
FPackageData* PackageData = AssignmentPackages[Index];
FWorkerId Assignment = Assignments[Index];
if (Assignment.IsInvalid())
{
Director.COTFS.DemoteToIdle(*PackageData, ESendFlags::QueueAdd,
ESuppressCookReason::MultiprocessAssignmentError);
}
else if (Assignment.IsLocal())
{
// UnStall the package to handle the case where it was previously in the save state on the LocalWorker
// and was retracted to a remote worker, so we stalled it, but now we've pulled it back to the LocalWorker.
PackageData->UnStall(ESendFlags::QueueAddAndRemove);
// If the packagedata was unstalled back into the Save State, then keep it there, but otherwise it is in
// the AssignedToWorker state so kick it back to readyrequest.
if (PackageData->GetState() != EPackageState::SaveActive)
{
PackageData->SendToState(EPackageState::Request, ESendFlags::QueueRemove, EStateChangeReason::Retraction);
RequestQueue.AddReadyRequest(PackageData);
}
bAssignedToLocal = true;
}
else
{
TRefCountPtr<FGenerationHelper> GenerationHelper = PackageData->GetGenerationHelper();
if (!GenerationHelper)
{
GenerationHelper = PackageData->GetParentGenerationHelper();
}
bool bShouldStall = false;
if (GenerationHelper)
{
bShouldStall = GenerationHelper->ShouldRetractionStallRatherThanDemote(*PackageData);
}
if (bShouldStall)
{
PackageData->Stall(EPackageState::SaveStalledAssignedToWorker, ESendFlags::QueueAddAndRemove);
// ShouldRetractionStallRatherThanDemote should not return true unless the Stall will succeed
check(PackageData->GetState() == EPackageState::SaveStalledAssignedToWorker);
UE_LOG(LogCook, Display, TEXT("Retracting generated package %s; it will remain in memory on the director's worker until the generator finishes saving."),
*WriteToString<256>(PackageData->GetPackageName()));
}
else
{
PackageData->SendToState(EPackageState::AssignedToWorker, ESendFlags::QueueAddAndRemove, EStateChangeReason::Retraction);
}
PackageData->SetWorkerAssignment(Assignment);
}
}
Director.DisplayRemainingPackages();
if (bAssignedToLocal)
{
// Clear the SoftGC diagnostic ExpectedNeverLoadPackages because we have new assigned packages
// that we didn't consider during SoftGC
Director.COTFS.PackageTracker->ClearExpectedNeverLoadPackages();
}
return ERetractionResult::Retracted;
}
TArray<FWorkerId> FCookDirector::FRetractionHandler::CalculateWorkersToSplitOver(int32 NumPackages,
const FWorkerId& FromWorker, TConstArrayView<TRefCountPtr<FCookWorkerServer>> LocalRemoteWorkers)
{
TArray<TPair<FWorkerId, int32>> WorkerNumPackages;
if (FromWorker != FWorkerId::Local() && Director.bAllowLocalCooks)
{
WorkerNumPackages.Emplace(FWorkerId::Local(), Director.COTFS.NumMultiprocessLocalWorkerAssignments());
}
for (const TRefCountPtr<FCookWorkerServer>& RemoteWorker : LocalRemoteWorkers)
{
if (FromWorker != RemoteWorker->GetWorkerId())
{
WorkerNumPackages.Emplace(RemoteWorker->GetWorkerId(), RemoteWorker->NumAssignments());
}
}
if (WorkerNumPackages.Num() == 0)
{
return TArray<FWorkerId>();
}
WorkerNumPackages.Sort([](const TPair<FWorkerId, int32>& A, const TPair<FWorkerId, int32>& B)
{
return A.Value < B.Value;
});
// Consider splitting the packages amonst the 1 lowest, 2 lowest, ... n lowest (not including the FromWorker)
// Pick the value to split over based on whichever split group results in the lowest post split maximum
// So splitting 500 over 0,1000,1000,1000 -> would give them all to the first, but splitting 500 over
// 0, 100, 1000, 1000 would split them amongst the first two.
int32 BestNumToSplitOver = 0;
int32 BestPostSplitValue = 0;
for (int32 NumToSplitOver = 1; NumToSplitOver <= WorkerNumPackages.Num(); ++NumToSplitOver)
{
int32 PostSplitValue = WorkerNumPackages[NumToSplitOver - 1].Value + NumPackages / NumToSplitOver;
if (BestNumToSplitOver == 0 || PostSplitValue < BestPostSplitValue)
{
BestNumToSplitOver = NumToSplitOver;
BestPostSplitValue = PostSplitValue;
}
}
check(BestNumToSplitOver > 0);
TArray<FWorkerId> Results;
Results.Reserve(BestNumToSplitOver);
for (const TPair<FWorkerId, int32>& Pair :
TArrayView<TPair<FWorkerId, int32>>(WorkerNumPackages).Left(BestNumToSplitOver))
{
Results.Add(Pair.Key);
}
return Results;
}
void FRetractionRequestMessage::Write(FCbWriter& Writer) const
{
Writer << "RequestedCount" << RequestedCount;
}
bool FRetractionRequestMessage::TryRead(FCbObjectView Object)
{
return LoadFromCompactBinary(Object["RequestedCount"], RequestedCount);
}
FGuid FRetractionRequestMessage::MessageType(TEXT("7109E168E8A8405BA65F9E1E82571D1A"));
void FRetractionResultsMessage::Write(FCbWriter& Writer) const
{
Writer << "ReturnedPackages" << ReturnedPackages;
}
bool FRetractionResultsMessage::TryRead(FCbObjectView Object)
{
return LoadFromCompactBinary(Object["ReturnedPackages"], ReturnedPackages);
}
FGuid FRetractionResultsMessage::MessageType(TEXT("CBFB840A4FB94903A757C490514A4B86"));
void FDirectorEventMessage::Write(FCbWriter& Writer) const
{
static_assert(sizeof(EDirectorEvent) <= sizeof(uint8), "We are storing it in a uint8");
Writer << "T" << (uint8)Event;
}
bool FDirectorEventMessage::TryRead(FCbObjectView Object)
{
bool bOk = true;
uint8 EventInt = Object["T"].AsUInt8(static_cast<uint8>(EDirectorEvent::Count));
if (EventInt < static_cast<uint8>(EDirectorEvent::Count))
{
Event = static_cast<EDirectorEvent>(EventInt);
}
else
{
bOk = false;
}
return bOk;
}
FGuid FDirectorEventMessage::MessageType(TEXT("58A03A38FEE045B08331BB8457AEBE35"));
}