// Copyright Epic Games, Inc. All Rights Reserved. #include "CookDirector.h" #include "Async/Fundamental/Scheduler.h" #include "CompactBinaryTCP.h" #include "Cooker/CookGenerationHelper.h" #include "Cooker/CookPackageData.h" #include "Cooker/CookPlatformManager.h" #include "Cooker/CookWorkerServer.h" #include "CookOnTheSide/CookOnTheFlyServer.h" #include "CoreGlobals.h" #include "GenericPlatform/GenericPlatformOutputDevices.h" #include "LoadBalanceCookBurden.h" #include "HAL/PlatformMisc.h" #include "HAL/PlatformTime.h" #include "HAL/RunnableThread.h" #include "Math/NumericLimits.h" #include "Misc/CommandLine.h" #include "Misc/ConfigCacheIni.h" #include "Misc/PathViews.h" #include "PackageTracker.h" #include "Serialization/CompactBinary.h" #include "Serialization/CompactBinaryWriter.h" #include "ShaderCompiler.h" #include "Sockets.h" #include "SocketSubsystem.h" #include "String/ParseTokens.h" #include "UnrealEdMisc.h" extern CORE_API int32 GNumForegroundWorkers; // TaskGraph.cpp LLM_DEFINE_TAG(Cooker_MPCook); namespace UE::Cook { constexpr int32 RetractionMinimumNumAssignments = 100; /** Profile data for each CookWorker that needs to be collected on the Director. */ struct FCookWorkerProfileData { float IdleTimeSeconds = 0.f; bool bIsIdle = true; void UpdateIdle(bool bInIsIdle, float DeltaTime) { if (bInIsIdle) { if (bIsIdle) { IdleTimeSeconds += DeltaTime; } } bIsIdle = bInIsIdle; } }; /** * A class that has an instance active while we need to handle retraction of assigned results from a CookWorker. * Keeps track of the expected message coming back from the remote worker, prevents repeatedly sending messages, gives * a warning if the remote worker does not respond. */ class FCookDirector::FRetractionHandler { public: FRetractionHandler(FCookDirector& InDirector); /** Initialize to search idle and busy workers to send a RetractionRequestMessage. */ void Initialize(); /** Initialize to handle an unexpected RetractionResultsMessage. */ void InitializeForResultsMessage(const FWorkerId& FromWorker); void TickFromSchedulerThread(bool bAllWorkersConnected, bool bAnyIdle, int32 BusiestNumAssignments); /** Hook called by the director when a retraction message comes in. */ void HandleRetractionMessage(FMPCollectorServerMessageContext& Context, bool bReadSuccessful, FRetractionResultsMessage&& Message); private: enum class ERetractionState : uint8 { Idle, WantToRetract, WaitingForResponse, Count, }; enum class ERetractionResult : uint8 { NoneAvailable, Retracted }; private: /** Try to select a worker for retraction */ ERetractionState TickWantToRetract(bool& bOutAnyIdle, int32& OutBusiestNumAssignments); /** Tick the asynchronous wait for the message to come in, and synchronously handle it when it does. */ ERetractionState TickWaitingForResponse(); /** * Pick workers to give the retracted packages to, and assign those packages to the worker * in the local and remote state. */ ERetractionResult ReassignPackages(const FWorkerId& WorkerId, TConstArrayView Packages); /** Pick workers to give the retracted packages to. */ TArray CalculateWorkersToSplitOver(int32 NumPackages, const FWorkerId& FromWorker, TConstArrayView> LocalRemoteWorkers); void SetRetractionState(ERetractionState NewState, bool& bOutHadStateChange); bool IsAvailableForRetraction(const FWorkerId& WorkerId); private: FCookDirector& Director; FWorkerId ExpectedWorker; TMap> PackagesToRetract; TMap WorkersUnavailableForRetract; FWorkerId WorkerWithResults; double MessageSentTimeSeconds = 0.; double LastWarnTimeSeconds = 0.; ERetractionState RetractionState = ERetractionState::Idle; }; FCookDirector::FCookDirector(UCookOnTheFlyServer& InCOTFS, int32 CookProcessCount, bool bInCookProcessCountSetByCommandLine) : RunnableShunt(*this) , COTFS(InCOTFS) , bCookProcessCountSetByCommandLine(bInCookProcessCountSetByCommandLine) { check(CookProcessCount > 1); WorkersStalledStartTimeSeconds = MAX_flt; WorkersStalledWarnTimeSeconds = MAX_flt; ShutdownEvent->Reset(); LocalWorkerProfileData = MakeUnique(); RetractionHandler = MakeUnique(*this); bool bConfigValid; ParseConfig(CookProcessCount, bConfigValid); if (!bConfigValid) { UE_LOG(LogCook, Error, TEXT("CookDirector initialization failure: config settings are invalid for multiprocess. CookMultiprocess is disabled and the cooker is running as a single process.")); bMultiprocessAvailable = false; return; } ISocketSubsystem* SocketSubsystem = ISocketSubsystem::Get(); if (!SocketSubsystem) { UE_LOG(LogCook, Error, TEXT("CookDirector initialization failure: platform does not support network sockets. CookMultiprocess is disabled and the cooker is running as a single process.")); bMultiprocessAvailable = false; return; } bMultiprocessAvailable = true; Register(new FLogMessagesMessageHandler(*COTFS.LogHandler)); Register(new TMPCollectorServerMessageCallback([this] (FMPCollectorServerMessageContext& Context, bool bReadSuccessful, FRetractionResultsMessage&& Message) { // Called from inside CommunicationLock RetractionHandler->HandleRetractionMessage(Context, bReadSuccessful, MoveTemp(Message)); })); Register(new TMPCollectorServerMessageCallback([this] (FMPCollectorServerMessageContext& Context, bool bReadSuccessful, FHeartbeatMessage&& Message) { HandleHeartbeatMessage(Context, bReadSuccessful, MoveTemp(Message)); })); Register(new FAssetRegistryMPCollector(COTFS)); Register(new FPackageWriterMPCollector(COTFS)); LastTickTimeSeconds = FPlatformTime::Seconds(); #if ENABLE_COOK_STATS FCookStatsManager::CookStatsCallbacks.AddRaw(this, &FCookDirector::LogCookStats); #endif } bool FCookDirector::IsMultiprocessAvailable() const { return bMultiprocessAvailable; } void FCookDirector::ParseConfig(int32 CookProcessCount, bool& bOutValid) { bOutValid = true; const TCHAR* CommandLine = FCommandLine::Get(); FString Text; // CookWorkerCount RequestedCookWorkerCount = CookProcessCount - 1; check(RequestedCookWorkerCount > 0); // CookDirectorListenPort WorkerConnectPort = Sockets::COOKDIRECTOR_DEFAULT_REQUEST_CONNECTION_PORT; FParse::Value(CommandLine, TEXT("-CookDirectorListenPort="), WorkerConnectPort); // ShowCookWorker if (!FParse::Value(CommandLine, TEXT("-ShowCookWorker="), Text)) { if (FParse::Param(CommandLine, TEXT("ShowCookWorker"))) { Text = TEXT("SeparateWindows"); } } if (Text == TEXT("CombinedLogs")) { ShowWorkerOption = EShowWorker::CombinedLogs; } else if (Text == TEXT("SeparateLogs")) { ShowWorkerOption = EShowWorker::SeparateLogs; } else if (Text == TEXT("SeparateWindows")) { ShowWorkerOption = EShowWorker::SeparateWindows; } else { if (!Text.IsEmpty()) { UE_LOG(LogCook, Warning, TEXT("Invalid selection \"%s\" for -ShowCookWorker."), *Text); } ShowWorkerOption = EShowWorker::CombinedLogs; } // LoadBalanceAlgorithm LoadBalanceAlgorithm = ELoadBalanceAlgorithm::CookBurden; if (FParse::Value(CommandLine, TEXT("-CookLoadBalance="), Text)) { if (Text == TEXT("Striped")) { LoadBalanceAlgorithm = ELoadBalanceAlgorithm::Striped; } else if (Text == TEXT("CookBurden")) { LoadBalanceAlgorithm = ELoadBalanceAlgorithm::CookBurden; } else { UE_LOG(LogCook, Warning, TEXT("Invalid selection \"%s\" for -CookLoadBalance."), *Text); } } bAllowLocalCooks = !FParse::Param(CommandLine, TEXT("CookForceRemote")); int32 MultiprocessId = UE::GetMultiprocessId(); if (MultiprocessId != 0) { bOutValid = false; UE_LOG(LogCook, Error, TEXT("CookMultiprocess is incompatible with -MultiprocessId on the CookDirector's commandline. The CookDirector needs to be able to specify all MultiprocessIds.")); } } FCookDirector::~FCookDirector() { StopCommunicationThread(); #if ENABLE_COOK_STATS FCookStatsManager::CookStatsCallbacks.RemoveAll(this); #endif TSet AbortedAssignments; for (TPair>& Pair : RemoteWorkers) { Pair.Value->AbortWorker(AbortedAssignments, ECookDirectorThread::SchedulerThread, HeartbeatNumber); } for (FPackageData* PackageData : AbortedAssignments) { // Packages that were assigned to workers should be in an AssignedToWorker state and // therefore should be InProgress. check(PackageData->IsInProgress()); PackageData->SetWorkerAssignment(FWorkerId::Invalid(), ESendFlags::QueueNone); EPackageState NewState = PackageData->IsInStateProperty(EPackageStateProperty::Saving) ? EPackageState::SaveActive : EPackageState::Request; PackageData->SendToState(NewState, ESendFlags::QueueAddAndRemove, EStateChangeReason::CookerShutdown); } RemoteWorkers.Empty(); RemoteWorkerProfileDatas.Empty(); PendingConnections.Empty(); Sockets::CloseSocket(WorkerConnectSocket); } void FCookDirector::LaunchCommunicationThread() { if (!CommunicationThread && FPlatformProcess::SupportsMultithreading()) { CommunicationThread = FRunnableThread::Create(&RunnableShunt, TEXT("FCookDirector"), 0, TPri_Normal); } } void FCookDirector::StopCommunicationThread() { ShutdownEvent->Trigger(); if (CommunicationThread) { CommunicationThread->WaitForCompletion(); delete CommunicationThread; CommunicationThread = nullptr; } ShutdownEvent->Reset(); } uint32 FCookDirector::RunCommunicationThread() { constexpr float TickPeriod = 1.f; constexpr float MinSleepTime = 0.001f; for (;;) { double StartTime = FPlatformTime::Seconds(); TickCommunication(ECookDirectorThread::CommunicateThread); double CurrentTime = FPlatformTime::Seconds(); float RemainingDuration = StartTime + TickPeriod - CurrentTime; if (RemainingDuration > .001f) { uint32 WaitTimeMilliseconds = static_cast(RemainingDuration * 1000); if (ShutdownEvent->Wait(WaitTimeMilliseconds)) { break; } } } return 0; } uint32 FCookDirector::FRunnableShunt::Run() { return Director.RunCommunicationThread(); } void FCookDirector::FRunnableShunt::Stop() { Director.ShutdownEvent->Trigger(); } void FCookDirector::StartCook(const FBeginCookContext& InBeginContext) { BeginCookContext.Set(InBeginContext); // We launch the CookWorkers during StartCook, so that their startup time overlaps with // work the Director has to do during the first tick, for PumpRequests and AssignRequests. FScopeLock CommunicationScopeLock(&CommunicationLock); InitializeWorkers(); } void FCookDirector::AssignRequests(TArrayView Requests, TArray& OutAssignments, TMap>&& RequestGraph) { TArray WorkerIds; TArray> LocalRemoteWorkers; { FScopeLock CommunicationScopeLock(&CommunicationLock); InitializeWorkers(); } LocalRemoteWorkers = CopyRemoteWorkers();; WorkerIds.Reserve(LocalRemoteWorkers.Num() + 1); if (bAllowLocalCooks) { WorkerIds.Add(FWorkerId::Local()); } for (const TRefCountPtr& RemoteWorker : LocalRemoteWorkers) { WorkerIds.Add(RemoteWorker->GetWorkerId()); } AssignRequests(MoveTemp(WorkerIds), LocalRemoteWorkers, Requests, OutAssignments, MoveTemp(RequestGraph), true /* bInitialAssignment */); // Check for a race condition with the communication thread; if a Server was aborted after we CopyRemoteWorkers // above but before we assigned packages to it, we need to abort those assigments now. TSet PackagesToReassignSet; for (TRefCountPtr& RemoteWorker : LocalRemoteWorkers) { if (RemoteWorker->IsShuttingDown()) { RemoteWorker->AbortAllAssignments(PackagesToReassignSet, ECookDirectorThread::SchedulerThread, HeartbeatNumber); } } if (!PackagesToReassignSet.IsEmpty()) { // Defer the reassign because our caller has not yet put the packages into the assigned state. FScopeLock CommunicationScopeLock(&CommunicationLock); DeferredPackagesToReassign.Append(PackagesToReassignSet.Array()); } } TMap FCookDirector::GetAssignPackageExtraDatas( TConstArrayView Requests) const { TConstArrayView SessionPlatforms = COTFS.PlatformManager->GetSessionPlatforms(); FMPCollectorServerTickPackageContext Context; Context.Platforms = SessionPlatforms; TMap Results; for (FPackageData* Request : Requests) { FAssignPackageExtraData* ExtraData = nullptr; auto GetOrAllocateExtraData = [&Results, &ExtraData](FPackageData* InRequest) { if (!ExtraData) { ExtraData = &Results.FindOrAdd(InRequest); } return ExtraData; }; TRefCountPtr GenerationHelper = Request->GetGenerationHelper(); if (GenerationHelper) { ExtraData = GetOrAllocateExtraData(Request); ExtraData->GeneratorPerPlatformPreviousGeneratedPackages.Reserve(SessionPlatforms.Num()); for (const ITargetPlatform* TargetPlatform : SessionPlatforms) { TMap& PreviousGeneratedPackages = ExtraData->GeneratorPerPlatformPreviousGeneratedPackages.FindOrAdd(TargetPlatform); PreviousGeneratedPackages = GenerationHelper->GetPreviousGeneratedPackages(TargetPlatform); } } Context.PackageName = Request->GetPackageName(); for (const TPair>& CollectorPair : Collectors) { IMPCollector* Collector = CollectorPair.Value.GetReference(); Collector->ServerTickPackage(Context); if (!Context.Messages.IsEmpty()) { ExtraData = GetOrAllocateExtraData(Request); FGuid MessageType = Collector->GetMessageType(); for (FCbObject& Object : Context.Messages) { ExtraData->PerPackageCollectorMessages.Add({ MessageType, MoveTemp(Object) }); } Context.Messages.Reset(); } } } return Results; } TArray FCookDirector::GetInfoPackagesForRequests(TConstArrayView Requests) const { TSet InfoPackages; for (FPackageData* Request : Requests) { if (!Request->GetParentGenerator().IsNone()) { FPackageData* Generator = COTFS.PackageDatas->FindPackageDataByPackageName(Request->GetParentGenerator()); if (Generator) { InfoPackages.Add(Generator); } } } return InfoPackages.Array(); } void FCookDirector::AssignRequests(TArray&& InWorkers, TArray>& InRemoteWorkers, TArrayView Requests, TArray& OutAssignments, TMap>&& RequestGraph, bool bInitialAssignment) { check(InWorkers.Num() > 0); if (InWorkers.Num() <= 1) { FWorkerId WorkerId = InWorkers[0]; OutAssignments.SetNum(Requests.Num()); TArray RemovedRequests; for (int32 RequestIndex = 0; RequestIndex < Requests.Num(); ++RequestIndex) { FPackageData* Request = Requests[RequestIndex]; FWorkerId& Assignment = OutAssignments[RequestIndex]; FWorkerId WorkerIdConstraint = Request->GetWorkerAssignmentConstraint(); if (WorkerIdConstraint.IsValid() && WorkerIdConstraint != WorkerId) { UE_LOG(LogCook, Warning, TEXT("Package %s can only be cooked by a now-disconnected CookWorker. The package can not be cooked."), *Request->GetPackageName().ToString()); Assignment = FWorkerId::Invalid(); RemovedRequests.Add(Request); } else { Assignment = WorkerId; } } if (!WorkerId.IsLocal()) { TRefCountPtr* RemoteWorker = InRemoteWorkers.FindByPredicate( [&WorkerId](const TRefCountPtr& X) { return X->GetWorkerId() == WorkerId; }); check(RemoteWorker); TArrayView RequestsToSend = Requests; TArray RequestBuffer; if (!RemovedRequests.IsEmpty()) { TSet RequestSet; for (FPackageData* Request : Requests) { RequestSet.Add(Request); } for (FPackageData* Remove : RemovedRequests) { RequestSet.Remove(Remove); } RequestBuffer = RequestSet.Array(); RequestsToSend = RequestBuffer; } (*RemoteWorker)->AppendAssignments(RequestsToSend, GetAssignPackageExtraDatas(RequestsToSend), GetInfoPackagesForRequests(RequestsToSend), ECookDirectorThread::SchedulerThread); } return; } InWorkers.Sort(); // Call the LoadBalancing algorithm to split the requests among the LocalWorker and RemoteWorkers LoadBalance(InWorkers, Requests, MoveTemp(RequestGraph), OutAssignments); int32 MaxRemoteIndex = InWorkers.Last().IsLocal() ? -1 : InWorkers.Last().GetRemoteIndex(); // Split the output array of WorkerId assignments into a batch for each of the RemoteWorkers TArray> RemoteBatches; // Indexed by WorkerId.GetRemoteIndex() TArray RemoteIndexIsValid; // Indexed by WorkerId.GetRemoteIndex() RemoteBatches.SetNum(MaxRemoteIndex+1); RemoteIndexIsValid.Init(false, MaxRemoteIndex+1); for (FWorkerId WorkerId : InWorkers) { if (!WorkerId.IsLocal()) { RemoteIndexIsValid[WorkerId.GetRemoteIndex()] = true; } } // Debug function for generated packages. We want to be able to force their assignment to all,some,none // on the same worker that saved their generator. int32 MPCookGeneratorNextWorker = -1; auto GetNextWorkerForGeneratedPackage = [&MPCookGeneratorNextWorker, &InWorkers](FWorkerId WorkerNotToUse) { MPCookGeneratorNextWorker = (MPCookGeneratorNextWorker + 1) % InWorkers.Num(); if (InWorkers[MPCookGeneratorNextWorker] == WorkerNotToUse) { MPCookGeneratorNextWorker = (MPCookGeneratorNextWorker + 1) % InWorkers.Num(); } return InWorkers[MPCookGeneratorNextWorker]; }; for (int32 RequestIndex = 0; RequestIndex < Requests.Num(); ++RequestIndex) { FWorkerId& WorkerId = OutAssignments[RequestIndex]; // Override the loadbalancer's assignment if the Package has a WorkerAssignmentConstraint // This allows us to guarantee that generated packages will be cooked on the worker that cooked // their generator package FPackageData* RequestPackage = Requests[RequestIndex]; FWorkerId WorkerIdConstraint = RequestPackage->GetWorkerAssignmentConstraint(); if (WorkerIdConstraint.IsValid()) { WorkerId = WorkerIdConstraint; } // Override the loadbalancer's assignment to force it local if the Package is blocking urgency else if (RequestPackage->GetUrgency() == EUrgency::Blocking) { WorkerId = FWorkerId::Local(); } else if (RequestPackage->IsGenerated() && bInitialAssignment) { TRefCountPtr GenerationHelper = RequestPackage->GetParentGenerationHelper(); if (GenerationHelper) { FWorkerId GeneratorWorker = GenerationHelper->GetWorkerIdThatSavedGenerator(); if (GeneratorWorker.IsValid()) { switch (COTFS.MPCookGeneratorSplit) { default: // fallthrough case EMPCookGeneratorSplit::AnyWorker: // If we're allowed to put them on any worker, put them on the GeneratorWorker anyway // for the initial assignment since it already has the GenerationHelper created WorkerId = GeneratorWorker; break; case EMPCookGeneratorSplit::AllOnSameWorker: WorkerId = GeneratorWorker; break; case EMPCookGeneratorSplit::SomeOnSameWorker: if ((GenerationHelper->GetMPCookNextAssignmentIndex()++) % 2 == 0) { WorkerId = GeneratorWorker; } else { WorkerId = GetNextWorkerForGeneratedPackage(GeneratorWorker); } break; case EMPCookGeneratorSplit::NoneOnSameWorker: WorkerId = GetNextWorkerForGeneratedPackage(GeneratorWorker); break; } } } } if (!WorkerId.IsLocal()) { uint8 RemoteIndex = WorkerId.GetRemoteIndex(); if (RemoteIndex >= RemoteBatches.Num() || !RemoteIndexIsValid[RemoteIndex]) { UE_LOG(LogCook, Error, TEXT("Package %s can only be cooked by a now-disconnected CookWorker. The package can not be cooked."), *Requests[RequestIndex]->GetPackageName().ToString()); WorkerId = FWorkerId::Invalid(); continue; } TArray& RemoteBatch = RemoteBatches[RemoteIndex]; if (RemoteBatch.Num() == 0) { RemoteBatch.Reserve(2 * Requests.Num() / (InWorkers.Num())); } RemoteBatch.Add(Requests[RequestIndex]); } } // Assign each batch to the FCookWorkerServer in RemoteWorkers; // the CookWorkerServer's tick will handle sending the message to the remote process for (FWorkerId WorkerId : InWorkers) { if (!WorkerId.IsLocal()) { TRefCountPtr* RemoteWorker = InRemoteWorkers.FindByPredicate( [&WorkerId](const TRefCountPtr& X) { return X->GetWorkerId() == WorkerId; }); check(RemoteWorker); TArray& RemoteBatch = RemoteBatches[WorkerId.GetRemoteIndex()]; (*RemoteWorker)->AppendAssignments(RemoteBatch, GetAssignPackageExtraDatas(RemoteBatch), GetInfoPackagesForRequests(RemoteBatch), ECookDirectorThread::SchedulerThread); } } bIsFirstAssignment = false; } TArray> FCookDirector::CopyRemoteWorkers() const { TArray> LocalRemoteWorkers; FScopeLock CommunicationScopeLock(&CommunicationLock); LocalRemoteWorkers.Reset(RemoteWorkers.Num()); for (const TPair>& Pair : RemoteWorkers) { LocalRemoteWorkers.Add(Pair.Value); } return LocalRemoteWorkers; } void FCookDirector::RemoveFromWorker(FPackageData& PackageData) { FWorkerId WorkerId = PackageData.GetWorkerAssignment(); if (!WorkerId.IsRemote()) { return; } TRefCountPtr OwningWorker; { FScopeLock CommunicationScopeLock(&CommunicationLock); const TRefCountPtr* RemoteWorkerPtr = FindRemoteWorkerInLock(WorkerId); if (!RemoteWorkerPtr) { return; } OwningWorker = *RemoteWorkerPtr; } OwningWorker->AbortAssignment(PackageData, ECookDirectorThread::SchedulerThread, HeartbeatNumber); bForceNextHeartbeat = true; } void FCookDirector::BroadcastMessage(const IMPCollectorMessage& Message, ECookBroadcastTiming Timing) { BroadcastMessage(MarshalToCompactBinaryTCP(Message), Timing); } void FCookDirector::BroadcastMessage(UE::CompactBinaryTCP::FMarshalledMessage&& Message, ECookBroadcastTiming Timing) { if (bReceivingMessages) { QueuedBroadcasts.Emplace(MoveTemp(Message), Timing); return; } { FScopeLock CommunicationScopeLock(&CommunicationLock); InitializeWorkers(); } TArray> LocalRemoteWorkers = CopyRemoteWorkers(); for (TRefCountPtr& RemoteWorker : LocalRemoteWorkers) { UE::CompactBinaryTCP::FMarshalledMessage CopyMessage(Message); switch (Timing) { case ECookBroadcastTiming::Immediate: { if (RemoteWorker->IsConnected()) { RemoteWorker->SendMessage(MoveTemp(CopyMessage), ECookDirectorThread::SchedulerThread); } else { RemoteWorker->AppendMessage(MoveTemp(CopyMessage), ECookDirectorThread::SchedulerThread); } break; } case ECookBroadcastTiming::AfterAssignPackages: { RemoteWorker->AppendMessage(MoveTemp(CopyMessage), ECookDirectorThread::SchedulerThread); break; } default: checkNoEntry(); break; } } } int32 FCookDirector::InsertBroadcastFence() { int32 LocalHeartbeatNumber = HeartbeatNumber; // We need to Tick, and have the Tick function send and update the heartbeat number, so that any preceding // messages get sent before the heartbeat. bForceNextHeartbeat = true; TickFromSchedulerThread(); return LocalHeartbeatNumber; } bool FCookDirector::IsBroadcastFencePassed(int32 Fence, TArray* OutPendingWorkers) { bool bHasPendingWorker = false; FScopeLock CommunicationScopeLock(&CommunicationLock); for (TPair>& Pair : RemoteWorkers) { FCookWorkerServer* RemoteWorker = Pair.Value.GetReference(); if (RemoteWorker->GetLastReceivedHeartbeatNumber() < Fence) { bHasPendingWorker = true; if (OutPendingWorkers) { OutPendingWorkers->Add(RemoteWorker->GetWorkerId()); } } } return !bHasPendingWorker; } void FCookDirector::TickFromSchedulerThread() { double CurrentTime = FPlatformTime::Seconds(); if (!CommunicationThread) { TickCommunication(ECookDirectorThread::SchedulerThread); } int32 BusiestNumAssignments = 0; bool bLocalWorkerIdle = true; bool bAnyIdle = false; float DeltaTime = static_cast(CurrentTime - LastTickTimeSeconds); LastTickTimeSeconds = CurrentTime; if (bAllowLocalCooks) { BusiestNumAssignments = COTFS.NumMultiprocessLocalWorkerAssignments(); bLocalWorkerIdle = BusiestNumAssignments == 0; bAnyIdle = bLocalWorkerIdle; LocalWorkerProfileData->UpdateIdle(bLocalWorkerIdle, DeltaTime); } bool bSendHeartbeat; int32 LocalHeartbeatNumber; TickHeartbeat(bForceNextHeartbeat, CurrentTime, bSendHeartbeat, LocalHeartbeatNumber); bForceNextHeartbeat = false; TArray, TInlineAllocator<16>> WorkersWithMessage; bool bAllWorkersConnected = false; if (bWorkersInitialized) { FScopeLock CommunicationScopeLock(&CommunicationLock); bAllWorkersConnected = true; for (TPair>& Pair : RemoteWorkers) { FCookWorkerServer* RemoteWorker = Pair.Value.GetReference(); FCookWorkerProfileData& ProfileData = RemoteWorkerProfileDatas[RemoteWorker->GetProfileId()]; bAllWorkersConnected &= RemoteWorker->IsConnected(); int32 NumAssignments = RemoteWorker->NumAssignments(); BusiestNumAssignments = FMath::Max(NumAssignments, BusiestNumAssignments); bool bWorkerIdle = NumAssignments == 0; bAnyIdle |= bWorkerIdle; ProfileData.UpdateIdle(bWorkerIdle, DeltaTime); if (RemoteWorker->HasMessages()) { WorkersWithMessage.Add(RemoteWorker); } if (bSendHeartbeat) { RemoteWorker->SignalHeartbeat(ECookDirectorThread::SchedulerThread, LocalHeartbeatNumber); } } for (TPair>& Pair : ShuttingDownWorkers) { if (Pair.Value && Pair.Value->HasMessages()) { WorkersWithMessage.Add(Pair.Value); } } ReassignAbortedPackages(DeferredPackagesToReassign); RetractionHandler->TickFromSchedulerThread(bAllWorkersConnected, bAnyIdle, BusiestNumAssignments); } bool bIsStalled = bLocalWorkerIdle && WorkersWithMessage.IsEmpty(); if (bIsStalled) { // We are only stalled if we have no local work to do and there is a remote worker with assigned work. If no // worker has assigned work then we are done with the cook rather than stalled. bool bRemoteWorkerHasWork = !COTFS.PackageDatas->GetAssignedToWorkerSet().IsEmpty(); if (!bRemoteWorkerHasWork) { // Do the slow check to see whether we have a package in the SaveStalledAssignedToWorker state only if // we have to for (FPackageData* PackageData : COTFS.PackageDatas->GetSaveStalledSet()) { if (PackageData->IsInStateProperty(EPackageStateProperty::AssignedToWorkerProperty)) { bRemoteWorkerHasWork = true; break; } } } bIsStalled = bIsStalled && bRemoteWorkerHasWork; } { bReceivingMessages = true; for (TRefCountPtr& Worker : WorkersWithMessage) { Worker->HandleReceiveMessages(ECookDirectorThread::SchedulerThread); } bReceivingMessages = false; } for (TPair& Pair : QueuedBroadcasts) { BroadcastMessage(MoveTemp(Pair.Key), Pair.Value); } QueuedBroadcasts.Empty(); // Process any queued messages WorkersWithMessage.Empty(); SetWorkersStalled(bIsStalled); LastTickTimeSeconds = CurrentTime; } void FCookDirector::UpdateDisplayDiagnostics() const { DisplayRemainingPackages(); } void FCookDirector::DisplayRemainingPackages() const { constexpr int32 DisplayWidth = 16; UE_LOG(LogCook, Display, TEXT("\t%s: %d packages remain."), *GetDisplayName(FWorkerId::Local(), DisplayWidth), COTFS.NumMultiprocessLocalWorkerAssignments()); FScopeLock CommunicationScopeLock(&CommunicationLock); for (const TPair>& Pair : RemoteWorkers) { FCookWorkerServer* RemoteWorker = Pair.Value; UE_LOG(LogCook, Display, TEXT("\t%s: %d packages remain."), *GetDisplayName(*RemoteWorker, DisplayWidth), RemoteWorker->NumAssignments()); } } FString FCookDirector::GetDisplayName(const FWorkerId& WorkerId, int32 PreferredWidth) const { FString Result; if (WorkerId.IsInvalid()) { Result = TEXTVIEW("None"); } else if (WorkerId.IsLocal()) { Result = TEXTVIEW("Local"); } else { const TRefCountPtr* RemoteWorker = nullptr; { FScopeLock CommunicationScopeLock(&CommunicationLock); RemoteWorker = FindRemoteWorkerInLock(WorkerId); if (!RemoteWorker) { for (const TPair>& Pair : ShuttingDownWorkers) { if (Pair.Value && Pair.Value->GetWorkerId() == WorkerId) { RemoteWorker = &Pair.Value; break; } } } } if (RemoteWorker) { Result = FString::Printf(TEXT("%d"), (*RemoteWorker)->GetProfileId()); } else { Result = FString::Printf(TEXT("Unknown (WorkerId %d)"), WorkerId.GetRemoteIndex()); } } constexpr FStringView Prefix(TEXTVIEW("CookWorker ")); Result = FString(Prefix) + Result.LeftPad(PreferredWidth-Prefix.Len()); return Result; } FString FCookDirector::GetDisplayName(const FCookWorkerServer& RemoteWorker, int32 PreferredWidth) const { FString Result = FString::Printf(TEXT("%d"), RemoteWorker.GetProfileId()); constexpr FStringView Prefix(TEXTVIEW("CookWorker ")); Result = FString(Prefix) + Result.LeftPad(PreferredWidth - Prefix.Len()); return Result; } const TRefCountPtr* FCookDirector::FindRemoteWorkerInLock(const FWorkerId& WorkerId) const { if (!WorkerId.IsRemote()) { return nullptr; } return RemoteWorkers.Find(WorkerId.GetRemoteIndex()); } void FCookDirector::TickCommunication(ECookDirectorThread TickThread) { bool bHasShutdownWorkers = false; TickWorkerConnects(TickThread); TArray, TInlineAllocator<16>> LocalRemoteWorkers; { FScopeLock CommunicationScopeLock(&CommunicationLock); for (const TPair>& Pair: RemoteWorkers) { LocalRemoteWorkers.Add(Pair.Value); } if (!RemoteWorkers.IsEmpty()) { bWorkersActive = true; } else { bWorkersActive = false; for (const TPair >& Pair : ShuttingDownWorkers) { FCookWorkerServer* RemoteWorker = Pair.Key; check(RemoteWorker->IsShuttingDown()); bWorkersActive = bWorkersActive || RemoteWorker->IsFlushingBeforeShutdown(); } } bHasShutdownWorkers = !ShuttingDownWorkers.IsEmpty(); } for (TRefCountPtr& RemoteWorker: LocalRemoteWorkers) { RemoteWorker->TickCommunication(TickThread); if (RemoteWorker->IsShuttingDown()) { FScopeLock CommunicationScopeLock(&CommunicationLock); TRefCountPtr& Existing = ShuttingDownWorkers.FindOrAdd(RemoteWorker.GetReference()); check(!Existing); // We should not be able to send the same pointer into ShuttingDown twice bHasShutdownWorkers = true; } } if (bHasShutdownWorkers) { TickWorkerShutdowns(TickThread); } } void FCookDirector::PumpCookComplete(bool& bCompleted) { { FScopeLock CommunicationScopeLock(&CommunicationLock); if (!bCookCompleteSent) { bool bAllIdle = true; for (TPair>& Pair : RemoteWorkers) { FCookWorkerServer& RemoteWorker = *Pair.Value; if (RemoteWorker.NumAssignments() > 0) { bAllIdle = false; break; } } if (bAllIdle && FinalIdleHeartbeatFence == -1) { bool bSendHeartbeat; double CurrentTime = FPlatformTime::Seconds(); TickHeartbeat(true /* bForceHeartbeat */, CurrentTime, bSendHeartbeat, FinalIdleHeartbeatFence); for (TPair>& Pair : RemoteWorkers) { FCookWorkerServer& RemoteWorker = *Pair.Value; RemoteWorker.SignalHeartbeat(ECookDirectorThread::SchedulerThread, FinalIdleHeartbeatFence); } bAllIdle = false; } else { for (TPair>& Pair : RemoteWorkers) { FCookWorkerServer& RemoteWorker = *Pair.Value; if (RemoteWorker.GetLastReceivedHeartbeatNumber() < FinalIdleHeartbeatFence) { bAllIdle = false; SetWorkersStalled(true); break; } } } if (bAllIdle) { for (TPair>& Pair : RemoteWorkers) { FCookWorkerServer& RemoteWorker = *Pair.Value; RemoteWorker.SignalCookComplete(ECookDirectorThread::SchedulerThread); check(RemoteWorker.IsShuttingDown()); } bCookCompleteSent = true; } } bCompleted = !bWorkersActive; } TickFromSchedulerThread(); } void FCookDirector::ShutdownCookSession() { StopCommunicationThread(); // Cancel any inprogress workers and move them to the Shutdown list for (;;) { TRefCountPtr RemoteWorker; { FScopeLock CommunicationScopeLock(&CommunicationLock); if (RemoteWorkers.IsEmpty()) { break; } RemoteWorker = TMap>::TIterator(RemoteWorkers).Value(); } AbortWorker(RemoteWorker->GetWorkerId(), ECookDirectorThread::SchedulerThread); } // Immediately shutdown any gracefully shutting down workers TArray, TInlineAllocator<16>> WorkersNeedingAbort; { FScopeLock CommunicationScopeLock(&CommunicationLock); for (TPair>& Pair : ShuttingDownWorkers) { // The Value was set by AbortWorker for any new entries, and all old entries guarantee the value is set check(Pair.Value); if (Pair.Key->IsFlushingBeforeShutdown()) { WorkersNeedingAbort.Add(Pair.Value); } } } for (TRefCountPtr& RemoteWorker : WorkersNeedingAbort) { TSet UnusedPendingPackages; RemoteWorker->AbortWorker(UnusedPendingPackages, ECookDirectorThread::SchedulerThread, HeartbeatNumber); } // Wait for all the shutdowns to complete for (;;) { TickWorkerShutdowns(ECookDirectorThread::SchedulerThread); { FScopeLock CommunicationScopeLock(&CommunicationLock); if (ShuttingDownWorkers.IsEmpty()) { break; } } constexpr float SleepSeconds = 0.010f; FPlatformProcess::Sleep(SleepSeconds); } // Kill any connections that had just been made and not yet assigned to a Server PendingConnections.Reset(); // Restore the FCookDirector to its original state so that it is ready for a new session bWorkersInitialized = false; bIsFirstAssignment = true; bCookCompleteSent = false; bWorkersActive = false; HeartbeatNumber = 0; NextHeartbeatTimeSeconds = 0.; FinalIdleHeartbeatFence = -1; } void FCookDirector::Register(IMPCollector* Collector) { TRefCountPtr& Existing = Collectors.FindOrAdd(Collector->GetMessageType()); if (Existing) { UE_LOG(LogCook, Error, TEXT("Duplicate IMPCollectors registered. Guid: %s, Existing: %s, Registering: %s. Keeping the Existing."), *Collector->GetMessageType().ToString(), Existing->GetDebugName(), Collector->GetDebugName()); return; } Existing = Collector; } void FCookDirector::Unregister(IMPCollector* Collector) { TRefCountPtr Existing; Collectors.RemoveAndCopyValue(Collector->GetMessageType(), Existing); if (Existing && Existing.GetReference() != Collector) { UE_LOG(LogCook, Error, TEXT("Duplicate IMPCollector during Unregister. Guid: %s, Existing: %s, Unregistering: %s. Ignoring the Unregister."), *Collector->GetMessageType().ToString(), Existing->GetDebugName(), Collector->GetDebugName()); Collectors.Add(Collector->GetMessageType(), MoveTemp(Existing)); } } void FCookDirector::SetWorkersStalled(bool bInWorkersStalled) { if (bInWorkersStalled != bWorkersStalled) { bWorkersStalled = bInWorkersStalled; if (bWorkersStalled) { const double CurrentTime = FPlatformTime::Seconds(); WorkersStalledStartTimeSeconds = CurrentTime; WorkersStalledWarnTimeSeconds = CurrentTime + GCookProgressWarnBusyTime; } else { WorkersStalledStartTimeSeconds = MAX_flt; WorkersStalledWarnTimeSeconds = MAX_flt; } } else if (bWorkersStalled) { const double CurrentTime = FPlatformTime::Seconds(); if (CurrentTime >= WorkersStalledWarnTimeSeconds) { UE_LOG(LogCook, Display, TEXT("Cooker has been blocked with no results from remote CookWorkers for %.0f seconds."), (float)(CurrentTime - WorkersStalledStartTimeSeconds)); WorkersStalledWarnTimeSeconds = CurrentTime + GCookProgressWarnBusyTime; } } } void FCookDirector::TickHeartbeat(bool bForceHeartbeat, double CurrentTimeSeconds, bool& bOutSendHeartbeat, int32& OutHeartbeatNumber) { constexpr float HeartbeatPeriodSeconds = 30.f; bOutSendHeartbeat = false; OutHeartbeatNumber = HeartbeatNumber; if (bForceHeartbeat) { bOutSendHeartbeat = true; } else if (NextHeartbeatTimeSeconds == 0.) { NextHeartbeatTimeSeconds = CurrentTimeSeconds + HeartbeatPeriodSeconds; } else if (CurrentTimeSeconds >= NextHeartbeatTimeSeconds) { bOutSendHeartbeat = true; } if (bOutSendHeartbeat) { checkf(HeartbeatNumber < MAX_int32, TEXT("Overflow")); HeartbeatNumber++; NextHeartbeatTimeSeconds = CurrentTimeSeconds + HeartbeatPeriodSeconds; } } void FCookDirector::ResetFinalIdleHeartbeatFence() { FinalIdleHeartbeatFence = -1; } void FCookDirector::HandleHeartbeatMessage(FMPCollectorServerMessageContext& Context, bool bReadSuccessful, FHeartbeatMessage&& Message) { if (!bReadSuccessful) { UE_LOG(LogCook, Error, TEXT("Corrupt HeartbeatMessage received from CookWorker %d. It will be ignored."), Context.GetProfileId()); return; } Context.GetCookWorkerServer()->SetLastReceivedHeartbeatNumberInLock(Message.HeartbeatNumber); } FCookDirector::FPendingConnection::FPendingConnection(FPendingConnection&& Other) { Swap(Socket, Other.Socket); Buffer = MoveTemp(Other.Buffer); } FCookDirector::FPendingConnection::~FPendingConnection() { Sockets::CloseSocket(Socket); } FSocket* FCookDirector::FPendingConnection::DetachSocket() { FSocket* Result = Socket; Socket = nullptr; return Result; } void FWorkerConnectMessage::Write(FCbWriter& Writer) const { Writer << "RemoteIndex" << RemoteIndex; } bool FWorkerConnectMessage::TryRead(FCbObjectView Object) { RemoteIndex = Object["RemoteIndex"].AsInt32(-1); return RemoteIndex >= 0; } FGuid FWorkerConnectMessage::MessageType(TEXT("302096E887DA48F7B079FAFAD0EE5695")); bool FCookDirector::TryCreateWorkerConnectSocket() { if (WorkerConnectSocket) { return true; } ISocketSubsystem* SocketSubsystem = ISocketSubsystem::Get(); if (!SocketSubsystem) { // Error was already logged in the constructor return false; } FString ErrorReason; TSharedPtr ListenAddr; WorkerConnectSocket = Sockets::CreateListenSocket(WorkerConnectPort, ListenAddr, WorkerConnectAuthority, TEXT("FCookDirector-WorkerConnect"), ErrorReason); if (!WorkerConnectSocket) { UE_LOG(LogCook, Error, TEXT("CookDirector could not create listen socket, CookWorkers will be disabled. Reason: %s."), *ErrorReason); return false; } return true; } void FCookDirector::InitializeWorkers() { if (bWorkersInitialized) { return; } int32 InitialRequestCount = COTFS.InitialRequestCount; bWorkersInitialized = true; check(!CommunicationThread); check(RemoteWorkers.IsEmpty()); bool bSucceeded = false; ON_SCOPE_EXIT { if (!bSucceeded) { bWorkersActive = false; } }; constexpr int32 MinRequestsForMPCookWithCookSinglePackage = 100; UE::Cook::FCookByTheBookOptions& Options = *COTFS.CookByTheBookOptions; if (!bCookProcessCountSetByCommandLine && (Options.bSkipHardReferences || (Options.bSkipSoftReferences && InitialRequestCount < MinRequestsForMPCookWithCookSinglePackage))) { UE_LOG(LogCook, Display, TEXT("CookDirector initialization skipped: -CookSinglePackage was requested. CookMultiprocess is disabled and the cooker is running as a single process.")); return; } if (Options.bCookList) { UE_LOG(LogCook, Display, TEXT("CookDirector initialization skipped: -CookList was requested. CookMultiprocess is disabled and the cooker is running as a single process.")); return; } UE_LOG(LogCook, Display, TEXT("CookProcessCount=%d. CookMultiprocess is enabled with 1 CookDirector and %d %s."), RequestedCookWorkerCount + 1, RequestedCookWorkerCount, RequestedCookWorkerCount > 1 ? TEXT("CookWorkers") : TEXT("CookWorker")); ActivateMachineResourceReduction(); if (!TryCreateWorkerConnectSocket()) { return; } RemoteWorkers.Reserve(RequestedCookWorkerCount); for (int32 RemoteIndex = 0; RemoteIndex < RequestedCookWorkerCount; ++RemoteIndex) { int32 ProfileId = RemoteWorkerProfileDatas.Num(); RemoteWorkerProfileDatas.Emplace(); RemoteWorkers.Add(RemoteIndex, new FCookWorkerServer(*this, ProfileId, FWorkerId::FromRemoteIndex(RemoteIndex))); } bWorkersActive = true; ConstructReadonlyThreadVariables(); ShutdownEvent->Reset(); LaunchCommunicationThread(); bSucceeded = true; } void FCookDirector::RecreateWorkers() { // TODO: Finish implementing the recreation of workers that have crashed // Find any unused RemoteIndex less than the maximum used RemoteIndex FScopeLock CommunicationScopeLock(&CommunicationLock); if (RemoteWorkers.Num() >= RequestedCookWorkerCount || !WorkerConnectSocket) { return; } TArray UnusedRemoteIndexes; RemoteWorkers.KeySort(TLess<>()); uint8 NextPossiblyOpenIndex = 0; for (TPair>& Pair : RemoteWorkers) { check(NextPossiblyOpenIndex <= Pair.Key); while (NextPossiblyOpenIndex != Pair.Key) { UnusedRemoteIndexes.Add(NextPossiblyOpenIndex++); } } // Add RemoteWorkers, pulling the RemoteIndex id from the UnusedRemoteIndexes if any exist // otherwise use the next integer because all indexes up to RemoteWorkers.Num() are in use. while (RemoteWorkers.Num() < RequestedCookWorkerCount) { uint8 RemoteIndex; if (UnusedRemoteIndexes.Num()) { RemoteIndex = UnusedRemoteIndexes[0]; UnusedRemoteIndexes.RemoveAtSwap(0); } else { RemoteIndex = RemoteWorkers.Num(); } int32 ProfileId = RemoteWorkerProfileDatas.Num(); RemoteWorkerProfileDatas.Emplace(); RemoteWorkers.Add(RemoteIndex, new FCookWorkerServer(*this, ProfileId, FWorkerId::FromRemoteIndex(RemoteIndex))); bWorkersActive = true; } } void FCookDirector::ActivateMachineResourceReduction() { if (bHasReducedMachineResources) { return; } bHasReducedMachineResources = true; // When running a multiprocess cook, we remove the Memory triggers and trigger GC based solely on PressureLevel. // But keep the Soft GC settings PRAGMA_DISABLE_DEPRECATION_WARNINGS COTFS.MemoryMaxUsedPhysical = 0; COTFS.MemoryMaxUsedVirtual = 0; PRAGMA_ENABLE_DEPRECATION_WARNINGS COTFS.MemoryMinFreeVirtual = 0; COTFS.MemoryMinFreePhysical = 0; COTFS.MemoryTriggerGCAtPressureLevel = FGenericPlatformMemoryStats::EMemoryPressureStatus::Critical; UE_LOG(LogCook, Display, TEXT("CookMultiprocess changed CookSettings for Memory:%s"), *COTFS.GetCookSettingsForMemoryLogText()); // Set CoreLimit for updating workerthreads in this process and passing to the commandline for workers int32 NumProcesses = RequestedCookWorkerCount + 1; int32 NumberOfCores = FPlatformMisc::NumberOfCores(); int32 HyperThreadCount = FPlatformMisc::NumberOfCoresIncludingHyperthreads(); int32 NumberOfHyperThreadsPerCore = HyperThreadCount / NumberOfCores; CoreLimit = FMath::Max(NumberOfCores / NumProcesses, 1); const TCHAR* CommandLine = FCommandLine::Get(); float CoreOversubscription = 1.0f; if (FParse::Value(CommandLine, TEXT("-MPCookCoreSubscription="), CoreOversubscription)) { CoreLimit = FMath::Clamp(static_cast(CoreLimit*CoreOversubscription), 1, NumberOfCores); } int32 CoreIncludingHyperthreadsLimit = CoreLimit * NumberOfHyperThreadsPerCore; int32 NumberOfWorkers = FMath::Max(CoreLimit - 1, 1) * NumberOfHyperThreadsPerCore; // Update the number of Cores and WorkerThreads for this process check(IsInGameThread()); int32 NumBackgroundWorkers = FMath::Max(1, NumberOfWorkers - FMath::Min(GNumForegroundWorkers, NumberOfWorkers)); int32 NumForegroundWorkers = FMath::Max(1, NumberOfWorkers - NumBackgroundWorkers); LowLevelTasks::FScheduler::Get().RestartWorkers(NumForegroundWorkers, NumBackgroundWorkers); // Update the number of ShaderCompilerWorkers that can be launched GShaderCompilingManager->OnMachineResourcesChanged(CoreLimit, CoreIncludingHyperthreadsLimit); UE_LOG(LogCook, Display, TEXT("CookMultiprocess changed number of cores from %d to %d."), NumberOfCores, CoreLimit); UE_LOG(LogCook, Display, TEXT("CookMultiprocess changed number of hyperthreads from %d to %d."), HyperThreadCount, CoreIncludingHyperthreadsLimit); } void FCookDirector::TickWorkerConnects(ECookDirectorThread TickThread) { using namespace UE::CompactBinaryTCP; if (!WorkerConnectSocket) { return; } bool bReadReady; while (WorkerConnectSocket->HasPendingConnection(bReadReady) && bReadReady) { FSocket* WorkerSocket = WorkerConnectSocket->Accept(TEXT("Client Connection")); if (!WorkerSocket) { UE_LOG(LogCook, Warning, TEXT("Pending connection failed to create a ClientSocket.")); } else { WorkerSocket->SetNonBlocking(true); PendingConnections.Add(FPendingConnection(WorkerSocket)); } } for (TArray::TIterator Iter(PendingConnections); Iter; ++Iter) { FPendingConnection& Conn = *Iter; TArray Messages; EConnectionStatus Status; Status = TryReadPacket(Conn.Socket, Conn.Buffer, Messages); if (Status != EConnectionStatus::Okay) { UE_LOG(LogCook, Warning, TEXT("Pending connection failed before sending a WorkerPacket: %s"), DescribeStatus(Status)); Iter.RemoveCurrent(); } if (Messages.Num() == 0) { continue; } FPendingConnection LocalConn(MoveTemp(Conn)); Iter.RemoveCurrent(); if (Messages[0].MessageType != FWorkerConnectMessage::MessageType) { UE_LOG(LogCook, Warning, TEXT("Pending connection sent a different message before sending a connection message. MessageType: %s. Connection will be ignored."), *Messages[0].MessageType.ToString()); continue; } FWorkerConnectMessage Message; if (!Message.TryRead(MoveTemp(Messages[0].Object))) { UE_LOG(LogCook, Warning, TEXT("Pending connection sent an invalid Connection Message. Connection will be ignored.")); continue; } TRefCountPtr RemoteWorker; { FScopeLock CommunicationScopeLock(&CommunicationLock); TRefCountPtr* RemoteWorkerPtr; RemoteWorkerPtr = RemoteWorkers.Find(Message.RemoteIndex); if (!RemoteWorkerPtr) { TStringBuilder<256> ValidIndexes; if (RemoteWorkers.Num()) { RemoteWorkers.KeySort(TLess<>()); for (TPair>& Pair : RemoteWorkers) { ValidIndexes.Appendf(TEXT("%d,"), Pair.Key); } ValidIndexes.RemoveSuffix(1); // Remove the terminating comma } UE_LOG(LogCook, Warning, TEXT("Pending connection sent a Connection Message with invalid RemoteIndex %d. ValidIndexes = {%s}. Connection will be ignored."), Message.RemoteIndex, *ValidIndexes); continue; } RemoteWorker = *RemoteWorkerPtr; } FSocket* LocalSocket = LocalConn.DetachSocket(); Messages.RemoveAt(0); if (!RemoteWorker->TryHandleConnectMessage(Message, LocalSocket, MoveTemp(Messages), TickThread)) { UE_LOG(LogCook, Warning, TEXT("Pending connection sent a Connection Message with an already in-use RemoteIndex. Connection will be ignored.")); Sockets::CloseSocket(LocalSocket); continue; } } } void FCookDirector::TickWorkerShutdowns(ECookDirectorThread TickThread) { // Move any newly shutting down workers from RemoteWorkers TArray> NewShutdowns; TArray, TInlineAllocator<16>> LocalRemoteWorkers; { FScopeLock RemoteWorkersScopeLock(&CommunicationLock); for (TPair>& Pair : ShuttingDownWorkers) { if (!Pair.Value) { NewShutdowns.Emplace(Pair.Key); } else { LocalRemoteWorkers.Add(Pair.Value); } } } if (!NewShutdowns.IsEmpty()) { for (FCookWorkerServer* NewShutdown : NewShutdowns) { AbortWorker(NewShutdown->GetWorkerId(), TickThread); { FScopeLock CommunicationScopeLock(&CommunicationLock); check(ShuttingDownWorkers.FindOrAdd(NewShutdown).IsValid()); // Abort worker should have set the value } LocalRemoteWorkers.Emplace(NewShutdown); } } TArray, TInlineAllocator<16>> CompletedWorkers; for (TRefCountPtr& RemoteWorker : LocalRemoteWorkers) { RemoteWorker->TickCommunication(TickThread); if (RemoteWorker->IsShutdownComplete()) { CompletedWorkers.Add(RemoteWorker); } } LocalRemoteWorkers.Empty(); if (!CompletedWorkers.IsEmpty()) { FScopeLock CommunicationScopeLock(&CommunicationLock); for (TRefCountPtr& CompletedWorker : CompletedWorkers) { ShuttingDownWorkers.Remove(CompletedWorker.GetReference()); } } CompletedWorkers.Empty(); } FString FCookDirector::GetWorkerLogFileName(int32 ProfileId) { FString DirectorLogFileName = FGenericPlatformOutputDevices::GetAbsoluteLogFilename(); FStringView BaseFileName = FPathViews::GetBaseFilenameWithPath(DirectorLogFileName); FStringView Extension = FPathViews::GetExtension(DirectorLogFileName, true /* bIncludeDot */); return FString::Printf(TEXT("%.*s_Worker%d%*s"), BaseFileName.Len(), BaseFileName.GetData(), ProfileId, Extension.Len(), Extension.GetData()); } FString FCookDirector::GetWorkerCommandLine(FWorkerId WorkerId, int32 ProfileId) { const TCHAR* CommandLine = FCommandLine::Get(); const TCHAR* ProjectName = FApp::GetProjectName(); checkf(ProjectName && ProjectName[0], TEXT("Expected UnrealEditor to be running with a non-empty project name")); // Note that we need to handle quoted strings for e.g. a projectfile with spaces in it; FParse::Token // does handle them FString Token; TArray Tokens; bool bAssetRegistryCacheWritingDisabled = false; while (FParse::Token(CommandLine, Token, false /* bUseEscape */)) { if (Token.IsEmpty()) { continue; } if (Token.StartsWith(TEXT("-run=")) || Token == TEXT("-CookOnTheFly") || Token == TEXT("-CookWorker") || Token.StartsWith(TEXT("-CookCultures")) || Token.StartsWith(TEXT("-CookDirectorHost=")) || Token.StartsWith(TEXT("-MultiprocessId=")) || Token.StartsWith(TEXT("-CookProfileId=")) || Token.StartsWith(TEXT("-ShowCookWorker")) || Token.StartsWith(TEXT("-CoreLimit")) || Token.StartsWith(TEXT("-PhysicalCoreLimit")) || Token.StartsWith(TEXT("-MPCookCoreSubscription")) || Token.StartsWith(TEXT("-CookProcessCount=")) || Token.StartsWith(TEXT("-abslog=")) || Token.StartsWith(TEXT("-unattended")) ) { continue; } else if (Token.StartsWith(TEXT("-tracefile="))) { FString TraceFile; FString TokenString(Token); if (FParse::Value(*TokenString, TEXT("-tracefile="), TraceFile) && !TraceFile.IsEmpty()) { FStringView BaseFilenameWithPath = FPathViews::GetBaseFilenameWithPath(TraceFile); FStringView Extension = FPathViews::GetExtension(TraceFile, true /* bIncludeDot */); Tokens.Add(FString::Printf(TEXT("-tracefile=\"%.*s_Worker%d%.*s\""), BaseFilenameWithPath.Len(), BaseFilenameWithPath.GetData(), ProfileId, Extension.Len(), Extension.GetData())); continue; } } else if (Token == TEXT("-NoAssetRegistryCacheWrite")) { // If cache writing has been disabled, we must also disable reading of the discovery cache on workers. // Otherwise workers could be looking at old discovery results that no longer match what's on disk. Tokens.Add(TEXT("-NoAssetRegistryDiscoveryCache")); bAssetRegistryCacheWritingDisabled = true; } Tokens.Add(MoveTemp(Token)); } if (Tokens[0] != ProjectName && !Tokens[0].EndsWith(TEXT(".uproject"), ESearchCase::IgnoreCase)) { FString ProjectFilePath = FPaths::GetProjectFilePath(); if (!FPaths::IsSamePath(Tokens[0], ProjectFilePath)) { Tokens.Insert(ProjectFilePath, 0); } } Tokens.Insert(TEXT("-run=cook"), 1); Tokens.Insert(TEXT("-cookworker"), 2); Tokens.Insert(FString::Printf(TEXT("-CookProfileId=%d"), ProfileId), 3); Tokens.Insert(FString::Printf(TEXT("-MultiprocessId=%d"), WorkerId.GetMultiprocessId()), 4); // This should have been constructed in TryCreateWorkerConnectSocket before any CookWorkerServers could exist to // call GetWorkerCommandLine check(!WorkerConnectAuthority.IsEmpty()); Tokens.Add(FString::Printf(TEXT("-CookDirectorHost=%s"), *WorkerConnectAuthority)); Tokens.Add(TEXT("-unattended")); Tokens.Add(FString::Printf(TEXT("-abslog=%s"), *GetWorkerLogFileName(ProfileId))); IAssetRegistry& AssetRegistry = IAssetRegistry::GetChecked(); if (AssetRegistry.HasSerializedDiscoveryCache()) { UE_CLOG(bAssetRegistryCacheWritingDisabled, LogCook, Warning, TEXT("Invalid configuration: The Cook Director has written an AssetRegistry discovery cache when explicitly told not to via -NoAssetRegistryCacheWrite")); if (!bAssetRegistryCacheWritingDisabled) { // If the director was successful in writing a discovery cache, let workers re-use the director's cache Tokens.Add(TEXT("-AssetRegistryCacheSkipInvalidate")); } } if (CoreLimit > 0) { Tokens.Add(FString::Printf(TEXT("-PhysicalCoreLimit=%d"), CoreLimit)); } // We are joining the tokens back into a commandline string; wrap tokens with whitespace in quotes for (FString& IterToken : Tokens) { int32 IndexOfWhitespace = UE::String::FindFirstOfAnyChar(IterToken, { ' ', '\r', '\n' }); if (IndexOfWhitespace != INDEX_NONE) { int32 IndexOfQuote; if (!IterToken.FindChar('\"', IndexOfQuote)) { IterToken = FString::Printf(TEXT("\"%s\""), *IterToken); } } } return FString::Join(Tokens, TEXT(" ")); } bool FDirectorConnectionInfo::TryParseCommandLine() { if (!FParse::Value(FCommandLine::Get(), TEXT("-CookDirectorHost="), HostURI)) { UE_LOG(LogCook, Error, TEXT("CookWorker startup failed: no CookDirector specified on commandline.")); return false; } uint32 MultiprocessId = UE::GetMultiprocessId(); if (MultiprocessId == 0 && !FParse::Value(FCommandLine::Get(), TEXT("-MultiprocessId="), MultiprocessId)) { UE_LOG(LogCook, Error, TEXT("CookWorker startup failed: no MultiprocessId specified on commandline.")); return false; } if (MultiprocessId < 1 || 257 <= MultiprocessId) { UE_LOG(LogCook, Error, TEXT("CookWorker startup failed: commandline had invalid -MultiprocessId=%d; MultiprocessId must be in the range [1, 256]."), MultiprocessId); return false; } RemoteIndex = static_cast(MultiprocessId - 1); return true; } void FCookDirector::LoadBalance(TConstArrayView SortedWorkers, TArrayView Requests, TMap>&& RequestGraph, TArray& OutAssignments) { OutAssignments.Reset(Requests.Num()); bool bLogResults = bIsFirstAssignment; switch (LoadBalanceAlgorithm) { case ELoadBalanceAlgorithm::Striped: return LoadBalanceStriped(SortedWorkers, Requests, MoveTemp(RequestGraph), OutAssignments, bLogResults); case ELoadBalanceAlgorithm::CookBurden: return LoadBalanceCookBurden(SortedWorkers, Requests, MoveTemp(RequestGraph), OutAssignments, bLogResults); } checkNoEntry(); return LoadBalanceCookBurden(SortedWorkers, Requests, MoveTemp(RequestGraph), OutAssignments, bLogResults); } void FCookDirector::AbortWorker(FWorkerId WorkerId, ECookDirectorThread TickThread) { check(!WorkerId.IsLocal()); int32 Index = WorkerId.GetRemoteIndex(); TRefCountPtr RemoteWorker; { FScopeLock RemoteWorkersScopeLock(&CommunicationLock); RemoteWorkers.RemoveAndCopyValue(Index, RemoteWorker); if (!RemoteWorker) { return; } } TSet PackagesToReassignSet; RemoteWorker->AbortAllAssignments(PackagesToReassignSet, TickThread, HeartbeatNumber); if (!RemoteWorker->IsShuttingDown()) { RemoteWorker->AbortWorker(PackagesToReassignSet, TickThread, HeartbeatNumber); } TArray PackagesToReassign = PackagesToReassignSet.Array(); if (TickThread == ECookDirectorThread::SchedulerThread) { ReassignAbortedPackages(PackagesToReassign); } { FScopeLock RemoteWorkersScopeLock(&CommunicationLock); TRefCountPtr& Existing = ShuttingDownWorkers.FindOrAdd(RemoteWorker.GetReference()); // We should not be able to abort a worker twice because we removed it from RemoteWorkers above check(!Existing); Existing = MoveTemp(RemoteWorker); if (TickThread != ECookDirectorThread::SchedulerThread) { DeferredPackagesToReassign.Append(PackagesToReassign); } } } void FCookDirector::ReassignAbortedPackages(TArray& PackagesToReassign) { for (FPackageData* PackageData : PackagesToReassign) { // Packages that were assigned to a worker should be in the AssignedToWorker state and therefore in progress. check(PackageData->IsInProgress()); PackageData->SetWorkerAssignment(FWorkerId::Invalid()); EPackageState NewState = PackageData->IsInStateProperty(EPackageStateProperty::Saving) ? EPackageState::SaveActive : EPackageState::Request; PackageData->SendToState(NewState, ESendFlags::QueueAddAndRemove, EStateChangeReason::ReassignAbortedPackages); } PackagesToReassign.Empty(); } void FCookDirector::ConstructReadonlyThreadVariables() { IsCookIgnoreTimeouts(); // The global variables are read-only; call them now to initialize them CommandletExecutablePath = FUnrealEdMisc::Get().GetProjectEditorBinaryPath(); InitialConfigMessage = MakeUnique(); FMPCollectorServerTickContext StartupContext(FMPCollectorServerTickContext::EServerEventType::WorkerStartup); StartupContext.Platforms = COTFS.PlatformManager->GetSessionPlatforms(); InitialConfigMessage->ReadFromLocal(COTFS, StartupContext.Platforms, *COTFS.CookByTheBookOptions, *COTFS.CookOnTheFlyOptions, BeginCookContext); for (const TPair>& CollectorPair : Collectors) { IMPCollector* Collector = CollectorPair.Value.GetReference(); Collector->ServerTick(StartupContext); if (!StartupContext.Messages.IsEmpty()) { FGuid MessageType = Collector->GetMessageType(); for (FCbObject& Object : StartupContext.Messages) { InitialConfigMessage->AddMessage({ MessageType, MoveTemp(Object) }); } StartupContext.Messages.Reset(); } } } const FInitialConfigMessage& FCookDirector::GetInitialConfigMessage() { return *InitialConfigMessage; } FCookDirector::FLaunchInfo FCookDirector::GetLaunchInfo(FWorkerId WorkerId, int32 ProfileId) { FLaunchInfo Info; Info.ShowWorkerOption = GetShowWorkerOption(); Info.CommandletExecutable = CommandletExecutablePath; Info.WorkerCommandLine = GetWorkerCommandLine(WorkerId, ProfileId); return Info; } #if ENABLE_COOK_STATS void FCookDirector::LogCookStats(FCookStatsManager::AddStatFuncRef AddStat) { auto IdleTimeToString = [](float IdleTime) { return FString::Printf(TEXT("%.1fs"), IdleTime); }; TArray Stats; Stats.Emplace(TEXT("LocalWorker IdleTime"), IdleTimeToString(LocalWorkerProfileData->IdleTimeSeconds)); for (int32 ProfileId = 0; ProfileId < RemoteWorkerProfileDatas.Num(); ++ProfileId) { FCookWorkerProfileData& ProfileData = RemoteWorkerProfileDatas[ProfileId]; Stats.Emplace(FString::Printf(TEXT("CookWorker %d IdleTime"), ProfileId), IdleTimeToString(ProfileData.IdleTimeSeconds)); } AddStat(TEXT("CookDirector"), Stats); } #endif FCookDirector::FRetractionHandler::FRetractionHandler(FCookDirector& InDirector) : Director(InDirector) { } FCookDirector::FRetractionHandler::ERetractionState FCookDirector::FRetractionHandler::TickWantToRetract( bool& bOutAnyIdle, int32& OutBusiestNumAssignments) { FWorkerId BusiestWorker; TArray IdleWorkers; int32 BusiestNumAssignments = 0; if (Director.bAllowLocalCooks) { int32 NumAssignments = Director.COTFS.NumMultiprocessLocalWorkerAssignments(); if (NumAssignments > BusiestNumAssignments) { if (IsAvailableForRetraction(FWorkerId::Local())) { BusiestWorker = FWorkerId::Local(); BusiestNumAssignments = NumAssignments; } } if (NumAssignments == 0) { IdleWorkers.Add(FWorkerId::Local()); } } TArray> LocalRemoteWorkers = Director.CopyRemoteWorkers(); for (const TRefCountPtr& RemoteWorker : LocalRemoteWorkers) { int32 NumAssignments = RemoteWorker->NumAssignments(); if (NumAssignments > BusiestNumAssignments) { if (IsAvailableForRetraction(RemoteWorker->GetWorkerId())) { BusiestWorker = RemoteWorker->GetWorkerId(); BusiestNumAssignments = NumAssignments; } } if (NumAssignments == 0) { IdleWorkers.Add(RemoteWorker->GetWorkerId()); } } bOutAnyIdle = !IdleWorkers.IsEmpty(); OutBusiestNumAssignments = BusiestNumAssignments; if (IdleWorkers.IsEmpty() || BusiestNumAssignments < RetractionMinimumNumAssignments) { // Worker loads changed after the point where we decided to initialize the RetractionHandler, // or all workers with packages assigned are unavailable for retraction, so retraction is not // currently possible. Try again later. return ERetractionState::WantToRetract; } check(!BusiestWorker.IsInvalid()); // Plan to divide the assignments evenly between all idle workers and the one busiest worker. This means // retracting all but 1/(N+1) packages from the busiest worker. int32 NumAssignmentsToRetract = (BusiestNumAssignments * IdleWorkers.Num()) / (IdleWorkers.Num() + 1); TStringBuilder<256> IdleWorkerListText; for (FWorkerId& WorkerId : IdleWorkers) { IdleWorkerListText << Director.GetDisplayName(WorkerId) << TEXT(", "); } IdleWorkerListText.RemoveSuffix(2); UE_LOG(LogCook, Display, TEXT("Idle CookWorkers: { %s }. Retracting %d packages from %s to distribute to the idle CookWorkers."), *IdleWorkerListText, NumAssignmentsToRetract, *Director.GetDisplayName(BusiestWorker)); Director.DisplayRemainingPackages(); if (BusiestWorker.IsLocal()) { ExpectedWorker = FWorkerId::Local(); WorkerWithResults = ExpectedWorker; TArray LocalPackagesToRetract; Director.COTFS.GetPackagesToRetract(NumAssignmentsToRetract, LocalPackagesToRetract); PackagesToRetract.FindOrAdd(ExpectedWorker).Append(MoveTemp(LocalPackagesToRetract)); } else { TRefCountPtr* RemoteWorker = LocalRemoteWorkers.FindByPredicate( [&BusiestWorker](const TRefCountPtr& X) { return X->GetWorkerId() == BusiestWorker; }); check(RemoteWorker); FRetractionRequestMessage Message; Message.RequestedCount = NumAssignmentsToRetract; (*RemoteWorker)->SendMessage(Message, ECookDirectorThread::SchedulerThread); ExpectedWorker = BusiestWorker; MessageSentTimeSeconds = FPlatformTime::Seconds(); LastWarnTimeSeconds = MessageSentTimeSeconds; } return ERetractionState::WaitingForResponse; } void FCookDirector::FRetractionHandler::InitializeForResultsMessage(const FWorkerId& FromWorker) { ExpectedWorker = FromWorker; } void FCookDirector::FRetractionHandler::TickFromSchedulerThread(bool bAllWorkersConnected, bool bAnyIdle, int32 BusiestNumAssignments) { bool bHadStateChange; int32 NumTransitions = 0; constexpr int32 MaxNumTransitions = static_cast(ERetractionState::Count); do { bHadStateChange = false; switch (RetractionState) { case ERetractionState::Idle: { if (!bAnyIdle || !bAllWorkersConnected || BusiestNumAssignments <= RetractionMinimumNumAssignments) { break; } SetRetractionState(ERetractionState::WantToRetract, bHadStateChange); break; } case ERetractionState::WantToRetract: { if (!bAnyIdle || !bAllWorkersConnected || BusiestNumAssignments <= RetractionMinimumNumAssignments) { SetRetractionState(ERetractionState::Idle, bHadStateChange); break; } ERetractionState NewState = TickWantToRetract(bAnyIdle, BusiestNumAssignments); SetRetractionState(NewState, bHadStateChange); break; } case ERetractionState::WaitingForResponse: { ERetractionState NewState = TickWaitingForResponse(); SetRetractionState(NewState, bHadStateChange); break; } default: checkNoEntry(); break; } } while (bHadStateChange && RetractionState != ERetractionState::Idle && ++NumTransitions <= MaxNumTransitions); } void FCookDirector::FRetractionHandler::SetRetractionState(ERetractionState NewState, bool &bOutHadStateChange) { if (RetractionState == NewState) { bOutHadStateChange = false; return; } bOutHadStateChange = true; RetractionState = NewState; if (NewState == ERetractionState::Idle) { WorkersUnavailableForRetract.Empty(); } } bool FCookDirector::FRetractionHandler::IsAvailableForRetraction(const FWorkerId& WorkerId) { // Called from inside CommunicationLock int32* AssignedPackagesFence = WorkersUnavailableForRetract.Find(WorkerId); if (!AssignedPackagesFence) { return true; } int32 CurrentFenceMarker; if (WorkerId.IsLocal()) { CurrentFenceMarker = Director.COTFS.PackageDatas->GetMonitor().GetMPCookAssignedFenceMarker(); } else { const TRefCountPtr* RemoteWorker = Director.FindRemoteWorkerInLock(WorkerId); if (!RemoteWorker) { WorkersUnavailableForRetract.Remove(WorkerId); return true; } CurrentFenceMarker = (*RemoteWorker)->GetPackagesAssignedFenceMarker(); } if (*AssignedPackagesFence == CurrentFenceMarker) { // FenceMarker has not changed since we recorded the worker as unavailable for retraction at that fence marker // The worker is still unavailable for retraction return false; } WorkersUnavailableForRetract.Remove(WorkerId); return true; } FCookDirector::FRetractionHandler::ERetractionState FCookDirector::FRetractionHandler::TickWaitingForResponse() { // Called from inside CommunicationLock if (ExpectedWorker.IsInvalid()) { // We decided to cancel checkf(PackagesToRetract.IsEmpty(), TEXT("We should not have any packages when we cancelled.")); return ERetractionState::Idle; } if (WorkerWithResults.IsInvalid()) { double CurrentTime = FPlatformTime::Seconds(); constexpr float WarnDuration = 60.f; if (static_cast(CurrentTime - LastWarnTimeSeconds) < WarnDuration) { return ERetractionState::WaitingForResponse; } check(ExpectedWorker.IsRemote()); { const TRefCountPtr* RemoteWorkerPtr = Director.FindRemoteWorkerInLock(ExpectedWorker); if (!RemoteWorkerPtr) { // The CookWorker aborted and we already reassigned all of its packages; stop waiting for a retraction // message from it. check(PackagesToRetract.IsEmpty()); // Otherwise WorkerWithResults would have been set ExpectedWorker = FWorkerId::Invalid(); return ERetractionState::Idle; } } UE_CLOG(!IsCookIgnoreTimeouts(), LogCook, Display, TEXT("%s has not responded to a RetractionRequest message for %.1f seconds. Continuing to wait..."), *Director.GetDisplayName(ExpectedWorker), static_cast(CurrentTime - MessageSentTimeSeconds)); LastWarnTimeSeconds = CurrentTime; return ERetractionState::WaitingForResponse; } // Convert names to packagedatas and collect results from all CookWorkers who sent a message. TArray PackageDatasToReassign; for (const TPair>& Pair : PackagesToRetract) { TRefCountPtr RemoteWorker; if (Pair.Key.IsRemote()) { const TRefCountPtr* FoundRemoteWorker = Director.FindRemoteWorkerInLock(Pair.Key); if (FoundRemoteWorker) { RemoteWorker = *FoundRemoteWorker; } } TArray WorkerPackageDatas; WorkerPackageDatas.Reserve(Pair.Value.Num()); for (FName PackageName : Pair.Value) { FPackageData* PackageData = Director.COTFS.PackageDatas->FindPackageDataByPackageName(PackageName); if (PackageData) { WorkerPackageDatas.Add(PackageData); } } if (RemoteWorker) { // The worker(s) that sent the retraction message aborted all of the packages, so mark locally that they // have been aborted RemoteWorker->AbortAssignments(WorkerPackageDatas, ECookDirectorThread::SchedulerThread, Director.HeartbeatNumber, ENotifyRemote::LocalOnly); } PackageDatasToReassign.Append(MoveTemp(WorkerPackageDatas)); } // Reassign the packages ERetractionResult Result = ReassignPackages(WorkerWithResults, PackageDatasToReassign); if (Result == ERetractionResult::NoneAvailable) { TOptional AssignedPackagesFence; if (WorkerWithResults.IsLocal()) { AssignedPackagesFence.Emplace(Director.COTFS.PackageDatas->GetMonitor().GetMPCookAssignedFenceMarker()); } else { const TRefCountPtr* RemoteWorker = Director.FindRemoteWorkerInLock(WorkerWithResults); if (RemoteWorker) { AssignedPackagesFence.Emplace((*RemoteWorker)->GetPackagesAssignedFenceMarker()); } } if (AssignedPackagesFence.IsSet()) { WorkersUnavailableForRetract.Add(WorkerWithResults, *AssignedPackagesFence); } } // Mark that we are no longer waiting ExpectedWorker = FWorkerId::Invalid(); WorkerWithResults = FWorkerId::Invalid(); PackagesToRetract.Empty(); // Return to WantToRetract state; that state will handle returning to idle if the retraction was sufficient return ERetractionState::WantToRetract; } void FCookDirector::FRetractionHandler::HandleRetractionMessage(FMPCollectorServerMessageContext& Context, bool bReadSuccessful, FRetractionResultsMessage&& Message) { // Called from inside CommunicationLock if (!bReadSuccessful) { UE_LOG(LogCook, Error, TEXT("Corrupt RetractionResultsMessage received from CookWorker %d. It will be ignored and packages may fail to cook."), Context.GetProfileId()); return; } if (RetractionState != ERetractionState::WaitingForResponse) { UE_LOG(LogCook, Warning, TEXT("Retractionmessage received from CookWorker %d when we were not expecting one."), Context.GetProfileId()); InitializeForResultsMessage(Context.GetWorkerId()); bool bUnusedHadStateChange; SetRetractionState(ERetractionState::WaitingForResponse, bUnusedHadStateChange); } UE_CLOG(WorkerWithResults.IsValid(), LogCook, Error, TEXT("Unexpectedly received RetractionResults message from multiple CookWorkers. Merging the results.")); WorkerWithResults = Context.GetWorkerId(); PackagesToRetract.FindOrAdd(WorkerWithResults).Append(Message.ReturnedPackages); } FCookDirector::FRetractionHandler::ERetractionResult FCookDirector::FRetractionHandler::ReassignPackages(const FWorkerId& FromWorker, TConstArrayView Packages) { TArray> LocalRemoteWorkers = Director.CopyRemoteWorkers(); TArray WorkersRequiredByConstraint; TArray AssignmentPackages; for (FPackageData* PackageData : Packages) { EPackageState State = PackageData->GetState(); if (State == EPackageState::Idle) { continue; } FWorkerId WorkerConstraint = PackageData->GetWorkerAssignmentConstraint(); if (WorkerConstraint.IsValid()) { if (!WorkerConstraint.IsLocal() && !LocalRemoteWorkers.FindByPredicate( [&WorkerConstraint](const TRefCountPtr& X) { return X->GetWorkerId() == WorkerConstraint; })) { continue; } WorkersRequiredByConstraint.AddUnique(WorkerConstraint); } AssignmentPackages.Add(PackageData); // The package is no longer assigned to the worker, and the worker already knows it, so remove its assignment. // We have to remove the assignment so that we can reassign it down below; it is invalid to call // SetWorkerAssignment to a new worker without clearing the old worker first. PackageData->SetWorkerAssignment(FWorkerId::Invalid()); } if (AssignmentPackages.IsEmpty()) { UE_LOG(LogCook, Display, TEXT("Retraction results message received from %s; no packages were available for retraction."), *Director.GetDisplayName(FromWorker)); Director.DisplayRemainingPackages(); return ERetractionResult::NoneAvailable; } TArray WorkersToSplitOver = CalculateWorkersToSplitOver(AssignmentPackages.Num(), FromWorker, LocalRemoteWorkers); if (WorkersToSplitOver.IsEmpty()) { // Send the packages back to the Director for reassignment TPackageDataMap& RestartedRequests = Director.COTFS.PackageDatas->GetRequestQueue().GetRestartedRequests(); for (FPackageData* PackageData : AssignmentPackages) { // UnStall the package to handle the case where it was previously in the save state on the LocalWorker // and was retracted to a remote worker, so we stalled it, but now we've pulled it back to the LocalWorker. PackageData->UnStall(ESendFlags::QueueAddAndRemove); // If the packagedata was unstalled back into the Save State, then keep it there, but otherwise it is in // the AssignedToWorker state so kick it back to readyrequest. if (PackageData->GetState() != EPackageState::SaveActive) { PackageData->SendToState(EPackageState::Request, ESendFlags::QueueRemove, EStateChangeReason::Retraction); RestartedRequests.Add(PackageData, ESuppressCookReason::RetractedByCookDirector); } } // MPCOOKTODO: Add a method to PumpRequests long enough to assign the packages UE_LOG(LogCook, Display, TEXT("%d packages retracted from %s. No workers are currently idle so the packages were assigned evenly to all CookWorkers."), AssignmentPackages.Num(), *Director.GetDisplayName(FromWorker)); Director.DisplayRemainingPackages(); return ERetractionResult::Retracted; } for (const FWorkerId& WorkerId : WorkersRequiredByConstraint) { WorkersToSplitOver.AddUnique(WorkerId); } TStringBuilder<256> WorkerListText; for (const FWorkerId& WorkerId : WorkersToSplitOver) { WorkerListText << Director.GetDisplayName(WorkerId) << TEXT(", "); } WorkerListText.RemoveSuffix(2); UE_LOG(LogCook, Display, TEXT("%d packages retracted from %s and distributed to idle workers { %s }."), AssignmentPackages.Num(), *Director.GetDisplayName(FromWorker), *WorkerListText); TMap> RequestGraph; TArray Assignments; Director.AssignRequests(MoveTemp(WorkersToSplitOver), LocalRemoteWorkers, AssignmentPackages, Assignments, MoveTemp(RequestGraph), false /* bInitialAssignment */); FRequestQueue& RequestQueue = Director.COTFS.PackageDatas->GetRequestQueue(); bool bAssignedToLocal = false; for (int32 Index = 0; Index < AssignmentPackages.Num(); ++Index) { FPackageData* PackageData = AssignmentPackages[Index]; FWorkerId Assignment = Assignments[Index]; if (Assignment.IsInvalid()) { Director.COTFS.DemoteToIdle(*PackageData, ESendFlags::QueueAdd, ESuppressCookReason::MultiprocessAssignmentError); } else if (Assignment.IsLocal()) { // UnStall the package to handle the case where it was previously in the save state on the LocalWorker // and was retracted to a remote worker, so we stalled it, but now we've pulled it back to the LocalWorker. PackageData->UnStall(ESendFlags::QueueAddAndRemove); // If the packagedata was unstalled back into the Save State, then keep it there, but otherwise it is in // the AssignedToWorker state so kick it back to readyrequest. if (PackageData->GetState() != EPackageState::SaveActive) { PackageData->SendToState(EPackageState::Request, ESendFlags::QueueRemove, EStateChangeReason::Retraction); RequestQueue.AddReadyRequest(PackageData); } bAssignedToLocal = true; } else { TRefCountPtr GenerationHelper = PackageData->GetGenerationHelper(); if (!GenerationHelper) { GenerationHelper = PackageData->GetParentGenerationHelper(); } bool bShouldStall = false; if (GenerationHelper) { bShouldStall = GenerationHelper->ShouldRetractionStallRatherThanDemote(*PackageData); } if (bShouldStall) { PackageData->Stall(EPackageState::SaveStalledAssignedToWorker, ESendFlags::QueueAddAndRemove); // ShouldRetractionStallRatherThanDemote should not return true unless the Stall will succeed check(PackageData->GetState() == EPackageState::SaveStalledAssignedToWorker); UE_LOG(LogCook, Display, TEXT("Retracting generated package %s; it will remain in memory on the director's worker until the generator finishes saving."), *WriteToString<256>(PackageData->GetPackageName())); } else { PackageData->SendToState(EPackageState::AssignedToWorker, ESendFlags::QueueAddAndRemove, EStateChangeReason::Retraction); } PackageData->SetWorkerAssignment(Assignment); } } Director.DisplayRemainingPackages(); if (bAssignedToLocal) { // Clear the SoftGC diagnostic ExpectedNeverLoadPackages because we have new assigned packages // that we didn't consider during SoftGC Director.COTFS.PackageTracker->ClearExpectedNeverLoadPackages(); } return ERetractionResult::Retracted; } TArray FCookDirector::FRetractionHandler::CalculateWorkersToSplitOver(int32 NumPackages, const FWorkerId& FromWorker, TConstArrayView> LocalRemoteWorkers) { TArray> WorkerNumPackages; if (FromWorker != FWorkerId::Local() && Director.bAllowLocalCooks) { WorkerNumPackages.Emplace(FWorkerId::Local(), Director.COTFS.NumMultiprocessLocalWorkerAssignments()); } for (const TRefCountPtr& RemoteWorker : LocalRemoteWorkers) { if (FromWorker != RemoteWorker->GetWorkerId()) { WorkerNumPackages.Emplace(RemoteWorker->GetWorkerId(), RemoteWorker->NumAssignments()); } } if (WorkerNumPackages.Num() == 0) { return TArray(); } WorkerNumPackages.Sort([](const TPair& A, const TPair& B) { return A.Value < B.Value; }); // Consider splitting the packages amonst the 1 lowest, 2 lowest, ... n lowest (not including the FromWorker) // Pick the value to split over based on whichever split group results in the lowest post split maximum // So splitting 500 over 0,1000,1000,1000 -> would give them all to the first, but splitting 500 over // 0, 100, 1000, 1000 would split them amongst the first two. int32 BestNumToSplitOver = 0; int32 BestPostSplitValue = 0; for (int32 NumToSplitOver = 1; NumToSplitOver <= WorkerNumPackages.Num(); ++NumToSplitOver) { int32 PostSplitValue = WorkerNumPackages[NumToSplitOver - 1].Value + NumPackages / NumToSplitOver; if (BestNumToSplitOver == 0 || PostSplitValue < BestPostSplitValue) { BestNumToSplitOver = NumToSplitOver; BestPostSplitValue = PostSplitValue; } } check(BestNumToSplitOver > 0); TArray Results; Results.Reserve(BestNumToSplitOver); for (const TPair& Pair : TArrayView>(WorkerNumPackages).Left(BestNumToSplitOver)) { Results.Add(Pair.Key); } return Results; } void FRetractionRequestMessage::Write(FCbWriter& Writer) const { Writer << "RequestedCount" << RequestedCount; } bool FRetractionRequestMessage::TryRead(FCbObjectView Object) { return LoadFromCompactBinary(Object["RequestedCount"], RequestedCount); } FGuid FRetractionRequestMessage::MessageType(TEXT("7109E168E8A8405BA65F9E1E82571D1A")); void FRetractionResultsMessage::Write(FCbWriter& Writer) const { Writer << "ReturnedPackages" << ReturnedPackages; } bool FRetractionResultsMessage::TryRead(FCbObjectView Object) { return LoadFromCompactBinary(Object["ReturnedPackages"], ReturnedPackages); } FGuid FRetractionResultsMessage::MessageType(TEXT("CBFB840A4FB94903A757C490514A4B86")); void FDirectorEventMessage::Write(FCbWriter& Writer) const { static_assert(sizeof(EDirectorEvent) <= sizeof(uint8), "We are storing it in a uint8"); Writer << "T" << (uint8)Event; } bool FDirectorEventMessage::TryRead(FCbObjectView Object) { bool bOk = true; uint8 EventInt = Object["T"].AsUInt8(static_cast(EDirectorEvent::Count)); if (EventInt < static_cast(EDirectorEvent::Count)) { Event = static_cast(EventInt); } else { bOk = false; } return bOk; } FGuid FDirectorEventMessage::MessageType(TEXT("58A03A38FEE045B08331BB8457AEBE35")); }