// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "Containers/Array.h" #include "Containers/ArrayView.h" #include "Containers/Map.h" #include "Containers/UnrealString.h" #include "Cooker/CompactBinaryTCP.h" #include "Cooker/CookSockets.h" #include "Cooker/CookTypes.h" #include "Cooker/MPCollector.h" #include "HAL/CriticalSection.h" #include "HAL/Event.h" #include "HAL/LowLevelMemTracker.h" #include "HAL/Runnable.h" #include "Memory/SharedBuffer.h" #include "Misc/Guid.h" #include "Misc/ScopeLock.h" #include "ProfilingDebugging/CookStats.h" #include "Templates/RefCounting.h" #include "Templates/UniquePtr.h" #include class FCbObject; class FCbWriter; class FRunnableThread; class UCookOnTheFlyServer; namespace UE::Cook { class FCookWorkerServer; } namespace UE::Cook { struct FAssignPackageExtraData; } namespace UE::Cook { struct FCookWorkerProfileData; } namespace UE::Cook { struct FDirectorEventMessage; } namespace UE::Cook { struct FGeneratorEventMessage; } namespace UE::Cook { struct FGenerationHelper; } namespace UE::Cook { struct FHeartbeatMessage; } namespace UE::Cook { struct FInitialConfigMessage; } namespace UE::Cook { struct FPackageData; } namespace UE::Cook { struct FRetractionResultsMessage; } namespace UE::Cook { struct FWorkerId; } LLM_DECLARE_TAG(Cooker_MPCook); namespace UE::Cook { /** * The categories of thread that can pump the communication with CookWorkers. It can be pumped either from * the cooker's scheduler thread (aka Unreal's game thread or main thread) or from a worker thread. */ enum class ECookDirectorThread : uint8 { SchedulerThread, CommunicateThread, Invalid, }; /** * Timings for messages sent to CookWorker or CookDirector through the public Broadcast functions. * The timing indicates at which point in the network tick the message should be sent. */ enum class ECookBroadcastTiming : uint8 { Immediate, AfterAssignPackages, }; /** * Helper for CookOnTheFlyServer that sends requests to CookWorker processes for load/save and merges * their replies into the local process's cook results. */ class FCookDirector { public: FCookDirector(UCookOnTheFlyServer& InCOTFS, int32 CookProcessCount, bool bInCookProcessCountSetByCommandLine); ~FCookDirector(); bool IsMultiprocessAvailable() const; void StartCook(const FBeginCookContext& Context); /** * Assign the given requests out to CookWorkers (or keep on local COTFS), return the list of assignments. * Input requests have been sorted by leaf to root load order. */ void AssignRequests(TArrayView Requests, TArray& OutAssignments, TMap>&& RequestGraph); /** Notify the CookWorker that owns the cook of the package that the Director wants to take it back. */ void RemoveFromWorker(FPackageData& PackageData); /** * Send a message to all CookWorkers to match a state change on the Director. * The message will be sent at the time indicated by the Timing argument; either Immediate, before any other messages * from the director that are triggered after the call, or at the specified later time. */ void BroadcastMessage(const IMPCollectorMessage& Message, ECookBroadcastTiming Timing = ECookBroadcastTiming::Immediate); void BroadcastMessage(UE::CompactBinaryTCP::FMarshalledMessage&& Message, ECookBroadcastTiming Timing = ECookBroadcastTiming::Immediate); /** * Increment the current heartbeat number, send a heartbeat message to all CookWorkers, and return the number in * the broadcast. This number can then be queried for whether the CookWorkers have acknowledged it in a round trip * message. */ int32 InsertBroadcastFence(); /** Report whether all CookWorkers have acknowledged a previously inserted fence.. */ bool IsBroadcastFencePassed(int32 Fence, TArray* OutPendingWorkers); /** Periodic tick function. Sends/Receives messages to CookWorkers. */ void TickFromSchedulerThread(); /** Periodic display function, called from CookOnTheFlyServer.UpdateDisplay. */ void UpdateDisplayDiagnostics() const; /** * Called when the COTFS Server has detected all packages are complete. Tells the CookWorkers to flush messages * and exit. */ void PumpCookComplete(bool& bOutCompleted); /** * Called when a session ends. The Director blocks on shutdown of all CookWorkers and returns state to before * session started. */ void ShutdownCookSession(); /** Enum specifying how CookWorker log output should be shown. */ enum EShowWorker { CombinedLogs, SeparateLogs, SeparateWindows, // Implies SeparateLogs as well }; EShowWorker GetShowWorkerOption() const { return ShowWorkerOption; } /** Register a Collector to receive messages of its MessageType from CookWorkers. */ void Register(IMPCollector* Collector); /** Unegister a Collector that was registered. */ void Unregister(IMPCollector* Collector); /** Data used by a CookWorkerServer to launch the remote process. */ struct FLaunchInfo { EShowWorker ShowWorkerOption; FString CommandletExecutable; FString WorkerCommandLine; }; FLaunchInfo GetLaunchInfo(FWorkerId WorkerId, int32 ProfileId); FString GetDisplayName(const FWorkerId& WorkerId, int32 PreferredWidth = -1) const; /** The message CookWorkerServer sends to the remote process once it is ready to connect. */ const FInitialConfigMessage& GetInitialConfigMessage(); private: enum class ELoadBalanceAlgorithm { Striped, CookBurden, }; /** CookWorker connections that have not yet identified which CookWorker they are. */ struct FPendingConnection { explicit FPendingConnection(FSocket* InSocket = nullptr) :Socket(InSocket) { } FPendingConnection(FPendingConnection&& Other); FPendingConnection(const FPendingConnection& Other) = delete; ~FPendingConnection(); FSocket* DetachSocket(); FSocket* Socket = nullptr; UE::CompactBinaryTCP::FReceiveBuffer Buffer; }; /** Struct that implements the FRunnable interface and forwards it to to named functions on this FCookDirector. */ struct FRunnableShunt : public FRunnable { FRunnableShunt(FCookDirector& InDirector) : Director(InDirector) {} virtual uint32 Run() override; virtual void Stop() override; FCookDirector& Director; }; class FRetractionHandler; private: /** Helper for constructor parsing. */ void ParseConfig(int32 CookProcessCount, bool& bOutValid); /** Initialization helper: create the listen socket. */ bool TryCreateWorkerConnectSocket(); /** * Construct CookWorkerServers and communication thread if not yet constructed. The CookWorkerServers are * constructed to Uninitialized; the worker process is created asynchronously during TickCommunication. */ void InitializeWorkers(); /** * Copy to snapshot variables the data required on the communication thread that can only be read from the * scheduler thread. */ void ConstructReadonlyThreadVariables(); /** Construct CookWorkerServers if necessary to replace workers that have crashed. */ void RecreateWorkers(); /** Reduce memory settings, cpusettings, and anything else that needs to be shared with CookWorkers. */ void ActivateMachineResourceReduction(); /** Start the communication thread if not already started. */ void LaunchCommunicationThread(); /** Signal the communication thread to stop, wait for it to finish, and deallocate it. */ void StopCommunicationThread(); /** Entry point for the communication thread. */ uint32 RunCommunicationThread(); /** * Execute a single frame of communication with CookWorkers: send/receive to all CookWorkers, * including connecting, ongoing communication, and shutting down. */ void TickCommunication(ECookDirectorThread TickThread); /** Tick helper: tick any workers that have not yet finished initialization. */ void TickWorkerConnects(ECookDirectorThread TickThread); /** Tick helper: tick any workers that are shutting down. */ void TickWorkerShutdowns(ECookDirectorThread TickThread); /** The LogPath a worker process writes to. */ FString GetWorkerLogFileName(int32 ProfileId); /** Get the commandline to launch a worker process with. */ FString GetWorkerCommandLine(FWorkerId WorkerId, int32 ProfileId); /** Calls the configured LoadBalanceAlgorithm. Input Requests have been sorted by leaf to root load order. */ void LoadBalance(TConstArrayView SortedWorkers, TArrayView Requests, TMap>&& RequestGraph, TArray& OutAssignments); /** Report whether it is time for a heartbeat message and update the timer data. */ void TickHeartbeat(bool bForceHeartbeat, double CurrentTimeSeconds, bool& bOutSendHeartbeat, int32& OutHeartbeatNumber); /** Reset the IdleHeartbeatFence when new idle-breaking data comes in. */ void ResetFinalIdleHeartbeatFence(); /** Log the occurrence of a heartbeat message from a CookWorker. */ void HandleHeartbeatMessage(FMPCollectorServerMessageContext& Context, bool bReadSuccessful, FHeartbeatMessage&& Message); /** Move the given worker from active workers to the list of workers shutting down. */ void AbortWorker(FWorkerId WorkerId, ECookDirectorThread TickThread); /** Send the given packages from an aborted worker back to the CookOnTheFlyServer for reassignment. */ void ReassignAbortedPackages(TArray& PackagesToReassign); /** * Periodically update whether (1) local server is done and (2) no results from cookworkers have come in. * Send warning when it goes on too long. */ void SetWorkersStalled(bool bInWorkersStalled); /** Callback for CookStats system to log our stats. */ #if ENABLE_COOK_STATS void LogCookStats(FCookStatsManager::AddStatFuncRef AddStat); #endif void AssignRequests(TArray&& InWorkers, TArray>& InRemoteWorkers, TArrayView Requests, TArray& OutAssignments, TMap>&& RequestGraph, bool bInitialAssignment); TArray> CopyRemoteWorkers() const; void DisplayRemainingPackages() const; FString GetDisplayName(const FCookWorkerServer& RemoteWorker, int32 PreferredWidth=-1) const; const TRefCountPtr* FindRemoteWorkerInLock(const FWorkerId& WorkerId) const; TMap GetAssignPackageExtraDatas( TConstArrayView Requests) const; TArray GetInfoPackagesForRequests(TConstArrayView Requests) const; private: // Synchronization primitives that can be used from any thread mutable FCriticalSection CommunicationLock; FEventRef ShutdownEvent {EEventMode::ManualReset}; // Data only accessible from the SchedulerThread FRunnableShunt RunnableShunt; FRunnableThread* CommunicationThread = nullptr; TArray RemoteWorkerProfileDatas; TArray PendingConnections; TUniquePtr LocalWorkerProfileData; TArray> QueuedBroadcasts; UCookOnTheFlyServer& COTFS; double WorkersStalledStartTimeSeconds = 0.; double WorkersStalledWarnTimeSeconds = 0.; double LastTickTimeSeconds = 0.; double NextHeartbeatTimeSeconds = 0.; int32 HeartbeatNumber = 0; int32 FinalIdleHeartbeatFence = -1; bool bWorkersInitialized = false; bool bHasReducedMachineResources = false; bool bIsFirstAssignment = true; bool bCookCompleteSent = false; bool bWorkersStalled = false; bool bMultiprocessAvailable = false; bool bReceivingMessages = false; bool bForceNextHeartbeat = false; bool bCookProcessCountSetByCommandLine = false; // Data that is read-only while the CommunicationThread is active and is readable from any thread FBeginCookContextForWorker BeginCookContext; TMap> Collectors; TUniquePtr InitialConfigMessage; FString WorkerConnectAuthority; FString CommandletExecutablePath; int32 RequestedCookWorkerCount = 0; int32 WorkerConnectPort = 0; int32 CoreLimit = 0; EShowWorker ShowWorkerOption = EShowWorker::CombinedLogs; ELoadBalanceAlgorithm LoadBalanceAlgorithm = ELoadBalanceAlgorithm::CookBurden; /** Whether the director is allowed to cook any packages. True by default, false by commandline parameter. */ bool bAllowLocalCooks = true; // Data only accessible from the CommunicationThread (or if the CommunicationThread is inactive) FSocket* WorkerConnectSocket = nullptr; // Data shared between SchedulerThread and CommunicationThread that can only be accessed inside CommunicationLock TMap> RemoteWorkers; TMap> ShuttingDownWorkers; TArray DeferredPackagesToReassign; TUniquePtr RetractionHandler; bool bWorkersActive = false; friend class UE::Cook::FCookWorkerServer; }; /** Parameters parsed from commandline for how a CookWorker connects to the CooKDirector. */ struct FDirectorConnectionInfo { bool TryParseCommandLine(); FString HostURI; int32 RemoteIndex = 0; }; /** Message sent from a CookWorker to the Director to report that it is ready for setup messages and cooking. */ struct FWorkerConnectMessage : public IMPCollectorMessage { public: virtual void Write(FCbWriter& Writer) const override; virtual bool TryRead(FCbObjectView Object) override; virtual FGuid GetMessageType() const override { return MessageType; } virtual const TCHAR* GetDebugName() const override { return TEXT("WorkerConnectMessage"); } public: int32 RemoteIndex = 0; static FGuid MessageType; }; /** * Message sent from CookDirector to a CookWorker to cancel some of its assigned packages and return them * dispatch to idle workers. */ struct FRetractionRequestMessage : public IMPCollectorMessage { virtual void Write(FCbWriter& Writer) const override; virtual bool TryRead(FCbObjectView Object) override; virtual FGuid GetMessageType() const override { return MessageType; } virtual const TCHAR* GetDebugName() const override { return TEXT("RetractionRequestMessage"); } public: int32 RequestedCount = 0; static FGuid MessageType; }; /** * Message sent from CookWorker to CookDirector identifying which assigned packages it chose to satisfy a * a RetractionRequest. */ struct FRetractionResultsMessage : public IMPCollectorMessage { virtual void Write(FCbWriter& Writer) const override; virtual bool TryRead(FCbObjectView Object) override; virtual FGuid GetMessageType() const override { return MessageType; } virtual const TCHAR* GetDebugName() const override { return TEXT("RetractionResultsMessage"); } public: TArray ReturnedPackages; static FGuid MessageType; }; /** Director status Events that can be broadcast from CookDirector to CookWorkers. */ enum class EDirectorEvent : uint8 { KickBuildDependencies, Count, }; /** Message sent from CookDirector to CookWorkers to notify them of an EDirectorEvent. */ struct FDirectorEventMessage : public IMPCollectorMessage { explicit FDirectorEventMessage(EDirectorEvent InEvent = EDirectorEvent::Count) : Event(InEvent) { } virtual void Write(FCbWriter& Writer) const override; virtual bool TryRead(FCbObjectView Object) override; virtual FGuid GetMessageType() const override { return MessageType; } virtual const TCHAR* GetDebugName() const override { return TEXT("DirectorEventMessage"); } public: EDirectorEvent Event; static FGuid MessageType; }; } // namespace UE::Cook