// Copyright Epic Games, Inc. All Rights Reserved. /*============================================================================= GPUProfiler.h: Hierarchical GPU Profiler Implementation. =============================================================================*/ #include "GPUProfiler.h" #include "Async/TaskGraphInterfaces.h" #include "Misc/WildcardString.h" #include "Misc/CommandLine.h" #include "RHI.h" #include "GpuProfilerTrace.h" #include "Containers/AnsiString.h" #include "Stats/StatsData.h" #if !UE_BUILD_SHIPPING #include "VisualizerEvents.h" #include "ProfileVisualizerModule.h" #include "Modules/ModuleManager.h" #endif #define LOCTEXT_NAMESPACE "GpuProfiler" enum class EGPUProfileSortMode { Chronological, TimeElapsed, NumPrims, NumVerts, Max }; static TAutoConsoleVariable GCVarProfileGPU_Sort( TEXT("r.ProfileGPU.Sort"), 0, TEXT("Sorts the TTY Dump independently at each level of the tree in various modes.\n") TEXT("0 : Chronological\n") TEXT("1 : By time elapsed\n") TEXT("2 : By number of prims\n") TEXT("3 : By number of verts\n"), ECVF_Default); static TAutoConsoleVariable GCVarProfileGPU_Root( TEXT("r.ProfileGPU.Root"), TEXT("*"), TEXT("Allows to filter the tree when using ProfileGPU, the pattern match is case sensitive."), ECVF_Default); static TAutoConsoleVariable GCVarProfileGPU_ThresholdPercent( TEXT("r.ProfileGPU.ThresholdPercent"), 0.0f, TEXT("Percent of the total execution duration the event needs to be larger than to be printed."), ECVF_Default); static TAutoConsoleVariable GCVarProfileGPU_UnicodeOutput( TEXT("r.ProfileGPU.UnicodeOutput"), true, TEXT("When enabled, the output results will be formatted in a unicode table."), ECVF_Default); static TAutoConsoleVariable GCVarProfileGPU_ShowLeafEvents( TEXT("r.ProfileGPU.ShowLeafEvents"), true, TEXT("Allows profileGPU to display event-only leaf nodes with no draws associated."), ECVF_Default); static TAutoConsoleVariable CVarGPUCsvStatsEnabled( TEXT("r.GPUCsvStatsEnabled"), 0, TEXT("Enables or disables GPU stat recording to CSVs")); #if (RHI_NEW_GPU_PROFILER == 0) static TAutoConsoleVariable GProfileGPUPatternCVar( TEXT("r.ProfileGPU.Pattern"), TEXT("*"), TEXT("Allows to filter the entries when using ProfileGPU, the pattern match is case sensitive.\n") TEXT("'*' can be used in the end to get all entries starting with the string.\n") TEXT(" '*' without any leading characters disables the pattern matching and uses a time threshold instead (default).\n") TEXT("'?' allows to ignore one character.\n") TEXT("e.g. AmbientOcclusionSetup, AmbientOcclusion*, Ambient???lusion*, *"), ECVF_Default); static TAutoConsoleVariable GProfileShowEventHistogram( TEXT("r.ProfileGPU.ShowEventHistogram"), 0, TEXT("Whether the event histogram should be shown."), ECVF_Default); TAutoConsoleVariable GProfileGPUTransitions( TEXT("r.ProfileGPU.ShowTransitions"), 0, TEXT("Allows profileGPU to display resource transition events."), ECVF_Default); // Should we print a summary at the end? static TAutoConsoleVariable GProfilePrintAssetSummary( TEXT("r.ProfileGPU.PrintAssetSummary"), 0, TEXT("Should we print a summary split by asset (r.ShowMaterialDrawEvents is strongly recommended as well).\n"), ECVF_Default); // Should we print a summary at the end? static TAutoConsoleVariable GProfileAssetSummaryCallOuts( TEXT("r.ProfileGPU.AssetSummaryCallOuts"), TEXT(""), TEXT("Comma separated list of substrings that deserve special mention in the final summary (e.g., \"LOD,HeroName\"\n") TEXT("r.ProfileGPU.PrintAssetSummary must be true to enable this feature"), ECVF_Default); static TAutoConsoleVariable GSaveScreenshotAfterProfilingGPUCVar( TEXT("r.ProfileGPU.Screenshot"), 1, TEXT("Whether a screenshot should be taken when profiling the GPU. 0:off, 1:on (default)"), ECVF_RenderThreadSafe); static TAutoConsoleVariable GShowProfilerAfterProfilingGPUCVar( TEXT("r.ProfileGPU.ShowUI"), 1, TEXT("Whether the user interface profiler should be displayed after profiling the GPU.\n") TEXT("The results will always go to the log/console\n") TEXT("0:off, 1:on (default)"), ECVF_RenderThreadSafe); static TAutoConsoleVariable GGPUHitchThresholdCVar( TEXT("RHI.GPUHitchThreshold"), 100.0f, TEXT("Threshold for detecting hitches on the GPU (in milliseconds).") ); static TAutoConsoleVariable CVarGPUCrashDataCollectionEnable( TEXT("r.gpucrash.collectionenable"), 1, TEXT("Stores GPU crash data from scoped events when a applicable crash debugging system is available."), ECVF_RenderThreadSafe); static TAutoConsoleVariable CVarGPUCrashDataDepth( TEXT("r.gpucrash.datadepth"), -1, TEXT("Limits the amount of marker scope depth we record for GPU crash debugging to the given scope depth."), ECVF_RenderThreadSafe); namespace RHIConfig { bool ShouldSaveScreenshotAfterProfilingGPU() { return GSaveScreenshotAfterProfilingGPUCVar.GetValueOnAnyThread() != 0; } bool ShouldShowProfilerAfterProfilingGPU() { return GShowProfilerAfterProfilingGPUCVar.GetValueOnAnyThread() != 0; } float GetGPUHitchThreshold() { return GGPUHitchThresholdCVar.GetValueOnAnyThread() * 0.001f; } } /** Recursively generates a histogram of nodes and stores their timing in TimingResult. */ static void GatherStatsEventNode(FGPUProfilerEventNode* Node, int32 Depth, TMap& EventHistogram) { if (Node->NumDraws > 0 || Node->NumDispatches > 0 || Node->Children.Num() > 0) { Node->TimingResult = Node->GetTiming() * 1000.0f; Node->NumTotalDraws = Node->NumDraws; Node->NumTotalDispatches = Node->NumDispatches; Node->NumTotalPrimitives = Node->NumPrimitives; Node->NumTotalVertices = Node->NumVertices; FGPUProfilerEventNode* Parent = Node->Parent; while (Parent) { Parent->NumTotalDraws += Node->NumDraws; Parent->NumTotalDispatches += Node->NumDispatches; Parent->NumTotalPrimitives += Node->NumPrimitives; Parent->NumTotalVertices += Node->NumVertices; Parent = Parent->Parent; } for (int32 ChildIndex = 0; ChildIndex < Node->Children.Num(); ChildIndex++) { // Traverse children GatherStatsEventNode(Node->Children[ChildIndex], Depth + 1, EventHistogram); } FGPUProfilerEventNodeStats* FoundHistogramBucket = EventHistogram.Find(Node->Name); if (FoundHistogramBucket) { FoundHistogramBucket->NumDraws += Node->NumTotalDraws; FoundHistogramBucket->NumPrimitives += Node->NumTotalPrimitives; FoundHistogramBucket->NumVertices += Node->NumTotalVertices; FoundHistogramBucket->TimingResult += Node->TimingResult; FoundHistogramBucket->NumEvents++; } else { FGPUProfilerEventNodeStats NewNodeStats; NewNodeStats.NumDraws = Node->NumTotalDraws; NewNodeStats.NumPrimitives = Node->NumTotalPrimitives; NewNodeStats.NumVertices = Node->NumTotalVertices; NewNodeStats.TimingResult = Node->TimingResult; NewNodeStats.NumEvents = 1; EventHistogram.Add(Node->Name, NewNodeStats); } } } struct FGPUProfileInfoPair { int64 Triangles; int32 DrawCalls; FGPUProfileInfoPair() : Triangles(0) , DrawCalls(0) { } void AddDraw(int64 InTriangleCount) { Triangles += InTriangleCount; ++DrawCalls; } }; struct FGPUProfileStatSummary { TMap TrianglesPerMaterial; TMap TrianglesPerMesh; TMap TrianglesPerNonMesh; int32 TotalNumNodes; int32 TotalNumDraws; bool bGatherSummaryStats; bool bDumpEventLeafNodes; FGPUProfileStatSummary() : TotalNumNodes(0) , TotalNumDraws(0) , bGatherSummaryStats(false) , bDumpEventLeafNodes(false) { bDumpEventLeafNodes = GCVarProfileGPU_ShowLeafEvents.GetValueOnRenderThread() != 0; bGatherSummaryStats = GProfilePrintAssetSummary.GetValueOnRenderThread() != 0; } void ProcessMatch(FGPUProfilerEventNode* Node) { if (bGatherSummaryStats && (Node->NumTotalPrimitives > 0) && (Node->NumTotalVertices > 0) && (Node->Children.Num() == 0)) { FString MaterialPart; FString AssetPart; if (Node->Name.Split(TEXT(" "), &MaterialPart, &AssetPart, ESearchCase::CaseSensitive)) { TrianglesPerMaterial.FindOrAdd(MaterialPart).AddDraw(Node->NumTotalPrimitives); TrianglesPerMesh.FindOrAdd(AssetPart).AddDraw(Node->NumTotalPrimitives); } else { TrianglesPerNonMesh.FindOrAdd(Node->Name).AddDraw(Node->NumTotalPrimitives); } } } void PrintSummary() { UE_LOG(LogRHI, Log, TEXT("Total Nodes %u Draws %u"), TotalNumNodes, TotalNumDraws); UE_LOG(LogRHI, Log, TEXT("")); UE_LOG(LogRHI, Log, TEXT("")); if (bGatherSummaryStats) { // Sort the lists and print them out TrianglesPerMesh.ValueSort([](const FGPUProfileInfoPair& A, const FGPUProfileInfoPair& B){ return A.Triangles > B.Triangles; }); UE_LOG(LogRHI, Log, TEXT("")); UE_LOG(LogRHI, Log, TEXT("MeshList,TriangleCount,DrawCallCount")); for (auto& Pair : TrianglesPerMesh) { UE_LOG(LogRHI, Log, TEXT("%s,%" INT64_FMT ",%d"), *Pair.Key, Pair.Value.Triangles, Pair.Value.DrawCalls); } TrianglesPerMaterial.ValueSort([](const FGPUProfileInfoPair& A, const FGPUProfileInfoPair& B){ return A.Triangles > B.Triangles; }); UE_LOG(LogRHI, Log, TEXT("")); UE_LOG(LogRHI, Log, TEXT("MaterialList,TriangleCount,DrawCallCount")); for (auto& Pair : TrianglesPerMaterial) { UE_LOG(LogRHI, Log, TEXT("%s,%" INT64_FMT ",%d"), *Pair.Key, Pair.Value.Triangles, Pair.Value.DrawCalls); } TrianglesPerNonMesh.ValueSort([](const FGPUProfileInfoPair& A, const FGPUProfileInfoPair& B){ return A.Triangles > B.Triangles; }); UE_LOG(LogRHI, Log, TEXT("")); UE_LOG(LogRHI, Log, TEXT("MiscList,TriangleCount,DrawCallCount")); for (auto& Pair : TrianglesPerNonMesh) { UE_LOG(LogRHI, Log, TEXT("%s,%" INT64_FMT ",%d"), *Pair.Key, Pair.Value.Triangles, Pair.Value.DrawCalls); } // See if we want to call out any particularly interesting matches TArray InterestingSubstrings; GProfileAssetSummaryCallOuts.GetValueOnRenderThread().ParseIntoArray(InterestingSubstrings, TEXT(","), true); if (InterestingSubstrings.Num() > 0) { UE_LOG(LogRHI, Log, TEXT("")); UE_LOG(LogRHI, Log, TEXT("Information about specified mesh substring matches (r.ProfileGPU.AssetSummaryCallOuts)")); for (const FString& InterestingSubstring : InterestingSubstrings) { int32 InterestingNumDraws = 0; int64 InterestingNumTriangles = 0; for (auto& Pair : TrianglesPerMesh) { if (Pair.Key.Contains(InterestingSubstring)) { InterestingNumDraws += Pair.Value.DrawCalls; InterestingNumTriangles += Pair.Value.Triangles; } } UE_LOG(LogRHI, Log, TEXT("Matching '%s': %d draw calls, with %" INT64_FMT " tris (%.2f M)"), *InterestingSubstring, InterestingNumDraws, InterestingNumTriangles, InterestingNumTriangles * 1e-6); } UE_LOG(LogRHI, Log, TEXT("")); } } } }; /** Recursively dumps stats for each node with a depth first traversal. */ static void DumpStatsEventNode(FGPUProfilerEventNode* Node, float RootResult, int32 Depth, const FWildcardString& WildcardFilter, bool bParentMatchedFilter, float& ReportedTiming, FGPUProfileStatSummary& Summary) { Summary.TotalNumNodes++; ReportedTiming = 0; if (Node->NumDraws > 0 || Node->NumDispatches > 0 || Node->Children.Num() > 0 || Summary.bDumpEventLeafNodes) { Summary.TotalNumDraws += Node->NumDraws; // Percent that this node was of the total frame time const float Percent = Node->TimingResult * 100.0f / (RootResult * 1000.0f); const float PercentThreshold = GCVarProfileGPU_ThresholdPercent.GetValueOnRenderThread(); const int32 EffectiveDepth = FMath::Max(Depth - 1, 0); const bool bDisplayEvent = (bParentMatchedFilter || WildcardFilter.IsMatch(Node->Name)) && (Percent > PercentThreshold || Summary.bDumpEventLeafNodes); if (bDisplayEvent) { FString NodeStats = TEXT(""); if (Node->NumTotalDraws > 0) { NodeStats = FString::Printf(TEXT("%u %s %u prims %u verts "), Node->NumTotalDraws, Node->NumTotalDraws == 1 ? TEXT("draw") : TEXT("draws"), Node->NumTotalPrimitives, Node->NumTotalVertices); } if (Node->NumTotalDispatches > 0) { NodeStats += FString::Printf(TEXT("%u %s"), Node->NumTotalDispatches, Node->NumTotalDispatches == 1 ? TEXT("dispatch") : TEXT("dispatches")); // Cumulative group stats are not meaningful, only include dispatch stats if there was one in the current node if (Node->GroupCount.X > 0 && Node->NumDispatches == 1) { NodeStats += FString::Printf(TEXT(" %u"), Node->GroupCount.X); if (Node->GroupCount.Y > 1) { NodeStats += FString::Printf(TEXT("x%u"), Node->GroupCount.Y); } if (Node->GroupCount.Z > 1) { NodeStats += FString::Printf(TEXT("x%u"), Node->GroupCount.Z); } NodeStats += TEXT(" groups"); } } // Print information about this node, padded to its depth in the tree UE_LOG(LogRHI, Log, TEXT("%s%4.1f%%%5.2fms %s %s"), *FString(TEXT("")).LeftPad(EffectiveDepth * 3), Percent, Node->TimingResult, *Node->Name, *NodeStats ); ReportedTiming = Node->TimingResult; Summary.ProcessMatch(Node); } struct FCompareGPUProfileNode { EGPUProfileSortMode SortMode; FCompareGPUProfileNode(EGPUProfileSortMode InSortMode) : SortMode(InSortMode) {} FORCEINLINE bool operator()(const FGPUProfilerEventNode* A, const FGPUProfilerEventNode* B) const { switch (SortMode) { case EGPUProfileSortMode::NumPrims: return B->NumTotalPrimitives < A->NumTotalPrimitives; case EGPUProfileSortMode::NumVerts: return B->NumTotalVertices < A->NumTotalVertices; case EGPUProfileSortMode::TimeElapsed: default: return B->TimingResult < A->TimingResult; } } }; EGPUProfileSortMode SortMode = (EGPUProfileSortMode)FMath::Clamp(GCVarProfileGPU_Sort.GetValueOnRenderThread(), 0, ((int32)EGPUProfileSortMode::Max - 1)); if (SortMode != EGPUProfileSortMode::Chronological) { Node->Children.Sort(FCompareGPUProfileNode(SortMode)); } float TotalChildTime = 0; uint32 TotalChildDraws = 0; for (int32 ChildIndex = 0; ChildIndex < Node->Children.Num(); ChildIndex++) { FGPUProfilerEventNode* ChildNode = Node->Children[ChildIndex]; // Traverse children const int32 PrevNumDraws = Summary.TotalNumDraws; float ChildReportedTiming = 0; DumpStatsEventNode(Node->Children[ChildIndex], RootResult, Depth + 1, WildcardFilter, bDisplayEvent, ChildReportedTiming, Summary); const int32 NumChildDraws = Summary.TotalNumDraws - PrevNumDraws; TotalChildTime += ChildReportedTiming; TotalChildDraws += NumChildDraws; } const float UnaccountedTime = FMath::Max(Node->TimingResult - TotalChildTime, 0.0f); const float UnaccountedPercent = UnaccountedTime * 100.0f / (RootResult * 1000.0f); // Add an 'Other Children' node if necessary to show time spent in the current node that is not in any of its children if (bDisplayEvent && Node->Children.Num() > 0 && TotalChildDraws > 0 && (UnaccountedPercent > 2.0f || UnaccountedTime > .2f)) { UE_LOG(LogRHI, Log, TEXT("%s%4.1f%%%5.2fms Other Children"), *FString(TEXT("")).LeftPad((EffectiveDepth + 1) * 3), UnaccountedPercent, UnaccountedTime); } } } #if !UE_BUILD_SHIPPING /** * Converts GPU profile data to Visualizer data * * @param InProfileData GPU profile data * @param OutVisualizerData Visualizer data */ static TSharedPtr< FVisualizerEvent > CreateVisualizerDataRecursively( const TRefCountPtr< class FGPUProfilerEventNode >& InNode, TSharedPtr< FVisualizerEvent > InParentEvent, const double InStartTimeMs, const double InTotalTimeMs ) { TSharedPtr< FVisualizerEvent > VisualizerEvent( new FVisualizerEvent( InStartTimeMs / InTotalTimeMs, InNode->TimingResult / InTotalTimeMs, InNode->TimingResult, 0, InNode->Name ) ); VisualizerEvent->ParentEvent = InParentEvent; double ChildStartTimeMs = InStartTimeMs; for( int32 ChildIndex = 0; ChildIndex < InNode->Children.Num(); ChildIndex++ ) { TRefCountPtr< FGPUProfilerEventNode > ChildNode = InNode->Children[ ChildIndex ]; TSharedPtr< FVisualizerEvent > ChildEvent = CreateVisualizerDataRecursively( ChildNode, VisualizerEvent, ChildStartTimeMs, InTotalTimeMs ); VisualizerEvent->Children.Add( ChildEvent ); ChildStartTimeMs += ChildNode->TimingResult; } return VisualizerEvent; } /** * Converts GPU profile data to Visualizer data * * @param InProfileData GPU profile data * @param OutVisualizerData Visualizer data */ static TSharedPtr< FVisualizerEvent > CreateVisualizerData( const TArray >& InProfileData ) { // Calculate total time first double TotalTimeMs = 0.0; for( int32 Index = 0; Index < InProfileData.Num(); ++Index ) { TotalTimeMs += InProfileData[ Index ]->TimingResult; } // Assumption: InProfileData contains only one (root) element. Otherwise an extra FVisualizerEvent root event is required. TSharedPtr< FVisualizerEvent > DummyRoot; // Recursively create visualizer event data. TSharedPtr< FVisualizerEvent > StatEvents( CreateVisualizerDataRecursively( InProfileData[0], DummyRoot, 0.0, TotalTimeMs ) ); return StatEvents; } #endif void FGPUProfilerEventNodeFrame::DumpEventTree() { if (EventTree.Num() > 0) { float RootResult = GetRootTimingResults(); FString ConfigString; if (GCVarProfileGPU_Root.GetValueOnRenderThread() != TEXT("*")) { ConfigString += FString::Printf(TEXT("Root filter: %s "), *GCVarProfileGPU_Root.GetValueOnRenderThread()); } if (GCVarProfileGPU_ThresholdPercent.GetValueOnRenderThread() > 0.0f) { ConfigString += FString::Printf(TEXT("Threshold: %.2f%% "), GCVarProfileGPU_ThresholdPercent.GetValueOnRenderThread()); } if (ConfigString.Len() > 0) { ConfigString = FString(TEXT(", ")) + ConfigString; } UE_LOG(LogRHI, Log, TEXT("Perf marker hierarchy, total GPU time %.2fms%s"), RootResult * 1000.0f, *ConfigString); UE_LOG(LogRHI, Log, TEXT("")); // Display a warning if this is a GPU profile and the GPU was profiled with v-sync enabled FText VsyncEnabledWarningText = FText::GetEmpty(); static IConsoleVariable* CVSyncVar = IConsoleManager::Get().FindConsoleVariable(TEXT("r.VSync")); if (CVSyncVar->GetInt() != 0 && !PlatformDisablesVSync()) { VsyncEnabledWarningText = LOCTEXT("GpuProfileVsyncEnabledWarning", "WARNING: This GPU profile was captured with v-sync enabled. V-sync wait time may show up in any bucket, and as a result the data in this profile may be skewed. Please profile with v-sync disabled to obtain the most accurate data."); UE_LOG(LogRHI, Log, TEXT("%s"), *(VsyncEnabledWarningText.ToString())); } LogDisjointQuery(); TMap EventHistogram; for (int32 BaseNodeIndex = 0; BaseNodeIndex < EventTree.Num(); BaseNodeIndex++) { GatherStatsEventNode(EventTree[BaseNodeIndex], 0, EventHistogram); } FString RootWildcardString = GCVarProfileGPU_Root.GetValueOnRenderThread(); FWildcardString RootWildcard(RootWildcardString); FGPUProfileStatSummary Summary; for (int32 BaseNodeIndex = 0; BaseNodeIndex < EventTree.Num(); BaseNodeIndex++) { float Unused = 0; DumpStatsEventNode(EventTree[BaseNodeIndex], RootResult, 0, RootWildcard, false, Unused, /*inout*/ Summary); } Summary.PrintSummary(); const bool bShowHistogram = GProfileShowEventHistogram.GetValueOnRenderThread() != 0; if (RootWildcardString == TEXT("*") && bShowHistogram) { struct FNodeStatsCompare { /** Sorts nodes by descending durations. */ FORCEINLINE bool operator()(const FGPUProfilerEventNodeStats& A, const FGPUProfilerEventNodeStats& B) const { return B.TimingResult < A.TimingResult; } }; // Sort descending based on node duration EventHistogram.ValueSort( FNodeStatsCompare() ); // Log stats about the node histogram UE_LOG(LogRHI, Log, TEXT("Node histogram %u buckets"), EventHistogram.Num()); // bad: reading on render thread but we don't support ECVF_RenderThreadSafe on strings yet // It's very unlikely to cause a problem as the cvar is only changes by the user. FString WildcardString = GProfileGPUPatternCVar.GetValueOnRenderThread(); FGPUProfilerEventNodeStats Sum; const float ThresholdInMS = 5.0f; if(WildcardString == FString(TEXT("*"))) { // disable Wildcard functionality WildcardString.Empty(); } if(WildcardString.IsEmpty()) { UE_LOG(LogRHI, Log, TEXT(" r.ProfileGPU.Pattern = '*' (using threshold of %g ms)"), ThresholdInMS); } else { UE_LOG(LogRHI, Log, TEXT(" r.ProfileGPU.Pattern = '%s' (not using time threshold)"), *WildcardString); } FWildcardString Wildcard(WildcardString); int32 NumNotShown = 0; for (TMap::TIterator It(EventHistogram); It; ++It) { const FGPUProfilerEventNodeStats& NodeStats = It.Value(); bool bDump = NodeStats.TimingResult > RootResult * ThresholdInMS; if(!Wildcard.IsEmpty()) { // if a Wildcard string was specified, we want to always dump all entries bDump = Wildcard.IsMatch(*It.Key()); } if (bDump) { UE_LOG(LogRHI, Log, TEXT(" %.2fms %s Events %u Draws %u"), NodeStats.TimingResult, *It.Key(), NodeStats.NumEvents, NodeStats.NumDraws); Sum += NodeStats; } else { NumNotShown++; } } UE_LOG(LogRHI, Log, TEXT(" Total %.2fms Events %u Draws %u, %u buckets not shown"), Sum.TimingResult, Sum.NumEvents, Sum.NumDraws, NumNotShown); } #if !UE_BUILD_SHIPPING // Create and display profile visualizer data if (RHIConfig::ShouldShowProfilerAfterProfilingGPU()) { // execute on main thread { struct FDisplayProfilerVisualizer { void Thread( TSharedPtr InVisualizerData, const FText InVsyncEnabledWarningText ) { static FName ProfileVisualizerModule(TEXT("ProfileVisualizer")); if (FModuleManager::Get().IsModuleLoaded(ProfileVisualizerModule)) { IProfileVisualizerModule& ProfileVisualizer = FModuleManager::GetModuleChecked(ProfileVisualizerModule); // Display a warning if this is a GPU profile and the GPU was profiled with v-sync enabled (otherwise InVsyncEnabledWarningText is empty) ProfileVisualizer.DisplayProfileVisualizer( InVisualizerData, TEXT("GPU"), InVsyncEnabledWarningText, FLinearColor::Red ); } } } DisplayProfilerVisualizer; TSharedPtr VisualizerData = CreateVisualizerData( EventTree ); DECLARE_CYCLE_STAT(TEXT("FSimpleDelegateGraphTask.DisplayProfilerVisualizer"), STAT_FSimpleDelegateGraphTask_DisplayProfilerVisualizer, STATGROUP_TaskGraphTasks); FSimpleDelegateGraphTask::CreateAndDispatchWhenReady( FSimpleDelegateGraphTask::FDelegate::CreateRaw(&DisplayProfilerVisualizer, &FDisplayProfilerVisualizer::Thread, VisualizerData, VsyncEnabledWarningText), GET_STATID(STAT_FSimpleDelegateGraphTask_DisplayProfilerVisualizer), nullptr, ENamedThreads::GameThread ); } } #endif } } void FGPUProfiler::PushEvent(const TCHAR* Name, FColor Color) { if (bTrackingEvents) { check(StackDepth >= 0); StackDepth++; check(IsInRenderingThread() || IsInRHIThread()); if (CurrentEventNode) { // Add to the current node's children CurrentEventNode->Children.Add(CreateEventNode(Name, CurrentEventNode)); CurrentEventNode = CurrentEventNode->Children.Last(); } else { // Add a new root node to the tree CurrentEventNodeFrame->EventTree.Add(CreateEventNode(Name, NULL)); CurrentEventNode = CurrentEventNodeFrame->EventTree.Last(); } check(CurrentEventNode); // Start timing the current node CurrentEventNode->StartTiming(); } } void FGPUProfiler::PopEvent() { if (bTrackingEvents) { check(StackDepth >= 1); StackDepth--; check(CurrentEventNode && (IsInRenderingThread() || IsInRHIThread())); // Stop timing the current node and move one level up the tree CurrentEventNode->StopTiming(); CurrentEventNode = CurrentEventNode->Parent; } } /** Whether GPU timing measurements are supported by the driver. */ bool FGPUTiming::GIsSupported = false; /** Frequency for the timing values, in number of ticks per seconds, or 0 if the feature isn't supported. */ TStaticArray FGPUTiming::GTimingFrequency(InPlace, 0); /** * Two timestamps performed on GPU and CPU at nearly the same time. * This can be used to visualize GPU and CPU timing events on the same timeline. */ TStaticArray FGPUTiming::GCalibrationTimestamp; /** Whether the static variables have been initialized. */ bool FGPUTiming::GAreGlobalsInitialized = false; #else namespace UE::RHI::GPUProfiler { RHI_API FRHIOnProfileGPU OnProfileGPU; TLockFreePointerListUnordered FEventStream::FChunk::MemoryPool; static TArray& GetSinks() { static TArray Sinks; return Sinks; } FEventSink::FEventSink() { GetSinks().Add(this); } FEventSink::~FEventSink() { GetSinks().RemoveSingle(this); } void ProcessEvents(TArrayView EventStreams) { TRACE_CPUPROFILER_EVENT_SCOPE(UE::RHI::GPUProfiler::ProcessEvents); TArray> SharedStreams; SharedStreams.Reserve(EventStreams.Num()); for (FEventStream& Stream : EventStreams) { if (!Stream.IsEmpty()) { SharedStreams.Emplace(MakeShared(MoveTemp(Stream))); } } if (SharedStreams.Num()) { for (FEventSink* Sink : GetSinks()) { Sink->ProcessStreams(SharedStreams); } } } void InitializeQueues(TConstArrayView Queues) { for (FEventSink* Sink : GetSinks()) { Sink->InitializeQueues(Queues); } } #if WITH_PROFILEGPU template struct TUnicodeHorizontalBar { TCHAR Text[Width + 1]; // 0 <= Value <= 1 TUnicodeHorizontalBar(double Value) { TCHAR* Output = Text; int32 Solid, Partial, Blank; { double Integer; double Remainder = FMath::Modf(FMath::Clamp(Value, 0.0, 1.0) * Width, &Integer); Solid = (int32)Integer; Partial = (int32)FMath::Floor(Remainder * 8); Blank = (Width - Solid - (Partial > 0 ? 1 : 0)); } // Solid characters for (int32 Index = 0; Index < Solid; ++Index) { *Output++ = TEXT('█'); } // Partially filled character if (Partial > 0) { static constexpr TCHAR const Data[] = TEXT("▏▎▍▌▋▊▉"); *Output++ = Data[Partial - 1]; } // Blank Characters to pad out the width for (int32 Index = 0; Index < Blank; ++Index) { *Output++ = TEXT(' '); } *Output++ = 0; check(uintptr_t(Output) == (uintptr_t(Text) + sizeof(Text))); } }; struct FNode { FString Name; FNode* Parent = nullptr; FNode* Next = nullptr; TArray Children; struct FStats { uint32 NumDraws = 0; uint32 NumDispatches = 0; uint32 NumPrimitives = 0; uint32 NumVertices = 0; uint64 BusyCycles = 0; uint64 IdleCycles = 0; uint64 WaitCycles = 0; double GetBusyMilliseconds() const { return FPlatformTime::ToMilliseconds64(BusyCycles); } bool HasWork() const { return NumDraws > 0 || NumDispatches > 0; } FStats& operator += (FStats const& Stats) { NumDraws += Stats.NumDraws; NumDispatches += Stats.NumDispatches; NumPrimitives += Stats.NumPrimitives; NumVertices += Stats.NumVertices; BusyCycles += Stats.BusyCycles; IdleCycles += Stats.IdleCycles; WaitCycles += Stats.WaitCycles; return *this; } FStats& operator += (FEvent::FStats const& Stats) { NumDraws += Stats.NumDraws; NumDispatches += Stats.NumDispatches; NumPrimitives += Stats.NumPrimitives; NumVertices += Stats.NumVertices; return *this; } void Accumulate(uint64 Busy, uint64 Wait, uint64 Idle) { BusyCycles += Busy; IdleCycles += Idle; WaitCycles += Wait; } }; // Exclusive stats for this node FStats Exclusive; // Sum of stats including all children FStats Inclusive; FNode(FString&& Name) : Name(MoveTemp(Name)) {} }; struct FTable { bool const bUnicodeOutput; FTable() : bUnicodeOutput(GCVarProfileGPU_UnicodeOutput.GetValueOnAnyThread()) {} enum class EColumn : uint32 { Exclusive_NumDraws, Exclusive_NumDispatches, Exclusive_NumPrimitives, Exclusive_NumVertices, Exclusive_Percent, Exclusive_Time, Inclusive_NumDraws, Inclusive_NumDispatches, Inclusive_NumPrimitives, Inclusive_NumVertices, Inclusive_Percent, Inclusive_Time, Events, Num }; uint32 GetColumnMinimumWidth(EColumn Column) const { switch (Column) { case EColumn::Events: return 6; } return 0; } TCHAR const* GetColumnHeader(EColumn Column) const { switch (Column) { case EColumn::Exclusive_NumDraws: case EColumn::Inclusive_NumDraws: return TEXT("Draws"); case EColumn::Exclusive_NumDispatches: case EColumn::Inclusive_NumDispatches: return TEXT("Dsptch"); case EColumn::Exclusive_NumPrimitives: case EColumn::Inclusive_NumPrimitives: return TEXT("Prim"); case EColumn::Exclusive_NumVertices: case EColumn::Inclusive_NumVertices: return TEXT("Vert"); case EColumn::Exclusive_Percent: case EColumn::Inclusive_Percent: return TEXT("Percent"); case EColumn::Exclusive_Time: case EColumn::Inclusive_Time: return TEXT("Time"); } return TEXT(""); } uint32 GetColumnGroup(EColumn Column) const { switch (Column) { case EColumn::Exclusive_NumDraws: case EColumn::Exclusive_NumDispatches: case EColumn::Exclusive_NumPrimitives: case EColumn::Exclusive_NumVertices: case EColumn::Exclusive_Percent: case EColumn::Exclusive_Time: return 0; case EColumn::Inclusive_NumDraws: case EColumn::Inclusive_NumDispatches: case EColumn::Inclusive_NumPrimitives: case EColumn::Inclusive_NumVertices: case EColumn::Inclusive_Percent: case EColumn::Inclusive_Time: return 1; default: case EColumn::Events: return 2; } } TCHAR const* GetGroupName(uint32 GroupIndex) const { switch (GroupIndex) { case 0: return TEXT("Exclusive"); case 1: return TEXT("Inclusive"); case 2: return TEXT("Events"); } return TEXT(""); } uint32 NumRows = 0; TStaticArray, uint32(EColumn::Num)> Columns { InPlace }; TArray RowBreaks; FString& Col(EColumn Column) { return Columns[uint32(Column)].Emplace_GetRef(); } bool HasRows() const { return NumRows > 0; } void AddRow(FNode* Root, FNode::FStats const& Inclusive, FNode::FStats const& Exclusive, FString const& Name, uint32 Level) { double ExclusivePercent = double(Exclusive.BusyCycles) / Root->Inclusive.BusyCycles; double InclusivePercent = double(Inclusive.BusyCycles) / Root->Inclusive.BusyCycles; static constexpr uint32 BarWidth = 8; TUnicodeHorizontalBar ExclusiveBar = ExclusivePercent; TUnicodeHorizontalBar InclusiveBar = InclusivePercent; static constexpr TCHAR const BarSeparator[] = TEXT(" ┊ "); Col(EColumn::Exclusive_NumDraws ) = FString::Printf(TEXT("%d"), Exclusive.NumDraws); Col(EColumn::Exclusive_NumDispatches) = FString::Printf(TEXT("%d"), Exclusive.NumDispatches); Col(EColumn::Exclusive_NumPrimitives) = FString::Printf(TEXT("%d"), Exclusive.NumPrimitives); Col(EColumn::Exclusive_NumVertices ) = FString::Printf(TEXT("%d"), Exclusive.NumVertices); Col(EColumn::Exclusive_Percent ) = FString::Printf(TEXT("%.1f%%%s%s"), ExclusivePercent * 100.0, bUnicodeOutput ? BarSeparator : TEXT(""), bUnicodeOutput ? ExclusiveBar.Text : TEXT("")); Col(EColumn::Exclusive_Time ) = FString::Printf(TEXT("%.3f ms"), FPlatformTime::ToMilliseconds64(Exclusive.BusyCycles)); Col(EColumn::Inclusive_NumDraws ) = FString::Printf(TEXT("%d"), Inclusive.NumDraws); Col(EColumn::Inclusive_NumDispatches) = FString::Printf(TEXT("%d"), Inclusive.NumDispatches); Col(EColumn::Inclusive_NumPrimitives) = FString::Printf(TEXT("%d"), Inclusive.NumPrimitives); Col(EColumn::Inclusive_NumVertices ) = FString::Printf(TEXT("%d"), Inclusive.NumVertices); Col(EColumn::Inclusive_Percent ) = FString::Printf(TEXT("%.1f%%%s%s"), InclusivePercent * 100.0, bUnicodeOutput ? BarSeparator : TEXT(""), bUnicodeOutput ? InclusiveBar.Text : TEXT("")); Col(EColumn::Inclusive_Time ) = FString::Printf(TEXT("%.3f ms"), FPlatformTime::ToMilliseconds64(Inclusive.BusyCycles)); static constexpr uint32 SpacesPerIndent = 3; Col(EColumn::Events) = FString::Printf(TEXT("%*s"), Name.Len() + (Level * SpacesPerIndent), *Name); // Insert a horizontal rule before each root level row. RowBreaks.Add(Level == 0); NumRows++; } struct FChars { TCHAR const* Left; TCHAR const* GroupSeparator; TCHAR const* LastGroupSeparator; TCHAR const* Right; TCHAR const* CellSeparator; }; struct FFormat { TCHAR const* LineMajor; TCHAR const* LineMinor; FChars const TopRow; FChars const GroupNameRow; FChars const GroupBorderRow; FChars const ValueRow; FChars const DividorRow; FChars const BottomRow; }; FString ToString() const { if (bUnicodeOutput) { static constexpr FFormat Unicode = { .LineMajor = TEXT("━"), .LineMinor = TEXT("─"), // Left GrpSep LastGrp Right CellSep .TopRow { TEXT(" ┏"), TEXT("┳"), TEXT("┳"), TEXT("┓"), TEXT(" ") }, .GroupNameRow { TEXT(" ┃"), TEXT("┃"), TEXT("┃"), TEXT("┃"), TEXT(" ") }, .GroupBorderRow{ TEXT(" ┠"), TEXT("╂"), TEXT("┨"), TEXT("┃"), TEXT("┬") }, .ValueRow { TEXT(" ┃"), TEXT("┃"), TEXT("┃"), TEXT("┃"), TEXT("│") }, .DividorRow { TEXT(" ┠"), TEXT("╂"), TEXT("╂"), TEXT("┨"), TEXT("┼") }, .BottomRow { TEXT(" ┗"), TEXT("┻"), TEXT("┻"), TEXT("┛"), TEXT("┷") }, }; return ToStringInner(Unicode); } else { static constexpr FFormat Ascii = { .LineMajor = TEXT("-"), .LineMinor = TEXT("-"), // Left GrpSep LastGrp Right CellSep .TopRow { TEXT(" +"), TEXT("+"), TEXT("+"), TEXT("+"), TEXT(" ") }, .GroupNameRow { TEXT(" |"), TEXT("|"), TEXT("|"), TEXT("|"), TEXT(" ") }, .GroupBorderRow{ TEXT(" +"), TEXT("+"), TEXT("+"), TEXT("|"), TEXT("+") }, .ValueRow { TEXT(" |"), TEXT("|"), TEXT("|"), TEXT("|"), TEXT("|") }, .DividorRow { TEXT(" +"), TEXT("+"), TEXT("+"), TEXT("+"), TEXT("+") }, .BottomRow { TEXT(" +"), TEXT("+"), TEXT("+"), TEXT("+"), TEXT("+") }, }; return ToStringInner(Ascii); } } FString ToStringInner(FFormat const& Format) const { struct FGroup { uint32 Index, Width; }; struct FColumn { uint32 Index, Width; }; static constexpr uint32 NumGroups = 3; static constexpr uint32 CellPadding = 1; // Auto-size column widths to their contents TStaticArray ColumnWidths{ InPlace, 0 }; for (uint32 ColumnIndex = 0; ColumnIndex < uint32(EColumn::Num); ++ColumnIndex) { if (Columns[ColumnIndex].Num() == 0) continue; check(Columns[ColumnIndex].Num() == NumRows); int32& Width = ColumnWidths[ColumnIndex]; // Auto-size column width Width = GetColumnMinimumWidth(EColumn(ColumnIndex)); Width = FMath::Max(Width, FCString::Strlen(GetColumnHeader(EColumn(ColumnIndex)))); for (FString const& Cell : Columns[ColumnIndex]) { Width = FMath::Max(Width, Cell.Len()); } } FString Result; auto EmitGroupRow = [&](FChars const& Chars, TUniqueFunction GroupCallback) { uint32 const CellSeparatorLength = FCString::Strlen(Chars.CellSeparator); Result += Chars.Left; uint32 GroupWidth = 0; uint32 GroupIndex = 0; for (uint32 ColumnIndex = 0; ColumnIndex < uint32(EColumn::Num); ++ColumnIndex) { if (Columns[ColumnIndex].Num() == 0) continue; GroupWidth += ColumnWidths[ColumnIndex] + CellPadding * 2; GroupIndex = GetColumnGroup(EColumn(ColumnIndex)); if (GroupIndex != GetColumnGroup(EColumn(ColumnIndex + 1))) { // Group Change GroupCallback({ GroupIndex, GroupWidth }); // Add the group separator character Result += GroupIndex < NumGroups - 2 ? Chars.GroupSeparator : Chars.LastGroupSeparator; GroupWidth = 0; } else if (ColumnIndex < uint32(EColumn::Num) - 1) { // Same group. Count the (missing) cell division GroupWidth += CellSeparatorLength; } } // Emit final group GroupCallback({ GroupIndex, GroupWidth }); // Close the row Result += Chars.Right; Result += TEXT("\n"); }; auto EmitValueRow = [&](FChars const& Chars, TUniqueFunction CellCallback) { Result += Chars.Left; for (uint32 ColumnIndex = 0; ColumnIndex < uint32(EColumn::Num); ++ColumnIndex) { if (Columns[ColumnIndex].Num() == 0) continue; CellCallback({ ColumnIndex, ColumnWidths[ColumnIndex] + (CellPadding * 2) }); if (ColumnIndex < uint32(EColumn::Num) - 1) { uint32 GroupIndex = GetColumnGroup(EColumn(ColumnIndex)); if (GroupIndex != GetColumnGroup(EColumn(ColumnIndex + 1))) { // Group change, add the group separator Result += GroupIndex < NumGroups - 2 ? Chars.GroupSeparator : Chars.LastGroupSeparator; } else { // Same group, add the cell separator Result += Chars.CellSeparator; } } } // Close the row Result += Chars.Right; Result += TEXT("\n"); }; auto AlignCenter = [&](TCHAR const* Str, uint32 Width) { int32 PaddingLeft = FMath::Max(0, int32(Width) - FCString::Strlen(Str)); int32 PaddingRight = (PaddingLeft / 2) + (PaddingLeft & 1); PaddingLeft /= 2; Result += FString::Printf(TEXT("%*s%s%*s"), PaddingLeft, TEXT(""), Str, PaddingRight, TEXT("")); }; // Top Border EmitGroupRow(Format.TopRow, [&](FGroup Group) { while (Group.Width--) { Result += Format.LineMajor; } }); // Exclusive / Inclusive Group Row EmitGroupRow(Format.GroupNameRow, [&](FGroup Group) { TCHAR const* Str = Group.Index != GetColumnGroup(EColumn::Events) ? GetGroupName(Group.Index) : TEXT(""); AlignCenter(Str, Group.Width); }); // Events Group Row EmitValueRow(Format.GroupBorderRow, [&](FColumn Column) { if (Column.Index == uint32(EColumn::Events)) { AlignCenter(GetGroupName(GetColumnGroup(EColumn::Events)), Column.Width); } else { while (Column.Width--) { Result += Format.LineMinor; } } }); // Header Row EmitValueRow(Format.ValueRow, [&](FColumn Column) { AlignCenter(GetColumnHeader(EColumn(Column.Index)), Column.Width); }); // Header Border Row EmitValueRow(Format.DividorRow, [&](FColumn Column) { while (Column.Width--) { Result += Format.LineMinor; } }); // Value rows for (uint32 RowIndex = 0; RowIndex < NumRows; ++RowIndex) { if (RowIndex > 0 && RowBreaks[RowIndex]) { // Add a horizontal rule EmitValueRow(Format.DividorRow, [&](FColumn Column) { while (Column.Width--) { Result += Format.LineMinor; } }); } EmitValueRow(Format.ValueRow, [&](FColumn Column) { int32 Width = Column.Width - (CellPadding * 2); if (EColumn(Column.Index) == EColumn::Events) { Width = -Width; // Align left } FString const& Cell = Columns[Column.Index][RowIndex]; Result += FString::Printf(TEXT("%*s%*s%*s") , CellPadding, TEXT("") , Width, *Cell , CellPadding, TEXT("")); }); } // Bottom Border EmitValueRow(Format.BottomRow, [&](FColumn Column) { while (Column.Width--) { Result += Format.LineMajor; } }); return Result; } }; #endif #if HAS_GPU_STATS // Per queue GPU stats // Total busy time on the current queue. StatName == "Unaccounted" is used by the Csv profiler static FGPUStat GPUStat_Total(TEXT("Unaccounted"), TEXT("Queue Total")); #endif #if STATS TCHAR const* FGPUStat::GetTypeString(EType Type) { switch (Type) { default: checkNoEntry(); [[fallthrough]]; case EType::Busy: return TEXT("Busy"); case EType::Wait: return TEXT("Wait"); case EType::Idle: return TEXT("Idle"); } } FString FGPUStat::GetIDString(FQueue Queue, bool bFriendly) { if (bFriendly) { return FString::Printf(TEXT("GPU %d %s Queue %d") , Queue.GPU , Queue.GetTypeString() , Queue.Index ); } else { return FString::Printf(TEXT("GPU%d_%s%d") , Queue.GPU , Queue.GetTypeString() , Queue.Index ); } } FGPUStat::FStatInstance::FInner& FGPUStat::GetStatInstance(FQueue Queue, EType Type) { FStatInstance& Instance = Instances.FindOrAdd(Queue); switch (Type) { default: checkNoEntry(); [[fallthrough]]; case EType::Busy: return Instance.Busy; break; case EType::Wait: return Instance.Wait; break; case EType::Idle: return Instance.Idle; break; } } TMap> FGPUStat::FStatCategory::Categories; FGPUStat::FStatCategory::FStatCategory(FQueue Queue) : GroupName(FString::Printf(TEXT("STATGROUP_%s"), *GetIDString(Queue, false))) , GroupDesc(FString::Printf(TEXT("%s Timing"), *GetIDString(Queue, true))) {} TStatId FGPUStat::GetStatId(FQueue Queue, EType Type) { FStatInstance::FInner& Instance = GetStatInstance(Queue, Type); if (!Instance.Stat) { TUniquePtr& Category = FStatCategory::Categories.FindOrAdd(Queue); if (!Category) { Category = MakeUnique(Queue); } // Encode the stat type in the FName number Instance.StatName = FName(*FString::Printf(TEXT("STAT_%s_%s"), *GetIDString(Queue, false), DisplayName), int32(Type)); Instance.Stat = MakeUnique( Instance.StatName, DisplayName, *Category->GroupName, FStatNameAndInfo::GpuStatCategory, *Category->GroupDesc, true, // IsDefaultEnabled true, // IsClearEveryFrame EStatDataType::ST_double, false, // IsCycleStat false, // SortByName FPlatformMemory::MCR_Invalid ); } return Instance.Stat->GetStatId(); } #endif // Handles computing the "stat unit" GPU time, "stat gpu" stats, and "profilegpu". struct FGPUProfilerSink_StatSystem final : public FEventSink { class FTimestampStream { private: TArray Values; public: struct FState { FTimestampStream const& Stream; int32 TimestampIndex = 0; uint64 BusyCycles = 0; FState(FTimestampStream const& Stream) : Stream(Stream) {} uint64 GetCurrentTimestamp (uint64 Anchor) const { return Stream.Values[TimestampIndex] - Anchor; } uint64 GetPreviousTimestamp(uint64 Anchor) const { return Stream.Values[TimestampIndex - 1] - Anchor; } bool HasMoreTimestamps() const { return TimestampIndex < Stream.Values.Num(); } bool IsStartingWork () const { return (TimestampIndex & 0x01) == 0x00; } void AdvanceTimestamp () { TimestampIndex++; } }; void AddTimestamp(uint64 Value, bool bBegin) { if (bBegin) { if (!Values.IsEmpty() && Value <= Values.Last()) { // // The Begin TOP event is sooner than the last End BOP event. // The markers overlap, and the GPU was not idle. // // Remove the previous End event, and discard this Begin event. // Values.RemoveAt(Values.Num() - 1, EAllowShrinking::No); } else { // GPU was idle. Keep this timestamp. Values.Add(Value); } } else { Values.Add(Value); } } static uint64 ComputeUnion(TArrayView Streams) { // The total number of cycles where at least one GPU pipe was busy. uint64 UnionBusyCycles = 0; uint64 LastMinCycles = 0; int32 BusyPipes = 0; bool bFirst = true; uint64 Anchor = 0; // @todo - handle possible timestamp wraparound // Process the time ranges from each pipe. while (true) { // Find the next minimum timestamp FTimestampStream::FState* NextMin = nullptr; for (auto& Current : Streams) { if (Current.HasMoreTimestamps() && (!NextMin || Current.GetCurrentTimestamp(Anchor) < NextMin->GetCurrentTimestamp(Anchor))) { NextMin = &Current; } } if (!NextMin) break; // No more timestamps to process if (!bFirst) { if (BusyPipes > 0 && NextMin->GetCurrentTimestamp(Anchor) > LastMinCycles) { // Accumulate the union busy time across all pipes UnionBusyCycles += NextMin->GetCurrentTimestamp(Anchor) - LastMinCycles; } if (!NextMin->IsStartingWork()) { // Accumulate the busy time for this pipe specifically. NextMin->BusyCycles += NextMin->GetCurrentTimestamp(Anchor) - NextMin->GetPreviousTimestamp(Anchor); } } LastMinCycles = NextMin->GetCurrentTimestamp(Anchor); BusyPipes += NextMin->IsStartingWork() ? 1 : -1; check(BusyPipes >= 0); NextMin->AdvanceTimestamp(); bFirst = false; } check(BusyPipes == 0); return UnionBusyCycles; } }; struct FStatState { struct { uint64 BusyCycles = 0; uint64 IdleCycles = 0; uint64 WaitCycles = 0; void Accumulate(uint64 Busy, uint64 Wait, uint64 Idle) { BusyCycles += Busy; IdleCycles += Idle; WaitCycles += Wait; } } Exclusive, Inclusive; FStatState() = default; FStatState(FStatState const&) = default; FStatState(FStatState&& Other) : FStatState(Other) { Other.Exclusive = {}; Other.Inclusive = {}; } #if HAS_GPU_STATS void EmitResults(FQueue Queue, FGPUStat& GPUStat #if STATS , FEndOfPipeStats* Stats #endif #if CSV_PROFILER_STATS , FCsvProfiler* CsvProfiler #endif ) const { #if STATS Stats->AddMessage(GPUStat.GetStatId(Queue, FGPUStat::EType::Busy).GetName(), EStatOperation::Set, FPlatformTime::ToMilliseconds64(Inclusive.BusyCycles)); Stats->AddMessage(GPUStat.GetStatId(Queue, FGPUStat::EType::Idle).GetName(), EStatOperation::Set, FPlatformTime::ToMilliseconds64(Inclusive.IdleCycles)); Stats->AddMessage(GPUStat.GetStatId(Queue, FGPUStat::EType::Wait).GetName(), EStatOperation::Set, FPlatformTime::ToMilliseconds64(Inclusive.WaitCycles)); #endif #if CSV_PROFILER_STATS if (CsvProfiler && Queue.Type == FQueue::EType::Graphics && Queue.Index == 0) { if (!GPUStat.CsvStat.IsSet()) { static TArray> CsvGPUCategories; if (!CsvGPUCategories.IsValidIndex(Queue.GPU)) { CsvGPUCategories.SetNum(Queue.GPU + 1); } TUniquePtr& Category = CsvGPUCategories[Queue.GPU]; if (!Category) { Category = Queue.GPU > 0 ? MakeUnique(*FString::Printf(TEXT("GPU%d"), Queue.GPU + 1), true) : MakeUnique(TEXT("GPU"), true); } GPUStat.CsvStat.Emplace(GPUStat.StatName, Category->Index); } uint64 TotalCycles = Exclusive.BusyCycles + Exclusive.WaitCycles; CsvProfiler->RecordEndOfPipeCustomStat(GPUStat.CsvStat->Name, GPUStat.CsvStat->CategoryIndex, FPlatformTime::ToMilliseconds64(TotalCycles), ECsvCustomStatOp::Set); } #endif } #endif }; struct FQueueTimestamps { FTimestampStream Queue; FStatState WholeQueueStat; uint64 CPUFrameBoundary = 0; // Used to override the GPU time calculation for this queue, if an FFrameTime event is in the stream TOptional TotalBusyCycles; #if WITH_RHI_BREADCRUMBS TMap Stats; #endif }; struct FResolvedWait { uint64 GPUTimestampTOP = 0; uint64 CPUTimestamp = 0; }; struct FResolvedSignal { uint64 GPUTimestampBOP = 0; uint64 Value = 0; }; struct FFrameState : TMap { #if STATS TOptional StatsFrame; #endif }; struct FQueueState { FQueue const Queue; TSpscQueue PendingStreams; // Array of fence signal history. Events are kept until all queues have processed events // later than the CPU timestamps of these signals. The old events are then trimmed. TArray Signals; // The value of the latest signaled fence on this queue. FResolvedSignal MaxSignal; // The GPU timestamp of the last event processed. uint64 LastGPUCycles = 0; FQueueTimestamps Timestamps; bool bBusy = false; bool bWasTraced = false; #if WITH_RHI_BREADCRUMBS TMap ActiveStats; TArray ActiveStatsStack; FRHIBreadcrumbNode* Breadcrumb = nullptr; #endif #if WITH_PROFILEGPU struct { TArray> Nodes; FNode* Current = nullptr; FNode* Prev = nullptr; FNode* First = nullptr; bool bProfileFrame = false; void PushNode(FString&& Name) { FNode* Parent = Current; Current = Nodes.Emplace_GetRef(MakeUnique(MoveTemp(Name))).Get(); Current->Parent = Parent; if (!First) { First = Current; } if (Parent) { Parent->Children.Add(Current); } if (Prev) { Prev->Next = Current; } Prev = Current; } void PopNode() { check(Current && Current->Parent); Current = Current->Parent; } void LogTree(FQueueState const& QueueState, uint32 FrameNumber) const { FTable Table; EGPUProfileSortMode SortMode = (EGPUProfileSortMode)FMath::Clamp(GCVarProfileGPU_Sort.GetValueOnAnyThread(), 0, ((int32)EGPUProfileSortMode::Max - 1)); FWildcardString RootWildcard(GCVarProfileGPU_Root.GetValueOnAnyThread()); const bool bShowEmptyNodes = GCVarProfileGPU_ShowLeafEvents.GetValueOnAnyThread(); const double PercentThreshold = FMath::Clamp(GCVarProfileGPU_ThresholdPercent.GetValueOnAnyThread(), 0.0f, 100.0f); if (SortMode != EGPUProfileSortMode::Chronological) { for (FNode* Node = First; Node; Node = Node->Next) { Node->Children.Sort([SortMode](FNode const& A, FNode const& B) { switch (SortMode) { default: case EGPUProfileSortMode::TimeElapsed: return B.Inclusive.BusyCycles < A.Inclusive.BusyCycles; case EGPUProfileSortMode::NumPrims : return B.Inclusive.NumPrimitives < A.Inclusive.NumPrimitives; case EGPUProfileSortMode::NumVerts : return B.Inclusive.NumVertices < A.Inclusive.NumVertices; } }); } } auto Recurse = [&](auto& Recurse, FNode* Root, FNode* CurrentNode, bool bParentMatchedFilter, int32 Level) -> bool { // Percent that this node was of the total frame time const double Percent = Root ? (CurrentNode->Inclusive.GetBusyMilliseconds() / Root->Inclusive.GetBusyMilliseconds()) * 100.0 : 100.0; // Filter nodes according to cvar settings const bool bAboveThreshold = Percent >= PercentThreshold; const bool bNameMatches = bParentMatchedFilter || RootWildcard.IsMatch(CurrentNode->Name); const bool bHasWork = bShowEmptyNodes || CurrentNode->Inclusive.HasWork(); const bool bDisplayEvent = bNameMatches && bHasWork && bAboveThreshold; if (bDisplayEvent) { if (Root == nullptr) { Root = CurrentNode; } Table.AddRow( Root, CurrentNode->Inclusive, CurrentNode->Exclusive, CurrentNode->Name, Level ); } FNode::FStats OtherChildrenInclusive; FNode::FStats OtherChildrenExclusive; uint32 NumHiddenChildren = 0; for (FNode* Child : CurrentNode->Children) { bool bChildShown = Recurse(Recurse, Root, Child, bDisplayEvent, bDisplayEvent ? Level + 1 : Level); if (!bChildShown) { OtherChildrenInclusive += Child->Inclusive; OtherChildrenExclusive += Child->Exclusive; NumHiddenChildren++; } } if (bDisplayEvent && NumHiddenChildren > 0) { // Don't show the "other children" node if their total inclusive time is still below the percent threshold if ((double(OtherChildrenInclusive.BusyCycles) / Root->Inclusive.BusyCycles) >= PercentThreshold) { Table.AddRow( Root, OtherChildrenInclusive, OtherChildrenExclusive, FString::Printf(TEXT("%d Other %s"), NumHiddenChildren, NumHiddenChildren >= 2 ? TEXT("Children") : TEXT("Child")), Level + 1 ); } } return bDisplayEvent; }; // Skip building the table if there was no useful work if (First && First->Inclusive.BusyCycles > 0) { Recurse(Recurse, nullptr, First, false, 0); } FString Final = FString::Printf( TEXT("\n") TEXT("GPU Profile for Frame %d - GPU %d - %s %d\n") TEXT("\n") TEXT(" - %-30s: %.2fms\n") TEXT(" - %-30s: \"%s\"\n") TEXT(" - %-30s: %.2f%%\n") TEXT(" - %-30s: %s\n") TEXT("\n") TEXT("%s") , FrameNumber , QueueState.Queue.GPU , QueueState.Queue.GetTypeString() , QueueState.Queue.Index , TEXT("Frame Time") , First ? First->Inclusive.GetBusyMilliseconds() : 0.0 , *IConsoleManager::Get().FindConsoleObjectName(GCVarProfileGPU_Root.AsVariable()) , *RootWildcard , *IConsoleManager::Get().FindConsoleObjectName(GCVarProfileGPU_ThresholdPercent.AsVariable()) , PercentThreshold , *IConsoleManager::Get().FindConsoleObjectName(GCVarProfileGPU_ShowLeafEvents.AsVariable()) , bShowEmptyNodes ? TEXT("true") : TEXT("false") , Table.HasRows() ? *Table.ToString() : TEXT(" No recorded work for this queue.\n") ); TArray Lines; Final.ParseIntoArrayLines(Lines, false); for (FString const& Line : Lines) { UE_LOG(LogRHI, Display, TEXT("%s"), *Line); } } } Profile; #endif FQueueState(FQueue const& Queue) : Queue(Queue) {} void ResolveSignal(FEvent::FSignalFence const& Event) { FResolvedSignal& Result = Signals.Emplace_GetRef(); // // Take the max between the previous GPU EndWork event and the CPU timestamp. The signal cannot have happened on the GPU until the CPU has submitted the command to the driver. // // An example would be a GPU queue that completes work and goes idle at time T. Later, the CPU issues a Signal without other prior work at time T + 100ms. // The fence signal cannot have happened until time T + 100ms because the CPU hadn't instructed the GPU to do so until then. // LastGPUCycles would still be set to time T, since that was the time of the preceeding EndWork event. // Result.GPUTimestampBOP = FMath::Max(LastGPUCycles, Event.CPUTimestamp); Result.Value = Event.Value; FGpuProfilerTrace::SignalFence(Queue.Value, Result.GPUTimestampBOP, Event.Value); // // Fences signals *MUST* be sequential, to remove ambiguity caused by trimming the Signals array. // // To explain why, assume non-sequential signals are allowed, and consider the following example events on an arbitrary queue: // // [Signal 2] // -- Frame Boundary -- // [Signal 4] // // Assume, after trimming events earlier than the frame boundary, that only [Signal 4] remains in the Signals array. // Then, some other queue attempts to [Wait 3]. We need to compute when [Wait 3] is resolved with only the information about [Signal 4]. // // Given that fences resolve waits as soon as the signalled value is >= the wait value, we could assume the fence was resolved at [Signal 4]. // However, we don't know if the fence was already signalled to value 3 before the frame boundary and the trimming. // // Without this information, it is ambiguous whether [Wait 3] is already resolved by a [Signal 3] before the frame boundary that is no longer // in the Signals array, or won't be resolved until [Signal 4]. We could have had this sequence of events: // // [Signal 2] // [Signal 3] // -- Frame Boundary -- // [Signal 4] // // Requiring that fences are always signalled in sequential order solves this. // If the awaited value is less than the first Signal, the fence has already been signalled before the frame boundary. // checkf(Result.Value == MaxSignal.Value + 1, TEXT("Fence signals must be sequential. Result.Value: %llu, MaxSignal.Value + 1: %llu"), Result.Value, (MaxSignal.Value + 1)); // Signals should always advance in time checkf(Result.GPUTimestampBOP >= MaxSignal.GPUTimestampBOP, TEXT("Signals should always advance in time. Result.GPUTimestampBOP: %llu, MaxSignal.GPUTimestampBOP: %llu"), Result.GPUTimestampBOP, MaxSignal.GPUTimestampBOP); MaxSignal = Result; } void AccumulateTime(uint64 Busy, uint64 Wait, uint64 Idle) { #if WITH_RHI_BREADCRUMBS // Apply the timings to all active stats for (auto const& [Stat, RefCount] : ActiveStats) { FStatState& State = Timestamps.Stats.FindChecked(Stat); State.Inclusive.Accumulate(Busy, Wait, Idle); if (ActiveStatsStack.Num() > 0 && ActiveStatsStack.Last() == Stat) { State.Exclusive.Accumulate(Busy, Wait, Idle); } } if (ActiveStatsStack.Num() == 0) #endif { Timestamps.WholeQueueStat.Exclusive.Accumulate(Busy, Wait, Idle); } Timestamps.WholeQueueStat.Inclusive.Accumulate(Busy, Wait, Idle); #if WITH_PROFILEGPU for (FNode* Node = Profile.Current; Node; Node = Node->Parent) { Node->Inclusive.Accumulate(Busy, Wait, Idle); if (Node == Profile.Current) { Node->Exclusive.Accumulate(Busy, Wait, Idle); } } #endif } void BeginWork(FEvent::FBeginWork const& Event) { Timestamps.Queue.AddTimestamp(Event.GPUTimestampTOP, true); uint64 Idle = Event.CPUTimestamp > LastGPUCycles ? Event.CPUTimestamp - LastGPUCycles : 0; AccumulateTime(0, 0, Idle); FGpuProfilerTrace::BeginWork(Queue.Value, Event.GPUTimestampTOP, Event.CPUTimestamp); LastGPUCycles = FMath::Max(LastGPUCycles, Event.GPUTimestampTOP); } void EndWork(FEvent::FEndWork const& Event) { Timestamps.Queue.AddTimestamp(Event.GPUTimestampBOP, false); uint64 Busy = Event.GPUTimestampBOP > LastGPUCycles ? Event.GPUTimestampBOP - LastGPUCycles : 0; AccumulateTime(Busy, 0, 0); FGpuProfilerTrace::EndWork(Queue.Value, Event.GPUTimestampBOP); LastGPUCycles = FMath::Max(LastGPUCycles, Event.GPUTimestampBOP); } #if WITH_RHI_BREADCRUMBS void BeginBreadcrumb(FEvent::FBeginBreadcrumb const& Event) { uint64 Busy = Event.GPUTimestampTOP > LastGPUCycles ? Event.GPUTimestampTOP - LastGPUCycles : 0; AccumulateTime(Busy, 0, 0); LastGPUCycles = FMath::Max(LastGPUCycles, Event.GPUTimestampTOP); FRHIBreadcrumbData_Stats const& Stat = Event.Breadcrumb->Data; if (Stat.ShouldComputeStat()) { // Disregard the stat if it is nested within itself (i.e. its already in the ActiveStats map with a non-zero ref count). // Only the outermost stat will count the busy time, otherwise we'd be double-counting the nested time. int32 RefCount = ActiveStats.FindOrAdd(Stat)++; if (RefCount == 0) { Timestamps.Stats.FindOrAdd(Stat); } ActiveStatsStack.Add(Stat); } Breadcrumb = Event.Breadcrumb; Breadcrumb->TraceBeginGPU(Queue.Value, Event.GPUTimestampTOP); #if WITH_PROFILEGPU if (Profile.bProfileFrame) { FRHIBreadcrumb::FBuffer Buffer; const TCHAR* Name = Event.Breadcrumb->GetTCHAR(Buffer); // Push a new node Profile.PushNode(Name); } #endif } void EndBreadcrumb(FEvent::FEndBreadcrumb const& Event) { uint64 Busy = Event.GPUTimestampBOP > LastGPUCycles ? Event.GPUTimestampBOP - LastGPUCycles : 0; AccumulateTime(Busy, 0, 0); LastGPUCycles = FMath::Max(LastGPUCycles, Event.GPUTimestampBOP); FRHIBreadcrumbData_Stats const& Stat = Event.Breadcrumb->Data; if (Stat.ShouldComputeStat()) { // Pop the stat when the refcount hits zero. int32 RefCount = --ActiveStats.FindChecked(Stat); if (RefCount == 0) { ActiveStats.FindAndRemoveChecked(Stat); } check(ActiveStatsStack.Last() == Stat); ActiveStatsStack.RemoveAt(ActiveStatsStack.Num() - 1, EAllowShrinking::No); } Breadcrumb->TraceEndGPU(Queue.Value, Event.GPUTimestampBOP); Breadcrumb = Event.Breadcrumb->GetParent(); #if WITH_PROFILEGPU if (Profile.bProfileFrame) { Profile.PopNode(); } #endif } #endif void Stats(FEvent::FStats const& Event) { #if WITH_PROFILEGPU if (Profile.Current) { Profile.Current->Exclusive += Event; for (FNode* Node = Profile.Current; Node; Node = Node->Parent) { Node->Inclusive += Event; } } #endif FGpuProfilerTrace::Stats(Queue.Value, Event.NumDraws, Event.NumPrimitives); } void Wait(FResolvedWait const& ResolvedWait, const FEvent::FWaitFence& WaitFence) { // Time the queue was idle between the last EndWork event, and the Wait command being submitted to the GPU driver. uint64 Idle = ResolvedWait.CPUTimestamp > LastGPUCycles ? ResolvedWait.CPUTimestamp - LastGPUCycles : 0; uint64 WaitStart = FMath::Max(ResolvedWait.CPUTimestamp, LastGPUCycles); FGpuProfilerTrace::WaitFence(Queue.Value, ResolvedWait.GPUTimestampTOP, WaitFence.Queue.Value, WaitFence.Value); // Time the queue spent waiting for the fence to signal on another queue. uint64 Wait = 0; if (ResolvedWait.GPUTimestampTOP > WaitStart) { Wait = ResolvedWait.GPUTimestampTOP - WaitStart; FGpuProfilerTrace::TraceWait(Queue.Value, WaitStart, ResolvedWait.GPUTimestampTOP); } // Bring the last GPU busy end time forwards to where the wait is resolved. LastGPUCycles = ResolvedWait.GPUTimestampTOP; AccumulateTime(0, Wait, Idle); } void TrimSignals(uint64 CPUTimestamp) { // Remove all signals that occured on the GPU timeline before this frame boundary on the CPU. int32 Index = Algo::LowerBoundBy(Signals, CPUTimestamp, [](FResolvedSignal const& Signal) { return Signal.GPUTimestampBOP; }); if (Index >= 0) { Signals.RemoveAt(0, Index, EAllowShrinking::No); } } void FrameTime(uint64 TotalGPUTime) { Timestamps.TotalBusyCycles = TotalGPUTime; } void FrameBoundary(FEvent::FFrameBoundary const& Event, FFrameState& FrameState, uint32 FrameNumber) { check(!bBusy); Timestamps.CPUFrameBoundary = Event.CPUTimestamp; FGpuProfilerTrace::FrameBoundary(Queue.Value, Event.FrameNumber); #if WITH_PROFILEGPU if (Profile.bProfileFrame) { Profile.LogTree(*this, Event.FrameNumber); Profile = {}; } #endif FrameState.Emplace(Queue, MoveTemp(Timestamps)); #if WITH_RHI_BREADCRUMBS // Reinsert timestamp streams for the current active stats on // this queue, since these got moved into the frame state. for (auto& [Stat, RefCount] : ActiveStats) { Timestamps.Stats.FindOrAdd(Stat); } #endif #if WITH_PROFILEGPU if (FrameNumber == Event.FrameNumber + 1) { Profile.bProfileFrame = true; // Build the node tree Profile.PushNode(TEXT("")); #if WITH_RHI_BREADCRUMBS auto Recurse = [&](auto& Recurse, FRHIBreadcrumbNode* Current) -> void { if (!Current) { return; } Recurse(Recurse, Current->GetParent()); FRHIBreadcrumb::FBuffer Buffer; Profile.PushNode(Current->GetTCHAR(Buffer)); }; Recurse(Recurse, Event.Breadcrumb); #endif // WITH_RHI_BREADCRUMBS } #endif } }; std::atomic bTriggerProfile{ false }; uint32 ProfileFrameNumber = 0; uint32 MaxFrameNumber = 0; TMap Frames; TMap> QueueStates; // Attempts to retrieve the CPU and GPU timestamps of when a fence wait is resolved by a signal on another queue. TOptional ResolveWait(FQueueState& LocalQueue, FEvent::FWaitFence const& WaitFenceEvent) { FQueueState const& RemoteQueue = static_cast(*QueueStates.FindChecked(WaitFenceEvent.Queue)); if (RemoteQueue.MaxSignal.Value < WaitFenceEvent.Value) { // Fence has not yet been signalled on the remote queue return {}; } else { // Fence has been signalled, but it may be in the future. FResolvedWait Result; Result.CPUTimestamp = WaitFenceEvent.CPUTimestamp; // // The wait cannot be resolved any earlier than: // // 1) The wait command was issued to the driver (WaitFenceEvent.CPUTimestamp) // 2) The GPU completed prior work on this queue (LocalQueue.LastGPUCycles) // Result.GPUTimestampTOP = FMath::Max(WaitFenceEvent.CPUTimestamp, LocalQueue.LastGPUCycles); // // 3) The wait maybe be further delayed by the remote queue the GPU is awaiting. // int32 Index = Algo::LowerBoundBy(RemoteQueue.Signals, WaitFenceEvent.Value, [](FResolvedSignal const& Signal) { return Signal.Value; }); if (RemoteQueue.Signals.IsValidIndex(Index)) { FResolvedSignal const& Signal = RemoteQueue.Signals[Index]; // // Only consider this signal's timestamp if the fence was not already signalled at the previous frame boundary. // See comment in ResolveSignal() for details. // if (!(Index == 0 && WaitFenceEvent.Value < Signal.Value)) { Result.GPUTimestampTOP = FMath::Max(Result.GPUTimestampTOP, Signal.GPUTimestampBOP); } } return Result; } } void InitializeQueues(TConstArrayView Queues) override { FGpuProfilerTrace::Initialize(); for (FQueue Queue : Queues) { TUniquePtr& Ptr = QueueStates.FindOrAdd(Queue); if (!Ptr.IsValid()) { Ptr = MakeUnique(Queue); } } } bool ProcessQueue(FQueueState& QueueState, FIterator& Iterator) { if (FGpuProfilerTrace::IsAvailable() && !QueueState.bWasTraced) { FGpuProfilerTrace::InitializeQueue(QueueState.Queue.Value, QueueState.Queue.GetTypeString()); QueueState.bWasTraced = true; } while (FEvent const* Event = Iterator.Peek()) { switch (Event->GetType()) { case FEvent::EType::BeginWork: { check(!QueueState.bBusy); QueueState.bBusy = true; QueueState.BeginWork(Event->Value.Get()); } break; case FEvent::EType::EndWork: { check(QueueState.bBusy); QueueState.bBusy = false; QueueState.EndWork(Event->Value.Get()); } break; #if WITH_RHI_BREADCRUMBS case FEvent::EType::BeginBreadcrumb: { check(QueueState.bBusy); QueueState.BeginBreadcrumb(Event->Value.Get()); } break; case FEvent::EType::EndBreadcrumb: { check(QueueState.bBusy); QueueState.EndBreadcrumb(Event->Value.Get()); } break; #endif // WITH_RHI_BREADCRUMBS #if WITH_PROFILEGPU case FEvent::EType::Stats: { check(QueueState.bBusy); QueueState.Stats(Event->Value.Get()); } break; #endif // WITH_PROFILEGPU case FEvent::EType::SignalFence: { check(!QueueState.bBusy); QueueState.ResolveSignal(Event->Value.Get()); } break; case FEvent::EType::WaitFence: { check(!QueueState.bBusy); const FEvent::FWaitFence& WaitFence= Event->Value.Get(); TOptional ResolvedWait = ResolveWait(QueueState, Event->Value.Get()); if (!ResolvedWait.IsSet()) { // Unresolved fence, pause processing return false; } QueueState.Wait(*ResolvedWait, WaitFence); } break; case FEvent::EType::FrameTime: { const FEvent::FFrameTime& FrameTime = Event->Value.Get(); QueueState.FrameTime(FrameTime.TotalGPUTime); } break; case FEvent::EType::FrameBoundary: { FEvent::FFrameBoundary const& FrameBoundary = Event->Value.Get(); FFrameState& FrameState = Frames.FindOrAdd(FrameBoundary.FrameNumber); #if STATS FrameState.StatsFrame = FrameBoundary.bStatsFrameSet ? FrameBoundary.StatsFrame : TOptional(); #endif #if WITH_PROFILEGPU // Latch the index of the next frame to profile MaxFrameNumber = FMath::Max(FrameBoundary.FrameNumber, MaxFrameNumber); if (bTriggerProfile.exchange(false)) { ProfileFrameNumber = MaxFrameNumber + 1; } #endif // WITH_PROFILEGPU QueueState.FrameBoundary(FrameBoundary, FrameState, ProfileFrameNumber); if (FrameState.Num() == QueueStates.Num()) { // Trim the Signals array in each queue, up to the lowest frame boundary CPU timestamp. { uint64 MinFrameBoundary = TNumericLimits::Max(); for (auto& [Queue, QueueTimestamps] : FrameState) { MinFrameBoundary = FMath::Min(MinFrameBoundary, QueueTimestamps.CPUFrameBoundary); } for (auto& [Queue, LocalQueueState] : QueueStates) { LocalQueueState.Get()->TrimSignals(MinFrameBoundary); } } // All registered queues have reported their frame boundary event. // We have a full set of data to compute the total frame GPU stats. ProcessFrame(FrameState); Frames.Remove(FrameBoundary.FrameNumber); } } break; } Iterator.Pop(); } return true; } void ProcessFrame(FFrameState& FrameState) { #if STATS FEndOfPipeStats* Stats = FEndOfPipeStats::Get(); if (FrameState.StatsFrame.IsSet()) { Stats->AddMessage(FStatConstants::AdvanceFrame.GetEncodedName(), EStatOperation::AdvanceFrameEventEndOfPipe, *FrameState.StatsFrame); } #endif #if CSV_PROFILER_STATS const bool bCsvStatsEnabled = !!CVarGPUCsvStatsEnabled.GetValueOnAnyThread(); FCsvProfiler* CsvProfiler = FCsvProfiler::Get(); CsvProfiler->BeginFrameEOP(); #else const bool bCsvStatsEnabled = false; #endif TOptional MaxQueueBusyCycles; for (auto const& [Queue, QueueTimestamps] : FrameState) { #if WITH_RHI_BREADCRUMBS && HAS_GPU_STATS // Compute the individual GPU stats for (auto const& [Stat, StatState] : QueueTimestamps.Stats) { StatState.EmitResults(Queue, *Stat.GPUStat #if STATS , Stats #endif #if CSV_PROFILER_STATS , bCsvStatsEnabled ? CsvProfiler : nullptr #endif ); } #endif // WITH_RHI_BREADCRUMBS && HAS_GPU_STATS // Set the whole-frame per queue stat #if HAS_GPU_STATS QueueTimestamps.WholeQueueStat.EmitResults(Queue, GPUStat_Total #if STATS , Stats #endif #if CSV_PROFILER_STATS , bCsvStatsEnabled ? CsvProfiler : nullptr #endif ); #endif if (QueueTimestamps.TotalBusyCycles.IsSet()) { uint64 CurrentMax = MaxQueueBusyCycles ? *MaxQueueBusyCycles : 0; MaxQueueBusyCycles = FMath::Max(CurrentMax, *QueueTimestamps.TotalBusyCycles); } } if (MaxQueueBusyCycles.IsSet()) { // Set the total GPU time stat according to the value directly provided by the platform RHI GRHIGPUFrameTimeHistory.PushFrameCycles(1.0 / FPlatformTime::GetSecondsPerCycle64(), *MaxQueueBusyCycles); } else { // Compute the whole-frame total GPU time. TArray> StreamPointers; for (auto const& [Queue, State] : FrameState) { StreamPointers.Emplace(State.Queue); } uint64 WholeFrameUnion = FTimestampStream::ComputeUnion(StreamPointers); // Update the global GPU frame time stats GRHIGPUFrameTimeHistory.PushFrameCycles(1.0 / FPlatformTime::GetSecondsPerCycle64(), WholeFrameUnion); } // @todo set global csv GPU time //RHISetGPUStatTotals(bCsvStatsEnabled, FPlatformTime::ToMilliseconds64(WholeFrameUnion)); #if STATS Stats->Flush(); #endif } void ProcessAllQueues() { // Process the queue as far as possible bool bProgress; do { bProgress = false; for (auto& [Queue, QueueState] : QueueStates) { while (FIterator* Iterator = QueueState->PendingStreams.Peek()) { FEvent const* Start = Iterator->Peek(); bool bPaused = !ProcessQueue(*QueueState.Get(), *Iterator); FEvent const* End = Iterator->Peek(); bProgress |= End != Start; if (bPaused) { // The queue was paused by a Wait event check(End); break; } if (!End) { // This stream has been fully processed. QueueState->PendingStreams.Dequeue(); } } } } while (bProgress); } void ProcessStreams(TConstArrayView> EventStreams) override { for (TSharedRef const& Stream : EventStreams) { FQueueState& QueueState = *QueueStates.FindChecked(Stream->Queue); QueueState.PendingStreams.Enqueue(FIterator(Stream)); } ProcessAllQueues(); } } GGPUProfilerSink_StatSystem; #if WITH_PROFILEGPU static FAutoConsoleCommand GCommand_ProfileGPU( TEXT("ProfileGPU"), TEXT("Captures statistics about a frame of GPU work and prints the results to the log."), FConsoleCommandWithArgsDelegate::CreateLambda([](const TArray& Args) { GGPUProfilerSink_StatSystem.bTriggerProfile = true; if (OnProfileGPU.IsBound()) { OnProfileGPU.Broadcast(); } })); #endif // WITH_PROFILEGPU } #endif // RHI_NEW_GPU_PROFILER RHI_API FRHIGPUFrameTimeHistory GRHIGPUFrameTimeHistory; FRHIGPUFrameTimeHistory::EResult FRHIGPUFrameTimeHistory::FState::PopFrameCycles(uint64& OutCycles64) { return GRHIGPUFrameTimeHistory.PopFrameCycles(*this, OutCycles64); } FRHIGPUFrameTimeHistory::EResult FRHIGPUFrameTimeHistory::PopFrameCycles(FState& State, uint64& OutCycles64) { FScopeLock Lock(&CS); if (State.NextIndex == NextIndex) { OutCycles64 = 0; return EResult::Empty; } else { uint64 MinHistoryIndex = NextIndex >= MaxLength ? NextIndex - MaxLength : 0; if (State.NextIndex < MinHistoryIndex) { State.NextIndex = MinHistoryIndex; OutCycles64 = History[State.NextIndex++ % MaxLength]; return EResult::Disjoint; } else { OutCycles64 = History[State.NextIndex++ % MaxLength]; return EResult::Ok; } } } void FRHIGPUFrameTimeHistory::PushFrameCycles(double GPUFrequency, uint64 GPUCycles) { double Seconds = double(GPUCycles) / GPUFrequency; double Cycles32 = Seconds / FPlatformTime::GetSecondsPerCycle(); double Cycles64 = Seconds / FPlatformTime::GetSecondsPerCycle64(); { FScopeLock Lock(&CS); History[NextIndex++ % MaxLength] = uint64(Cycles64); } PRAGMA_DISABLE_DEPRECATION_WARNINGS FPlatformAtomics::InterlockedExchange(reinterpret_cast(&GGPUFrameTime), int32(Cycles32)); PRAGMA_ENABLE_DEPRECATION_WARNINGS } RHI_API uint32 RHIGetGPUFrameCycles(uint32 GPUIndex) { PRAGMA_DISABLE_DEPRECATION_WARNINGS return (uint32)FPlatformAtomics::AtomicRead(reinterpret_cast(&GGPUFrameTime)); PRAGMA_ENABLE_DEPRECATION_WARNINGS } #undef LOCTEXT_NAMESPACE