Files
UnrealEngine/Engine/Source/Runtime/RHI/Private/GPUProfiler.cpp
2025-05-18 13:04:45 +08:00

2545 lines
75 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
GPUProfiler.h: Hierarchical GPU Profiler Implementation.
=============================================================================*/
#include "GPUProfiler.h"
#include "Async/TaskGraphInterfaces.h"
#include "Misc/WildcardString.h"
#include "Misc/CommandLine.h"
#include "RHI.h"
#include "GpuProfilerTrace.h"
#include "Containers/AnsiString.h"
#include "Stats/StatsData.h"
#if !UE_BUILD_SHIPPING
#include "VisualizerEvents.h"
#include "ProfileVisualizerModule.h"
#include "Modules/ModuleManager.h"
#endif
#define LOCTEXT_NAMESPACE "GpuProfiler"
enum class EGPUProfileSortMode
{
Chronological,
TimeElapsed,
NumPrims,
NumVerts,
Max
};
static TAutoConsoleVariable<int32> GCVarProfileGPU_Sort(
TEXT("r.ProfileGPU.Sort"),
0,
TEXT("Sorts the TTY Dump independently at each level of the tree in various modes.\n")
TEXT("0 : Chronological\n")
TEXT("1 : By time elapsed\n")
TEXT("2 : By number of prims\n")
TEXT("3 : By number of verts\n"),
ECVF_Default);
static TAutoConsoleVariable<FString> GCVarProfileGPU_Root(
TEXT("r.ProfileGPU.Root"),
TEXT("*"),
TEXT("Allows to filter the tree when using ProfileGPU, the pattern match is case sensitive."),
ECVF_Default);
static TAutoConsoleVariable<float> GCVarProfileGPU_ThresholdPercent(
TEXT("r.ProfileGPU.ThresholdPercent"),
0.0f,
TEXT("Percent of the total execution duration the event needs to be larger than to be printed."),
ECVF_Default);
static TAutoConsoleVariable<bool> GCVarProfileGPU_UnicodeOutput(
TEXT("r.ProfileGPU.UnicodeOutput"),
true,
TEXT("When enabled, the output results will be formatted in a unicode table."),
ECVF_Default);
static TAutoConsoleVariable<bool> GCVarProfileGPU_ShowLeafEvents(
TEXT("r.ProfileGPU.ShowLeafEvents"),
true,
TEXT("Allows profileGPU to display event-only leaf nodes with no draws associated."),
ECVF_Default);
static TAutoConsoleVariable<int> CVarGPUCsvStatsEnabled(
TEXT("r.GPUCsvStatsEnabled"),
0,
TEXT("Enables or disables GPU stat recording to CSVs"));
#if (RHI_NEW_GPU_PROFILER == 0)
static TAutoConsoleVariable<FString> GProfileGPUPatternCVar(
TEXT("r.ProfileGPU.Pattern"),
TEXT("*"),
TEXT("Allows to filter the entries when using ProfileGPU, the pattern match is case sensitive.\n")
TEXT("'*' can be used in the end to get all entries starting with the string.\n")
TEXT(" '*' without any leading characters disables the pattern matching and uses a time threshold instead (default).\n")
TEXT("'?' allows to ignore one character.\n")
TEXT("e.g. AmbientOcclusionSetup, AmbientOcclusion*, Ambient???lusion*, *"),
ECVF_Default);
static TAutoConsoleVariable<int32> GProfileShowEventHistogram(
TEXT("r.ProfileGPU.ShowEventHistogram"),
0,
TEXT("Whether the event histogram should be shown."),
ECVF_Default);
TAutoConsoleVariable<int32> GProfileGPUTransitions(
TEXT("r.ProfileGPU.ShowTransitions"),
0,
TEXT("Allows profileGPU to display resource transition events."),
ECVF_Default);
// Should we print a summary at the end?
static TAutoConsoleVariable<int32> GProfilePrintAssetSummary(
TEXT("r.ProfileGPU.PrintAssetSummary"),
0,
TEXT("Should we print a summary split by asset (r.ShowMaterialDrawEvents is strongly recommended as well).\n"),
ECVF_Default);
// Should we print a summary at the end?
static TAutoConsoleVariable<FString> GProfileAssetSummaryCallOuts(
TEXT("r.ProfileGPU.AssetSummaryCallOuts"),
TEXT(""),
TEXT("Comma separated list of substrings that deserve special mention in the final summary (e.g., \"LOD,HeroName\"\n")
TEXT("r.ProfileGPU.PrintAssetSummary must be true to enable this feature"),
ECVF_Default);
static TAutoConsoleVariable<int32> GSaveScreenshotAfterProfilingGPUCVar(
TEXT("r.ProfileGPU.Screenshot"),
1,
TEXT("Whether a screenshot should be taken when profiling the GPU. 0:off, 1:on (default)"),
ECVF_RenderThreadSafe);
static TAutoConsoleVariable<int32> GShowProfilerAfterProfilingGPUCVar(
TEXT("r.ProfileGPU.ShowUI"),
1,
TEXT("Whether the user interface profiler should be displayed after profiling the GPU.\n")
TEXT("The results will always go to the log/console\n")
TEXT("0:off, 1:on (default)"),
ECVF_RenderThreadSafe);
static TAutoConsoleVariable<float> GGPUHitchThresholdCVar(
TEXT("RHI.GPUHitchThreshold"),
100.0f,
TEXT("Threshold for detecting hitches on the GPU (in milliseconds).")
);
static TAutoConsoleVariable<int32> CVarGPUCrashDataCollectionEnable(
TEXT("r.gpucrash.collectionenable"),
1,
TEXT("Stores GPU crash data from scoped events when a applicable crash debugging system is available."),
ECVF_RenderThreadSafe);
static TAutoConsoleVariable<int32> CVarGPUCrashDataDepth(
TEXT("r.gpucrash.datadepth"),
-1,
TEXT("Limits the amount of marker scope depth we record for GPU crash debugging to the given scope depth."),
ECVF_RenderThreadSafe);
namespace RHIConfig
{
bool ShouldSaveScreenshotAfterProfilingGPU()
{
return GSaveScreenshotAfterProfilingGPUCVar.GetValueOnAnyThread() != 0;
}
bool ShouldShowProfilerAfterProfilingGPU()
{
return GShowProfilerAfterProfilingGPUCVar.GetValueOnAnyThread() != 0;
}
float GetGPUHitchThreshold()
{
return GGPUHitchThresholdCVar.GetValueOnAnyThread() * 0.001f;
}
}
/** Recursively generates a histogram of nodes and stores their timing in TimingResult. */
static void GatherStatsEventNode(FGPUProfilerEventNode* Node, int32 Depth, TMap<FString, FGPUProfilerEventNodeStats>& EventHistogram)
{
if (Node->NumDraws > 0 || Node->NumDispatches > 0 || Node->Children.Num() > 0)
{
Node->TimingResult = Node->GetTiming() * 1000.0f;
Node->NumTotalDraws = Node->NumDraws;
Node->NumTotalDispatches = Node->NumDispatches;
Node->NumTotalPrimitives = Node->NumPrimitives;
Node->NumTotalVertices = Node->NumVertices;
FGPUProfilerEventNode* Parent = Node->Parent;
while (Parent)
{
Parent->NumTotalDraws += Node->NumDraws;
Parent->NumTotalDispatches += Node->NumDispatches;
Parent->NumTotalPrimitives += Node->NumPrimitives;
Parent->NumTotalVertices += Node->NumVertices;
Parent = Parent->Parent;
}
for (int32 ChildIndex = 0; ChildIndex < Node->Children.Num(); ChildIndex++)
{
// Traverse children
GatherStatsEventNode(Node->Children[ChildIndex], Depth + 1, EventHistogram);
}
FGPUProfilerEventNodeStats* FoundHistogramBucket = EventHistogram.Find(Node->Name);
if (FoundHistogramBucket)
{
FoundHistogramBucket->NumDraws += Node->NumTotalDraws;
FoundHistogramBucket->NumPrimitives += Node->NumTotalPrimitives;
FoundHistogramBucket->NumVertices += Node->NumTotalVertices;
FoundHistogramBucket->TimingResult += Node->TimingResult;
FoundHistogramBucket->NumEvents++;
}
else
{
FGPUProfilerEventNodeStats NewNodeStats;
NewNodeStats.NumDraws = Node->NumTotalDraws;
NewNodeStats.NumPrimitives = Node->NumTotalPrimitives;
NewNodeStats.NumVertices = Node->NumTotalVertices;
NewNodeStats.TimingResult = Node->TimingResult;
NewNodeStats.NumEvents = 1;
EventHistogram.Add(Node->Name, NewNodeStats);
}
}
}
struct FGPUProfileInfoPair
{
int64 Triangles;
int32 DrawCalls;
FGPUProfileInfoPair()
: Triangles(0)
, DrawCalls(0)
{
}
void AddDraw(int64 InTriangleCount)
{
Triangles += InTriangleCount;
++DrawCalls;
}
};
struct FGPUProfileStatSummary
{
TMap<FString, FGPUProfileInfoPair> TrianglesPerMaterial;
TMap<FString, FGPUProfileInfoPair> TrianglesPerMesh;
TMap<FString, FGPUProfileInfoPair> TrianglesPerNonMesh;
int32 TotalNumNodes;
int32 TotalNumDraws;
bool bGatherSummaryStats;
bool bDumpEventLeafNodes;
FGPUProfileStatSummary()
: TotalNumNodes(0)
, TotalNumDraws(0)
, bGatherSummaryStats(false)
, bDumpEventLeafNodes(false)
{
bDumpEventLeafNodes = GCVarProfileGPU_ShowLeafEvents.GetValueOnRenderThread() != 0;
bGatherSummaryStats = GProfilePrintAssetSummary.GetValueOnRenderThread() != 0;
}
void ProcessMatch(FGPUProfilerEventNode* Node)
{
if (bGatherSummaryStats && (Node->NumTotalPrimitives > 0) && (Node->NumTotalVertices > 0) && (Node->Children.Num() == 0))
{
FString MaterialPart;
FString AssetPart;
if (Node->Name.Split(TEXT(" "), &MaterialPart, &AssetPart, ESearchCase::CaseSensitive))
{
TrianglesPerMaterial.FindOrAdd(MaterialPart).AddDraw(Node->NumTotalPrimitives);
TrianglesPerMesh.FindOrAdd(AssetPart).AddDraw(Node->NumTotalPrimitives);
}
else
{
TrianglesPerNonMesh.FindOrAdd(Node->Name).AddDraw(Node->NumTotalPrimitives);
}
}
}
void PrintSummary()
{
UE_LOG(LogRHI, Log, TEXT("Total Nodes %u Draws %u"), TotalNumNodes, TotalNumDraws);
UE_LOG(LogRHI, Log, TEXT(""));
UE_LOG(LogRHI, Log, TEXT(""));
if (bGatherSummaryStats)
{
// Sort the lists and print them out
TrianglesPerMesh.ValueSort([](const FGPUProfileInfoPair& A, const FGPUProfileInfoPair& B){ return A.Triangles > B.Triangles; });
UE_LOG(LogRHI, Log, TEXT(""));
UE_LOG(LogRHI, Log, TEXT("MeshList,TriangleCount,DrawCallCount"));
for (auto& Pair : TrianglesPerMesh)
{
UE_LOG(LogRHI, Log, TEXT("%s,%" INT64_FMT ",%d"), *Pair.Key, Pair.Value.Triangles, Pair.Value.DrawCalls);
}
TrianglesPerMaterial.ValueSort([](const FGPUProfileInfoPair& A, const FGPUProfileInfoPair& B){ return A.Triangles > B.Triangles; });
UE_LOG(LogRHI, Log, TEXT(""));
UE_LOG(LogRHI, Log, TEXT("MaterialList,TriangleCount,DrawCallCount"));
for (auto& Pair : TrianglesPerMaterial)
{
UE_LOG(LogRHI, Log, TEXT("%s,%" INT64_FMT ",%d"), *Pair.Key, Pair.Value.Triangles, Pair.Value.DrawCalls);
}
TrianglesPerNonMesh.ValueSort([](const FGPUProfileInfoPair& A, const FGPUProfileInfoPair& B){ return A.Triangles > B.Triangles; });
UE_LOG(LogRHI, Log, TEXT(""));
UE_LOG(LogRHI, Log, TEXT("MiscList,TriangleCount,DrawCallCount"));
for (auto& Pair : TrianglesPerNonMesh)
{
UE_LOG(LogRHI, Log, TEXT("%s,%" INT64_FMT ",%d"), *Pair.Key, Pair.Value.Triangles, Pair.Value.DrawCalls);
}
// See if we want to call out any particularly interesting matches
TArray<FString> InterestingSubstrings;
GProfileAssetSummaryCallOuts.GetValueOnRenderThread().ParseIntoArray(InterestingSubstrings, TEXT(","), true);
if (InterestingSubstrings.Num() > 0)
{
UE_LOG(LogRHI, Log, TEXT(""));
UE_LOG(LogRHI, Log, TEXT("Information about specified mesh substring matches (r.ProfileGPU.AssetSummaryCallOuts)"));
for (const FString& InterestingSubstring : InterestingSubstrings)
{
int32 InterestingNumDraws = 0;
int64 InterestingNumTriangles = 0;
for (auto& Pair : TrianglesPerMesh)
{
if (Pair.Key.Contains(InterestingSubstring))
{
InterestingNumDraws += Pair.Value.DrawCalls;
InterestingNumTriangles += Pair.Value.Triangles;
}
}
UE_LOG(LogRHI, Log, TEXT("Matching '%s': %d draw calls, with %" INT64_FMT " tris (%.2f M)"), *InterestingSubstring, InterestingNumDraws, InterestingNumTriangles, InterestingNumTriangles * 1e-6);
}
UE_LOG(LogRHI, Log, TEXT(""));
}
}
}
};
/** Recursively dumps stats for each node with a depth first traversal. */
static void DumpStatsEventNode(FGPUProfilerEventNode* Node, float RootResult, int32 Depth, const FWildcardString& WildcardFilter, bool bParentMatchedFilter, float& ReportedTiming, FGPUProfileStatSummary& Summary)
{
Summary.TotalNumNodes++;
ReportedTiming = 0;
if (Node->NumDraws > 0 || Node->NumDispatches > 0 || Node->Children.Num() > 0 || Summary.bDumpEventLeafNodes)
{
Summary.TotalNumDraws += Node->NumDraws;
// Percent that this node was of the total frame time
const float Percent = Node->TimingResult * 100.0f / (RootResult * 1000.0f);
const float PercentThreshold = GCVarProfileGPU_ThresholdPercent.GetValueOnRenderThread();
const int32 EffectiveDepth = FMath::Max(Depth - 1, 0);
const bool bDisplayEvent = (bParentMatchedFilter || WildcardFilter.IsMatch(Node->Name)) && (Percent > PercentThreshold || Summary.bDumpEventLeafNodes);
if (bDisplayEvent)
{
FString NodeStats = TEXT("");
if (Node->NumTotalDraws > 0)
{
NodeStats = FString::Printf(TEXT("%u %s %u prims %u verts "), Node->NumTotalDraws, Node->NumTotalDraws == 1 ? TEXT("draw") : TEXT("draws"), Node->NumTotalPrimitives, Node->NumTotalVertices);
}
if (Node->NumTotalDispatches > 0)
{
NodeStats += FString::Printf(TEXT("%u %s"), Node->NumTotalDispatches, Node->NumTotalDispatches == 1 ? TEXT("dispatch") : TEXT("dispatches"));
// Cumulative group stats are not meaningful, only include dispatch stats if there was one in the current node
if (Node->GroupCount.X > 0 && Node->NumDispatches == 1)
{
NodeStats += FString::Printf(TEXT(" %u"), Node->GroupCount.X);
if (Node->GroupCount.Y > 1)
{
NodeStats += FString::Printf(TEXT("x%u"), Node->GroupCount.Y);
}
if (Node->GroupCount.Z > 1)
{
NodeStats += FString::Printf(TEXT("x%u"), Node->GroupCount.Z);
}
NodeStats += TEXT(" groups");
}
}
// Print information about this node, padded to its depth in the tree
UE_LOG(LogRHI, Log, TEXT("%s%4.1f%%%5.2fms %s %s"),
*FString(TEXT("")).LeftPad(EffectiveDepth * 3),
Percent,
Node->TimingResult,
*Node->Name,
*NodeStats
);
ReportedTiming = Node->TimingResult;
Summary.ProcessMatch(Node);
}
struct FCompareGPUProfileNode
{
EGPUProfileSortMode SortMode;
FCompareGPUProfileNode(EGPUProfileSortMode InSortMode)
: SortMode(InSortMode)
{}
FORCEINLINE bool operator()(const FGPUProfilerEventNode* A, const FGPUProfilerEventNode* B) const
{
switch (SortMode)
{
case EGPUProfileSortMode::NumPrims:
return B->NumTotalPrimitives < A->NumTotalPrimitives;
case EGPUProfileSortMode::NumVerts:
return B->NumTotalVertices < A->NumTotalVertices;
case EGPUProfileSortMode::TimeElapsed:
default:
return B->TimingResult < A->TimingResult;
}
}
};
EGPUProfileSortMode SortMode = (EGPUProfileSortMode)FMath::Clamp(GCVarProfileGPU_Sort.GetValueOnRenderThread(), 0, ((int32)EGPUProfileSortMode::Max - 1));
if (SortMode != EGPUProfileSortMode::Chronological)
{
Node->Children.Sort(FCompareGPUProfileNode(SortMode));
}
float TotalChildTime = 0;
uint32 TotalChildDraws = 0;
for (int32 ChildIndex = 0; ChildIndex < Node->Children.Num(); ChildIndex++)
{
FGPUProfilerEventNode* ChildNode = Node->Children[ChildIndex];
// Traverse children
const int32 PrevNumDraws = Summary.TotalNumDraws;
float ChildReportedTiming = 0;
DumpStatsEventNode(Node->Children[ChildIndex], RootResult, Depth + 1, WildcardFilter, bDisplayEvent, ChildReportedTiming, Summary);
const int32 NumChildDraws = Summary.TotalNumDraws - PrevNumDraws;
TotalChildTime += ChildReportedTiming;
TotalChildDraws += NumChildDraws;
}
const float UnaccountedTime = FMath::Max(Node->TimingResult - TotalChildTime, 0.0f);
const float UnaccountedPercent = UnaccountedTime * 100.0f / (RootResult * 1000.0f);
// Add an 'Other Children' node if necessary to show time spent in the current node that is not in any of its children
if (bDisplayEvent && Node->Children.Num() > 0 && TotalChildDraws > 0 && (UnaccountedPercent > 2.0f || UnaccountedTime > .2f))
{
UE_LOG(LogRHI, Log, TEXT("%s%4.1f%%%5.2fms Other Children"),
*FString(TEXT("")).LeftPad((EffectiveDepth + 1) * 3),
UnaccountedPercent,
UnaccountedTime);
}
}
}
#if !UE_BUILD_SHIPPING
/**
* Converts GPU profile data to Visualizer data
*
* @param InProfileData GPU profile data
* @param OutVisualizerData Visualizer data
*/
static TSharedPtr< FVisualizerEvent > CreateVisualizerDataRecursively( const TRefCountPtr< class FGPUProfilerEventNode >& InNode, TSharedPtr< FVisualizerEvent > InParentEvent, const double InStartTimeMs, const double InTotalTimeMs )
{
TSharedPtr< FVisualizerEvent > VisualizerEvent( new FVisualizerEvent( InStartTimeMs / InTotalTimeMs, InNode->TimingResult / InTotalTimeMs, InNode->TimingResult, 0, InNode->Name ) );
VisualizerEvent->ParentEvent = InParentEvent;
double ChildStartTimeMs = InStartTimeMs;
for( int32 ChildIndex = 0; ChildIndex < InNode->Children.Num(); ChildIndex++ )
{
TRefCountPtr< FGPUProfilerEventNode > ChildNode = InNode->Children[ ChildIndex ];
TSharedPtr< FVisualizerEvent > ChildEvent = CreateVisualizerDataRecursively( ChildNode, VisualizerEvent, ChildStartTimeMs, InTotalTimeMs );
VisualizerEvent->Children.Add( ChildEvent );
ChildStartTimeMs += ChildNode->TimingResult;
}
return VisualizerEvent;
}
/**
* Converts GPU profile data to Visualizer data
*
* @param InProfileData GPU profile data
* @param OutVisualizerData Visualizer data
*/
static TSharedPtr< FVisualizerEvent > CreateVisualizerData( const TArray<TRefCountPtr<class FGPUProfilerEventNode> >& InProfileData )
{
// Calculate total time first
double TotalTimeMs = 0.0;
for( int32 Index = 0; Index < InProfileData.Num(); ++Index )
{
TotalTimeMs += InProfileData[ Index ]->TimingResult;
}
// Assumption: InProfileData contains only one (root) element. Otherwise an extra FVisualizerEvent root event is required.
TSharedPtr< FVisualizerEvent > DummyRoot;
// Recursively create visualizer event data.
TSharedPtr< FVisualizerEvent > StatEvents( CreateVisualizerDataRecursively( InProfileData[0], DummyRoot, 0.0, TotalTimeMs ) );
return StatEvents;
}
#endif
void FGPUProfilerEventNodeFrame::DumpEventTree()
{
if (EventTree.Num() > 0)
{
float RootResult = GetRootTimingResults();
FString ConfigString;
if (GCVarProfileGPU_Root.GetValueOnRenderThread() != TEXT("*"))
{
ConfigString += FString::Printf(TEXT("Root filter: %s "), *GCVarProfileGPU_Root.GetValueOnRenderThread());
}
if (GCVarProfileGPU_ThresholdPercent.GetValueOnRenderThread() > 0.0f)
{
ConfigString += FString::Printf(TEXT("Threshold: %.2f%% "), GCVarProfileGPU_ThresholdPercent.GetValueOnRenderThread());
}
if (ConfigString.Len() > 0)
{
ConfigString = FString(TEXT(", ")) + ConfigString;
}
UE_LOG(LogRHI, Log, TEXT("Perf marker hierarchy, total GPU time %.2fms%s"), RootResult * 1000.0f, *ConfigString);
UE_LOG(LogRHI, Log, TEXT(""));
// Display a warning if this is a GPU profile and the GPU was profiled with v-sync enabled
FText VsyncEnabledWarningText = FText::GetEmpty();
static IConsoleVariable* CVSyncVar = IConsoleManager::Get().FindConsoleVariable(TEXT("r.VSync"));
if (CVSyncVar->GetInt() != 0 && !PlatformDisablesVSync())
{
VsyncEnabledWarningText = LOCTEXT("GpuProfileVsyncEnabledWarning", "WARNING: This GPU profile was captured with v-sync enabled. V-sync wait time may show up in any bucket, and as a result the data in this profile may be skewed. Please profile with v-sync disabled to obtain the most accurate data.");
UE_LOG(LogRHI, Log, TEXT("%s"), *(VsyncEnabledWarningText.ToString()));
}
LogDisjointQuery();
TMap<FString, FGPUProfilerEventNodeStats> EventHistogram;
for (int32 BaseNodeIndex = 0; BaseNodeIndex < EventTree.Num(); BaseNodeIndex++)
{
GatherStatsEventNode(EventTree[BaseNodeIndex], 0, EventHistogram);
}
FString RootWildcardString = GCVarProfileGPU_Root.GetValueOnRenderThread();
FWildcardString RootWildcard(RootWildcardString);
FGPUProfileStatSummary Summary;
for (int32 BaseNodeIndex = 0; BaseNodeIndex < EventTree.Num(); BaseNodeIndex++)
{
float Unused = 0;
DumpStatsEventNode(EventTree[BaseNodeIndex], RootResult, 0, RootWildcard, false, Unused, /*inout*/ Summary);
}
Summary.PrintSummary();
const bool bShowHistogram = GProfileShowEventHistogram.GetValueOnRenderThread() != 0;
if (RootWildcardString == TEXT("*") && bShowHistogram)
{
struct FNodeStatsCompare
{
/** Sorts nodes by descending durations. */
FORCEINLINE bool operator()(const FGPUProfilerEventNodeStats& A, const FGPUProfilerEventNodeStats& B) const
{
return B.TimingResult < A.TimingResult;
}
};
// Sort descending based on node duration
EventHistogram.ValueSort( FNodeStatsCompare() );
// Log stats about the node histogram
UE_LOG(LogRHI, Log, TEXT("Node histogram %u buckets"), EventHistogram.Num());
// bad: reading on render thread but we don't support ECVF_RenderThreadSafe on strings yet
// It's very unlikely to cause a problem as the cvar is only changes by the user.
FString WildcardString = GProfileGPUPatternCVar.GetValueOnRenderThread();
FGPUProfilerEventNodeStats Sum;
const float ThresholdInMS = 5.0f;
if(WildcardString == FString(TEXT("*")))
{
// disable Wildcard functionality
WildcardString.Empty();
}
if(WildcardString.IsEmpty())
{
UE_LOG(LogRHI, Log, TEXT(" r.ProfileGPU.Pattern = '*' (using threshold of %g ms)"), ThresholdInMS);
}
else
{
UE_LOG(LogRHI, Log, TEXT(" r.ProfileGPU.Pattern = '%s' (not using time threshold)"), *WildcardString);
}
FWildcardString Wildcard(WildcardString);
int32 NumNotShown = 0;
for (TMap<FString, FGPUProfilerEventNodeStats>::TIterator It(EventHistogram); It; ++It)
{
const FGPUProfilerEventNodeStats& NodeStats = It.Value();
bool bDump = NodeStats.TimingResult > RootResult * ThresholdInMS;
if(!Wildcard.IsEmpty())
{
// if a Wildcard string was specified, we want to always dump all entries
bDump = Wildcard.IsMatch(*It.Key());
}
if (bDump)
{
UE_LOG(LogRHI, Log, TEXT(" %.2fms %s Events %u Draws %u"), NodeStats.TimingResult, *It.Key(), NodeStats.NumEvents, NodeStats.NumDraws);
Sum += NodeStats;
}
else
{
NumNotShown++;
}
}
UE_LOG(LogRHI, Log, TEXT(" Total %.2fms Events %u Draws %u, %u buckets not shown"),
Sum.TimingResult, Sum.NumEvents, Sum.NumDraws, NumNotShown);
}
#if !UE_BUILD_SHIPPING
// Create and display profile visualizer data
if (RHIConfig::ShouldShowProfilerAfterProfilingGPU())
{
// execute on main thread
{
struct FDisplayProfilerVisualizer
{
void Thread( TSharedPtr<FVisualizerEvent> InVisualizerData, const FText InVsyncEnabledWarningText )
{
static FName ProfileVisualizerModule(TEXT("ProfileVisualizer"));
if (FModuleManager::Get().IsModuleLoaded(ProfileVisualizerModule))
{
IProfileVisualizerModule& ProfileVisualizer = FModuleManager::GetModuleChecked<IProfileVisualizerModule>(ProfileVisualizerModule);
// Display a warning if this is a GPU profile and the GPU was profiled with v-sync enabled (otherwise InVsyncEnabledWarningText is empty)
ProfileVisualizer.DisplayProfileVisualizer( InVisualizerData, TEXT("GPU"), InVsyncEnabledWarningText, FLinearColor::Red );
}
}
} DisplayProfilerVisualizer;
TSharedPtr<FVisualizerEvent> VisualizerData = CreateVisualizerData( EventTree );
DECLARE_CYCLE_STAT(TEXT("FSimpleDelegateGraphTask.DisplayProfilerVisualizer"),
STAT_FSimpleDelegateGraphTask_DisplayProfilerVisualizer,
STATGROUP_TaskGraphTasks);
FSimpleDelegateGraphTask::CreateAndDispatchWhenReady(
FSimpleDelegateGraphTask::FDelegate::CreateRaw(&DisplayProfilerVisualizer, &FDisplayProfilerVisualizer::Thread, VisualizerData, VsyncEnabledWarningText),
GET_STATID(STAT_FSimpleDelegateGraphTask_DisplayProfilerVisualizer), nullptr, ENamedThreads::GameThread
);
}
}
#endif
}
}
void FGPUProfiler::PushEvent(const TCHAR* Name, FColor Color)
{
if (bTrackingEvents)
{
check(StackDepth >= 0);
StackDepth++;
check(IsInRenderingThread() || IsInRHIThread());
if (CurrentEventNode)
{
// Add to the current node's children
CurrentEventNode->Children.Add(CreateEventNode(Name, CurrentEventNode));
CurrentEventNode = CurrentEventNode->Children.Last();
}
else
{
// Add a new root node to the tree
CurrentEventNodeFrame->EventTree.Add(CreateEventNode(Name, NULL));
CurrentEventNode = CurrentEventNodeFrame->EventTree.Last();
}
check(CurrentEventNode);
// Start timing the current node
CurrentEventNode->StartTiming();
}
}
void FGPUProfiler::PopEvent()
{
if (bTrackingEvents)
{
check(StackDepth >= 1);
StackDepth--;
check(CurrentEventNode && (IsInRenderingThread() || IsInRHIThread()));
// Stop timing the current node and move one level up the tree
CurrentEventNode->StopTiming();
CurrentEventNode = CurrentEventNode->Parent;
}
}
/** Whether GPU timing measurements are supported by the driver. */
bool FGPUTiming::GIsSupported = false;
/** Frequency for the timing values, in number of ticks per seconds, or 0 if the feature isn't supported. */
TStaticArray<uint64, MAX_NUM_GPUS> FGPUTiming::GTimingFrequency(InPlace, 0);
/**
* Two timestamps performed on GPU and CPU at nearly the same time.
* This can be used to visualize GPU and CPU timing events on the same timeline.
*/
TStaticArray<FGPUTimingCalibrationTimestamp, MAX_NUM_GPUS> FGPUTiming::GCalibrationTimestamp;
/** Whether the static variables have been initialized. */
bool FGPUTiming::GAreGlobalsInitialized = false;
#else
namespace UE::RHI::GPUProfiler
{
RHI_API FRHIOnProfileGPU OnProfileGPU;
TLockFreePointerListUnordered<void, PLATFORM_CACHE_LINE_SIZE> FEventStream::FChunk::MemoryPool;
static TArray<FEventSink*>& GetSinks()
{
static TArray<FEventSink*> Sinks;
return Sinks;
}
FEventSink::FEventSink()
{
GetSinks().Add(this);
}
FEventSink::~FEventSink()
{
GetSinks().RemoveSingle(this);
}
void ProcessEvents(TArrayView<FEventStream> EventStreams)
{
TRACE_CPUPROFILER_EVENT_SCOPE(UE::RHI::GPUProfiler::ProcessEvents);
TArray<TSharedRef<FEventStream>> SharedStreams;
SharedStreams.Reserve(EventStreams.Num());
for (FEventStream& Stream : EventStreams)
{
if (!Stream.IsEmpty())
{
SharedStreams.Emplace(MakeShared<FEventStream>(MoveTemp(Stream)));
}
}
if (SharedStreams.Num())
{
for (FEventSink* Sink : GetSinks())
{
Sink->ProcessStreams(SharedStreams);
}
}
}
void InitializeQueues(TConstArrayView<FQueue> Queues)
{
for (FEventSink* Sink : GetSinks())
{
Sink->InitializeQueues(Queues);
}
}
#if WITH_PROFILEGPU
template <uint32 Width>
struct TUnicodeHorizontalBar
{
TCHAR Text[Width + 1];
// 0 <= Value <= 1
TUnicodeHorizontalBar(double Value)
{
TCHAR* Output = Text;
int32 Solid, Partial, Blank;
{
double Integer;
double Remainder = FMath::Modf(FMath::Clamp(Value, 0.0, 1.0) * Width, &Integer);
Solid = (int32)Integer;
Partial = (int32)FMath::Floor(Remainder * 8);
Blank = (Width - Solid - (Partial > 0 ? 1 : 0));
}
// Solid characters
for (int32 Index = 0; Index < Solid; ++Index)
{
*Output++ = TEXT('');
}
// Partially filled character
if (Partial > 0)
{
static constexpr TCHAR const Data[] = TEXT("▏▎▍▌▋▊▉");
*Output++ = Data[Partial - 1];
}
// Blank Characters to pad out the width
for (int32 Index = 0; Index < Blank; ++Index)
{
*Output++ = TEXT(' ');
}
*Output++ = 0;
check(uintptr_t(Output) == (uintptr_t(Text) + sizeof(Text)));
}
};
struct FNode
{
FString Name;
FNode* Parent = nullptr;
FNode* Next = nullptr;
TArray<FNode*> Children;
struct FStats
{
uint32 NumDraws = 0;
uint32 NumDispatches = 0;
uint32 NumPrimitives = 0;
uint32 NumVertices = 0;
uint64 BusyCycles = 0;
uint64 IdleCycles = 0;
uint64 WaitCycles = 0;
double GetBusyMilliseconds() const
{
return FPlatformTime::ToMilliseconds64(BusyCycles);
}
bool HasWork() const
{
return NumDraws > 0 || NumDispatches > 0;
}
FStats& operator += (FStats const& Stats)
{
NumDraws += Stats.NumDraws;
NumDispatches += Stats.NumDispatches;
NumPrimitives += Stats.NumPrimitives;
NumVertices += Stats.NumVertices;
BusyCycles += Stats.BusyCycles;
IdleCycles += Stats.IdleCycles;
WaitCycles += Stats.WaitCycles;
return *this;
}
FStats& operator += (FEvent::FStats const& Stats)
{
NumDraws += Stats.NumDraws;
NumDispatches += Stats.NumDispatches;
NumPrimitives += Stats.NumPrimitives;
NumVertices += Stats.NumVertices;
return *this;
}
void Accumulate(uint64 Busy, uint64 Wait, uint64 Idle)
{
BusyCycles += Busy;
IdleCycles += Idle;
WaitCycles += Wait;
}
};
// Exclusive stats for this node
FStats Exclusive;
// Sum of stats including all children
FStats Inclusive;
FNode(FString&& Name)
: Name(MoveTemp(Name))
{}
};
struct FTable
{
bool const bUnicodeOutput;
FTable()
: bUnicodeOutput(GCVarProfileGPU_UnicodeOutput.GetValueOnAnyThread())
{}
enum class EColumn : uint32
{
Exclusive_NumDraws,
Exclusive_NumDispatches,
Exclusive_NumPrimitives,
Exclusive_NumVertices,
Exclusive_Percent,
Exclusive_Time,
Inclusive_NumDraws,
Inclusive_NumDispatches,
Inclusive_NumPrimitives,
Inclusive_NumVertices,
Inclusive_Percent,
Inclusive_Time,
Events,
Num
};
uint32 GetColumnMinimumWidth(EColumn Column) const
{
switch (Column)
{
case EColumn::Events:
return 6;
}
return 0;
}
TCHAR const* GetColumnHeader(EColumn Column) const
{
switch (Column)
{
case EColumn::Exclusive_NumDraws:
case EColumn::Inclusive_NumDraws:
return TEXT("Draws");
case EColumn::Exclusive_NumDispatches:
case EColumn::Inclusive_NumDispatches:
return TEXT("Dsptch");
case EColumn::Exclusive_NumPrimitives:
case EColumn::Inclusive_NumPrimitives:
return TEXT("Prim");
case EColumn::Exclusive_NumVertices:
case EColumn::Inclusive_NumVertices:
return TEXT("Vert");
case EColumn::Exclusive_Percent:
case EColumn::Inclusive_Percent:
return TEXT("Percent");
case EColumn::Exclusive_Time:
case EColumn::Inclusive_Time:
return TEXT("Time");
}
return TEXT("");
}
uint32 GetColumnGroup(EColumn Column) const
{
switch (Column)
{
case EColumn::Exclusive_NumDraws:
case EColumn::Exclusive_NumDispatches:
case EColumn::Exclusive_NumPrimitives:
case EColumn::Exclusive_NumVertices:
case EColumn::Exclusive_Percent:
case EColumn::Exclusive_Time:
return 0;
case EColumn::Inclusive_NumDraws:
case EColumn::Inclusive_NumDispatches:
case EColumn::Inclusive_NumPrimitives:
case EColumn::Inclusive_NumVertices:
case EColumn::Inclusive_Percent:
case EColumn::Inclusive_Time:
return 1;
default:
case EColumn::Events:
return 2;
}
}
TCHAR const* GetGroupName(uint32 GroupIndex) const
{
switch (GroupIndex)
{
case 0: return TEXT("Exclusive");
case 1: return TEXT("Inclusive");
case 2: return TEXT("Events");
}
return TEXT("");
}
uint32 NumRows = 0;
TStaticArray<TArray<FString>, uint32(EColumn::Num)> Columns { InPlace };
TArray<bool> RowBreaks;
FString& Col(EColumn Column)
{
return Columns[uint32(Column)].Emplace_GetRef();
}
bool HasRows() const
{
return NumRows > 0;
}
void AddRow(FNode* Root, FNode::FStats const& Inclusive, FNode::FStats const& Exclusive, FString const& Name, uint32 Level)
{
double ExclusivePercent = double(Exclusive.BusyCycles) / Root->Inclusive.BusyCycles;
double InclusivePercent = double(Inclusive.BusyCycles) / Root->Inclusive.BusyCycles;
static constexpr uint32 BarWidth = 8;
TUnicodeHorizontalBar<BarWidth> ExclusiveBar = ExclusivePercent;
TUnicodeHorizontalBar<BarWidth> InclusiveBar = InclusivePercent;
static constexpr TCHAR const BarSeparator[] = TEXT("");
Col(EColumn::Exclusive_NumDraws ) = FString::Printf(TEXT("%d"), Exclusive.NumDraws);
Col(EColumn::Exclusive_NumDispatches) = FString::Printf(TEXT("%d"), Exclusive.NumDispatches);
Col(EColumn::Exclusive_NumPrimitives) = FString::Printf(TEXT("%d"), Exclusive.NumPrimitives);
Col(EColumn::Exclusive_NumVertices ) = FString::Printf(TEXT("%d"), Exclusive.NumVertices);
Col(EColumn::Exclusive_Percent ) = FString::Printf(TEXT("%.1f%%%s%s"), ExclusivePercent * 100.0, bUnicodeOutput ? BarSeparator : TEXT(""), bUnicodeOutput ? ExclusiveBar.Text : TEXT(""));
Col(EColumn::Exclusive_Time ) = FString::Printf(TEXT("%.3f ms"), FPlatformTime::ToMilliseconds64(Exclusive.BusyCycles));
Col(EColumn::Inclusive_NumDraws ) = FString::Printf(TEXT("%d"), Inclusive.NumDraws);
Col(EColumn::Inclusive_NumDispatches) = FString::Printf(TEXT("%d"), Inclusive.NumDispatches);
Col(EColumn::Inclusive_NumPrimitives) = FString::Printf(TEXT("%d"), Inclusive.NumPrimitives);
Col(EColumn::Inclusive_NumVertices ) = FString::Printf(TEXT("%d"), Inclusive.NumVertices);
Col(EColumn::Inclusive_Percent ) = FString::Printf(TEXT("%.1f%%%s%s"), InclusivePercent * 100.0, bUnicodeOutput ? BarSeparator : TEXT(""), bUnicodeOutput ? InclusiveBar.Text : TEXT(""));
Col(EColumn::Inclusive_Time ) = FString::Printf(TEXT("%.3f ms"), FPlatformTime::ToMilliseconds64(Inclusive.BusyCycles));
static constexpr uint32 SpacesPerIndent = 3;
Col(EColumn::Events) = FString::Printf(TEXT("%*s"), Name.Len() + (Level * SpacesPerIndent), *Name);
// Insert a horizontal rule before each root level row.
RowBreaks.Add(Level == 0);
NumRows++;
}
struct FChars
{
TCHAR const* Left;
TCHAR const* GroupSeparator;
TCHAR const* LastGroupSeparator;
TCHAR const* Right;
TCHAR const* CellSeparator;
};
struct FFormat
{
TCHAR const* LineMajor;
TCHAR const* LineMinor;
FChars const TopRow;
FChars const GroupNameRow;
FChars const GroupBorderRow;
FChars const ValueRow;
FChars const DividorRow;
FChars const BottomRow;
};
FString ToString() const
{
if (bUnicodeOutput)
{
static constexpr FFormat Unicode =
{
.LineMajor = TEXT(""),
.LineMinor = TEXT(""),
// Left GrpSep LastGrp Right CellSep
.TopRow { TEXT(""), TEXT(""), TEXT(""), TEXT(""), TEXT(" ") },
.GroupNameRow { TEXT(""), TEXT(""), TEXT(""), TEXT(""), TEXT(" ") },
.GroupBorderRow{ TEXT(""), TEXT(""), TEXT(""), TEXT(""), TEXT("") },
.ValueRow { TEXT(""), TEXT(""), TEXT(""), TEXT(""), TEXT("") },
.DividorRow { TEXT(""), TEXT(""), TEXT(""), TEXT(""), TEXT("") },
.BottomRow { TEXT(""), TEXT(""), TEXT(""), TEXT(""), TEXT("") },
};
return ToStringInner(Unicode);
}
else
{
static constexpr FFormat Ascii =
{
.LineMajor = TEXT("-"),
.LineMinor = TEXT("-"),
// Left GrpSep LastGrp Right CellSep
.TopRow { TEXT(" +"), TEXT("+"), TEXT("+"), TEXT("+"), TEXT(" ") },
.GroupNameRow { TEXT(" |"), TEXT("|"), TEXT("|"), TEXT("|"), TEXT(" ") },
.GroupBorderRow{ TEXT(" +"), TEXT("+"), TEXT("+"), TEXT("|"), TEXT("+") },
.ValueRow { TEXT(" |"), TEXT("|"), TEXT("|"), TEXT("|"), TEXT("|") },
.DividorRow { TEXT(" +"), TEXT("+"), TEXT("+"), TEXT("+"), TEXT("+") },
.BottomRow { TEXT(" +"), TEXT("+"), TEXT("+"), TEXT("+"), TEXT("+") },
};
return ToStringInner(Ascii);
}
}
FString ToStringInner(FFormat const& Format) const
{
struct FGroup { uint32 Index, Width; };
struct FColumn { uint32 Index, Width; };
static constexpr uint32 NumGroups = 3;
static constexpr uint32 CellPadding = 1;
// Auto-size column widths to their contents
TStaticArray<int32, uint32(EColumn::Num)> ColumnWidths{ InPlace, 0 };
for (uint32 ColumnIndex = 0; ColumnIndex < uint32(EColumn::Num); ++ColumnIndex)
{
if (Columns[ColumnIndex].Num() == 0)
continue;
check(Columns[ColumnIndex].Num() == NumRows);
int32& Width = ColumnWidths[ColumnIndex];
// Auto-size column width
Width = GetColumnMinimumWidth(EColumn(ColumnIndex));
Width = FMath::Max(Width, FCString::Strlen(GetColumnHeader(EColumn(ColumnIndex))));
for (FString const& Cell : Columns[ColumnIndex])
{
Width = FMath::Max(Width, Cell.Len());
}
}
FString Result;
auto EmitGroupRow = [&](FChars const& Chars, TUniqueFunction<void(FGroup)> GroupCallback)
{
uint32 const CellSeparatorLength = FCString::Strlen(Chars.CellSeparator);
Result += Chars.Left;
uint32 GroupWidth = 0;
uint32 GroupIndex = 0;
for (uint32 ColumnIndex = 0; ColumnIndex < uint32(EColumn::Num); ++ColumnIndex)
{
if (Columns[ColumnIndex].Num() == 0)
continue;
GroupWidth += ColumnWidths[ColumnIndex] + CellPadding * 2;
GroupIndex = GetColumnGroup(EColumn(ColumnIndex));
if (GroupIndex != GetColumnGroup(EColumn(ColumnIndex + 1)))
{
// Group Change
GroupCallback({ GroupIndex, GroupWidth });
// Add the group separator character
Result += GroupIndex < NumGroups - 2
? Chars.GroupSeparator
: Chars.LastGroupSeparator;
GroupWidth = 0;
}
else if (ColumnIndex < uint32(EColumn::Num) - 1)
{
// Same group. Count the (missing) cell division
GroupWidth += CellSeparatorLength;
}
}
// Emit final group
GroupCallback({ GroupIndex, GroupWidth });
// Close the row
Result += Chars.Right;
Result += TEXT("\n");
};
auto EmitValueRow = [&](FChars const& Chars, TUniqueFunction<void(FColumn)> CellCallback)
{
Result += Chars.Left;
for (uint32 ColumnIndex = 0; ColumnIndex < uint32(EColumn::Num); ++ColumnIndex)
{
if (Columns[ColumnIndex].Num() == 0)
continue;
CellCallback({ ColumnIndex, ColumnWidths[ColumnIndex] + (CellPadding * 2) });
if (ColumnIndex < uint32(EColumn::Num) - 1)
{
uint32 GroupIndex = GetColumnGroup(EColumn(ColumnIndex));
if (GroupIndex != GetColumnGroup(EColumn(ColumnIndex + 1)))
{
// Group change, add the group separator
Result += GroupIndex < NumGroups - 2
? Chars.GroupSeparator
: Chars.LastGroupSeparator;
}
else
{
// Same group, add the cell separator
Result += Chars.CellSeparator;
}
}
}
// Close the row
Result += Chars.Right;
Result += TEXT("\n");
};
auto AlignCenter = [&](TCHAR const* Str, uint32 Width)
{
int32 PaddingLeft = FMath::Max(0, int32(Width) - FCString::Strlen(Str));
int32 PaddingRight = (PaddingLeft / 2) + (PaddingLeft & 1);
PaddingLeft /= 2;
Result += FString::Printf(TEXT("%*s%s%*s"), PaddingLeft, TEXT(""), Str, PaddingRight, TEXT(""));
};
// Top Border
EmitGroupRow(Format.TopRow, [&](FGroup Group)
{
while (Group.Width--)
{
Result += Format.LineMajor;
}
});
// Exclusive / Inclusive Group Row
EmitGroupRow(Format.GroupNameRow, [&](FGroup Group)
{
TCHAR const* Str = Group.Index != GetColumnGroup(EColumn::Events)
? GetGroupName(Group.Index)
: TEXT("");
AlignCenter(Str, Group.Width);
});
// Events Group Row
EmitValueRow(Format.GroupBorderRow, [&](FColumn Column)
{
if (Column.Index == uint32(EColumn::Events))
{
AlignCenter(GetGroupName(GetColumnGroup(EColumn::Events)), Column.Width);
}
else
{
while (Column.Width--)
{
Result += Format.LineMinor;
}
}
});
// Header Row
EmitValueRow(Format.ValueRow, [&](FColumn Column)
{
AlignCenter(GetColumnHeader(EColumn(Column.Index)), Column.Width);
});
// Header Border Row
EmitValueRow(Format.DividorRow, [&](FColumn Column)
{
while (Column.Width--)
{
Result += Format.LineMinor;
}
});
// Value rows
for (uint32 RowIndex = 0; RowIndex < NumRows; ++RowIndex)
{
if (RowIndex > 0 && RowBreaks[RowIndex])
{
// Add a horizontal rule
EmitValueRow(Format.DividorRow, [&](FColumn Column)
{
while (Column.Width--)
{
Result += Format.LineMinor;
}
});
}
EmitValueRow(Format.ValueRow, [&](FColumn Column)
{
int32 Width = Column.Width - (CellPadding * 2);
if (EColumn(Column.Index) == EColumn::Events)
{
Width = -Width; // Align left
}
FString const& Cell = Columns[Column.Index][RowIndex];
Result += FString::Printf(TEXT("%*s%*s%*s")
, CellPadding, TEXT("")
, Width, *Cell
, CellPadding, TEXT(""));
});
}
// Bottom Border
EmitValueRow(Format.BottomRow, [&](FColumn Column)
{
while (Column.Width--)
{
Result += Format.LineMajor;
}
});
return Result;
}
};
#endif
#if HAS_GPU_STATS
// Per queue GPU stats
// Total busy time on the current queue. StatName == "Unaccounted" is used by the Csv profiler
static FGPUStat GPUStat_Total(TEXT("Unaccounted"), TEXT("Queue Total"));
#endif
#if STATS
TCHAR const* FGPUStat::GetTypeString(EType Type)
{
switch (Type)
{
default: checkNoEntry(); [[fallthrough]];
case EType::Busy: return TEXT("Busy");
case EType::Wait: return TEXT("Wait");
case EType::Idle: return TEXT("Idle");
}
}
FString FGPUStat::GetIDString(FQueue Queue, bool bFriendly)
{
if (bFriendly)
{
return FString::Printf(TEXT("GPU %d %s Queue %d")
, Queue.GPU
, Queue.GetTypeString()
, Queue.Index
);
}
else
{
return FString::Printf(TEXT("GPU%d_%s%d")
, Queue.GPU
, Queue.GetTypeString()
, Queue.Index
);
}
}
FGPUStat::FStatInstance::FInner& FGPUStat::GetStatInstance(FQueue Queue, EType Type)
{
FStatInstance& Instance = Instances.FindOrAdd(Queue);
switch (Type)
{
default: checkNoEntry(); [[fallthrough]];
case EType::Busy: return Instance.Busy; break;
case EType::Wait: return Instance.Wait; break;
case EType::Idle: return Instance.Idle; break;
}
}
TMap<FQueue, TUniquePtr<FGPUStat::FStatCategory>> FGPUStat::FStatCategory::Categories;
FGPUStat::FStatCategory::FStatCategory(FQueue Queue)
: GroupName(FString::Printf(TEXT("STATGROUP_%s"), *GetIDString(Queue, false)))
, GroupDesc(FString::Printf(TEXT("%s Timing"), *GetIDString(Queue, true)))
{}
TStatId FGPUStat::GetStatId(FQueue Queue, EType Type)
{
FStatInstance::FInner& Instance = GetStatInstance(Queue, Type);
if (!Instance.Stat)
{
TUniquePtr<FStatCategory>& Category = FStatCategory::Categories.FindOrAdd(Queue);
if (!Category)
{
Category = MakeUnique<FStatCategory>(Queue);
}
// Encode the stat type in the FName number
Instance.StatName = FName(*FString::Printf(TEXT("STAT_%s_%s"), *GetIDString(Queue, false), DisplayName), int32(Type));
Instance.Stat = MakeUnique<FDynamicStat>(
Instance.StatName,
DisplayName,
*Category->GroupName,
FStatNameAndInfo::GpuStatCategory,
*Category->GroupDesc,
true, // IsDefaultEnabled
true, // IsClearEveryFrame
EStatDataType::ST_double,
false, // IsCycleStat
false, // SortByName
FPlatformMemory::MCR_Invalid
);
}
return Instance.Stat->GetStatId();
}
#endif
// Handles computing the "stat unit" GPU time, "stat gpu" stats, and "profilegpu".
struct FGPUProfilerSink_StatSystem final : public FEventSink
{
class FTimestampStream
{
private:
TArray<uint64> Values;
public:
struct FState
{
FTimestampStream const& Stream;
int32 TimestampIndex = 0;
uint64 BusyCycles = 0;
FState(FTimestampStream const& Stream)
: Stream(Stream)
{}
uint64 GetCurrentTimestamp (uint64 Anchor) const { return Stream.Values[TimestampIndex] - Anchor; }
uint64 GetPreviousTimestamp(uint64 Anchor) const { return Stream.Values[TimestampIndex - 1] - Anchor; }
bool HasMoreTimestamps() const { return TimestampIndex < Stream.Values.Num(); }
bool IsStartingWork () const { return (TimestampIndex & 0x01) == 0x00; }
void AdvanceTimestamp () { TimestampIndex++; }
};
void AddTimestamp(uint64 Value, bool bBegin)
{
if (bBegin)
{
if (!Values.IsEmpty() && Value <= Values.Last())
{
//
// The Begin TOP event is sooner than the last End BOP event.
// The markers overlap, and the GPU was not idle.
//
// Remove the previous End event, and discard this Begin event.
//
Values.RemoveAt(Values.Num() - 1, EAllowShrinking::No);
}
else
{
// GPU was idle. Keep this timestamp.
Values.Add(Value);
}
}
else
{
Values.Add(Value);
}
}
static uint64 ComputeUnion(TArrayView<FTimestampStream::FState> Streams)
{
// The total number of cycles where at least one GPU pipe was busy.
uint64 UnionBusyCycles = 0;
uint64 LastMinCycles = 0;
int32 BusyPipes = 0;
bool bFirst = true;
uint64 Anchor = 0; // @todo - handle possible timestamp wraparound
// Process the time ranges from each pipe.
while (true)
{
// Find the next minimum timestamp
FTimestampStream::FState* NextMin = nullptr;
for (auto& Current : Streams)
{
if (Current.HasMoreTimestamps() && (!NextMin || Current.GetCurrentTimestamp(Anchor) < NextMin->GetCurrentTimestamp(Anchor)))
{
NextMin = &Current;
}
}
if (!NextMin)
break; // No more timestamps to process
if (!bFirst)
{
if (BusyPipes > 0 && NextMin->GetCurrentTimestamp(Anchor) > LastMinCycles)
{
// Accumulate the union busy time across all pipes
UnionBusyCycles += NextMin->GetCurrentTimestamp(Anchor) - LastMinCycles;
}
if (!NextMin->IsStartingWork())
{
// Accumulate the busy time for this pipe specifically.
NextMin->BusyCycles += NextMin->GetCurrentTimestamp(Anchor) - NextMin->GetPreviousTimestamp(Anchor);
}
}
LastMinCycles = NextMin->GetCurrentTimestamp(Anchor);
BusyPipes += NextMin->IsStartingWork() ? 1 : -1;
check(BusyPipes >= 0);
NextMin->AdvanceTimestamp();
bFirst = false;
}
check(BusyPipes == 0);
return UnionBusyCycles;
}
};
struct FStatState
{
struct
{
uint64 BusyCycles = 0;
uint64 IdleCycles = 0;
uint64 WaitCycles = 0;
void Accumulate(uint64 Busy, uint64 Wait, uint64 Idle)
{
BusyCycles += Busy;
IdleCycles += Idle;
WaitCycles += Wait;
}
} Exclusive, Inclusive;
FStatState() = default;
FStatState(FStatState const&) = default;
FStatState(FStatState&& Other)
: FStatState(Other)
{
Other.Exclusive = {};
Other.Inclusive = {};
}
#if HAS_GPU_STATS
void EmitResults(FQueue Queue, FGPUStat& GPUStat
#if STATS
, FEndOfPipeStats* Stats
#endif
#if CSV_PROFILER_STATS
, FCsvProfiler* CsvProfiler
#endif
) const
{
#if STATS
Stats->AddMessage(GPUStat.GetStatId(Queue, FGPUStat::EType::Busy).GetName(), EStatOperation::Set, FPlatformTime::ToMilliseconds64(Inclusive.BusyCycles));
Stats->AddMessage(GPUStat.GetStatId(Queue, FGPUStat::EType::Idle).GetName(), EStatOperation::Set, FPlatformTime::ToMilliseconds64(Inclusive.IdleCycles));
Stats->AddMessage(GPUStat.GetStatId(Queue, FGPUStat::EType::Wait).GetName(), EStatOperation::Set, FPlatformTime::ToMilliseconds64(Inclusive.WaitCycles));
#endif
#if CSV_PROFILER_STATS
if (CsvProfiler && Queue.Type == FQueue::EType::Graphics && Queue.Index == 0)
{
if (!GPUStat.CsvStat.IsSet())
{
static TArray<TUniquePtr<FCsvCategory>> CsvGPUCategories;
if (!CsvGPUCategories.IsValidIndex(Queue.GPU))
{
CsvGPUCategories.SetNum(Queue.GPU + 1);
}
TUniquePtr<FCsvCategory>& Category = CsvGPUCategories[Queue.GPU];
if (!Category)
{
Category = Queue.GPU > 0
? MakeUnique<FCsvCategory>(*FString::Printf(TEXT("GPU%d"), Queue.GPU + 1), true)
: MakeUnique<FCsvCategory>(TEXT("GPU"), true);
}
GPUStat.CsvStat.Emplace(GPUStat.StatName, Category->Index);
}
uint64 TotalCycles = Exclusive.BusyCycles + Exclusive.WaitCycles;
CsvProfiler->RecordEndOfPipeCustomStat(GPUStat.CsvStat->Name, GPUStat.CsvStat->CategoryIndex, FPlatformTime::ToMilliseconds64(TotalCycles), ECsvCustomStatOp::Set);
}
#endif
}
#endif
};
struct FQueueTimestamps
{
FTimestampStream Queue;
FStatState WholeQueueStat;
uint64 CPUFrameBoundary = 0;
// Used to override the GPU time calculation for this queue, if an FFrameTime event is in the stream
TOptional<uint64> TotalBusyCycles;
#if WITH_RHI_BREADCRUMBS
TMap<FRHIBreadcrumbData_Stats, FStatState> Stats;
#endif
};
struct FResolvedWait
{
uint64 GPUTimestampTOP = 0;
uint64 CPUTimestamp = 0;
};
struct FResolvedSignal
{
uint64 GPUTimestampBOP = 0;
uint64 Value = 0;
};
struct FFrameState : TMap<FQueue, FQueueTimestamps>
{
#if STATS
TOptional<int64> StatsFrame;
#endif
};
struct FQueueState
{
FQueue const Queue;
TSpscQueue<FEventSink::FIterator> PendingStreams;
// Array of fence signal history. Events are kept until all queues have processed events
// later than the CPU timestamps of these signals. The old events are then trimmed.
TArray<FResolvedSignal> Signals;
// The value of the latest signaled fence on this queue.
FResolvedSignal MaxSignal;
// The GPU timestamp of the last event processed.
uint64 LastGPUCycles = 0;
FQueueTimestamps Timestamps;
bool bBusy = false;
bool bWasTraced = false;
#if WITH_RHI_BREADCRUMBS
TMap<FRHIBreadcrumbData_Stats, int32> ActiveStats;
TArray<FRHIBreadcrumbData_Stats> ActiveStatsStack;
FRHIBreadcrumbNode* Breadcrumb = nullptr;
#endif
#if WITH_PROFILEGPU
struct
{
TArray<TUniquePtr<FNode>> Nodes;
FNode* Current = nullptr;
FNode* Prev = nullptr;
FNode* First = nullptr;
bool bProfileFrame = false;
void PushNode(FString&& Name)
{
FNode* Parent = Current;
Current = Nodes.Emplace_GetRef(MakeUnique<FNode>(MoveTemp(Name))).Get();
Current->Parent = Parent;
if (!First)
{
First = Current;
}
if (Parent)
{
Parent->Children.Add(Current);
}
if (Prev)
{
Prev->Next = Current;
}
Prev = Current;
}
void PopNode()
{
check(Current && Current->Parent);
Current = Current->Parent;
}
void LogTree(FQueueState const& QueueState, uint32 FrameNumber) const
{
FTable Table;
EGPUProfileSortMode SortMode = (EGPUProfileSortMode)FMath::Clamp(GCVarProfileGPU_Sort.GetValueOnAnyThread(), 0, ((int32)EGPUProfileSortMode::Max - 1));
FWildcardString RootWildcard(GCVarProfileGPU_Root.GetValueOnAnyThread());
const bool bShowEmptyNodes = GCVarProfileGPU_ShowLeafEvents.GetValueOnAnyThread();
const double PercentThreshold = FMath::Clamp(GCVarProfileGPU_ThresholdPercent.GetValueOnAnyThread(), 0.0f, 100.0f);
if (SortMode != EGPUProfileSortMode::Chronological)
{
for (FNode* Node = First; Node; Node = Node->Next)
{
Node->Children.Sort([SortMode](FNode const& A, FNode const& B)
{
switch (SortMode)
{
default:
case EGPUProfileSortMode::TimeElapsed: return B.Inclusive.BusyCycles < A.Inclusive.BusyCycles;
case EGPUProfileSortMode::NumPrims : return B.Inclusive.NumPrimitives < A.Inclusive.NumPrimitives;
case EGPUProfileSortMode::NumVerts : return B.Inclusive.NumVertices < A.Inclusive.NumVertices;
}
});
}
}
auto Recurse = [&](auto& Recurse, FNode* Root, FNode* CurrentNode, bool bParentMatchedFilter, int32 Level) -> bool
{
// Percent that this node was of the total frame time
const double Percent = Root ? (CurrentNode->Inclusive.GetBusyMilliseconds() / Root->Inclusive.GetBusyMilliseconds()) * 100.0 : 100.0;
// Filter nodes according to cvar settings
const bool bAboveThreshold = Percent >= PercentThreshold;
const bool bNameMatches = bParentMatchedFilter || RootWildcard.IsMatch(CurrentNode->Name);
const bool bHasWork = bShowEmptyNodes || CurrentNode->Inclusive.HasWork();
const bool bDisplayEvent = bNameMatches && bHasWork && bAboveThreshold;
if (bDisplayEvent)
{
if (Root == nullptr)
{
Root = CurrentNode;
}
Table.AddRow(
Root,
CurrentNode->Inclusive,
CurrentNode->Exclusive,
CurrentNode->Name,
Level
);
}
FNode::FStats OtherChildrenInclusive;
FNode::FStats OtherChildrenExclusive;
uint32 NumHiddenChildren = 0;
for (FNode* Child : CurrentNode->Children)
{
bool bChildShown = Recurse(Recurse, Root, Child, bDisplayEvent, bDisplayEvent ? Level + 1 : Level);
if (!bChildShown)
{
OtherChildrenInclusive += Child->Inclusive;
OtherChildrenExclusive += Child->Exclusive;
NumHiddenChildren++;
}
}
if (bDisplayEvent && NumHiddenChildren > 0)
{
// Don't show the "other children" node if their total inclusive time is still below the percent threshold
if ((double(OtherChildrenInclusive.BusyCycles) / Root->Inclusive.BusyCycles) >= PercentThreshold)
{
Table.AddRow(
Root,
OtherChildrenInclusive,
OtherChildrenExclusive,
FString::Printf(TEXT("%d Other %s"), NumHiddenChildren, NumHiddenChildren >= 2 ? TEXT("Children") : TEXT("Child")),
Level + 1
);
}
}
return bDisplayEvent;
};
// Skip building the table if there was no useful work
if (First && First->Inclusive.BusyCycles > 0)
{
Recurse(Recurse, nullptr, First, false, 0);
}
FString Final = FString::Printf(
TEXT("\n")
TEXT("GPU Profile for Frame %d - GPU %d - %s %d\n")
TEXT("\n")
TEXT(" - %-30s: %.2fms\n")
TEXT(" - %-30s: \"%s\"\n")
TEXT(" - %-30s: %.2f%%\n")
TEXT(" - %-30s: %s\n")
TEXT("\n")
TEXT("%s")
, FrameNumber
, QueueState.Queue.GPU
, QueueState.Queue.GetTypeString()
, QueueState.Queue.Index
, TEXT("Frame Time")
, First ? First->Inclusive.GetBusyMilliseconds() : 0.0
, *IConsoleManager::Get().FindConsoleObjectName(GCVarProfileGPU_Root.AsVariable())
, *RootWildcard
, *IConsoleManager::Get().FindConsoleObjectName(GCVarProfileGPU_ThresholdPercent.AsVariable())
, PercentThreshold
, *IConsoleManager::Get().FindConsoleObjectName(GCVarProfileGPU_ShowLeafEvents.AsVariable())
, bShowEmptyNodes ? TEXT("true") : TEXT("false")
, Table.HasRows() ? *Table.ToString() : TEXT(" No recorded work for this queue.\n")
);
TArray<FString> Lines;
Final.ParseIntoArrayLines(Lines, false);
for (FString const& Line : Lines)
{
UE_LOG(LogRHI, Display, TEXT("%s"), *Line);
}
}
} Profile;
#endif
FQueueState(FQueue const& Queue)
: Queue(Queue)
{}
void ResolveSignal(FEvent::FSignalFence const& Event)
{
FResolvedSignal& Result = Signals.Emplace_GetRef();
//
// Take the max between the previous GPU EndWork event and the CPU timestamp. The signal cannot have happened on the GPU until the CPU has submitted the command to the driver.
//
// An example would be a GPU queue that completes work and goes idle at time T. Later, the CPU issues a Signal without other prior work at time T + 100ms.
// The fence signal cannot have happened until time T + 100ms because the CPU hadn't instructed the GPU to do so until then.
// LastGPUCycles would still be set to time T, since that was the time of the preceeding EndWork event.
//
Result.GPUTimestampBOP = FMath::Max(LastGPUCycles, Event.CPUTimestamp);
Result.Value = Event.Value;
FGpuProfilerTrace::SignalFence(Queue.Value, Result.GPUTimestampBOP, Event.Value);
//
// Fences signals *MUST* be sequential, to remove ambiguity caused by trimming the Signals array.
//
// To explain why, assume non-sequential signals are allowed, and consider the following example events on an arbitrary queue:
//
// [Signal 2]
// -- Frame Boundary --
// [Signal 4]
//
// Assume, after trimming events earlier than the frame boundary, that only [Signal 4] remains in the Signals array.
// Then, some other queue attempts to [Wait 3]. We need to compute when [Wait 3] is resolved with only the information about [Signal 4].
//
// Given that fences resolve waits as soon as the signalled value is >= the wait value, we could assume the fence was resolved at [Signal 4].
// However, we don't know if the fence was already signalled to value 3 before the frame boundary and the trimming.
//
// Without this information, it is ambiguous whether [Wait 3] is already resolved by a [Signal 3] before the frame boundary that is no longer
// in the Signals array, or won't be resolved until [Signal 4]. We could have had this sequence of events:
//
// [Signal 2]
// [Signal 3]
// -- Frame Boundary --
// [Signal 4]
//
// Requiring that fences are always signalled in sequential order solves this.
// If the awaited value is less than the first Signal, the fence has already been signalled before the frame boundary.
//
checkf(Result.Value == MaxSignal.Value + 1, TEXT("Fence signals must be sequential. Result.Value: %llu, MaxSignal.Value + 1: %llu"), Result.Value, (MaxSignal.Value + 1));
// Signals should always advance in time
checkf(Result.GPUTimestampBOP >= MaxSignal.GPUTimestampBOP, TEXT("Signals should always advance in time. Result.GPUTimestampBOP: %llu, MaxSignal.GPUTimestampBOP: %llu"), Result.GPUTimestampBOP, MaxSignal.GPUTimestampBOP);
MaxSignal = Result;
}
void AccumulateTime(uint64 Busy, uint64 Wait, uint64 Idle)
{
#if WITH_RHI_BREADCRUMBS
// Apply the timings to all active stats
for (auto const& [Stat, RefCount] : ActiveStats)
{
FStatState& State = Timestamps.Stats.FindChecked(Stat);
State.Inclusive.Accumulate(Busy, Wait, Idle);
if (ActiveStatsStack.Num() > 0 && ActiveStatsStack.Last() == Stat)
{
State.Exclusive.Accumulate(Busy, Wait, Idle);
}
}
if (ActiveStatsStack.Num() == 0)
#endif
{
Timestamps.WholeQueueStat.Exclusive.Accumulate(Busy, Wait, Idle);
}
Timestamps.WholeQueueStat.Inclusive.Accumulate(Busy, Wait, Idle);
#if WITH_PROFILEGPU
for (FNode* Node = Profile.Current; Node; Node = Node->Parent)
{
Node->Inclusive.Accumulate(Busy, Wait, Idle);
if (Node == Profile.Current)
{
Node->Exclusive.Accumulate(Busy, Wait, Idle);
}
}
#endif
}
void BeginWork(FEvent::FBeginWork const& Event)
{
Timestamps.Queue.AddTimestamp(Event.GPUTimestampTOP, true);
uint64 Idle = Event.CPUTimestamp > LastGPUCycles
? Event.CPUTimestamp - LastGPUCycles
: 0;
AccumulateTime(0, 0, Idle);
FGpuProfilerTrace::BeginWork(Queue.Value, Event.GPUTimestampTOP, Event.CPUTimestamp);
LastGPUCycles = FMath::Max(LastGPUCycles, Event.GPUTimestampTOP);
}
void EndWork(FEvent::FEndWork const& Event)
{
Timestamps.Queue.AddTimestamp(Event.GPUTimestampBOP, false);
uint64 Busy = Event.GPUTimestampBOP > LastGPUCycles
? Event.GPUTimestampBOP - LastGPUCycles
: 0;
AccumulateTime(Busy, 0, 0);
FGpuProfilerTrace::EndWork(Queue.Value, Event.GPUTimestampBOP);
LastGPUCycles = FMath::Max(LastGPUCycles, Event.GPUTimestampBOP);
}
#if WITH_RHI_BREADCRUMBS
void BeginBreadcrumb(FEvent::FBeginBreadcrumb const& Event)
{
uint64 Busy = Event.GPUTimestampTOP > LastGPUCycles
? Event.GPUTimestampTOP - LastGPUCycles
: 0;
AccumulateTime(Busy, 0, 0);
LastGPUCycles = FMath::Max(LastGPUCycles, Event.GPUTimestampTOP);
FRHIBreadcrumbData_Stats const& Stat = Event.Breadcrumb->Data;
if (Stat.ShouldComputeStat())
{
// Disregard the stat if it is nested within itself (i.e. its already in the ActiveStats map with a non-zero ref count).
// Only the outermost stat will count the busy time, otherwise we'd be double-counting the nested time.
int32 RefCount = ActiveStats.FindOrAdd(Stat)++;
if (RefCount == 0)
{
Timestamps.Stats.FindOrAdd(Stat);
}
ActiveStatsStack.Add(Stat);
}
Breadcrumb = Event.Breadcrumb;
Breadcrumb->TraceBeginGPU(Queue.Value, Event.GPUTimestampTOP);
#if WITH_PROFILEGPU
if (Profile.bProfileFrame)
{
FRHIBreadcrumb::FBuffer Buffer;
const TCHAR* Name = Event.Breadcrumb->GetTCHAR(Buffer);
// Push a new node
Profile.PushNode(Name);
}
#endif
}
void EndBreadcrumb(FEvent::FEndBreadcrumb const& Event)
{
uint64 Busy = Event.GPUTimestampBOP > LastGPUCycles
? Event.GPUTimestampBOP - LastGPUCycles
: 0;
AccumulateTime(Busy, 0, 0);
LastGPUCycles = FMath::Max(LastGPUCycles, Event.GPUTimestampBOP);
FRHIBreadcrumbData_Stats const& Stat = Event.Breadcrumb->Data;
if (Stat.ShouldComputeStat())
{
// Pop the stat when the refcount hits zero.
int32 RefCount = --ActiveStats.FindChecked(Stat);
if (RefCount == 0)
{
ActiveStats.FindAndRemoveChecked(Stat);
}
check(ActiveStatsStack.Last() == Stat);
ActiveStatsStack.RemoveAt(ActiveStatsStack.Num() - 1, EAllowShrinking::No);
}
Breadcrumb->TraceEndGPU(Queue.Value, Event.GPUTimestampBOP);
Breadcrumb = Event.Breadcrumb->GetParent();
#if WITH_PROFILEGPU
if (Profile.bProfileFrame)
{
Profile.PopNode();
}
#endif
}
#endif
void Stats(FEvent::FStats const& Event)
{
#if WITH_PROFILEGPU
if (Profile.Current)
{
Profile.Current->Exclusive += Event;
for (FNode* Node = Profile.Current; Node; Node = Node->Parent)
{
Node->Inclusive += Event;
}
}
#endif
FGpuProfilerTrace::Stats(Queue.Value, Event.NumDraws, Event.NumPrimitives);
}
void Wait(FResolvedWait const& ResolvedWait, const FEvent::FWaitFence& WaitFence)
{
// Time the queue was idle between the last EndWork event, and the Wait command being submitted to the GPU driver.
uint64 Idle = ResolvedWait.CPUTimestamp > LastGPUCycles
? ResolvedWait.CPUTimestamp - LastGPUCycles
: 0;
uint64 WaitStart = FMath::Max(ResolvedWait.CPUTimestamp, LastGPUCycles);
FGpuProfilerTrace::WaitFence(Queue.Value, ResolvedWait.GPUTimestampTOP, WaitFence.Queue.Value, WaitFence.Value);
// Time the queue spent waiting for the fence to signal on another queue.
uint64 Wait = 0;
if (ResolvedWait.GPUTimestampTOP > WaitStart)
{
Wait = ResolvedWait.GPUTimestampTOP - WaitStart;
FGpuProfilerTrace::TraceWait(Queue.Value, WaitStart, ResolvedWait.GPUTimestampTOP);
}
// Bring the last GPU busy end time forwards to where the wait is resolved.
LastGPUCycles = ResolvedWait.GPUTimestampTOP;
AccumulateTime(0, Wait, Idle);
}
void TrimSignals(uint64 CPUTimestamp)
{
// Remove all signals that occured on the GPU timeline before this frame boundary on the CPU.
int32 Index = Algo::LowerBoundBy(Signals, CPUTimestamp, [](FResolvedSignal const& Signal) { return Signal.GPUTimestampBOP; });
if (Index >= 0)
{
Signals.RemoveAt(0, Index, EAllowShrinking::No);
}
}
void FrameTime(uint64 TotalGPUTime)
{
Timestamps.TotalBusyCycles = TotalGPUTime;
}
void FrameBoundary(FEvent::FFrameBoundary const& Event, FFrameState& FrameState, uint32 FrameNumber)
{
check(!bBusy);
Timestamps.CPUFrameBoundary = Event.CPUTimestamp;
FGpuProfilerTrace::FrameBoundary(Queue.Value, Event.FrameNumber);
#if WITH_PROFILEGPU
if (Profile.bProfileFrame)
{
Profile.LogTree(*this, Event.FrameNumber);
Profile = {};
}
#endif
FrameState.Emplace(Queue, MoveTemp(Timestamps));
#if WITH_RHI_BREADCRUMBS
// Reinsert timestamp streams for the current active stats on
// this queue, since these got moved into the frame state.
for (auto& [Stat, RefCount] : ActiveStats)
{
Timestamps.Stats.FindOrAdd(Stat);
}
#endif
#if WITH_PROFILEGPU
if (FrameNumber == Event.FrameNumber + 1)
{
Profile.bProfileFrame = true;
// Build the node tree
Profile.PushNode(TEXT("<root>"));
#if WITH_RHI_BREADCRUMBS
auto Recurse = [&](auto& Recurse, FRHIBreadcrumbNode* Current) -> void
{
if (!Current)
{
return;
}
Recurse(Recurse, Current->GetParent());
FRHIBreadcrumb::FBuffer Buffer;
Profile.PushNode(Current->GetTCHAR(Buffer));
};
Recurse(Recurse, Event.Breadcrumb);
#endif // WITH_RHI_BREADCRUMBS
}
#endif
}
};
std::atomic<bool> bTriggerProfile{ false };
uint32 ProfileFrameNumber = 0;
uint32 MaxFrameNumber = 0;
TMap<uint32, FFrameState> Frames;
TMap<FQueue, TUniquePtr<FQueueState>> QueueStates;
// Attempts to retrieve the CPU and GPU timestamps of when a fence wait is resolved by a signal on another queue.
TOptional<FResolvedWait> ResolveWait(FQueueState& LocalQueue, FEvent::FWaitFence const& WaitFenceEvent)
{
FQueueState const& RemoteQueue = static_cast<FQueueState const&>(*QueueStates.FindChecked(WaitFenceEvent.Queue));
if (RemoteQueue.MaxSignal.Value < WaitFenceEvent.Value)
{
// Fence has not yet been signalled on the remote queue
return {};
}
else
{
// Fence has been signalled, but it may be in the future.
FResolvedWait Result;
Result.CPUTimestamp = WaitFenceEvent.CPUTimestamp;
//
// The wait cannot be resolved any earlier than:
//
// 1) The wait command was issued to the driver (WaitFenceEvent.CPUTimestamp)
// 2) The GPU completed prior work on this queue (LocalQueue.LastGPUCycles)
//
Result.GPUTimestampTOP = FMath::Max(WaitFenceEvent.CPUTimestamp, LocalQueue.LastGPUCycles);
//
// 3) The wait maybe be further delayed by the remote queue the GPU is awaiting.
//
int32 Index = Algo::LowerBoundBy(RemoteQueue.Signals, WaitFenceEvent.Value, [](FResolvedSignal const& Signal) { return Signal.Value; });
if (RemoteQueue.Signals.IsValidIndex(Index))
{
FResolvedSignal const& Signal = RemoteQueue.Signals[Index];
//
// Only consider this signal's timestamp if the fence was not already signalled at the previous frame boundary.
// See comment in ResolveSignal() for details.
//
if (!(Index == 0 && WaitFenceEvent.Value < Signal.Value))
{
Result.GPUTimestampTOP = FMath::Max(Result.GPUTimestampTOP, Signal.GPUTimestampBOP);
}
}
return Result;
}
}
void InitializeQueues(TConstArrayView<FQueue> Queues) override
{
FGpuProfilerTrace::Initialize();
for (FQueue Queue : Queues)
{
TUniquePtr<FQueueState>& Ptr = QueueStates.FindOrAdd(Queue);
if (!Ptr.IsValid())
{
Ptr = MakeUnique<FQueueState>(Queue);
}
}
}
bool ProcessQueue(FQueueState& QueueState, FIterator& Iterator)
{
if (FGpuProfilerTrace::IsAvailable() && !QueueState.bWasTraced)
{
FGpuProfilerTrace::InitializeQueue(QueueState.Queue.Value, QueueState.Queue.GetTypeString());
QueueState.bWasTraced = true;
}
while (FEvent const* Event = Iterator.Peek())
{
switch (Event->GetType())
{
case FEvent::EType::BeginWork:
{
check(!QueueState.bBusy);
QueueState.bBusy = true;
QueueState.BeginWork(Event->Value.Get<FEvent::FBeginWork>());
}
break;
case FEvent::EType::EndWork:
{
check(QueueState.bBusy);
QueueState.bBusy = false;
QueueState.EndWork(Event->Value.Get<FEvent::FEndWork>());
}
break;
#if WITH_RHI_BREADCRUMBS
case FEvent::EType::BeginBreadcrumb:
{
check(QueueState.bBusy);
QueueState.BeginBreadcrumb(Event->Value.Get<FEvent::FBeginBreadcrumb>());
}
break;
case FEvent::EType::EndBreadcrumb:
{
check(QueueState.bBusy);
QueueState.EndBreadcrumb(Event->Value.Get<FEvent::FEndBreadcrumb>());
}
break;
#endif // WITH_RHI_BREADCRUMBS
#if WITH_PROFILEGPU
case FEvent::EType::Stats:
{
check(QueueState.bBusy);
QueueState.Stats(Event->Value.Get<FEvent::FStats>());
}
break;
#endif // WITH_PROFILEGPU
case FEvent::EType::SignalFence:
{
check(!QueueState.bBusy);
QueueState.ResolveSignal(Event->Value.Get<FEvent::FSignalFence>());
}
break;
case FEvent::EType::WaitFence:
{
check(!QueueState.bBusy);
const FEvent::FWaitFence& WaitFence= Event->Value.Get<FEvent::FWaitFence>();
TOptional<FResolvedWait> ResolvedWait = ResolveWait(QueueState, Event->Value.Get<FEvent::FWaitFence>());
if (!ResolvedWait.IsSet())
{
// Unresolved fence, pause processing
return false;
}
QueueState.Wait(*ResolvedWait, WaitFence);
}
break;
case FEvent::EType::FrameTime:
{
const FEvent::FFrameTime& FrameTime = Event->Value.Get<FEvent::FFrameTime>();
QueueState.FrameTime(FrameTime.TotalGPUTime);
}
break;
case FEvent::EType::FrameBoundary:
{
FEvent::FFrameBoundary const& FrameBoundary = Event->Value.Get<FEvent::FFrameBoundary>();
FFrameState& FrameState = Frames.FindOrAdd(FrameBoundary.FrameNumber);
#if STATS
FrameState.StatsFrame = FrameBoundary.bStatsFrameSet
? FrameBoundary.StatsFrame
: TOptional<int64>();
#endif
#if WITH_PROFILEGPU
// Latch the index of the next frame to profile
MaxFrameNumber = FMath::Max(FrameBoundary.FrameNumber, MaxFrameNumber);
if (bTriggerProfile.exchange(false))
{
ProfileFrameNumber = MaxFrameNumber + 1;
}
#endif // WITH_PROFILEGPU
QueueState.FrameBoundary(FrameBoundary, FrameState, ProfileFrameNumber);
if (FrameState.Num() == QueueStates.Num())
{
// Trim the Signals array in each queue, up to the lowest frame boundary CPU timestamp.
{
uint64 MinFrameBoundary = TNumericLimits<uint64>::Max();
for (auto& [Queue, QueueTimestamps] : FrameState)
{
MinFrameBoundary = FMath::Min(MinFrameBoundary, QueueTimestamps.CPUFrameBoundary);
}
for (auto& [Queue, LocalQueueState] : QueueStates)
{
LocalQueueState.Get()->TrimSignals(MinFrameBoundary);
}
}
// All registered queues have reported their frame boundary event.
// We have a full set of data to compute the total frame GPU stats.
ProcessFrame(FrameState);
Frames.Remove(FrameBoundary.FrameNumber);
}
}
break;
}
Iterator.Pop();
}
return true;
}
void ProcessFrame(FFrameState& FrameState)
{
#if STATS
FEndOfPipeStats* Stats = FEndOfPipeStats::Get();
if (FrameState.StatsFrame.IsSet())
{
Stats->AddMessage(FStatConstants::AdvanceFrame.GetEncodedName(), EStatOperation::AdvanceFrameEventEndOfPipe, *FrameState.StatsFrame);
}
#endif
#if CSV_PROFILER_STATS
const bool bCsvStatsEnabled = !!CVarGPUCsvStatsEnabled.GetValueOnAnyThread();
FCsvProfiler* CsvProfiler = FCsvProfiler::Get();
CsvProfiler->BeginFrameEOP();
#else
const bool bCsvStatsEnabled = false;
#endif
TOptional<uint64> MaxQueueBusyCycles;
for (auto const& [Queue, QueueTimestamps] : FrameState)
{
#if WITH_RHI_BREADCRUMBS && HAS_GPU_STATS
// Compute the individual GPU stats
for (auto const& [Stat, StatState] : QueueTimestamps.Stats)
{
StatState.EmitResults(Queue, *Stat.GPUStat
#if STATS
, Stats
#endif
#if CSV_PROFILER_STATS
, bCsvStatsEnabled ? CsvProfiler : nullptr
#endif
);
}
#endif // WITH_RHI_BREADCRUMBS && HAS_GPU_STATS
// Set the whole-frame per queue stat
#if HAS_GPU_STATS
QueueTimestamps.WholeQueueStat.EmitResults(Queue, GPUStat_Total
#if STATS
, Stats
#endif
#if CSV_PROFILER_STATS
, bCsvStatsEnabled ? CsvProfiler : nullptr
#endif
);
#endif
if (QueueTimestamps.TotalBusyCycles.IsSet())
{
uint64 CurrentMax = MaxQueueBusyCycles ? *MaxQueueBusyCycles : 0;
MaxQueueBusyCycles = FMath::Max(CurrentMax, *QueueTimestamps.TotalBusyCycles);
}
}
if (MaxQueueBusyCycles.IsSet())
{
// Set the total GPU time stat according to the value directly provided by the platform RHI
GRHIGPUFrameTimeHistory.PushFrameCycles(1.0 / FPlatformTime::GetSecondsPerCycle64(), *MaxQueueBusyCycles);
}
else
{
// Compute the whole-frame total GPU time.
TArray<FTimestampStream::FState, TInlineAllocator<GetRHIPipelineCount() * MAX_NUM_GPUS>> StreamPointers;
for (auto const& [Queue, State] : FrameState)
{
StreamPointers.Emplace(State.Queue);
}
uint64 WholeFrameUnion = FTimestampStream::ComputeUnion(StreamPointers);
// Update the global GPU frame time stats
GRHIGPUFrameTimeHistory.PushFrameCycles(1.0 / FPlatformTime::GetSecondsPerCycle64(), WholeFrameUnion);
}
// @todo set global csv GPU time
//RHISetGPUStatTotals(bCsvStatsEnabled, FPlatformTime::ToMilliseconds64(WholeFrameUnion));
#if STATS
Stats->Flush();
#endif
}
void ProcessAllQueues()
{
// Process the queue as far as possible
bool bProgress;
do
{
bProgress = false;
for (auto& [Queue, QueueState] : QueueStates)
{
while (FIterator* Iterator = QueueState->PendingStreams.Peek())
{
FEvent const* Start = Iterator->Peek();
bool bPaused = !ProcessQueue(*QueueState.Get(), *Iterator);
FEvent const* End = Iterator->Peek();
bProgress |= End != Start;
if (bPaused)
{
// The queue was paused by a Wait event
check(End);
break;
}
if (!End)
{
// This stream has been fully processed.
QueueState->PendingStreams.Dequeue();
}
}
}
} while (bProgress);
}
void ProcessStreams(TConstArrayView<TSharedRef<FEventStream>> EventStreams) override
{
for (TSharedRef<FEventStream> const& Stream : EventStreams)
{
FQueueState& QueueState = *QueueStates.FindChecked(Stream->Queue);
QueueState.PendingStreams.Enqueue(FIterator(Stream));
}
ProcessAllQueues();
}
} GGPUProfilerSink_StatSystem;
#if WITH_PROFILEGPU
static FAutoConsoleCommand GCommand_ProfileGPU(
TEXT("ProfileGPU"),
TEXT("Captures statistics about a frame of GPU work and prints the results to the log."),
FConsoleCommandWithArgsDelegate::CreateLambda([](const TArray<FString>& Args)
{
GGPUProfilerSink_StatSystem.bTriggerProfile = true;
if (OnProfileGPU.IsBound())
{
OnProfileGPU.Broadcast();
}
}));
#endif // WITH_PROFILEGPU
}
#endif // RHI_NEW_GPU_PROFILER
RHI_API FRHIGPUFrameTimeHistory GRHIGPUFrameTimeHistory;
FRHIGPUFrameTimeHistory::EResult FRHIGPUFrameTimeHistory::FState::PopFrameCycles(uint64& OutCycles64)
{
return GRHIGPUFrameTimeHistory.PopFrameCycles(*this, OutCycles64);
}
FRHIGPUFrameTimeHistory::EResult FRHIGPUFrameTimeHistory::PopFrameCycles(FState& State, uint64& OutCycles64)
{
FScopeLock Lock(&CS);
if (State.NextIndex == NextIndex)
{
OutCycles64 = 0;
return EResult::Empty;
}
else
{
uint64 MinHistoryIndex = NextIndex >= MaxLength ? NextIndex - MaxLength : 0;
if (State.NextIndex < MinHistoryIndex)
{
State.NextIndex = MinHistoryIndex;
OutCycles64 = History[State.NextIndex++ % MaxLength];
return EResult::Disjoint;
}
else
{
OutCycles64 = History[State.NextIndex++ % MaxLength];
return EResult::Ok;
}
}
}
void FRHIGPUFrameTimeHistory::PushFrameCycles(double GPUFrequency, uint64 GPUCycles)
{
double Seconds = double(GPUCycles) / GPUFrequency;
double Cycles32 = Seconds / FPlatformTime::GetSecondsPerCycle();
double Cycles64 = Seconds / FPlatformTime::GetSecondsPerCycle64();
{
FScopeLock Lock(&CS);
History[NextIndex++ % MaxLength] = uint64(Cycles64);
}
PRAGMA_DISABLE_DEPRECATION_WARNINGS
FPlatformAtomics::InterlockedExchange(reinterpret_cast<volatile int32*>(&GGPUFrameTime), int32(Cycles32));
PRAGMA_ENABLE_DEPRECATION_WARNINGS
}
RHI_API uint32 RHIGetGPUFrameCycles(uint32 GPUIndex)
{
PRAGMA_DISABLE_DEPRECATION_WARNINGS
return (uint32)FPlatformAtomics::AtomicRead(reinterpret_cast<volatile int32*>(&GGPUFrameTime));
PRAGMA_ENABLE_DEPRECATION_WARNINGS
}
#undef LOCTEXT_NAMESPACE