Files
UnrealEngine/Engine/Source/Runtime/Apple/MetalRHI/Private/MetalSubmission.cpp
2025-05-18 13:04:45 +08:00

932 lines
25 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
#include "MetalSubmission.h"
#include "MetalRHIPrivate.h"
#include "MetalDynamicRHI.h"
#include "MetalCommandBuffer.h"
#include "HAL/Runnable.h"
#include "HAL/RunnableThread.h"
#include "IRenderCaptureProvider.h"
#include "MetalProfiler.h"
static TAutoConsoleVariable<int32> CVarMetalRHIUseInterruptThread(
TEXT("MetalRHI.UseInterruptThread"),
1,
TEXT("Whether to enable the Metal RHI's interrupt thread.\n")
TEXT(" 0: No\n")
TEXT(" 1: Yes\n"),
ECVF_ReadOnly);
static TAutoConsoleVariable<int32> CVarMetalRHIUseSubmissionThread(
TEXT("MetalRHI.UseSubmissionThread"),
1,
TEXT("Whether to enable the Metal RHI's submission thread.\n")
TEXT(" 0: No\n")
TEXT(" 1: Yes\n"),
ECVF_ReadOnly);
#define METAL_USE_INTERRUPT_THREAD 1
#define METAL_USE_SUBMISSION_THREAD 1
class FMetalThread final : private FRunnable
{
public:
typedef FMetalDynamicRHI::FProcessResult(FMetalDynamicRHI::*FQueueFunc)();
FMetalThread(TCHAR const* Name, EThreadPriority Priority, FMetalDynamicRHI* RHI, FQueueFunc QueueFunc)
: RHI(RHI)
, QueueFunc(QueueFunc)
, Event(FPlatformProcess::GetSynchEventFromPool(true))
, Thread(FRunnableThread::Create(this, Name, 0, Priority))
{}
virtual ~FMetalThread()
{
bExit = true;
Event->Trigger();
Thread->WaitForCompletion();
delete Thread;
}
void Kick() const
{
check(Event);
Event->Trigger();
}
void Join() const
{
Thread->WaitForCompletion();
}
private:
virtual uint32 Run() override
{
while (!bExit)
{
// Process the queue until no more progress is made
FMetalDynamicRHI::FProcessResult Result;
do { Result = (RHI->*QueueFunc)(); }
while (EnumHasAllFlags(Result.Status, FMetalDynamicRHI::EQueueStatus::Processed));
Event->Wait(1);
Event->Reset();
}
// Drain any remaining work in the queue
while (EnumHasAllFlags((RHI->*QueueFunc)().Status, FMetalDynamicRHI::EQueueStatus::Pending)) {}
return 0;
}
FMetalDynamicRHI* RHI;
FQueueFunc QueueFunc;
FEvent* Event;
bool bExit = false;
private:
FRunnableThread* Thread = nullptr;
};
void FMetalDynamicRHI::InitializeSubmissionPipe()
{
if (FPlatformProcess::SupportsMultithreading())
{
#if METAL_USE_INTERRUPT_THREAD
bool bUseInterruptThread = CVarMetalRHIUseInterruptThread.GetValueOnAnyThread() == 1;
if (bUseInterruptThread)
{
InterruptThread = new FMetalThread(TEXT("RHIInterruptThread"), TPri_Highest, this, &FMetalDynamicRHI::ProcessInterruptQueue);
}
#endif
#if METAL_USE_SUBMISSION_THREAD
bool bUseSubmissionThread = false;
switch (CVarMetalRHIUseSubmissionThread.GetValueOnAnyThread())
{
case 1: bUseSubmissionThread = true; break;
}
if (bUseSubmissionThread)
{
SubmissionThread = new FMetalThread(TEXT("RHISubmissionThread"), TPri_Highest, this, &FMetalDynamicRHI::ProcessSubmissionQueue);
}
#endif
}
// Initialize the timing structs in each queue, and the engine GPU profilers
#if RHI_NEW_GPU_PROFILER
{
TArray<FMetalPayload*> Payloads;
TArray<UE::RHI::GPUProfiler::FQueue> ProfilerQueues;
ForEachQueue([&](FMetalCommandQueue& Queue)
{
FMetalPayload* Payload = Payloads.Emplace_GetRef(new FMetalPayload(Queue));
Payload->Timing = CurrentTimingPerQueue.CreateNew(Queue);
ProfilerQueues.Add(Queue.GetProfilerQueue());
});
UE::RHI::GPUProfiler::InitializeQueues(ProfilerQueues);
SubmitPayloads(MoveTemp(Payloads));
}
#endif
}
void FMetalDynamicRHI::ShutdownSubmissionPipe()
{
delete SubmissionThread;
SubmissionThread = nullptr;
delete InterruptThread;
InterruptThread = nullptr;
if (EopTask)
{
ProcessInterruptQueueUntil(EopTask);
EopTask = nullptr;
}
}
static TLockFreePointerListUnordered<FMetalRHIUploadContext, PLATFORM_CACHE_LINE_SIZE> MetalUploadContextPool;
IRHIUploadContext* FMetalDynamicRHI::RHIGetUploadContext()
{
FMetalRHIUploadContext* Context = MetalUploadContextPool.Pop();
if (!Context)
{
Context = new FMetalRHIUploadContext(*Device);
}
return static_cast<IRHIUploadContext*>(Context);
}
void FMetalDynamicRHI::RHIFinalizeContext(FRHIFinalizeContextArgs&& Args, TRHIPipelineArray<IRHIPlatformCommandList*>& Output)
{
MTL_SCOPED_AUTORELEASE_POOL;
FMetalRHIUploadContext* UploadContext = static_cast<FMetalRHIUploadContext*>(Args.UploadContext);
FMetalFinalizedCommands Commands;
if(UploadContext)
{
UploadContext->Finalize(Commands);
MetalUploadContextPool.Push(UploadContext);
}
for(IRHIComputeContext* Context : Args.Contexts)
{
FMetalRHICommandContext* CmdContext = static_cast<FMetalRHICommandContext*>(Context);
if(!CmdContext->IsInsideRenderPass())
{
CmdContext->Finalize(Commands);
CmdContext->ResetContext();
if(GRHISupportsParallelRHIExecute)
{
if(CmdContext != RHIGetDefaultContext())
{
MetalCommandContextPool.Push(CmdContext);
}
}
}
Output[Context->GetPipeline()] = new FMetalFinalizedCommands(MoveTemp(Commands));
}
}
IRHIPlatformCommandList* FMetalDynamicRHI::RHIFinalizeParallelContext(IRHIComputeContext* Context)
{
FMetalFinalizedCommands* Commands = new FMetalFinalizedCommands;
FMetalRHICommandContext* CmdContext = static_cast<FMetalRHICommandContext*>(Context);
CmdContext->Finalize(*Commands);
CmdContext->ResetContext();
MetalCommandContextPool.Push(CmdContext);
return Commands;
}
void FMetalDynamicRHI::RHISubmitCommandLists(FRHISubmitCommandListsArgs&& Args)
{
SubmitCommands(MakeArrayView(reinterpret_cast<FMetalFinalizedCommands**>(Args.CommandLists.GetData()), Args.CommandLists.Num()));
}
void FMetalDynamicRHI::SubmitCommands(TConstArrayView<FMetalFinalizedCommands*> Commands)
{
SCOPED_NAMED_EVENT_TEXT("CommandList_Submit", FColor::Magenta);
#if RHI_NEW_GPU_PROFILER
TArray<FMetalPayload*> AllPayloads;
for (FMetalFinalizedCommands* Payloads : Commands)
{
#if WITH_RHI_BREADCRUMBS
TSharedPtr<FRHIBreadcrumbAllocatorArray> BreadcrumbAllocators {};
if (Payloads->BreadcrumbAllocators.Num())
{
BreadcrumbAllocators = MakeShared<FRHIBreadcrumbAllocatorArray>(MoveTemp(Payloads->BreadcrumbAllocators));
}
for (FMetalPayload* Payload : *Payloads)
{
Payload->BreadcrumbRange = Payloads->BreadcrumbRange;
if (BreadcrumbAllocators.IsValid())
{
check(!Payload->BreadcrumbAllocators.IsValid());
Payload->BreadcrumbAllocators = BreadcrumbAllocators;
}
}
#endif
AllPayloads.Append(MoveTemp(*Payloads));
delete Payloads;
}
SubmitPayloads(MoveTemp(AllPayloads));
#else
TArray<FMetalPayload*> AllPayloads;
#if WITH_RHI_BREADCRUMBS
TArray<TSharedPtr<FRHIBreadcrumbAllocator>> BreadcrumbAllocators;
#endif
for (FMetalFinalizedCommands* Payloads : Commands)
{
#if WITH_RHI_BREADCRUMBS
for (FMetalPayload* Payload : *Payloads)
{
Payload->BreadcrumbRange = Payloads->BreadcrumbRange;
}
#endif
AllPayloads.Append(MoveTemp(static_cast<TArray<FMetalPayload*>&>(*Payloads)));
#if WITH_RHI_BREADCRUMBS
BreadcrumbAllocators.Append(MoveTemp(Payloads->BreadcrumbAllocators));
#endif
delete Payloads;
}
SubmitPayloads(MoveTemp(AllPayloads));
#if WITH_RHI_BREADCRUMBS
// Enqueue the breadcrumb allocator references for cleanup once all prior payloads have completed on the GPU.
DeferredDelete([Array = MoveTemp(BreadcrumbAllocators)]() {});
#endif
#endif
}
void FMetalDynamicRHI::SubmitPayloads(TArray<FMetalPayload*>&& Payloads)
{
if (Payloads.Num())
{
PendingPayloadsForSubmission.Enqueue(new TArray<FMetalPayload*>(MoveTemp(Payloads)));
}
if (SubmissionThread)
{
SubmissionThread->Kick();
}
else
{
// Since we're processing directly on the calling thread, we need to take a scope lock.
// Multiple engine threads might be calling Submit().
{
FScopeLock Lock(&SubmissionCS);
// Process the submission queue until no further progress is being made.
while (EnumHasAnyFlags(ProcessSubmissionQueue().Status, EQueueStatus::Processed)) {}
}
}
// Use this opportunity to pump the interrupt queue
ProcessInterruptQueueUntil(nullptr);
}
static int32 GetMaxExecuteBatchSize()
{
return
#if UE_BUILD_DEBUG
GRHIGlobals.IsDebugLayerEnabled ? 1 :
#endif
TNumericLimits<int32>::Max();
}
FMetalDynamicRHI::FProcessResult FMetalDynamicRHI::ProcessSubmissionQueue()
{
SCOPED_NAMED_EVENT_TEXT("SubmissionQueue_Process", FColor::Turquoise);
LLM_SCOPE_BYNAME(TEXT("RHIMisc/ProcessSubmissionQueue"));
FMetalCommandQueue::FPayloadArray PayloadsToHandDown;
FProcessResult Result;
auto FlushPayloads = [&PayloadsToHandDown, &Result, DynamicRHI = this]()
{
if (PayloadsToHandDown.Num() > 0)
{
Result.Status |= EQueueStatus::Processed;
DynamicRHI->FlushBatchedPayloads(PayloadsToHandDown);
}
};
bool bProgress;
bool bKickInterruptThread = false;
do
{
bProgress = false;
Result.Status = EQueueStatus::None;
// Push all pending payloads into the ordered per-device, per-pipe pending queues
{
TArray<FMetalPayload*>* Array;
while (PendingPayloadsForSubmission.Dequeue(Array))
{
for (FMetalPayload* Payload : *Array)
{
Payload->Queue.PendingSubmission.Enqueue(Payload);
}
delete Array;
}
}
//
// Fence values for FMetalSyncPoint are determined on the submission thread,
// where each queue has a monotonically incrementing fence value.
//
// We might receive work that waits on a sync point which has not yet been submitted
// to the queue that will signal it, so we need to delay processing of those
// payloads until the fence value is known.
//
// Process all queues (across all devices and adapters) to flush work.
// Any sync point waits where the fence value is unknown will be left in the
// appropriate queue, to be processed the next time commands are submitted.
ForEachQueue([&](FMetalCommandQueue& CurrentQueue)
{
while (true)
{
TArray<FMetalCommandQueue*, TInlineAllocator<GMetalMaxNumQueues>> QueuesWithPayloads;
{
FMetalPayload* Payload = CurrentQueue.PendingSubmission.Peek();
if (!Payload)
return;
// Accumulate the list of fences to await, and their maximum values
while (Payload->SyncPointsToWait.Index < Payload->SyncPointsToWait.Num())
{
FMetalSyncPointRef& SyncPoint = Payload->SyncPointsToWait[Payload->SyncPointsToWait.Index];
if (!SyncPoint->ResolvedFence.IsSet())
{
// Need to wait on a sync point, but the fence value has not been resolved yet
// (no other payloads have signaled the sync point yet).
// Skip processing this queue, and move on to the next. We will retry later when
// further work is submitted, which may contain the sync point we need.
Result.Status |= EQueueStatus::Pending;
return;
}
Payload->AddQueueFenceWait(
SyncPoint->ResolvedFence->Fence,
SyncPoint->ResolvedFence->Value
);
Payload->SyncPointsToWait.Index++;
bProgress = true;
}
// All necessary sync points have been resolved.
Payload->SyncPointsToWait = {};
CurrentQueue.PendingSubmission.Pop();
bProgress = true;
check(!CurrentQueue.PayloadToSubmit);
CurrentQueue.PayloadToSubmit = Payload;
QueuesWithPayloads.Add(&CurrentQueue);
Result.Status |= EQueueStatus::Processed;
bKickInterruptThread = true;
for (int32 Index = 0; Index < Payload->CommandBuffersToExecute.Num(); Index++)
{
FMetalCommandBuffer* CurrentCommandBuffer = Payload->CommandBuffersToExecute[Index];
CurrentQueue.OcclusionQueries.Append(MoveTemp(CurrentCommandBuffer->OcclusionQueries));
CurrentQueue.TimestampQueries.Append(MoveTemp(CurrentCommandBuffer->TimestampQueries));
#if RHI_NEW_GPU_PROFILER == 0
CurrentQueue.EventSampleCounters.Append(MoveTemp(CurrentCommandBuffer->EventSampleCounters));
#endif
}
}
// Queues with work to submit other than the current one (CurrentQueue) are performing barrier operations.
// Submit this work first, followed by a fence signal + enqueued wait.
for (FMetalCommandQueue* OtherQueue : QueuesWithPayloads)
{
if (OtherQueue != &CurrentQueue)
{
uint64 ValueSignaled = OtherQueue->FinalizePayload(true, PayloadsToHandDown);
CurrentQueue.PayloadToSubmit->AddQueueFenceWait(OtherQueue->GetSignalEvent(), ValueSignaled);
}
FlushPayloads();
}
// Now submit the original payload
CurrentQueue.FinalizePayload(false, PayloadsToHandDown);
FlushPayloads();
}
});
} while (bProgress);
FlushPayloads();
if (InterruptThread && bKickInterruptThread)
{
InterruptThread->Kick();
}
return Result;
}
uint64 FMetalCommandQueue::FinalizePayload(bool bRequiresSignal, FPayloadArray& PayloadsToHandDown)
{
TRACE_CPUPROFILER_EVENT_SCOPE(ExecuteCommandList);
LLM_SCOPE_BYNAME(TEXT("RHIMisc/ExecuteCommandLists"));
check(PayloadToSubmit && this == &PayloadToSubmit->Queue);
check(PayloadToSubmit->SyncPointsToWait.Num() == 0);
check(!PayloadToSubmit->SignalCommandBuffer);
// Keep the latest fence value in the submitted payload.
// The interrupt thread uses this to determine when work has completed.
PayloadToSubmit->CompletionFenceValue = ++SignalEvent.NextCompletionValue;
PayloadToSubmit->bAlwaysSignal |= bRequiresSignal;
// Set the fence/value pair into any sync points we need to signal.
for (FMetalSyncPointRef& SyncPoint : PayloadToSubmit->SyncPointsToSignal)
{
check(!SyncPoint->ResolvedFence.IsSet());
SyncPoint->ResolvedFence.Emplace(SignalEvent, PayloadToSubmit->CompletionFenceValue);
}
PayloadToSubmit->BatchedObjects.OcclusionQueries = MoveTemp(OcclusionQueries);
PayloadToSubmit->BatchedObjects.TimestampQueries = MoveTemp(TimestampQueries);
#if RHI_NEW_GPU_PROFILER == 0
PayloadToSubmit->BatchedObjects.EventSampleCounters = MoveTemp(EventSampleCounters);
#endif
PayloadsToHandDown.Add(PayloadToSubmit);
PayloadToSubmit = nullptr;
return SignalEvent.NextCompletionValue;
}
void FMetalDynamicRHI::FlushBatchedPayloads(FMetalCommandQueue::FPayloadArray& PayloadsToSubmit)
{
uint32 FirstPayload = 0, LastPayload = 0;
auto Wait = [this](FMetalPayload* Payload)
{
FMetalCommandQueue& Queue = Payload->Queue;
// Wait for queue fences
for (auto& [LocalFence, Value] : Payload->QueueFencesToWait)
{
FMetalCommandBuffer* CurrentCommandBuffer = Queue.CreateCommandBuffer();
CurrentCommandBuffer->GetMTLCmdBuffer()->encodeWait(LocalFence.MetalEvent, Value);
Queue.CommitCommandBuffer(CurrentCommandBuffer);
DeferredDelete([CurrentCommandBuffer]() {
delete CurrentCommandBuffer;
});
}
};
auto Flush = [&]()
{
if (FirstPayload == LastPayload)
return;
FMetalCommandQueue& Queue = PayloadsToSubmit[FirstPayload]->Queue;
uint64 Time = FPlatformTime::Cycles64();
TArray<FMetalCommandBuffer*> CommandBuffers;
// Accumulate the command lists from the payload
for (uint32 Index = FirstPayload; Index < LastPayload; ++Index)
{
FMetalPayload* Payload = PayloadsToSubmit[Index];
check(&Payload->Queue == &Queue);
for (FMetalCommandBuffer* CommandBuffer : Payload->CommandBuffersToExecute)
{
#if RHI_NEW_GPU_PROFILER
CommandBuffer->FlushProfilerEvents(Payload->EventStream, Time);
#endif
Payload->Queue.CommitCommandBuffer(CommandBuffer);
}
}
FirstPayload = LastPayload;
};
auto Signal = [this](FMetalPayload* Payload)
{
FMetalCommandQueue& Queue = Payload->Queue;
// Signal the queue fence
if (Payload->RequiresQueueFenceSignal())
{
check(Queue.GetSignalEvent().LastSignaledValue < Payload->CompletionFenceValue);
FMetalCommandBuffer* CommandBuffer = Queue.CreateCommandBuffer();
MTL::HandlerFunction CompletionHandler = [this](MTL::CommandBuffer* CompletedBuffer)
{
if(InterruptThread)
{
InterruptThread->Kick();
}
};
CommandBuffer->GetMTLCmdBuffer()->addCompletedHandler(CompletionHandler);
CommandBuffer->GetMTLCmdBuffer()->encodeSignalEvent(Queue.GetSignalEvent().MetalEvent, Payload->CompletionFenceValue);
Payload->SignalCommandBuffer = CommandBuffer;
Queue.CommitCommandBuffer(CommandBuffer);
Queue.GetSignalEvent().LastSignaledValue.store(Payload->CompletionFenceValue, std::memory_order_release);
}
// Submission of this payload is completed. Signal the submission event if one was provided.
if (Payload->SubmissionEvent)
{
Payload->SubmissionEvent->DispatchSubsequents();
}
};
FMetalCommandQueue* PrevQueue = nullptr;
for (FMetalPayload* Payload : PayloadsToSubmit)
{
if (PrevQueue != &Payload->Queue)
{
Flush();
PrevQueue = &Payload->Queue;
}
Payload->Queue.PendingInterrupt.Enqueue(Payload);
if (Payload->HasWaitWork())
{
Flush();
Wait(Payload);
}
if (Payload->HasPreExecuteWork())
{
Flush();
Payload->PreExecute();
}
LastPayload++;
if (Payload->HasSignalWork())
{
Flush();
Signal(Payload);
}
}
Flush();
for (FMetalPayload* Payload : PayloadsToSubmit)
{
// Only set this bool to true once we'll never touch the payload again on this thread.
// This is because the bool hands ownership to the interrupt thread, which might delete the payload.
Payload->bSubmitted = true;
}
PayloadsToSubmit.Reset();
}
void FMetalPayload::AddQueueFenceWait(FMetalSignalEvent& InFence, uint64 InValue)
{
for (auto& [Fence, Value] : QueueFencesToWait)
{
if (&Fence == &InFence)
{
Value = FMath::Max(Value, InValue);
return;
}
}
QueueFencesToWait.Add({ InFence, InValue });
}
void FMetalSyncPoint::Wait() const
{
checkf(GraphEvent, TEXT("This sync point was not created with a CPU event. Cannot wait for completion on the CPU."));
if (!GraphEvent->IsComplete())
{
// Block the calling thread until the graph event is signaled by the interrupt thread.
SCOPED_NAMED_EVENT_TEXT("SyncPoint_Wait", FColor::Turquoise);
FMetalDynamicRHI::Get().ProcessInterruptQueueUntil(GraphEvent);
}
check(GraphEvent->IsComplete());
}
void FMetalDynamicRHI::ProcessInterruptQueueUntil(FGraphEvent* GraphEvent)
{
if (InterruptThread)
{
if (GraphEvent && !GraphEvent->IsComplete())
{
InterruptThread->Kick();
GraphEvent->Wait();
}
}
else
{
// Use the current thread to process the interrupt queue until the sync point we're waiting for is signaled.
// If GraphEvent is nullptr, process the queue until no further progress is made (assuming we can acquire the lock), then return.
if (!GraphEvent || !GraphEvent->IsComplete())
{
// If we're waiting for a sync point, accumulate the idle time
UE::Stats::FThreadIdleStats::FScopeIdle IdleScope(GraphEvent == nullptr);
Retry:
if (InterruptCS.TryLock())
{
FProcessResult Result;
do { Result = ProcessInterruptQueue(); }
// If we have a sync point, keep processing until the sync point is signaled.
// Otherwise, process until no more progress is being made.
while (GraphEvent
? !GraphEvent->IsComplete()
: EnumHasAllFlags(Result.Status, EQueueStatus::Processed)
);
InterruptCS.Unlock();
}
else if (GraphEvent && !GraphEvent->IsComplete())
{
// Failed to get the lock. Another thread is processing the interrupt queue. Try again...
FPlatformProcess::SleepNoStats(0);
goto Retry;
}
}
}
}
void FMetalDynamicRHI::ProcessPendingCommandBuffers()
{
double SecondsPerCycle = FPlatformTime::GetSecondsPerCycle64();
bool bContinueProcessing = true;
CmdBuffersPendingCompletion.RemoveAll([SecondsPerCycle, &bContinueProcessing](FMetalCommandBuffer* CommandBuffer)
{
MTL::CommandBuffer* CompletedBuffer = CommandBuffer->GetMTLCmdBuffer();
MTL::CommandBufferStatus Status = CompletedBuffer->status();
if (Status == MTL::CommandBufferStatusCompleted && bContinueProcessing)
{
#if RHI_NEW_GPU_PROFILER
#if WITH_RHI_BREADCRUMBS
FMetalBreadcrumbProfiler::GetInstance()->ResolveBreadcrumbTrackerStream(CommandBuffer->BreadcrumbTrackerStream);
#endif
uint64_t* Start = CommandBuffer->BeginWorkTimestamp;
uint64_t* End = CommandBuffer->EndWorkTimestamp;
if(CommandBuffer->CounterSamples.Num())
{
for(FMetalCounterSamplePtr Sample : CommandBuffer->CounterSamples)
{
uint64_t StartTime, EndTime;
Sample->ResolveStageCounters(StartTime, EndTime);
*Start = *Start > 0 ? FMath::Min(StartTime, *Start) : StartTime;
*End = *End > 0 ? FMath::Max(EndTime, *End) : EndTime;
}
}
else
{
*Start = CompletedBuffer->GPUStartTime() / SecondsPerCycle;
*End = CompletedBuffer->GPUEndTime() / SecondsPerCycle;
}
#else
FMetalCommandBufferTimer* Timer = CommandBuffer->GetTimer();
Timer->AddTiming({CompletedBuffer->GPUStartTime(), CompletedBuffer->GPUEndTime()});
#endif
delete CommandBuffer;
return true;
}
else
{
bContinueProcessing = false;
}
return false;
});
}
FMetalDynamicRHI::FProcessResult FMetalDynamicRHI::ProcessInterruptQueue()
{
SCOPED_NAMED_EVENT_TEXT("InterruptQueue_Process", FColor::Yellow);
LLM_SCOPE_BYNAME(TEXT("RHIMisc/ProcessInterruptQueue"));
// Timer that clamps each tick to prevent false positive GPU timeouts
// when a debugger is attached and the process is broken.
struct FTimer
{
uint64 Elapsed;
uint64 Last;
FTimer()
: Elapsed(0)
, Last(FPlatformTime::Cycles64())
{}
void Tick()
{
static const uint64 MaxDeltaCycles = uint64(1.0 / FPlatformTime::GetSecondsPerCycle64()); // 1 second
uint64 Current = FPlatformTime::Cycles64();
Elapsed += FMath::Min(MaxDeltaCycles, Current - Last);
Last = Current;
}
} static Timer;
Timer.Tick();
FProcessResult Result;
ForEachQueue([&](FMetalCommandQueue& CurrentQueue)
{
while (FMetalPayload* Payload = CurrentQueue.PendingInterrupt.Peek())
{
if (!Payload->bSubmitted)
break;
// Check for GPU completion
FMetalSignalEvent& CurrentEvent = CurrentQueue.GetSignalEvent();
uint64 LastSignaledFenceValue = CurrentEvent.LastSignaledValue.load(std::memory_order_acquire);
// Handle command buffer errors
for(FMetalCommandBuffer* CommandBuffer : Payload->CommandBuffersToExecute)
{
MTL::CommandBuffer* CompletedBuffer = CommandBuffer->GetMTLCmdBuffer();
if (CompletedBuffer->status() == MTL::CommandBufferStatusError)
{
FMetalCommandList::HandleMetalCommandBufferFailure(CompletedBuffer);
}
}
MTL::CommandBufferStatus Status = MTL::CommandBufferStatusCompleted;
if(Payload->SignalCommandBuffer)
{
MTL::CommandBuffer* SignalBuffer = Payload->SignalCommandBuffer->GetMTLCmdBuffer();
Status = SignalBuffer->status();
if(Status == MTL::CommandBufferStatusError)
{
FMetalCommandList::HandleMetalCommandBufferFailure(SignalBuffer);
}
}
// Remove Completed status check when we remove completion handlers
if (Status != MTL::CommandBufferStatusCompleted)
{
// Skip processing this queue and move on to the next.
Result.Status |= EQueueStatus::Pending;
break;
}
if(Payload->SignalCommandBuffer)
{
delete Payload->SignalCommandBuffer;
}
#if RHI_NEW_GPU_PROFILER
if (!Payload->EventStream.IsEmpty())
{
check(CurrentQueue.Timing);
CurrentQueue.Timing->EventStream.Append(MoveTemp(Payload->EventStream));
}
if (Payload->Timing.IsSet())
{
// Switch the new timing struct into the queue. This redirects timestamp results to separate each frame's work.
CurrentQueue.Timing = Payload->Timing.GetValue();
}
#endif
// Resolve query results
{
for (FMetalRHIRenderQuery* OcclusionQuery : Payload->BatchedObjects.OcclusionQueries)
{
OcclusionQuery->Result = OcclusionQuery->Buffer.GetResult();
}
for (FMetalRHIRenderQuery* TimestampQuery : Payload->BatchedObjects.TimestampQueries)
{
MTL::CommandBuffer* CmdBuffer = TimestampQuery->CommandBuffer->GetMTLCmdBuffer();
// If there are no commands in the command buffer then this can be zero
// In this case GPU start time is also not correct - we need to fall back standard behaviour
// Only seen empty command buffers at the very end of a frame
// Convert seconds to microseconds
TimestampQuery->Result = uint64(CmdBuffer->GPUEndTime()) * 1000000;
if(TimestampQuery->Result == 0)
{
TimestampQuery->Result = (FPlatformTime::ToMilliseconds64(mach_absolute_time()) * 1000.0);
}
TimestampQuery->Release();
}
#if RHI_NEW_GPU_PROFILER == 0
for(auto& Pair : Payload->BatchedObjects.EventSampleCounters)
{
uint64_t& Start = Pair.Key->StartTime;
uint64_t& End = Pair.Key->EndTime;
for(FMetalCounterSamplePtr Sample: Pair.Value)
{
uint64_t StartTime, EndTime;
Sample->ResolveStageCounters(StartTime, EndTime);
StartTime /= 1000.0;
EndTime /= 1000.0;
Start = Start > 0 ? FMath::Min(StartTime, Start) : StartTime;
End = End > 0 ? FMath::Max(EndTime, End) : EndTime;
}
}
#endif
}
// Signal the CPU events of all sync points associated with this batch.
for (FMetalSyncPointRef& SyncPoint : Payload->SyncPointsToSignal)
{
if (SyncPoint->GraphEvent)
{
SyncPoint->GraphEvent->DispatchSubsequents();
}
}
// We're done with this payload now.
for(FMetalCommandBuffer* CommandBuffer : Payload->CommandBuffersToExecute)
{
CmdBuffersPendingCompletion.Add(CommandBuffer);
}
ProcessPendingCommandBuffers();
// At this point, the current command list has completed on the GPU.
CurrentQueue.PendingInterrupt.Pop();
Result.Status |= EQueueStatus::Processed;
// GPU resources the payload is holding a reference to will be cleaned up here.
// E.g. command list allocators, which get recycled on the parent device.
delete Payload;
}
});
return Result;
}
FMetalPayload::FMetalPayload(FMetalCommandQueue& Queue)
: Queue(Queue)
#if RHI_NEW_GPU_PROFILER
, EventStream(Queue.GetProfilerQueue())
#endif
{}
FMetalPayload::~FMetalPayload()
{
}
void FMetalPayload::PreExecute()
{
if (PreExecuteCallback)
{
PreExecuteCallback(Queue);
}
}