932 lines
25 KiB
C++
932 lines
25 KiB
C++
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
#include "MetalSubmission.h"
|
|
#include "MetalRHIPrivate.h"
|
|
#include "MetalDynamicRHI.h"
|
|
#include "MetalCommandBuffer.h"
|
|
#include "HAL/Runnable.h"
|
|
#include "HAL/RunnableThread.h"
|
|
#include "IRenderCaptureProvider.h"
|
|
#include "MetalProfiler.h"
|
|
|
|
static TAutoConsoleVariable<int32> CVarMetalRHIUseInterruptThread(
|
|
TEXT("MetalRHI.UseInterruptThread"),
|
|
1,
|
|
TEXT("Whether to enable the Metal RHI's interrupt thread.\n")
|
|
TEXT(" 0: No\n")
|
|
TEXT(" 1: Yes\n"),
|
|
ECVF_ReadOnly);
|
|
|
|
static TAutoConsoleVariable<int32> CVarMetalRHIUseSubmissionThread(
|
|
TEXT("MetalRHI.UseSubmissionThread"),
|
|
1,
|
|
TEXT("Whether to enable the Metal RHI's submission thread.\n")
|
|
TEXT(" 0: No\n")
|
|
TEXT(" 1: Yes\n"),
|
|
ECVF_ReadOnly);
|
|
|
|
#define METAL_USE_INTERRUPT_THREAD 1
|
|
#define METAL_USE_SUBMISSION_THREAD 1
|
|
|
|
class FMetalThread final : private FRunnable
|
|
{
|
|
public:
|
|
typedef FMetalDynamicRHI::FProcessResult(FMetalDynamicRHI::*FQueueFunc)();
|
|
|
|
FMetalThread(TCHAR const* Name, EThreadPriority Priority, FMetalDynamicRHI* RHI, FQueueFunc QueueFunc)
|
|
: RHI(RHI)
|
|
, QueueFunc(QueueFunc)
|
|
, Event(FPlatformProcess::GetSynchEventFromPool(true))
|
|
, Thread(FRunnableThread::Create(this, Name, 0, Priority))
|
|
{}
|
|
|
|
virtual ~FMetalThread()
|
|
{
|
|
bExit = true;
|
|
|
|
Event->Trigger();
|
|
|
|
Thread->WaitForCompletion();
|
|
delete Thread;
|
|
}
|
|
|
|
void Kick() const
|
|
{
|
|
check(Event);
|
|
Event->Trigger();
|
|
}
|
|
|
|
void Join() const
|
|
{
|
|
Thread->WaitForCompletion();
|
|
}
|
|
|
|
private:
|
|
virtual uint32 Run() override
|
|
{
|
|
while (!bExit)
|
|
{
|
|
// Process the queue until no more progress is made
|
|
FMetalDynamicRHI::FProcessResult Result;
|
|
do { Result = (RHI->*QueueFunc)(); }
|
|
while (EnumHasAllFlags(Result.Status, FMetalDynamicRHI::EQueueStatus::Processed));
|
|
|
|
Event->Wait(1);
|
|
Event->Reset();
|
|
}
|
|
|
|
// Drain any remaining work in the queue
|
|
while (EnumHasAllFlags((RHI->*QueueFunc)().Status, FMetalDynamicRHI::EQueueStatus::Pending)) {}
|
|
|
|
return 0;
|
|
}
|
|
|
|
FMetalDynamicRHI* RHI;
|
|
FQueueFunc QueueFunc;
|
|
FEvent* Event;
|
|
bool bExit = false;
|
|
|
|
private:
|
|
FRunnableThread* Thread = nullptr;
|
|
};
|
|
|
|
void FMetalDynamicRHI::InitializeSubmissionPipe()
|
|
{
|
|
if (FPlatformProcess::SupportsMultithreading())
|
|
{
|
|
#if METAL_USE_INTERRUPT_THREAD
|
|
bool bUseInterruptThread = CVarMetalRHIUseInterruptThread.GetValueOnAnyThread() == 1;
|
|
if (bUseInterruptThread)
|
|
{
|
|
InterruptThread = new FMetalThread(TEXT("RHIInterruptThread"), TPri_Highest, this, &FMetalDynamicRHI::ProcessInterruptQueue);
|
|
}
|
|
#endif
|
|
|
|
#if METAL_USE_SUBMISSION_THREAD
|
|
bool bUseSubmissionThread = false;
|
|
switch (CVarMetalRHIUseSubmissionThread.GetValueOnAnyThread())
|
|
{
|
|
case 1: bUseSubmissionThread = true; break;
|
|
}
|
|
|
|
if (bUseSubmissionThread)
|
|
{
|
|
SubmissionThread = new FMetalThread(TEXT("RHISubmissionThread"), TPri_Highest, this, &FMetalDynamicRHI::ProcessSubmissionQueue);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Initialize the timing structs in each queue, and the engine GPU profilers
|
|
#if RHI_NEW_GPU_PROFILER
|
|
{
|
|
TArray<FMetalPayload*> Payloads;
|
|
TArray<UE::RHI::GPUProfiler::FQueue> ProfilerQueues;
|
|
|
|
ForEachQueue([&](FMetalCommandQueue& Queue)
|
|
{
|
|
FMetalPayload* Payload = Payloads.Emplace_GetRef(new FMetalPayload(Queue));
|
|
Payload->Timing = CurrentTimingPerQueue.CreateNew(Queue);
|
|
|
|
ProfilerQueues.Add(Queue.GetProfilerQueue());
|
|
});
|
|
|
|
UE::RHI::GPUProfiler::InitializeQueues(ProfilerQueues);
|
|
|
|
SubmitPayloads(MoveTemp(Payloads));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void FMetalDynamicRHI::ShutdownSubmissionPipe()
|
|
{
|
|
delete SubmissionThread;
|
|
SubmissionThread = nullptr;
|
|
|
|
delete InterruptThread;
|
|
InterruptThread = nullptr;
|
|
|
|
if (EopTask)
|
|
{
|
|
ProcessInterruptQueueUntil(EopTask);
|
|
EopTask = nullptr;
|
|
}
|
|
}
|
|
|
|
static TLockFreePointerListUnordered<FMetalRHIUploadContext, PLATFORM_CACHE_LINE_SIZE> MetalUploadContextPool;
|
|
|
|
IRHIUploadContext* FMetalDynamicRHI::RHIGetUploadContext()
|
|
{
|
|
FMetalRHIUploadContext* Context = MetalUploadContextPool.Pop();
|
|
if (!Context)
|
|
{
|
|
Context = new FMetalRHIUploadContext(*Device);
|
|
}
|
|
|
|
return static_cast<IRHIUploadContext*>(Context);
|
|
}
|
|
|
|
void FMetalDynamicRHI::RHIFinalizeContext(FRHIFinalizeContextArgs&& Args, TRHIPipelineArray<IRHIPlatformCommandList*>& Output)
|
|
{
|
|
MTL_SCOPED_AUTORELEASE_POOL;
|
|
|
|
FMetalRHIUploadContext* UploadContext = static_cast<FMetalRHIUploadContext*>(Args.UploadContext);
|
|
|
|
FMetalFinalizedCommands Commands;
|
|
|
|
if(UploadContext)
|
|
{
|
|
UploadContext->Finalize(Commands);
|
|
MetalUploadContextPool.Push(UploadContext);
|
|
}
|
|
|
|
for(IRHIComputeContext* Context : Args.Contexts)
|
|
{
|
|
FMetalRHICommandContext* CmdContext = static_cast<FMetalRHICommandContext*>(Context);
|
|
|
|
if(!CmdContext->IsInsideRenderPass())
|
|
{
|
|
CmdContext->Finalize(Commands);
|
|
|
|
CmdContext->ResetContext();
|
|
if(GRHISupportsParallelRHIExecute)
|
|
{
|
|
if(CmdContext != RHIGetDefaultContext())
|
|
{
|
|
MetalCommandContextPool.Push(CmdContext);
|
|
}
|
|
}
|
|
}
|
|
|
|
Output[Context->GetPipeline()] = new FMetalFinalizedCommands(MoveTemp(Commands));
|
|
}
|
|
}
|
|
|
|
IRHIPlatformCommandList* FMetalDynamicRHI::RHIFinalizeParallelContext(IRHIComputeContext* Context)
|
|
{
|
|
FMetalFinalizedCommands* Commands = new FMetalFinalizedCommands;
|
|
|
|
FMetalRHICommandContext* CmdContext = static_cast<FMetalRHICommandContext*>(Context);
|
|
CmdContext->Finalize(*Commands);
|
|
|
|
CmdContext->ResetContext();
|
|
|
|
MetalCommandContextPool.Push(CmdContext);
|
|
|
|
return Commands;
|
|
}
|
|
|
|
void FMetalDynamicRHI::RHISubmitCommandLists(FRHISubmitCommandListsArgs&& Args)
|
|
{
|
|
SubmitCommands(MakeArrayView(reinterpret_cast<FMetalFinalizedCommands**>(Args.CommandLists.GetData()), Args.CommandLists.Num()));
|
|
}
|
|
|
|
void FMetalDynamicRHI::SubmitCommands(TConstArrayView<FMetalFinalizedCommands*> Commands)
|
|
{
|
|
SCOPED_NAMED_EVENT_TEXT("CommandList_Submit", FColor::Magenta);
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
TArray<FMetalPayload*> AllPayloads;
|
|
for (FMetalFinalizedCommands* Payloads : Commands)
|
|
{
|
|
#if WITH_RHI_BREADCRUMBS
|
|
TSharedPtr<FRHIBreadcrumbAllocatorArray> BreadcrumbAllocators {};
|
|
if (Payloads->BreadcrumbAllocators.Num())
|
|
{
|
|
BreadcrumbAllocators = MakeShared<FRHIBreadcrumbAllocatorArray>(MoveTemp(Payloads->BreadcrumbAllocators));
|
|
}
|
|
|
|
for (FMetalPayload* Payload : *Payloads)
|
|
{
|
|
Payload->BreadcrumbRange = Payloads->BreadcrumbRange;
|
|
if (BreadcrumbAllocators.IsValid())
|
|
{
|
|
check(!Payload->BreadcrumbAllocators.IsValid());
|
|
Payload->BreadcrumbAllocators = BreadcrumbAllocators;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
AllPayloads.Append(MoveTemp(*Payloads));
|
|
delete Payloads;
|
|
}
|
|
|
|
SubmitPayloads(MoveTemp(AllPayloads));
|
|
|
|
#else
|
|
|
|
TArray<FMetalPayload*> AllPayloads;
|
|
#if WITH_RHI_BREADCRUMBS
|
|
TArray<TSharedPtr<FRHIBreadcrumbAllocator>> BreadcrumbAllocators;
|
|
#endif
|
|
|
|
for (FMetalFinalizedCommands* Payloads : Commands)
|
|
{
|
|
#if WITH_RHI_BREADCRUMBS
|
|
for (FMetalPayload* Payload : *Payloads)
|
|
{
|
|
Payload->BreadcrumbRange = Payloads->BreadcrumbRange;
|
|
}
|
|
#endif
|
|
|
|
AllPayloads.Append(MoveTemp(static_cast<TArray<FMetalPayload*>&>(*Payloads)));
|
|
#if WITH_RHI_BREADCRUMBS
|
|
BreadcrumbAllocators.Append(MoveTemp(Payloads->BreadcrumbAllocators));
|
|
#endif
|
|
delete Payloads;
|
|
}
|
|
|
|
SubmitPayloads(MoveTemp(AllPayloads));
|
|
|
|
#if WITH_RHI_BREADCRUMBS
|
|
// Enqueue the breadcrumb allocator references for cleanup once all prior payloads have completed on the GPU.
|
|
DeferredDelete([Array = MoveTemp(BreadcrumbAllocators)]() {});
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
void FMetalDynamicRHI::SubmitPayloads(TArray<FMetalPayload*>&& Payloads)
|
|
{
|
|
if (Payloads.Num())
|
|
{
|
|
PendingPayloadsForSubmission.Enqueue(new TArray<FMetalPayload*>(MoveTemp(Payloads)));
|
|
}
|
|
|
|
if (SubmissionThread)
|
|
{
|
|
SubmissionThread->Kick();
|
|
}
|
|
else
|
|
{
|
|
// Since we're processing directly on the calling thread, we need to take a scope lock.
|
|
// Multiple engine threads might be calling Submit().
|
|
{
|
|
FScopeLock Lock(&SubmissionCS);
|
|
|
|
// Process the submission queue until no further progress is being made.
|
|
while (EnumHasAnyFlags(ProcessSubmissionQueue().Status, EQueueStatus::Processed)) {}
|
|
}
|
|
}
|
|
|
|
// Use this opportunity to pump the interrupt queue
|
|
ProcessInterruptQueueUntil(nullptr);
|
|
}
|
|
|
|
static int32 GetMaxExecuteBatchSize()
|
|
{
|
|
return
|
|
#if UE_BUILD_DEBUG
|
|
GRHIGlobals.IsDebugLayerEnabled ? 1 :
|
|
#endif
|
|
TNumericLimits<int32>::Max();
|
|
}
|
|
|
|
FMetalDynamicRHI::FProcessResult FMetalDynamicRHI::ProcessSubmissionQueue()
|
|
{
|
|
SCOPED_NAMED_EVENT_TEXT("SubmissionQueue_Process", FColor::Turquoise);
|
|
LLM_SCOPE_BYNAME(TEXT("RHIMisc/ProcessSubmissionQueue"));
|
|
|
|
FMetalCommandQueue::FPayloadArray PayloadsToHandDown;
|
|
FProcessResult Result;
|
|
|
|
auto FlushPayloads = [&PayloadsToHandDown, &Result, DynamicRHI = this]()
|
|
{
|
|
if (PayloadsToHandDown.Num() > 0)
|
|
{
|
|
Result.Status |= EQueueStatus::Processed;
|
|
DynamicRHI->FlushBatchedPayloads(PayloadsToHandDown);
|
|
}
|
|
};
|
|
|
|
bool bProgress;
|
|
bool bKickInterruptThread = false;
|
|
|
|
do
|
|
{
|
|
bProgress = false;
|
|
Result.Status = EQueueStatus::None;
|
|
|
|
// Push all pending payloads into the ordered per-device, per-pipe pending queues
|
|
{
|
|
TArray<FMetalPayload*>* Array;
|
|
while (PendingPayloadsForSubmission.Dequeue(Array))
|
|
{
|
|
for (FMetalPayload* Payload : *Array)
|
|
{
|
|
Payload->Queue.PendingSubmission.Enqueue(Payload);
|
|
}
|
|
delete Array;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Fence values for FMetalSyncPoint are determined on the submission thread,
|
|
// where each queue has a monotonically incrementing fence value.
|
|
//
|
|
// We might receive work that waits on a sync point which has not yet been submitted
|
|
// to the queue that will signal it, so we need to delay processing of those
|
|
// payloads until the fence value is known.
|
|
//
|
|
|
|
// Process all queues (across all devices and adapters) to flush work.
|
|
// Any sync point waits where the fence value is unknown will be left in the
|
|
// appropriate queue, to be processed the next time commands are submitted.
|
|
ForEachQueue([&](FMetalCommandQueue& CurrentQueue)
|
|
{
|
|
while (true)
|
|
{
|
|
TArray<FMetalCommandQueue*, TInlineAllocator<GMetalMaxNumQueues>> QueuesWithPayloads;
|
|
{
|
|
FMetalPayload* Payload = CurrentQueue.PendingSubmission.Peek();
|
|
if (!Payload)
|
|
return;
|
|
|
|
// Accumulate the list of fences to await, and their maximum values
|
|
while (Payload->SyncPointsToWait.Index < Payload->SyncPointsToWait.Num())
|
|
{
|
|
FMetalSyncPointRef& SyncPoint = Payload->SyncPointsToWait[Payload->SyncPointsToWait.Index];
|
|
if (!SyncPoint->ResolvedFence.IsSet())
|
|
{
|
|
// Need to wait on a sync point, but the fence value has not been resolved yet
|
|
// (no other payloads have signaled the sync point yet).
|
|
|
|
// Skip processing this queue, and move on to the next. We will retry later when
|
|
// further work is submitted, which may contain the sync point we need.
|
|
Result.Status |= EQueueStatus::Pending;
|
|
return;
|
|
}
|
|
|
|
Payload->AddQueueFenceWait(
|
|
SyncPoint->ResolvedFence->Fence,
|
|
SyncPoint->ResolvedFence->Value
|
|
);
|
|
|
|
Payload->SyncPointsToWait.Index++;
|
|
bProgress = true;
|
|
}
|
|
|
|
// All necessary sync points have been resolved.
|
|
Payload->SyncPointsToWait = {};
|
|
CurrentQueue.PendingSubmission.Pop();
|
|
bProgress = true;
|
|
|
|
check(!CurrentQueue.PayloadToSubmit);
|
|
CurrentQueue.PayloadToSubmit = Payload;
|
|
QueuesWithPayloads.Add(&CurrentQueue);
|
|
Result.Status |= EQueueStatus::Processed;
|
|
bKickInterruptThread = true;
|
|
|
|
for (int32 Index = 0; Index < Payload->CommandBuffersToExecute.Num(); Index++)
|
|
{
|
|
FMetalCommandBuffer* CurrentCommandBuffer = Payload->CommandBuffersToExecute[Index];
|
|
|
|
CurrentQueue.OcclusionQueries.Append(MoveTemp(CurrentCommandBuffer->OcclusionQueries));
|
|
CurrentQueue.TimestampQueries.Append(MoveTemp(CurrentCommandBuffer->TimestampQueries));
|
|
#if RHI_NEW_GPU_PROFILER == 0
|
|
CurrentQueue.EventSampleCounters.Append(MoveTemp(CurrentCommandBuffer->EventSampleCounters));
|
|
#endif
|
|
}
|
|
}
|
|
|
|
// Queues with work to submit other than the current one (CurrentQueue) are performing barrier operations.
|
|
// Submit this work first, followed by a fence signal + enqueued wait.
|
|
for (FMetalCommandQueue* OtherQueue : QueuesWithPayloads)
|
|
{
|
|
if (OtherQueue != &CurrentQueue)
|
|
{
|
|
uint64 ValueSignaled = OtherQueue->FinalizePayload(true, PayloadsToHandDown);
|
|
CurrentQueue.PayloadToSubmit->AddQueueFenceWait(OtherQueue->GetSignalEvent(), ValueSignaled);
|
|
}
|
|
FlushPayloads();
|
|
}
|
|
|
|
// Now submit the original payload
|
|
CurrentQueue.FinalizePayload(false, PayloadsToHandDown);
|
|
FlushPayloads();
|
|
}
|
|
});
|
|
} while (bProgress);
|
|
|
|
FlushPayloads();
|
|
|
|
if (InterruptThread && bKickInterruptThread)
|
|
{
|
|
InterruptThread->Kick();
|
|
}
|
|
|
|
return Result;
|
|
}
|
|
|
|
uint64 FMetalCommandQueue::FinalizePayload(bool bRequiresSignal, FPayloadArray& PayloadsToHandDown)
|
|
{
|
|
TRACE_CPUPROFILER_EVENT_SCOPE(ExecuteCommandList);
|
|
LLM_SCOPE_BYNAME(TEXT("RHIMisc/ExecuteCommandLists"));
|
|
|
|
check(PayloadToSubmit && this == &PayloadToSubmit->Queue);
|
|
check(PayloadToSubmit->SyncPointsToWait.Num() == 0);
|
|
check(!PayloadToSubmit->SignalCommandBuffer);
|
|
|
|
// Keep the latest fence value in the submitted payload.
|
|
// The interrupt thread uses this to determine when work has completed.
|
|
PayloadToSubmit->CompletionFenceValue = ++SignalEvent.NextCompletionValue;
|
|
PayloadToSubmit->bAlwaysSignal |= bRequiresSignal;
|
|
|
|
// Set the fence/value pair into any sync points we need to signal.
|
|
for (FMetalSyncPointRef& SyncPoint : PayloadToSubmit->SyncPointsToSignal)
|
|
{
|
|
check(!SyncPoint->ResolvedFence.IsSet());
|
|
SyncPoint->ResolvedFence.Emplace(SignalEvent, PayloadToSubmit->CompletionFenceValue);
|
|
}
|
|
|
|
PayloadToSubmit->BatchedObjects.OcclusionQueries = MoveTemp(OcclusionQueries);
|
|
PayloadToSubmit->BatchedObjects.TimestampQueries = MoveTemp(TimestampQueries);
|
|
|
|
#if RHI_NEW_GPU_PROFILER == 0
|
|
PayloadToSubmit->BatchedObjects.EventSampleCounters = MoveTemp(EventSampleCounters);
|
|
#endif
|
|
|
|
PayloadsToHandDown.Add(PayloadToSubmit);
|
|
PayloadToSubmit = nullptr;
|
|
|
|
return SignalEvent.NextCompletionValue;
|
|
}
|
|
|
|
void FMetalDynamicRHI::FlushBatchedPayloads(FMetalCommandQueue::FPayloadArray& PayloadsToSubmit)
|
|
{
|
|
uint32 FirstPayload = 0, LastPayload = 0;
|
|
|
|
auto Wait = [this](FMetalPayload* Payload)
|
|
{
|
|
FMetalCommandQueue& Queue = Payload->Queue;
|
|
|
|
// Wait for queue fences
|
|
for (auto& [LocalFence, Value] : Payload->QueueFencesToWait)
|
|
{
|
|
FMetalCommandBuffer* CurrentCommandBuffer = Queue.CreateCommandBuffer();
|
|
CurrentCommandBuffer->GetMTLCmdBuffer()->encodeWait(LocalFence.MetalEvent, Value);
|
|
Queue.CommitCommandBuffer(CurrentCommandBuffer);
|
|
|
|
DeferredDelete([CurrentCommandBuffer]() {
|
|
delete CurrentCommandBuffer;
|
|
});
|
|
}
|
|
};
|
|
|
|
auto Flush = [&]()
|
|
{
|
|
if (FirstPayload == LastPayload)
|
|
return;
|
|
|
|
FMetalCommandQueue& Queue = PayloadsToSubmit[FirstPayload]->Queue;
|
|
|
|
uint64 Time = FPlatformTime::Cycles64();
|
|
|
|
TArray<FMetalCommandBuffer*> CommandBuffers;
|
|
|
|
// Accumulate the command lists from the payload
|
|
for (uint32 Index = FirstPayload; Index < LastPayload; ++Index)
|
|
{
|
|
FMetalPayload* Payload = PayloadsToSubmit[Index];
|
|
check(&Payload->Queue == &Queue);
|
|
|
|
for (FMetalCommandBuffer* CommandBuffer : Payload->CommandBuffersToExecute)
|
|
{
|
|
#if RHI_NEW_GPU_PROFILER
|
|
CommandBuffer->FlushProfilerEvents(Payload->EventStream, Time);
|
|
#endif
|
|
|
|
Payload->Queue.CommitCommandBuffer(CommandBuffer);
|
|
}
|
|
}
|
|
|
|
FirstPayload = LastPayload;
|
|
};
|
|
|
|
auto Signal = [this](FMetalPayload* Payload)
|
|
{
|
|
FMetalCommandQueue& Queue = Payload->Queue;
|
|
|
|
// Signal the queue fence
|
|
if (Payload->RequiresQueueFenceSignal())
|
|
{
|
|
check(Queue.GetSignalEvent().LastSignaledValue < Payload->CompletionFenceValue);
|
|
|
|
FMetalCommandBuffer* CommandBuffer = Queue.CreateCommandBuffer();
|
|
|
|
MTL::HandlerFunction CompletionHandler = [this](MTL::CommandBuffer* CompletedBuffer)
|
|
{
|
|
if(InterruptThread)
|
|
{
|
|
InterruptThread->Kick();
|
|
}
|
|
};
|
|
|
|
CommandBuffer->GetMTLCmdBuffer()->addCompletedHandler(CompletionHandler);
|
|
CommandBuffer->GetMTLCmdBuffer()->encodeSignalEvent(Queue.GetSignalEvent().MetalEvent, Payload->CompletionFenceValue);
|
|
Payload->SignalCommandBuffer = CommandBuffer;
|
|
|
|
Queue.CommitCommandBuffer(CommandBuffer);
|
|
Queue.GetSignalEvent().LastSignaledValue.store(Payload->CompletionFenceValue, std::memory_order_release);
|
|
}
|
|
|
|
// Submission of this payload is completed. Signal the submission event if one was provided.
|
|
if (Payload->SubmissionEvent)
|
|
{
|
|
Payload->SubmissionEvent->DispatchSubsequents();
|
|
}
|
|
};
|
|
|
|
FMetalCommandQueue* PrevQueue = nullptr;
|
|
for (FMetalPayload* Payload : PayloadsToSubmit)
|
|
{
|
|
if (PrevQueue != &Payload->Queue)
|
|
{
|
|
Flush();
|
|
PrevQueue = &Payload->Queue;
|
|
}
|
|
|
|
Payload->Queue.PendingInterrupt.Enqueue(Payload);
|
|
|
|
if (Payload->HasWaitWork())
|
|
{
|
|
Flush();
|
|
Wait(Payload);
|
|
}
|
|
|
|
if (Payload->HasPreExecuteWork())
|
|
{
|
|
Flush();
|
|
Payload->PreExecute();
|
|
}
|
|
|
|
LastPayload++;
|
|
|
|
if (Payload->HasSignalWork())
|
|
{
|
|
Flush();
|
|
Signal(Payload);
|
|
}
|
|
}
|
|
|
|
Flush();
|
|
|
|
for (FMetalPayload* Payload : PayloadsToSubmit)
|
|
{
|
|
// Only set this bool to true once we'll never touch the payload again on this thread.
|
|
// This is because the bool hands ownership to the interrupt thread, which might delete the payload.
|
|
Payload->bSubmitted = true;
|
|
}
|
|
|
|
PayloadsToSubmit.Reset();
|
|
}
|
|
|
|
void FMetalPayload::AddQueueFenceWait(FMetalSignalEvent& InFence, uint64 InValue)
|
|
{
|
|
for (auto& [Fence, Value] : QueueFencesToWait)
|
|
{
|
|
if (&Fence == &InFence)
|
|
{
|
|
Value = FMath::Max(Value, InValue);
|
|
return;
|
|
}
|
|
}
|
|
|
|
QueueFencesToWait.Add({ InFence, InValue });
|
|
}
|
|
|
|
void FMetalSyncPoint::Wait() const
|
|
{
|
|
checkf(GraphEvent, TEXT("This sync point was not created with a CPU event. Cannot wait for completion on the CPU."));
|
|
|
|
if (!GraphEvent->IsComplete())
|
|
{
|
|
// Block the calling thread until the graph event is signaled by the interrupt thread.
|
|
SCOPED_NAMED_EVENT_TEXT("SyncPoint_Wait", FColor::Turquoise);
|
|
FMetalDynamicRHI::Get().ProcessInterruptQueueUntil(GraphEvent);
|
|
}
|
|
|
|
check(GraphEvent->IsComplete());
|
|
}
|
|
|
|
void FMetalDynamicRHI::ProcessInterruptQueueUntil(FGraphEvent* GraphEvent)
|
|
{
|
|
if (InterruptThread)
|
|
{
|
|
if (GraphEvent && !GraphEvent->IsComplete())
|
|
{
|
|
InterruptThread->Kick();
|
|
GraphEvent->Wait();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Use the current thread to process the interrupt queue until the sync point we're waiting for is signaled.
|
|
// If GraphEvent is nullptr, process the queue until no further progress is made (assuming we can acquire the lock), then return.
|
|
if (!GraphEvent || !GraphEvent->IsComplete())
|
|
{
|
|
// If we're waiting for a sync point, accumulate the idle time
|
|
UE::Stats::FThreadIdleStats::FScopeIdle IdleScope(GraphEvent == nullptr);
|
|
|
|
Retry:
|
|
if (InterruptCS.TryLock())
|
|
{
|
|
FProcessResult Result;
|
|
do { Result = ProcessInterruptQueue(); }
|
|
// If we have a sync point, keep processing until the sync point is signaled.
|
|
// Otherwise, process until no more progress is being made.
|
|
while (GraphEvent
|
|
? !GraphEvent->IsComplete()
|
|
: EnumHasAllFlags(Result.Status, EQueueStatus::Processed)
|
|
);
|
|
|
|
InterruptCS.Unlock();
|
|
}
|
|
else if (GraphEvent && !GraphEvent->IsComplete())
|
|
{
|
|
// Failed to get the lock. Another thread is processing the interrupt queue. Try again...
|
|
FPlatformProcess::SleepNoStats(0);
|
|
goto Retry;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void FMetalDynamicRHI::ProcessPendingCommandBuffers()
|
|
{
|
|
double SecondsPerCycle = FPlatformTime::GetSecondsPerCycle64();
|
|
bool bContinueProcessing = true;
|
|
|
|
CmdBuffersPendingCompletion.RemoveAll([SecondsPerCycle, &bContinueProcessing](FMetalCommandBuffer* CommandBuffer)
|
|
{
|
|
MTL::CommandBuffer* CompletedBuffer = CommandBuffer->GetMTLCmdBuffer();
|
|
|
|
MTL::CommandBufferStatus Status = CompletedBuffer->status();
|
|
if (Status == MTL::CommandBufferStatusCompleted && bContinueProcessing)
|
|
{
|
|
#if RHI_NEW_GPU_PROFILER
|
|
#if WITH_RHI_BREADCRUMBS
|
|
FMetalBreadcrumbProfiler::GetInstance()->ResolveBreadcrumbTrackerStream(CommandBuffer->BreadcrumbTrackerStream);
|
|
#endif
|
|
|
|
uint64_t* Start = CommandBuffer->BeginWorkTimestamp;
|
|
uint64_t* End = CommandBuffer->EndWorkTimestamp;
|
|
|
|
if(CommandBuffer->CounterSamples.Num())
|
|
{
|
|
for(FMetalCounterSamplePtr Sample : CommandBuffer->CounterSamples)
|
|
{
|
|
uint64_t StartTime, EndTime;
|
|
Sample->ResolveStageCounters(StartTime, EndTime);
|
|
|
|
*Start = *Start > 0 ? FMath::Min(StartTime, *Start) : StartTime;
|
|
*End = *End > 0 ? FMath::Max(EndTime, *End) : EndTime;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*Start = CompletedBuffer->GPUStartTime() / SecondsPerCycle;
|
|
*End = CompletedBuffer->GPUEndTime() / SecondsPerCycle;
|
|
}
|
|
#else
|
|
FMetalCommandBufferTimer* Timer = CommandBuffer->GetTimer();
|
|
Timer->AddTiming({CompletedBuffer->GPUStartTime(), CompletedBuffer->GPUEndTime()});
|
|
#endif
|
|
|
|
delete CommandBuffer;
|
|
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
bContinueProcessing = false;
|
|
}
|
|
|
|
return false;
|
|
});
|
|
}
|
|
|
|
FMetalDynamicRHI::FProcessResult FMetalDynamicRHI::ProcessInterruptQueue()
|
|
{
|
|
SCOPED_NAMED_EVENT_TEXT("InterruptQueue_Process", FColor::Yellow);
|
|
LLM_SCOPE_BYNAME(TEXT("RHIMisc/ProcessInterruptQueue"));
|
|
|
|
// Timer that clamps each tick to prevent false positive GPU timeouts
|
|
// when a debugger is attached and the process is broken.
|
|
struct FTimer
|
|
{
|
|
uint64 Elapsed;
|
|
uint64 Last;
|
|
|
|
FTimer()
|
|
: Elapsed(0)
|
|
, Last(FPlatformTime::Cycles64())
|
|
{}
|
|
|
|
void Tick()
|
|
{
|
|
static const uint64 MaxDeltaCycles = uint64(1.0 / FPlatformTime::GetSecondsPerCycle64()); // 1 second
|
|
uint64 Current = FPlatformTime::Cycles64();
|
|
Elapsed += FMath::Min(MaxDeltaCycles, Current - Last);
|
|
Last = Current;
|
|
}
|
|
} static Timer;
|
|
|
|
Timer.Tick();
|
|
|
|
FProcessResult Result;
|
|
ForEachQueue([&](FMetalCommandQueue& CurrentQueue)
|
|
{
|
|
while (FMetalPayload* Payload = CurrentQueue.PendingInterrupt.Peek())
|
|
{
|
|
if (!Payload->bSubmitted)
|
|
break;
|
|
|
|
// Check for GPU completion
|
|
FMetalSignalEvent& CurrentEvent = CurrentQueue.GetSignalEvent();
|
|
|
|
uint64 LastSignaledFenceValue = CurrentEvent.LastSignaledValue.load(std::memory_order_acquire);
|
|
|
|
// Handle command buffer errors
|
|
for(FMetalCommandBuffer* CommandBuffer : Payload->CommandBuffersToExecute)
|
|
{
|
|
MTL::CommandBuffer* CompletedBuffer = CommandBuffer->GetMTLCmdBuffer();
|
|
if (CompletedBuffer->status() == MTL::CommandBufferStatusError)
|
|
{
|
|
FMetalCommandList::HandleMetalCommandBufferFailure(CompletedBuffer);
|
|
}
|
|
}
|
|
|
|
MTL::CommandBufferStatus Status = MTL::CommandBufferStatusCompleted;
|
|
|
|
if(Payload->SignalCommandBuffer)
|
|
{
|
|
MTL::CommandBuffer* SignalBuffer = Payload->SignalCommandBuffer->GetMTLCmdBuffer();
|
|
Status = SignalBuffer->status();
|
|
if(Status == MTL::CommandBufferStatusError)
|
|
{
|
|
FMetalCommandList::HandleMetalCommandBufferFailure(SignalBuffer);
|
|
}
|
|
}
|
|
|
|
// Remove Completed status check when we remove completion handlers
|
|
if (Status != MTL::CommandBufferStatusCompleted)
|
|
{
|
|
// Skip processing this queue and move on to the next.
|
|
Result.Status |= EQueueStatus::Pending;
|
|
break;
|
|
}
|
|
|
|
if(Payload->SignalCommandBuffer)
|
|
{
|
|
delete Payload->SignalCommandBuffer;
|
|
}
|
|
|
|
#if RHI_NEW_GPU_PROFILER
|
|
if (!Payload->EventStream.IsEmpty())
|
|
{
|
|
check(CurrentQueue.Timing);
|
|
CurrentQueue.Timing->EventStream.Append(MoveTemp(Payload->EventStream));
|
|
}
|
|
|
|
if (Payload->Timing.IsSet())
|
|
{
|
|
// Switch the new timing struct into the queue. This redirects timestamp results to separate each frame's work.
|
|
CurrentQueue.Timing = Payload->Timing.GetValue();
|
|
}
|
|
#endif
|
|
// Resolve query results
|
|
{
|
|
for (FMetalRHIRenderQuery* OcclusionQuery : Payload->BatchedObjects.OcclusionQueries)
|
|
{
|
|
OcclusionQuery->Result = OcclusionQuery->Buffer.GetResult();
|
|
}
|
|
|
|
for (FMetalRHIRenderQuery* TimestampQuery : Payload->BatchedObjects.TimestampQueries)
|
|
{
|
|
MTL::CommandBuffer* CmdBuffer = TimestampQuery->CommandBuffer->GetMTLCmdBuffer();
|
|
|
|
// If there are no commands in the command buffer then this can be zero
|
|
// In this case GPU start time is also not correct - we need to fall back standard behaviour
|
|
// Only seen empty command buffers at the very end of a frame
|
|
|
|
// Convert seconds to microseconds
|
|
TimestampQuery->Result = uint64(CmdBuffer->GPUEndTime()) * 1000000;
|
|
|
|
if(TimestampQuery->Result == 0)
|
|
{
|
|
TimestampQuery->Result = (FPlatformTime::ToMilliseconds64(mach_absolute_time()) * 1000.0);
|
|
}
|
|
|
|
TimestampQuery->Release();
|
|
}
|
|
|
|
|
|
#if RHI_NEW_GPU_PROFILER == 0
|
|
for(auto& Pair : Payload->BatchedObjects.EventSampleCounters)
|
|
{
|
|
uint64_t& Start = Pair.Key->StartTime;
|
|
uint64_t& End = Pair.Key->EndTime;
|
|
|
|
for(FMetalCounterSamplePtr Sample: Pair.Value)
|
|
{
|
|
uint64_t StartTime, EndTime;
|
|
Sample->ResolveStageCounters(StartTime, EndTime);
|
|
|
|
StartTime /= 1000.0;
|
|
EndTime /= 1000.0;
|
|
|
|
Start = Start > 0 ? FMath::Min(StartTime, Start) : StartTime;
|
|
End = End > 0 ? FMath::Max(EndTime, End) : EndTime;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Signal the CPU events of all sync points associated with this batch.
|
|
for (FMetalSyncPointRef& SyncPoint : Payload->SyncPointsToSignal)
|
|
{
|
|
if (SyncPoint->GraphEvent)
|
|
{
|
|
SyncPoint->GraphEvent->DispatchSubsequents();
|
|
}
|
|
}
|
|
|
|
// We're done with this payload now.
|
|
for(FMetalCommandBuffer* CommandBuffer : Payload->CommandBuffersToExecute)
|
|
{
|
|
CmdBuffersPendingCompletion.Add(CommandBuffer);
|
|
}
|
|
|
|
ProcessPendingCommandBuffers();
|
|
|
|
// At this point, the current command list has completed on the GPU.
|
|
CurrentQueue.PendingInterrupt.Pop();
|
|
Result.Status |= EQueueStatus::Processed;
|
|
|
|
// GPU resources the payload is holding a reference to will be cleaned up here.
|
|
// E.g. command list allocators, which get recycled on the parent device.
|
|
delete Payload;
|
|
}
|
|
});
|
|
|
|
return Result;
|
|
}
|
|
|
|
FMetalPayload::FMetalPayload(FMetalCommandQueue& Queue)
|
|
: Queue(Queue)
|
|
#if RHI_NEW_GPU_PROFILER
|
|
, EventStream(Queue.GetProfilerQueue())
|
|
#endif
|
|
{}
|
|
|
|
FMetalPayload::~FMetalPayload()
|
|
{
|
|
}
|
|
|
|
void FMetalPayload::PreExecute()
|
|
{
|
|
if (PreExecuteCallback)
|
|
{
|
|
PreExecuteCallback(Queue);
|
|
}
|
|
}
|