Files
UnrealEngine/Engine/Source/Runtime/D3D12RHI/Private/D3D12CommandContext.cpp
2025-05-18 13:04:45 +08:00

1272 lines
39 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
D3D12CommandContext.cpp: RHI Command Context implementation.
=============================================================================*/
#include "D3D12CommandContext.h"
#include "D3D12RHIPrivate.h"
#include "D3D12AmdExtensions.h"
#include "D3D12RayTracing.h"
int32 GD3D12MaxCommandsPerCommandList = 10000;
static FAutoConsoleVariableRef CVarMaxCommandsPerCommandList(
TEXT("D3D12.MaxCommandsPerCommandList"),
GD3D12MaxCommandsPerCommandList,
TEXT("Flush command list to GPU after certain amount of enqueued commands (draw, dispatch, copy, ...) (default value 10000)"),
ECVF_RenderThreadSafe
);
// We don't yet have a way to auto-detect that the Radeon Developer Panel is running
// with profiling enabled, so for now, we have to manually toggle this console var.
// It needs to be set before device creation, so it's read only.
int32 GEmitRgpFrameMarkers = 0;
static FAutoConsoleVariableRef CVarEmitRgpFrameMarkers(
TEXT("D3D12.EmitRgpFrameMarkers"),
GEmitRgpFrameMarkers,
TEXT("Enables/Disables frame markers for AMD's RGP tool."),
ECVF_ReadOnly | ECVF_RenderThreadSafe
);
// jhoerner TODO 10/4/2022: This setting is a hack to improve performance by reverting cross GPU transfer synchronization behavior to
// what it was in 5.0, at a cost in validation correctness (D3D debug errors related to using a cross GPU transferred resource in an
// incorrect transition state, or when possibly still being written). In practice, these errors haven't caused artifacts or stability
// issues, but if you run into an artifact suspected to be related to a cross GPU transfer, or want to run with validation for
// debugging, you can disable the hack. A future refactor in 5.2 will clean this up and provide validation correctness without any
// performance loss.
//
bool GD3D12UnsafeCrossGPUTransfers = true;
static FAutoConsoleVariableRef CVarD3D12UnsafeCrossGPUTransfers(
TEXT("D3D12.UnsafeCrossGPUTransfers"),
GD3D12UnsafeCrossGPUTransfers,
TEXT("Disables cross GPU synchronization correctness, for a gain in performance (Default: true)."),
ECVF_RenderThreadSafe
);
FD3D12CommandContextBase::FD3D12CommandContextBase(class FD3D12Adapter* InParentAdapter, FRHIGPUMask InGPUMask)
: FD3D12AdapterChild(InParentAdapter)
, GPUMask(InGPUMask)
, PhysicalGPUMask(InGPUMask)
{
}
static D3D12_RESOURCE_STATES GetValidResourceStates(ED3D12QueueType CommandListType)
{
// For reasons, we can't just list the allowed states, we have to list the disallowed states.
// For reference on allowed/disallowed states, see:
// https://microsoft.github.io/DirectX-Specs/d3d/CPUEfficiency.html#state-support-by-command-list-type
const D3D12_RESOURCE_STATES DisallowedDirectStates =
static_cast<D3D12_RESOURCE_STATES>(0);
const D3D12_RESOURCE_STATES DisallowedComputeStates =
DisallowedDirectStates |
D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER |
D3D12_RESOURCE_STATE_INDEX_BUFFER |
D3D12_RESOURCE_STATE_RENDER_TARGET |
D3D12_RESOURCE_STATE_DEPTH_WRITE |
D3D12_RESOURCE_STATE_DEPTH_READ |
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE |
D3D12_RESOURCE_STATE_STREAM_OUT |
D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT |
D3D12_RESOURCE_STATE_RESOLVE_DEST |
D3D12_RESOURCE_STATE_RESOLVE_SOURCE;
const D3D12_RESOURCE_STATES DisallowedCopyStates =
DisallowedComputeStates |
D3D12_RESOURCE_STATE_UNORDERED_ACCESS |
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
if (CommandListType == ED3D12QueueType::Copy)
{
return ~DisallowedCopyStates;
}
if (CommandListType == ED3D12QueueType::Async)
{
return ~DisallowedComputeStates;
}
return ~DisallowedDirectStates;
}
D3D12_RESOURCE_STATES FD3D12CommandContext::SkipFastClearEliminateState = D3D12_RESOURCE_STATES(0);
FD3D12CommandContext::FD3D12CommandContext(FD3D12Device* InParent, ED3D12QueueType QueueType, bool InIsDefaultContext)
: FD3D12ContextCommon(InParent, QueueType, InIsDefaultContext)
, FD3D12CommandContextBase(InParent->GetParentAdapter(), InParent->GetGPUMask())
, FD3D12DeviceChild(InParent)
, ConstantsAllocator(InParent, InParent->GetGPUMask())
, StateCache(*this, InParent->GetGPUMask())
, ValidResourceStates(GetValidResourceStates(QueueType))
, StageConstantBuffers{
FD3D12ConstantBuffer(InParent, ConstantsAllocator),
FD3D12ConstantBuffer(InParent, ConstantsAllocator),
FD3D12ConstantBuffer(InParent, ConstantsAllocator),
FD3D12ConstantBuffer(InParent, ConstantsAllocator),
FD3D12ConstantBuffer(InParent, ConstantsAllocator),
FD3D12ConstantBuffer(InParent, ConstantsAllocator),
}
{
SkipFastClearEliminateState = D3D12_RESOURCE_STATES(0);
StaticUniformBuffers.AddZeroed(FUniformBufferStaticSlotRegistry::Get().GetSlotCount());
ClearState();
}
FD3D12CommandContext::~FD3D12CommandContext()
{
ClearState();
}
void FD3D12ContextCommon::WriteMarker(D3D12_GPU_VIRTUAL_ADDRESS Address, uint32 Value, EMarkerType Type)
{
if (!GraphicsCommandList2())
return;
D3D12_WRITEBUFFERIMMEDIATE_PARAMETER Parameter;
Parameter.Dest = Address;
Parameter.Value = Value;
D3D12_WRITEBUFFERIMMEDIATE_MODE Mode = Type == EMarkerType::In
? D3D12_WRITEBUFFERIMMEDIATE_MODE_MARKER_IN
: D3D12_WRITEBUFFERIMMEDIATE_MODE_MARKER_OUT;
GraphicsCommandList2()->WriteBufferImmediate(1, &Parameter, &Mode);
}
void FD3D12ContextCommon::BindDiagnosticBuffer(FD3D12RootSignature const* RootSignature, ED3D12PipelineType PipelineType)
{
int8 const Slot = RootSignature->GetDiagnosticBufferSlot();
if (Slot < 0)
return;
if (FD3D12DiagnosticBuffer* DiagBuffer = Device->GetQueue(QueueType).DiagnosticBuffer.Get())
{
D3D12_GPU_VIRTUAL_ADDRESS DataAddress = DiagBuffer->GetGPUQueueData();
switch (PipelineType)
{
default: checkNoEntry(); [[fallthrough]];
case ED3D12PipelineType::Graphics: GraphicsCommandList()->SetGraphicsRootUnorderedAccessView(Slot, DataAddress); break;
case ED3D12PipelineType::Compute : GraphicsCommandList()->SetComputeRootUnorderedAccessView (Slot, DataAddress); break;
}
}
}
#if WITH_RHI_BREADCRUMBS
void FD3D12CommandContext::RHIBeginBreadcrumbGPU(FRHIBreadcrumbNode* Breadcrumb)
{
FD3D12DiagnosticBuffer* DiagBuffer = Device->GetQueue(QueueType).DiagnosticBuffer.Get();
if (DiagBuffer && UE::RHI::UseGPUCrashBreadcrumbs())
{
D3D12_GPU_VIRTUAL_ADDRESS Marker = DiagBuffer->GetGPUQueueMarkerIn();
WriteMarker(Marker, Breadcrumb->ID, EMarkerType::In);
}
#if NV_AFTERMATH
UE::RHICore::Nvidia::Aftermath::D3D12::BeginBreadcrumb(AftermathHandle(), Breadcrumb);
#endif
#if INTEL_GPU_CRASH_DUMPS
UE::RHICore::Intel::GPUCrashDumps::D3D12::BeginBreadcrumb(GraphicsCommandList().Get(), Breadcrumb);
#endif
const TCHAR* NameStr = nullptr;
FRHIBreadcrumb::FBuffer Buffer;
auto GetNameStr = [&]()
{
if (!NameStr)
{
NameStr = Breadcrumb->GetTCHAR(Buffer);
}
return NameStr;
};
// Only emit formatted strings to platform APIs when requested.
if (ShouldEmitBreadcrumbs())
{
#if WITH_AMD_AGS
if (AGSContext* const AmdAgsContext = FD3D12DynamicRHI::GetD3DRHI()->GetAmdAgsContext())
{
if (GEmitRgpFrameMarkers)
{
agsDriverExtensionsDX12_PushMarker(AmdAgsContext, GraphicsCommandList().Get(), TCHAR_TO_ANSI(GetNameStr()));
}
}
#endif
#if USE_PIX
if (FD3D12DynamicRHI::GetD3DRHI()->IsPixEventEnabled())
{
PIXBeginEvent(GraphicsCommandList().Get(), PIX_COLOR(0xff, 0xff, 0xff), TEXT("%s"), GetNameStr());
}
#endif
}
#if RHI_NEW_GPU_PROFILER
{
FlushProfilerStats();
auto& Event = GetCommandList().EmplaceProfilerEvent<UE::RHI::GPUProfiler::FEvent::FBeginBreadcrumb>(Breadcrumb);
FD3D12QueryLocation TimestampQuery = AllocateQuery(ED3D12QueryType::ProfilerTimestampTOP, &Event.GPUTimestampTOP);
EndQuery(TimestampQuery);
}
#else
if (IsDefaultContext() && !IsAsyncComputeContext())
{
FD3D12GPUProfiler& GPUProfiler = GetParentDevice()->GetGPUProfiler();
if (GPUProfiler.IsProfilingGPU())
{
GPUProfiler.PushEvent(GetNameStr(), FColor::White);
}
}
#endif // (RHI_NEW_GPU_PROFILER == 0)
}
void FD3D12CommandContext::RHIEndBreadcrumbGPU(FRHIBreadcrumbNode* Breadcrumb)
{
#if RHI_NEW_GPU_PROFILER
{
FlushProfilerStats();
auto& Event = GetCommandList().EmplaceProfilerEvent<UE::RHI::GPUProfiler::FEvent::FEndBreadcrumb>(Breadcrumb);
FD3D12QueryLocation TimestampQuery = AllocateQuery(ED3D12QueryType::ProfilerTimestampBOP, &Event.GPUTimestampBOP);
EndQuery(TimestampQuery);
}
#else
if (IsDefaultContext() && !IsAsyncComputeContext())
{
FD3D12GPUProfiler& GPUProfiler = GetParentDevice()->GetGPUProfiler();
if (GPUProfiler.IsProfilingGPU())
{
GPUProfiler.PopEvent();
}
}
#endif // (RHI_NEW_GPU_PROFILER == 0)
// Only emit formatted strings to platform APIs when requested.
if (ShouldEmitBreadcrumbs())
{
#if USE_PIX
if (FD3D12DynamicRHI::GetD3DRHI()->IsPixEventEnabled())
{
PIXEndEvent(GraphicsCommandList().Get());
}
#endif
#if WITH_AMD_AGS
if (AGSContext* const AmdAgsContext = FD3D12DynamicRHI::GetD3DRHI()->GetAmdAgsContext())
{
if (GEmitRgpFrameMarkers)
{
agsDriverExtensionsDX12_PopMarker(AmdAgsContext, GraphicsCommandList().Get());
}
}
#endif
}
#if NV_AFTERMATH
UE::RHICore::Nvidia::Aftermath::D3D12::EndBreadcrumb(AftermathHandle(), Breadcrumb);
#endif
#if INTEL_GPU_CRASH_DUMPS
UE::RHICore::Intel::GPUCrashDumps::D3D12::EndBreadcrumb(GraphicsCommandList().Get(), Breadcrumb);
#endif
FD3D12DiagnosticBuffer* DiagBuffer = Device->GetQueue(QueueType).DiagnosticBuffer.Get();
if (DiagBuffer && UE::RHI::UseGPUCrashBreadcrumbs())
{
D3D12_GPU_VIRTUAL_ADDRESS Marker = DiagBuffer->GetGPUQueueMarkerOut();
WriteMarker(Marker, Breadcrumb->ID, EMarkerType::Out);
}
}
#endif // WITH_RHI_BREADCRUMBS
FD3D12ContextCommon::FD3D12ContextCommon(FD3D12Device* Device, ED3D12QueueType QueueType, bool bIsDefaultContext)
: Device(Device)
, QueueType(QueueType)
, bIsDefaultContext(bIsDefaultContext)
, TimestampQueries(Device, QueueType, D3D12_QUERY_TYPE_TIMESTAMP)
, OcclusionQueries(Device, QueueType, D3D12_QUERY_TYPE_OCCLUSION)
, PipelineStatsQueries(Device, QueueType, D3D12_QUERY_TYPE_PIPELINE_STATISTICS)
{
}
void FD3D12ContextCommon::WaitSyncPoint(FD3D12SyncPoint* SyncPoint)
{
if (IsPendingCommands())
{
CloseCommandList();
}
GetPayload(EPhase::Wait)->SyncPointsToWait.Add(SyncPoint);
}
void FD3D12ContextCommon::SignalSyncPoint(FD3D12SyncPoint* SyncPoint)
{
if (IsPendingCommands())
{
CloseCommandList();
}
GetPayload(EPhase::Signal)->SyncPointsToSignal.Add(SyncPoint);
}
void FD3D12ContextCommon::SignalManualFence(ID3D12Fence* Fence, uint64 Value)
{
if (IsPendingCommands())
{
CloseCommandList();
}
GetPayload(EPhase::Signal)->ManualFencesToSignal.Emplace(Fence, Value);
}
void FD3D12ContextCommon::WaitManualFence(ID3D12Fence* Fence, uint64 Value)
{
if (IsPendingCommands())
{
CloseCommandList();
}
GetPayload(EPhase::Wait)->ManualFencesToWait.Emplace(Fence, Value);
}
FD3D12QueryLocation FD3D12ContextCommon::AllocateQuery(ED3D12QueryType Type, void* Target)
{
switch (Type)
{
default:
checkNoEntry();
[[fallthrough]];
case ED3D12QueryType::TimestampRaw:
case ED3D12QueryType::TimestampMicroseconds:
#if RHI_NEW_GPU_PROFILER
case ED3D12QueryType::ProfilerTimestampTOP:
case ED3D12QueryType::ProfilerTimestampBOP:
#endif
return TimestampQueries.Allocate(Type, Target);
case ED3D12QueryType::Occlusion:
return OcclusionQueries.Allocate(Type, Target);
case ED3D12QueryType::PipelineStats:
return PipelineStatsQueries.Allocate(Type, Target);
}
}
FD3D12QueryLocation FD3D12ContextCommon::InsertTimestamp(ED3D12Units Units, uint64* Target)
{
ED3D12QueryType Type;
switch (Units)
{
default:
checkNoEntry();
[[fallthrough]];
case ED3D12Units::Microseconds: Type = ED3D12QueryType::TimestampMicroseconds; break;
case ED3D12Units::Raw: Type = ED3D12QueryType::TimestampRaw; break;
}
FD3D12QueryLocation Location = AllocateQuery(Type, Target);
EndQuery(Location);
return Location;
}
void FD3D12ContextCommon::SetReservedBufferCommitSize(FD3D12Buffer* Buffer, uint64 CommitSizeInBytes)
{
if (IsPendingCommands())
{
CloseCommandList();
}
FD3D12CommitReservedResourceDesc CommitDesc;
CommitDesc.Resource = Buffer->GetResource();
CommitDesc.CommitSizeInBytes = CommitSizeInBytes;
checkf(CommitDesc.Resource, TEXT("FD3D12CommitReservedResourceDesc::Resource must be set"));
GetPayload(EPhase::UpdateReservedResources)->ReservedResourcesToCommit.Add(CommitDesc);
}
void FD3D12ContextCommon::OpenCommandList()
{
LLM_SCOPE_BYNAME(TEXT("RHIMisc/OpenCommandList"));
checkf(!IsOpen(), TEXT("Command list is already open."));
if (CommandAllocator == nullptr)
{
// Obtain a command allocator if the context doesn't already have one.
CommandAllocator = Device->ObtainCommandAllocator(QueueType);
}
// Get a new command list
CommandList = Device->ObtainCommandList(CommandAllocator, &TimestampQueries, &PipelineStatsQueries);
GetPayload(EPhase::Execute)->CommandListsToExecute.Add(CommandList);
check(ActiveQueries == 0);
}
void FD3D12CommandContext::OpenCommandList()
{
FD3D12ContextCommon::OpenCommandList();
// Notify the descriptor cache about the new command list
// This will set the descriptor cache's current heaps on the new command list.
StateCache.GetDescriptorCache()->OpenCommandList();
}
void FD3D12ContextCommon::CloseCommandList()
{
checkf(IsPendingCommands(), TEXT("The command list is empty."));
// Do this before we insert the final timestamp to ensure we're timing all the work on the command list.
// If the command list only has barrier work to do, this will open the command list for the first time
FlushResourceBarriers();
checkf(IsOpen(), TEXT("Command list is not open."));
checkf(Payloads.Num() && CurrentPhase == EPhase::Execute, TEXT("Expected the current payload to be in the execute phase."));
checkf(ActiveQueries == 0, TEXT("All queries must be completed before the command list is closed."));
FD3D12Payload* Payload = GetPayload(EPhase::Execute);
CommandList->Close();
CommandList = nullptr;
TimestampQueries .CloseAndReset(Payload->BatchedObjects.QueryRanges);
OcclusionQueries .CloseAndReset(Payload->BatchedObjects.QueryRanges);
PipelineStatsQueries.CloseAndReset(Payload->BatchedObjects.QueryRanges);
}
void FD3D12CommandContext::CloseCommandList()
{
StateCache.GetDescriptorCache()->CloseCommandList();
FD3D12ContextCommon::CloseCommandList();
#if PLATFORM_SUPPORTS_BINDLESS_RENDERING
// Always call the Bindless Manager CloseCommandList, it will determine when it needs to do anything.
GetParentDevice()->GetBindlessDescriptorManager().CloseCommandList(*this);
#endif
// Mark state as dirty now, because ApplyState may be called before OpenCommandList(), and it needs to know that the state has
// become invalid, so it can set it up again (which opens a new command list if necessary).
StateCache.DirtyStateForNewCommandList();
#if RHI_RAYTRACING
RayTracingShaderTables.Empty();
#endif
}
void FD3D12ContextCommon::Finalize(TArray<FD3D12Payload*>& OutPayloads)
{
if (IsPendingCommands())
{
CloseCommandList();
}
// Collect the context's batch of sync points to wait/signal
if (BatchedSyncPoints.ToWait.Num())
{
FD3D12Payload* Payload = Payloads.Num()
? Payloads[0]
: GetPayload(EPhase::Wait);
Payload->SyncPointsToWait.Append(BatchedSyncPoints.ToWait);
BatchedSyncPoints.ToWait.Reset();
}
if (BatchedSyncPoints.ToSignal.Num())
{
GetPayload(EPhase::Signal)->SyncPointsToSignal.Append(BatchedSyncPoints.ToSignal);
BatchedSyncPoints.ToSignal.Reset();
}
// Attach the command allocator and query heaps to the last payload.
// The interrupt thread will release these back to the device object pool.
if (CommandAllocator)
{
GetPayload(EPhase::Signal)->AllocatorsToRelease.Add(CommandAllocator);
CommandAllocator = nullptr;
}
check(!TimestampQueries.HasQueries());
check(!OcclusionQueries.HasQueries());
check(!PipelineStatsQueries.HasQueries());
ContextSyncPoint = nullptr;
// Move the list of payloads out of this context
OutPayloads.Append(MoveTemp(Payloads));
}
uint32 FD3D12CommandContext::GetFrameFenceCounter() const
{
return GetParentDevice()->GetParentAdapter()->GetFrameFence().GetNextFenceToSignal();
}
void FD3D12CommandContext::Finalize(TArray<FD3D12Payload*>& OutPayloads)
{
#if PLATFORM_SUPPORTS_BINDLESS_RENDERING
GetParentDevice()->GetBindlessDescriptorManager().FinalizeContext(*this);
#endif
#if RHI_NEW_GPU_PROFILER
FlushProfilerStats();
#endif
FD3D12ContextCommon::Finalize(OutPayloads);
}
#if PLATFORM_SUPPORTS_BINDLESS_RENDERING
FD3D12DescriptorHeap* FD3D12CommandContext::GetBindlessResourcesHeap()
{
// We require the descriptor cache to be setup correctly before it can have a valid bindless heap.
OpenIfNotAlready();
return StateCache.GetDescriptorCache()->GetBindlessResourcesHeap();
}
#endif
FD3D12QueryLocation FD3D12QueryAllocator::Allocate(ED3D12QueryType Type, void* Target)
{
check(Type != ED3D12QueryType::None);
// Allocate a new heap if needed
if (!CurrentRange || CurrentRange->IsFull(CurrentHeap))
{
TRefCountPtr<FD3D12QueryHeap> Heap = Device->ObtainQueryHeap(QueueType, QueryType);
if (!Heap)
{
// Unsupported query type
return {};
}
CurrentHeap = Heap;
CurrentRange = &Heaps.FindOrAdd(MoveTemp(Heap));
}
return FD3D12QueryLocation(
CurrentHeap,
CurrentRange->End++,
Type,
Target
);
}
void FD3D12QueryAllocator::CloseAndReset(TMap<TRefCountPtr<FD3D12QueryHeap>, TArray<FD3D12QueryRange>>& OutRanges)
{
if (HasQueries())
{
for (auto const& Pair : Heaps)
{
OutRanges.FindOrAdd(Pair.Key).Emplace(Pair.Value);
}
if (CurrentRange->IsFull(CurrentHeap))
{
// No space in any heap. Reset the whole array.
Heaps.Reset();
CurrentRange = nullptr;
CurrentHeap = nullptr;
}
else
{
// The last heap still has space. Reuse it for the next batch of command lists.
FD3D12QueryRange LastRange = *CurrentRange;
LastRange.Start = LastRange.End;
Heaps.Reset();
CurrentRange = &Heaps.FindOrAdd(CurrentHeap);
*CurrentRange = LastRange;
}
}
}
FD3D12CopyScope::FD3D12CopyScope(FD3D12Device* Device, ED3D12SyncPointType SyncPointType, FD3D12SyncPointRef const& WaitSyncPoint)
: Device(Device)
, SyncPoint(FD3D12SyncPoint::Create(SyncPointType))
, Context(*Device->ObtainContextCopy())
{
if (WaitSyncPoint)
{
Context.BatchedSyncPoints.ToWait.Add(WaitSyncPoint);
}
}
FD3D12CopyScope::~FD3D12CopyScope()
{
#if DO_CHECK
checkf(bSyncPointRetrieved, TEXT("The copy sync point must be retrieved before the end of the scope."));
#endif
Context.SignalSyncPoint(SyncPoint);
TArray<FD3D12Payload*> Payloads;
Context.Finalize(Payloads);
Context.ClearState();
Device->ReleaseContext(&Context);
FD3D12DynamicRHI::GetD3DRHI()->SubmitPayloads(MoveTemp(Payloads));
}
FD3D12SyncPoint* FD3D12CopyScope::GetSyncPoint() const
{
#if DO_CHECK
bSyncPointRetrieved = true;
#endif
return SyncPoint;
}
void FD3D12ContextCommon::NewPayload()
{
Payloads.Add(new FD3D12Payload(Device->GetQueue(QueueType)));
}
void FD3D12ContextCommon::FlushCommands(ED3D12FlushFlags FlushFlags)
{
// We should only be flushing the default context
check(IsDefaultContext());
if (IsPendingCommands())
{
CloseCommandList();
}
FD3D12SyncPointRef SyncPoint;
FGraphEventRef SubmissionEvent;
if (EnumHasAnyFlags(FlushFlags, ED3D12FlushFlags::WaitForCompletion))
{
SyncPoint = FD3D12SyncPoint::Create(ED3D12SyncPointType::GPUAndCPU);
SignalSyncPoint(SyncPoint);
}
if (EnumHasAnyFlags(FlushFlags, ED3D12FlushFlags::WaitForSubmission))
{
SubmissionEvent = FGraphEvent::CreateGraphEvent();
GetPayload(EPhase::Signal)->SubmissionEvent = SubmissionEvent;
}
{
TArray<FD3D12Payload*> LocalPayloads;
Finalize(LocalPayloads);
FD3D12DynamicRHI::GetD3DRHI()->SubmitPayloads(MoveTemp(LocalPayloads));
}
if (SyncPoint)
{
SyncPoint->Wait();
}
if (SubmissionEvent && !SubmissionEvent->IsComplete())
{
SCOPED_NAMED_EVENT_TEXT("Submission_Wait", FColor::Turquoise);
SubmissionEvent->Wait();
}
}
void FD3D12ContextCommon::ConditionalSplitCommandList()
{
// Start a new command list if the total number of commands exceeds the threshold. Too many commands in a single command list can cause TDRs.
if (IsOpen() && ActiveQueries == 0 && GD3D12MaxCommandsPerCommandList > 0 && CommandList->State.NumCommands > (uint32)GD3D12MaxCommandsPerCommandList)
{
UE_LOG(LogD3D12RHI, Verbose, TEXT("Splitting command lists because too many commands have been enqueued already (%d commands)"), CommandList->State.NumCommands);
CloseCommandList();
}
}
void FD3D12CommandContext::ClearState(EClearStateMode Mode)
{
StateCache.ClearState();
bDiscardSharedGraphicsConstants = false;
bDiscardSharedComputeConstants = false;
FMemory::Memzero(BoundUniformBuffers, sizeof(BoundUniformBuffers));
FMemory::Memzero(DirtyUniformBuffers, sizeof(DirtyUniformBuffers));
if (Mode == EClearStateMode::All)
{
FMemory::Memzero(StaticUniformBuffers.GetData(), StaticUniformBuffers.Num() * sizeof(FRHIUniformBuffer*));
}
}
void FD3D12CommandContext::ConditionalClearShaderResource(FD3D12ResourceLocation* Resource, EShaderParameterTypeMask ShaderParameterTypeMask)
{
check(Resource);
for (int32 Index = 0; Index < SF_NumStandardFrequencies; Index++)
{
StateCache.ClearResourceViewCaches(static_cast<EShaderFrequency>(Index), Resource, ShaderParameterTypeMask);
}
}
void FD3D12CommandContext::ClearShaderResources(FD3D12UnorderedAccessView* UAV, EShaderParameterTypeMask ShaderParameterTypeMask)
{
if (UAV)
{
ConditionalClearShaderResource(UAV->GetResourceLocation(), ShaderParameterTypeMask);
}
}
void FD3D12CommandContext::ClearShaderResources(FD3D12BaseShaderResource* Resource, EShaderParameterTypeMask ShaderParameterTypeMask)
{
if (Resource)
{
ConditionalClearShaderResource(&Resource->ResourceLocation, ShaderParameterTypeMask);
}
}
void FD3D12CommandContext::ClearAllShaderResources()
{
StateCache.ClearSRVs();
}
void FD3D12DynamicRHI::UpdateMemoryStats()
{
#if PLATFORM_WINDOWS && (STATS || CSV_PROFILER_STATS)
SCOPE_CYCLE_COUNTER(STAT_D3DUpdateVideoMemoryStats);
for (TSharedPtr<FD3D12Adapter> const& Adapter : ChosenAdapters)
{
// Refresh captured memory stats.
const FD3DMemoryStats& MemoryStats = Adapter->CollectMemoryStats();
UpdateD3DMemoryStatsAndCSV(MemoryStats, true);
#if STATS
uint64 MaxTexAllocWastage = 0;
for (FD3D12Device* Device : Adapter->GetDevices())
{
#if D3D12RHI_SEGREGATED_TEXTURE_ALLOC && D3D12RHI_SEGLIST_ALLOC_TRACK_WASTAGE
uint64 TotalAllocated;
uint64 TotalUnused;
Device->GetTextureAllocator().GetMemoryStats(TotalAllocated, TotalUnused);
MaxTexAllocWastage = FMath::Max(MaxTexAllocWastage, TotalUnused);
SET_MEMORY_STAT(STAT_D3D12TextureAllocatorAllocated, TotalAllocated);
SET_MEMORY_STAT(STAT_D3D12TextureAllocatorUnused, TotalUnused);
#endif
Device->GetDefaultBufferAllocator().UpdateMemoryStats();
Adapter->GetUploadHeapAllocator(Device->GetGPUIndex()).UpdateMemoryStats();
}
#endif // STATS
}
#endif // PLATFORM_WINDOWS && (STATS || CSV_PROFILER_STATS)
}
IRHIComputeContext* FD3D12DynamicRHI::RHIGetCommandContext(ERHIPipeline Pipeline, FRHIGPUMask GPUMask)
{
if (GPUMask.HasSingleIndex())
{
FD3D12Device* Device = GetAdapter().GetDevice(GPUMask.ToIndex());
FD3D12CommandContext* CmdContext;
switch (Pipeline)
{
default: checkNoEntry(); // fallthrough
case ERHIPipeline::Graphics : CmdContext = Device->ObtainContextGraphics(); break;
case ERHIPipeline::AsyncCompute: CmdContext = Device->ObtainContextCompute(); break;
}
check(CmdContext->GetPhysicalGPUMask() == GPUMask);
return CmdContext;
}
else
{
FD3D12CommandContextRedirector* CmdContextRedirector = new FD3D12CommandContextRedirector(&GetAdapter(), GetD3DCommandQueueType(Pipeline), false);
CmdContextRedirector->SetPhysicalGPUMask(GPUMask);
for (uint32 GPUIndex : GPUMask)
{
FD3D12Device* Device = GetAdapter().GetDevice(GPUIndex);
FD3D12CommandContext* CmdContext;
switch (Pipeline)
{
default: checkNoEntry(); // fallthrough
case ERHIPipeline::Graphics : CmdContext = Device->ObtainContextGraphics(); break;
case ERHIPipeline::AsyncCompute: CmdContext = Device->ObtainContextCompute(); break;
}
CmdContextRedirector->SetPhysicalContext(CmdContext);
}
return CmdContextRedirector;
}
}
void FD3D12DynamicRHI::RHICreateTransition(FRHITransition* Transition, const FRHITransitionCreateInfo& CreateInfo)
{
// Construct the data in-place on the transition instance
FD3D12TransitionData* Data = new (Transition->GetPrivateData<FD3D12TransitionData>()) FD3D12TransitionData;
Data->SrcPipelines = CreateInfo.SrcPipelines;
Data->DstPipelines = CreateInfo.DstPipelines;
Data->CreateFlags = CreateInfo.Flags;
const bool bCrossPipeline = (CreateInfo.SrcPipelines != CreateInfo.DstPipelines) && (!EnumHasAnyFlags(Data->CreateFlags, ERHITransitionCreateFlags::NoFence));
const bool bAsyncToAllPipelines = ((CreateInfo.SrcPipelines == ERHIPipeline::AsyncCompute) && (CreateInfo.DstPipelines == ERHIPipeline::All));
Data->bCrossPipeline = bCrossPipeline;
// In DX12 we cannot perform resource barrier with graphics state on the AsyncCompute pipe
// This check is here to be able to force a crosspipe transition coming from AsyncCompute with graphics states to be split and processed in the both the Async and Graphics pipe
// This case can be removed when using EB on DX12
if (bAsyncToAllPipelines)
{
for (const FRHITransitionInfo& TransitionInfo : CreateInfo.TransitionInfos)
{
if (EnumHasAnyFlags(TransitionInfo.AccessAfter, ERHIAccess::SRVGraphics))
{
Data->bAsyncToAllPipelines = true;
Data->bCrossPipeline = false;
break;
}
}
}
if ((Data->bCrossPipeline) || (Data->bAsyncToAllPipelines))
{
// Create one sync point per device, per source pipe
for (uint32 Index : FRHIGPUMask::All())
{
TRHIPipelineArray<FD3D12SyncPointRef>& DeviceSyncPoints = Data->SyncPoints.Emplace_GetRef();
for (ERHIPipeline Pipeline : MakeFlagsRange(CreateInfo.SrcPipelines))
{
DeviceSyncPoints[Pipeline] = FD3D12SyncPoint::Create(ED3D12SyncPointType::GPUOnly);
}
}
}
Data->TransitionInfos = CreateInfo.TransitionInfos;
Data->AliasingInfos = CreateInfo.AliasingInfos;
uint32 AliasingOverlapCount = 0;
for (const FRHITransientAliasingInfo& AliasingInfo : Data->AliasingInfos)
{
AliasingOverlapCount += AliasingInfo.Overlaps.Num();
}
Data->AliasingOverlaps.Reserve(AliasingOverlapCount);
for (FRHITransientAliasingInfo& AliasingInfo : Data->AliasingInfos)
{
const int32 OverlapCount = AliasingInfo.Overlaps.Num();
if (OverlapCount > 0)
{
const int32 OverlapOffset = Data->AliasingOverlaps.Num();
Data->AliasingOverlaps.Append(AliasingInfo.Overlaps.GetData(), OverlapCount);
AliasingInfo.Overlaps = MakeArrayView(&Data->AliasingOverlaps[OverlapOffset], OverlapCount);
}
}
}
void FD3D12DynamicRHI::RHIReleaseTransition(FRHITransition* Transition)
{
// Destruct the transition data
Transition->GetPrivateData<FD3D12TransitionData>()->~FD3D12TransitionData();
}
IRHITransientResourceAllocator* FD3D12DynamicRHI::RHICreateTransientResourceAllocator()
{
return new FD3D12TransientResourceHeapAllocator(GetAdapter().GetOrCreateTransientHeapCache());
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
//
// FD3D12CommandContextRedirector
//
//////////////////////////////////////////////////////////////////////////////////////////////////////////
FD3D12CommandContextRedirector::FD3D12CommandContextRedirector(class FD3D12Adapter* InParent, ED3D12QueueType QueueType, bool InIsDefaultContext)
: FD3D12CommandContextBase(InParent, FRHIGPUMask::All())
, QueueType(QueueType)
, bIsDefaultContext(InIsDefaultContext)
{
for (FD3D12CommandContext*& Context : PhysicalContexts)
Context = nullptr;
}
#if WITH_MGPU
void FD3D12CommandContextRedirector::RHITransferResources(TConstArrayView<FTransferResourceParams> Params)
{
if (Params.Num() == 0)
return;
auto MGPUSync = [this](FRHIGPUMask SignalMask, TOptional<FRHIGPUMask> WaitMask = {})
{
FRHIGPUMask CombinedMask = SignalMask;
if (WaitMask.IsSet())
{
CombinedMask |= WaitMask.GetValue();
}
// Signal a sync point on each source GPU
TStaticArray<FD3D12SyncPointRef, MAX_NUM_GPUS> SyncPoints;
for (uint32 GPUIndex : SignalMask)
{
SyncPoints[GPUIndex] = FD3D12SyncPoint::Create(ED3D12SyncPointType::GPUOnly);
PhysicalContexts[GPUIndex]->SignalSyncPoint(SyncPoints[GPUIndex]);
}
// Wait for sync points
if (WaitMask.IsSet())
{
for (uint32 WaitGPUIndex : WaitMask.GetValue())
{
for (uint32 SignalGPUIndex : SignalMask)
{
PhysicalContexts[WaitGPUIndex]->WaitSyncPoint(SyncPoints[SignalGPUIndex]);
}
}
}
return SyncPoints;
};
// Note that by default it is not empty, but GPU0
FRHIGPUMask SrcMask, DstMask;
bool bLockstep = GD3D12UnsafeCrossGPUTransfers == false; // @todo mgpu - fix synchronization
bool bDelayFence = false;
{
bool bFirst = true;
for (const FTransferResourceParams& Param : Params)
{
FD3D12CommandContext* SrcContext = PhysicalContexts[Param.SrcGPUIndex];
FD3D12CommandContext* DstContext = PhysicalContexts[Param.DestGPUIndex];
if (!ensure(SrcContext && DstContext))
{
continue;
}
// @todo mgpu - fix synchronization
bLockstep |= Param.bLockStepGPUs;
// If it's the first time we set the mask.
if (bFirst)
{
SrcMask = FRHIGPUMask::FromIndex(Param.SrcGPUIndex);
DstMask = FRHIGPUMask::FromIndex(Param.DestGPUIndex);
bDelayFence = Param.DelayedFence != nullptr;
bFirst = false;
}
else
{
SrcMask |= FRHIGPUMask::FromIndex(Param.SrcGPUIndex);
DstMask |= FRHIGPUMask::FromIndex(Param.DestGPUIndex);
check(bDelayFence == (Param.DelayedFence != nullptr));
}
FD3D12Resource* SrcResource;
FD3D12Resource* DstResource;
if (Param.Texture)
{
check(Param.Buffer == nullptr);
SrcResource = FD3D12CommandContext::RetrieveTexture(Param.Texture, Param.SrcGPUIndex )->GetResource();
DstResource = FD3D12CommandContext::RetrieveTexture(Param.Texture, Param.DestGPUIndex)->GetResource();
}
else
{
check(Param.Buffer != nullptr);
SrcResource = FD3D12DynamicRHI::ResourceCast(Param.Buffer.GetReference(), Param.SrcGPUIndex )->GetResource();
DstResource = FD3D12DynamicRHI::ResourceCast(Param.Buffer.GetReference(), Param.DestGPUIndex)->GetResource();
}
}
}
// Wait on any pre-transfer fences first
for (const FTransferResourceParams& Param : Params)
{
if (Param.PreTransferFence)
{
FTransferResourceFenceData* FenceData = Param.PreTransferFence;
for (uint32 GPUIndex : FenceData->Mask)
{
FD3D12SyncPoint* SyncPoint = static_cast<FD3D12SyncPoint*>(FenceData->SyncPoints[GPUIndex]);
PhysicalContexts[GPUIndex]->WaitSyncPoint(SyncPoint);
SyncPoint->Release();
}
delete FenceData;
}
}
// Pre-copy synchronization
if (bLockstep)
{
// Everyone waits for completion of everyone one else.
MGPUSync(SrcMask | DstMask, SrcMask | DstMask);
}
else
{
for (const FTransferResourceParams& Param : Params)
{
if (Param.bPullData)
{
// Destination GPUs wait for source GPUs
MGPUSync(SrcMask, DstMask);
break;
}
}
}
// Enqueue the copy work
for (const FTransferResourceParams& Param : Params)
{
FD3D12CommandContext* SrcContext = PhysicalContexts[Param.SrcGPUIndex];
FD3D12CommandContext* DstContext = PhysicalContexts[Param.DestGPUIndex];
if (!ensure(SrcContext && DstContext))
{
continue;
}
FD3D12CommandContext* CopyContext = Param.bPullData ? DstContext : SrcContext;
if (Param.Texture)
{
FD3D12Texture* SrcTexture = FD3D12CommandContext::RetrieveTexture(Param.Texture, Param.SrcGPUIndex);
FD3D12Texture* DstTexture = FD3D12CommandContext::RetrieveTexture(Param.Texture, Param.DestGPUIndex);
// If the texture size is zero (Max.Z == 0, set in the constructor), copy the whole resource
if (Param.Max.Z == 0)
{
CopyContext->GraphicsCommandList()->CopyResource(DstTexture->GetResource()->GetResource(), SrcTexture->GetResource()->GetResource());
}
else
{
// Must be a 2D texture for this code path
check(Param.Texture->GetTexture2D() != nullptr);
ensureMsgf(
Param.Min.X >= 0 && Param.Min.Y >= 0 && Param.Min.Z >= 0 &&
Param.Max.X >= 0 && Param.Max.Y >= 0 && Param.Max.Z >= 0,
TEXT("Invalid rect for texture transfer: %i, %i, %i, %i, %i, %i"), Param.Min.X, Param.Min.Y, Param.Min.Z, Param.Max.X, Param.Max.Y, Param.Max.Z);
D3D12_BOX Box = { (UINT)Param.Min.X, (UINT)Param.Min.Y, (UINT)Param.Min.Z, (UINT)Param.Max.X, (UINT)Param.Max.Y, (UINT)Param.Max.Z };
CD3DX12_TEXTURE_COPY_LOCATION SrcLocation(SrcTexture->GetResource()->GetResource(), 0);
CD3DX12_TEXTURE_COPY_LOCATION DstLocation(DstTexture->GetResource()->GetResource(), 0);
CopyContext->CopyTextureRegionChecked(&DstLocation, Box.left, Box.top, Box.front, DstTexture->GetFormat(), &SrcLocation, &Box, SrcTexture->GetFormat(), DstTexture->GetName());
}
}
else
{
FD3D12Resource* SrcResource = FD3D12DynamicRHI::ResourceCast(Param.Buffer.GetReference(), Param.SrcGPUIndex)->GetResource();
FD3D12Resource* DstResource = FD3D12DynamicRHI::ResourceCast(Param.Buffer.GetReference(), Param.DestGPUIndex)->GetResource();
CopyContext->GraphicsCommandList()->CopyResource(DstResource->GetResource(), SrcResource->GetResource());
}
}
// Post-copy synchronization
if (bLockstep)
{
// Complete the lockstep by ensuring the GPUs don't start doing something else before the copy completes.
MGPUSync(SrcMask | DstMask, SrcMask | DstMask);
}
else if (bDelayFence)
{
auto SyncPoints = MGPUSync(SrcMask | DstMask);
for (const FTransferResourceParams& Param : Params)
{
check(Param.DelayedFence);
Param.DelayedFence->Mask = SrcMask | DstMask;
// Copy the sync points into the delayed fence struct. These will be awaited later in RHITransferResourceWait().
for (int32 Index = 0; Index < SyncPoints.Num(); ++Index)
{
FD3D12SyncPointRef& SyncPoint = SyncPoints[Index];
if (SyncPoint)
{
SyncPoint->AddRef();
Param.DelayedFence->SyncPoints[Index] = SyncPoint.GetReference();
}
else
{
Param.DelayedFence->SyncPoints[Index] = nullptr;
}
}
}
}
else
{
// The dest waits for the src to be at this place in the frame before using the data.
MGPUSync(SrcMask, DstMask);
}
}
void FD3D12CommandContextRedirector::RHITransferResourceSignal(TConstArrayView<FTransferResourceFenceData*> FenceDatas, FRHIGPUMask SrcGPUMask)
{
check(FenceDatas.Num() == SrcGPUMask.GetNumActive());
uint32 FenceIndex = 0;
for (uint32 SrcGPUIndex : SrcGPUMask)
{
FD3D12SyncPointRef SyncPoint = FD3D12SyncPoint::Create(ED3D12SyncPointType::GPUOnly);
SyncPoint->AddRef();
PhysicalContexts[SrcGPUIndex]->SignalSyncPoint(SyncPoint);
FTransferResourceFenceData* FenceData = FenceDatas[FenceIndex++];
FenceData->Mask = FRHIGPUMask::FromIndex(SrcGPUIndex);
FenceData->SyncPoints[SrcGPUIndex] = SyncPoint;
}
}
void FD3D12CommandContextRedirector::RHITransferResourceWait(TConstArrayView<FTransferResourceFenceData*> FenceDatas)
{
FRHIGPUMask AllMasks;
for (int32 Index = 0; Index < FenceDatas.Num(); ++Index)
{
AllMasks = Index == 0
? FenceDatas[Index]->Mask
: FenceDatas[Index]->Mask | AllMasks;
}
for (FTransferResourceFenceData* FenceData : FenceDatas)
{
// Wait for sync points
for (uint32 WaitGPUIndex : FenceData->Mask)
{
for (void* SyncPointPtr : FenceData->SyncPoints)
{
if (SyncPointPtr)
{
FD3D12SyncPoint* SyncPoint = static_cast<FD3D12SyncPoint*>(SyncPointPtr);
PhysicalContexts[WaitGPUIndex]->WaitSyncPoint(SyncPoint);
}
}
}
// Release sync points
for (void* SyncPointPtr : FenceData->SyncPoints)
{
if (SyncPointPtr)
{
static_cast<FD3D12SyncPoint*>(SyncPointPtr)->Release();
}
}
delete FenceData;
}
}
void FD3D12CommandContextRedirector::RHICrossGPUTransfer(TConstArrayView<FTransferResourceParams> Params, TConstArrayView<FCrossGPUTransferFence*> PreTransfer, TConstArrayView<FCrossGPUTransferFence*> PostTransfer)
{
if (Params.Num() == 0)
return;
// Wait on any pre-transfer fences first
for (FCrossGPUTransferFence* PreTransferSyncPoint : PreTransfer)
{
FD3D12SyncPoint* SyncPoint = static_cast<FD3D12SyncPoint*>(PreTransferSyncPoint->SyncPoint);
PhysicalContexts[PreTransferSyncPoint->WaitGPUIndex]->WaitSyncPoint(SyncPoint);
SyncPoint->Release();
delete PreTransferSyncPoint;
}
// Enqueue the copy work
for (const FTransferResourceParams& Param : Params)
{
FD3D12CommandContext* SrcContext = PhysicalContexts[Param.SrcGPUIndex];
if (Param.Texture)
{
FD3D12Texture* SrcTexture = FD3D12CommandContext::RetrieveTexture(Param.Texture, Param.SrcGPUIndex);
FD3D12Texture* DstTexture = FD3D12CommandContext::RetrieveTexture(Param.Texture, Param.DestGPUIndex);
// If the texture size is zero (Max.Z == 0, set in the constructor), copy the whole resource
if (Param.Max.Z == 0)
{
SrcContext->GraphicsCommandList()->CopyResource(DstTexture->GetResource()->GetResource(), SrcTexture->GetResource()->GetResource());
}
else
{
// Must be a 2D texture for this code path
check(Param.Texture->GetTexture2D() != nullptr);
ensureMsgf(
Param.Min.X >= 0 && Param.Min.Y >= 0 && Param.Min.Z >= 0 &&
Param.Max.X >= 0 && Param.Max.Y >= 0 && Param.Max.Z >= 0,
TEXT("Invalid rect for texture transfer: %i, %i, %i, %i, %i, %i"), Param.Min.X, Param.Min.Y, Param.Min.Z, Param.Max.X, Param.Max.Y, Param.Max.Z);
D3D12_BOX Box = { (UINT)Param.Min.X, (UINT)Param.Min.Y, (UINT)Param.Min.Z, (UINT)Param.Max.X, (UINT)Param.Max.Y, (UINT)Param.Max.Z };
CD3DX12_TEXTURE_COPY_LOCATION SrcLocation(SrcTexture->GetResource()->GetResource(), 0);
CD3DX12_TEXTURE_COPY_LOCATION DstLocation(DstTexture->GetResource()->GetResource(), 0);
SrcContext->CopyTextureRegionChecked(&DstLocation, Box.left, Box.top, Box.front, DstTexture->GetFormat(), &SrcLocation, &Box, SrcTexture->GetFormat(), DstTexture->GetName());
}
}
else
{
FD3D12Resource* SrcResource = FD3D12DynamicRHI::ResourceCast(Param.Buffer.GetReference(), Param.SrcGPUIndex)->GetResource();
FD3D12Resource* DstResource = FD3D12DynamicRHI::ResourceCast(Param.Buffer.GetReference(), Param.DestGPUIndex)->GetResource();
SrcContext->GraphicsCommandList()->CopyResource(DstResource->GetResource(), SrcResource->GetResource());
}
}
// Post-copy synchronization
FD3D12SyncPointRef SyncPoint = FD3D12SyncPoint::Create(ED3D12SyncPointType::GPUOnly);
PhysicalContexts[Params[0].SrcGPUIndex]->SignalSyncPoint(SyncPoint);
for (FCrossGPUTransferFence* PostTransferSyncPoint : PostTransfer)
{
// Copy the sync points into the delayed fence struct. These will be awaited later in RHITransferResourceWait().
SyncPoint->AddRef();
PostTransferSyncPoint->SyncPoint = SyncPoint.GetReference();
}
}
void FD3D12CommandContextRedirector::RHICrossGPUTransferSignal(TConstArrayView<FTransferResourceParams> Params, TConstArrayView<FCrossGPUTransferFence*> PreTransfer)
{
for (FCrossGPUTransferFence* TransferSyncPoint : PreTransfer)
{
FD3D12SyncPointRef SyncPoint = FD3D12SyncPoint::Create(ED3D12SyncPointType::GPUOnly);
SyncPoint->AddRef();
PhysicalContexts[TransferSyncPoint->SignalGPUIndex]->SignalSyncPoint(SyncPoint);
TransferSyncPoint->SyncPoint = SyncPoint;
}
}
void FD3D12CommandContextRedirector::RHICrossGPUTransferWait(TConstArrayView<FCrossGPUTransferFence*> PostTransfer)
{
for (FCrossGPUTransferFence* TransferSyncPoint : PostTransfer)
{
if (TransferSyncPoint->SyncPoint)
{
FD3D12SyncPoint* SyncPoint = static_cast<FD3D12SyncPoint*>(TransferSyncPoint->SyncPoint);
PhysicalContexts[TransferSyncPoint->WaitGPUIndex]->WaitSyncPoint(SyncPoint);
SyncPoint->Release();
}
delete TransferSyncPoint;
}
}
#endif // WITH_MGPU