// Copyright Epic Games, Inc. All Rights Reserved. #include "D3D12RayTracing.h" #if D3D12_RHI_RAYTRACING #include "D3D12Resources.h" #include "D3D12Util.h" #include "Containers/DynamicRHIResourceArray.h" #include "Experimental/Containers/SherwoodHashTable.h" #include "BuiltInRayTracingShaders.h" #include "RayTracingValidationShaders.h" #include "Hash/xxhash.h" #include "HAL/CriticalSection.h" #include "HAL/IConsoleManager.h" #include "HAL/FileManagerGeneric.h" #include "Misc/ScopeLock.h" #include "Async/ParallelFor.h" #include "Misc/BufferedOutputDevice.h" #include "String/LexFromString.h" #include "GlobalRenderResources.h" #include "D3D12RayTracingDebug.h" #include "D3D12ExplicitDescriptorCache.h" #include "D3D12ResourceCollection.h" #include "RHIShaderBindingLayout.h" #include "RHIUniformBufferUtilities.h" #include "RHIResourceUtils.h" #include "D3D12TextureReference.h" extern int32 GD3D12ExplicitViewDescriptorHeapSize; extern int32 GD3D12ExplicitViewDescriptorHeapOverflowReported; static int32 GRayTracingDebugForceBuildMode = 0; static FAutoConsoleVariableRef CVarRayTracingDebugForceFastTrace( TEXT("r.D3D12.RayTracing.DebugForceBuildMode"), GRayTracingDebugForceBuildMode, TEXT("Forces specific acceleration structure build mode (not runtime-tweakable).\n") TEXT("0: Use build mode requested by high-level code (Default)\n") TEXT("1: Force fast build mode\n") TEXT("2: Force fast trace mode\n"), ECVF_ReadOnly ); static int32 GRayTracingCacheShaderRecords = 1; static FAutoConsoleVariableRef CVarRayTracingShaderRecordCache( TEXT("r.D3D12.RayTracing.CacheShaderRecords"), GRayTracingCacheShaderRecords, TEXT("Automatically cache and re-use SBT hit group records. This significantly improves CPU performance in large scenes with many identical mesh instances. (default = 1)\n") TEXT("This mode assumes that contents of uniform buffers does not change during ray tracing resource binding.") ); static int32 GD3D12RayTracingAllowCompaction = 1; static FAutoConsoleVariableRef CVarD3D12RayTracingAllowCompaction( TEXT("r.D3D12.RayTracing.AllowCompaction"), GD3D12RayTracingAllowCompaction, TEXT("Whether to automatically perform compaction for static acceleration structures to save GPU memory. (default = 1)\n"), ECVF_ReadOnly ); static int32 GD3D12RayTracingMaxBatchedCompaction = 64; static FAutoConsoleVariableRef CVarD3D12RayTracingMaxBatchedCompaction( TEXT("r.D3D12.RayTracing.MaxBatchedCompaction"), GD3D12RayTracingMaxBatchedCompaction, TEXT("Maximum of amount of compaction requests and rebuilds per frame. (default = 64)\n"), ECVF_ReadOnly ); static int32 GD3D12RayTracingCompactionMinPrimitiveCount = 128; static FAutoConsoleVariableRef CVarD3D12RayTracingCompactionMinPrimitiveCount( TEXT("r.D3D12.RayTracing.Compaction.MinPrimitiveCount"), GD3D12RayTracingCompactionMinPrimitiveCount, TEXT("Sets the minimum primitive count threshold below which geometry skips the compaction. (default = 128)\n") ); static int32 GRayTracingSpecializeStateObjects = 0; static FAutoConsoleVariableRef CVarRayTracingSpecializeStateObjects( TEXT("r.D3D12.RayTracing.SpecializeStateObjects"), GRayTracingSpecializeStateObjects, TEXT("Whether to create specialized unique ray tracing pipeline state objects for each ray generation shader. (default = 0)\n") TEXT("This option can produce more more efficient PSOs for the GPU at the cost of longer creation times and more memory. Requires DXR 1.1.\n"), ECVF_ReadOnly ); static int32 GRayTracingAllowSpecializedStateObjects = 1; static FAutoConsoleVariableRef CVarRayTracingAllowSpecializedStateObjects( TEXT("r.D3D12.RayTracing.AllowSpecializedStateObjects"), GRayTracingAllowSpecializedStateObjects, TEXT("Whether to use specialized RTPSOs if they have been created. ") TEXT("This is intended for performance testingand has no effect if r.D3D12.RayTracing.SpecializeStateObjects is 0. (default = 1)\n") ); static int32 GD3D12RayTracingGPUValidation = 0; static FAutoConsoleVariableRef CVarD3D12RayTracingGPUValidation( TEXT("r.D3D12.RayTracing.GPUValidation"), GD3D12RayTracingGPUValidation, TEXT("Whether to perform validation of ray tracing geometry and other structures on the GPU. Requires Shader Model 6. (default = 0)") ); // This is required to avoid redundent code static analysis warnings // If the static_assert fires the assumptions it is predicated on have been // violated and the code should be revisited #if WITH_MGPU #define FOREACH_GPU(Condition, Function) for (uint32 GPUIndex = 0; Condition; ++GPUIndex) Function #else static_assert(MAX_NUM_GPUS == 1 && GNumExplicitGPUsForRendering == 1); #define FOREACH_GPU(Condition, Function) { constexpr uint32 GPUIndex = 0; Function } #endif // Ray tracing stat counters DECLARE_STATS_GROUP(TEXT("D3D12RHI: Ray Tracing"), STATGROUP_D3D12RayTracing, STATCAT_Advanced); DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("Created pipelines (total)"), STAT_D3D12RayTracingCreatedPipelines, STATGROUP_D3D12RayTracing); DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("Compiled shaders (total)"), STAT_D3D12RayTracingCompiledShaders, STATGROUP_D3D12RayTracing); DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("Allocated bottom level acceleration structures"), STAT_D3D12RayTracingAllocatedBLAS, STATGROUP_D3D12RayTracing); DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("Allocated top level acceleration structures"), STAT_D3D12RayTracingAllocatedTLAS, STATGROUP_D3D12RayTracing); DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("Triangles in all BL acceleration structures"), STAT_D3D12RayTracingTrianglesBLAS, STATGROUP_D3D12RayTracing); DECLARE_DWORD_COUNTER_STAT(TEXT("Built BL AS (per frame)"), STAT_D3D12RayTracingBuiltBLAS, STATGROUP_D3D12RayTracing); DECLARE_DWORD_COUNTER_STAT(TEXT("Updated BL AS (per frame)"), STAT_D3D12RayTracingUpdatedBLAS, STATGROUP_D3D12RayTracing); DECLARE_DWORD_COUNTER_STAT(TEXT("Built TL AS (per frame)"), STAT_D3D12RayTracingBuiltTLAS, STATGROUP_D3D12RayTracing); DECLARE_DWORD_COUNTER_STAT(TEXT("Updated TL AS (per frame)"), STAT_D3D12RayTracingUpdatedTLAS, STATGROUP_D3D12RayTracing); DECLARE_DWORD_COUNTER_STAT(TEXT("UniformBuffer Record Updates (per frame)"), STAT_D3D12RayTracingUBRecordUpdate, STATGROUP_D3D12RayTracing); DECLARE_MEMORY_STAT(TEXT("Total BL AS Memory"), STAT_D3D12RayTracingBLASMemory, STATGROUP_D3D12RayTracing); DECLARE_MEMORY_STAT(TEXT("Static BL AS Memory"), STAT_D3D12RayTracingStaticBLASMemory, STATGROUP_D3D12RayTracing); DECLARE_MEMORY_STAT(TEXT("Dynamic BL AS Memory"), STAT_D3D12RayTracingDynamicBLASMemory, STATGROUP_D3D12RayTracing); DECLARE_MEMORY_STAT(TEXT("TL AS Memory"), STAT_D3D12RayTracingTLASMemory, STATGROUP_D3D12RayTracing); DECLARE_MEMORY_STAT(TEXT("Total Used Video Memory"), STAT_D3D12RayTracingUsedVideoMemory, STATGROUP_D3D12RayTracing); DECLARE_CYCLE_STAT(TEXT("RTPSO Compile Shader"), STAT_RTPSO_CompileShader, STATGROUP_D3D12RayTracing); DECLARE_CYCLE_STAT(TEXT("RTPSO Create Pipeline"), STAT_RTPSO_CreatePipeline, STATGROUP_D3D12RayTracing); DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("Allocated shader binding tables"), STAT_D3D12RayTracingAllocatedSBT, STATGROUP_D3D12RayTracing); DECLARE_CYCLE_STAT(TEXT("SetBindingsOnShaderBindingTable"), STAT_D3D12SetBindingsOnShaderBindingTable, STATGROUP_D3D12RayTracing); DECLARE_CYCLE_STAT(TEXT("CreateShaderTable"), STAT_D3D12CreateShaderTable, STATGROUP_D3D12RayTracing); DECLARE_CYCLE_STAT(TEXT("BuildTopLevel"), STAT_D3D12BuildTLAS, STATGROUP_D3D12RayTracing); DECLARE_CYCLE_STAT(TEXT("BuildBottomLevel"), STAT_D3D12BuildBLAS, STATGROUP_D3D12RayTracing); DECLARE_CYCLE_STAT(TEXT("DispatchRays"), STAT_D3D12DispatchRays, STATGROUP_D3D12RayTracing); static ERayTracingAccelerationStructureFlags GetRayTracingAccelerationStructureBuildFlags(const FRayTracingGeometryInitializer& Initializer); #if UE_BUILD_SHIPPING inline void RegisterD3D12RayTracingGeometry(FD3D12RayTracingGeometry* Geometry) {}; inline void UnregisterD3D12RayTracingGeometry(FD3D12RayTracingGeometry* Geometry) {}; #else struct FD3D12RayTracingGeometryTracker { TSet Geometries; uint64 TotalBLASSize = 0; uint64 MaxTotalBLASSize = 0; FCriticalSection CS; uint64 GetGeometrySize(FD3D12RayTracingGeometry& Geometry) { if (Geometry.AccelerationStructureCompactedSize != 0) { return Geometry.AccelerationStructureCompactedSize; } else { return Geometry.SizeInfo.ResultSize; } } void Add(FD3D12RayTracingGeometry* Geometry) { uint64 BLASSize = GetGeometrySize(*Geometry); FScopeLock Lock(&CS); Geometries.Add(Geometry); TotalBLASSize += BLASSize; MaxTotalBLASSize = FMath::Max(MaxTotalBLASSize, TotalBLASSize); } void Remove(FD3D12RayTracingGeometry* Geometry) { uint64 BLASSize = GetGeometrySize(*Geometry); FScopeLock Lock(&CS); Geometries.Remove(Geometry); TotalBLASSize -= BLASSize; } }; static FD3D12RayTracingGeometryTracker& GetD3D12RayTracingGeometryTracker() { static FD3D12RayTracingGeometryTracker Instance; return Instance; } enum class EDumpRayTracingGeometryMode { Top, All, }; static void DumpRayTracingGeometries(EDumpRayTracingGeometryMode Mode, int32 NumEntriesToShow, const FString& NameFilter, bool bCSV, FBufferedOutputDevice& BufferedOutput) { FD3D12RayTracingGeometryTracker& Tracker = GetD3D12RayTracingGeometryTracker(); FScopeLock Lock(&Tracker.CS); auto GetGeometrySize = [](FD3D12RayTracingGeometry& Geometry) { if (Geometry.AccelerationStructureCompactedSize != 0) { return Geometry.AccelerationStructureCompactedSize; } else { return Geometry.SizeInfo.ResultSize; } }; TArray Geometries = Tracker.Geometries.Array(); Geometries.Sort([GetGeometrySize](FD3D12RayTracingGeometry& A, FD3D12RayTracingGeometry& B) { return GetGeometrySize(A) > GetGeometrySize(B); }); FName CategoryName(TEXT("D3D12RayTracing")); uint64 TotalSizeBytes = 0; uint64 TopSizeBytes = 0; BufferedOutput.CategorizedLogf(CategoryName, ELogVerbosity::Log, TEXT("Tracked FD3D12RayTracingGeometry objects")); if (NumEntriesToShow < 0 || NumEntriesToShow > Geometries.Num()) { NumEntriesToShow = Geometries.Num(); } if (NumEntriesToShow != Geometries.Num()) { BufferedOutput.CategorizedLogf(CategoryName, ELogVerbosity::Log, TEXT("Showing %d out of %d"), NumEntriesToShow, Geometries.Num()); } auto ShouldShow = [&NameFilter](FD3D12RayTracingGeometry* Entry) { if (NameFilter.IsEmpty()) { return true; } FString DebugName = Entry->DebugName.ToString(); if (DebugName.Find(NameFilter, ESearchCase::IgnoreCase) != INDEX_NONE) { return true; } else { return false; } }; FArchive* CSVFile{ nullptr }; if (bCSV) { const FString Filename = FString::Printf(TEXT("%sd3d12DumpRayTracingGeometries-%s.csv"), *FPaths::ProfilingDir(), *FDateTime::Now().ToString()); CSVFile = IFileManager::Get().CreateFileWriter(*Filename, FILEWRITE_AllowRead); const TCHAR* Header = TEXT("Name,Size (MBs),Prims,Segments,Compaction,Update,MarkedForDelete\n"); CSVFile->Serialize(TCHAR_TO_ANSI(Header), FPlatformString::Strlen(Header)); } int32 ShownEntries = 0; for (int32 i=0; i< Geometries.Num(); ++i) { FD3D12RayTracingGeometry* Geometry = Geometries[i]; uint64 SizeBytes = GetGeometrySize(*Geometry); ERayTracingAccelerationStructureFlags GeometryBuildFlags = GetRayTracingAccelerationStructureBuildFlags(Geometry->Initializer); if (ShownEntries < NumEntriesToShow && ShouldShow(Geometry)) { if (bCSV) { const FString Row = FString::Printf(TEXT("%s,%.3f,%d,%d,%d,%d,%d\n"), !Geometry->DebugName.IsNone() ? *Geometry->DebugName.ToString() : TEXT("*UNKNOWN*"), SizeBytes / double(1 << 20), Geometry->Initializer.TotalPrimitiveCount, Geometry->Initializer.Segments.Num(), (int32)EnumHasAllFlags(GeometryBuildFlags, ERayTracingAccelerationStructureFlags::AllowCompaction), (int32)EnumHasAllFlags(GeometryBuildFlags, ERayTracingAccelerationStructureFlags::AllowUpdate), !Geometry->IsValid()); CSVFile->Serialize(TCHAR_TO_ANSI(*Row), Row.Len()); } else { BufferedOutput.CategorizedLogf(CategoryName, ELogVerbosity::Log, TEXT("Name: %s - Size: %.3f MB - Prims: %d - Segments: %d - Compaction: %d - Update: %d"), !Geometry->DebugName.IsNone() ? *Geometry->DebugName.ToString() : TEXT("*UNKNOWN*"), SizeBytes / double(1 << 20), Geometry->Initializer.TotalPrimitiveCount, Geometry->Initializer.Segments.Num(), (int32)EnumHasAllFlags(GeometryBuildFlags, ERayTracingAccelerationStructureFlags::AllowCompaction), (int32)EnumHasAllFlags(GeometryBuildFlags, ERayTracingAccelerationStructureFlags::AllowUpdate)); } TopSizeBytes += SizeBytes; ++ShownEntries; } TotalSizeBytes += SizeBytes; } if (bCSV) { delete CSVFile; CSVFile = nullptr; } else { double TotalSizeF = double(TotalSizeBytes) / double(1 << 20); double TopSizeF = double(TopSizeBytes) / double(1 << 20); if (ShownEntries != Geometries.Num() && ShownEntries) { BufferedOutput.CategorizedLogf(CategoryName, ELogVerbosity::Log, TEXT("Use command `D3D12.DumpRayTracingGeometries all/N [name]` to dump all or N objects. ") TEXT("Optionally add 'name' to filter entries, such as 'skm_'.")); BufferedOutput.CategorizedLogf(CategoryName, ELogVerbosity::Log, TEXT("Shown %d entries. Size: %.3f MB (%.2f%% of total)"), ShownEntries, TopSizeF, 100.0 * TopSizeF / TotalSizeF); } BufferedOutput.CategorizedLogf(CategoryName, ELogVerbosity::Log, TEXT("Total size: %.3f MB"), TotalSizeF); } } static FAutoConsoleCommandWithWorldArgsAndOutputDevice GD3D12DumpRayTracingGeometriesCmd( TEXT("D3D12.DumpRayTracingGeometries"), TEXT("Dump memory allocations for ray tracing resources."), FConsoleCommandWithWorldArgsAndOutputDeviceDelegate::CreateStatic([](const TArray& Args, UWorld*, FOutputDevice& OutputDevice) { // Default: show top 50 largest objects. EDumpRayTracingGeometryMode Mode = EDumpRayTracingGeometryMode::Top; int32 NumEntriesToShow = 50; bool bCSV = false; FString NameFilter; if (Args.Num()) { if (Args[0] == TEXT("all")) { Mode = EDumpRayTracingGeometryMode::All; NumEntriesToShow = -1; } else if (FCString::IsNumeric(*Args[0])) { Mode = EDumpRayTracingGeometryMode::Top; LexFromString(NumEntriesToShow, *Args[0]); } if (Args.Num() > 1) { NameFilter = Args[1]; } } FBufferedOutputDevice BufferedOutput; DumpRayTracingGeometries(Mode, NumEntriesToShow, NameFilter, bCSV, BufferedOutput); BufferedOutput.RedirectTo(OutputDevice); })); static FAutoConsoleCommandWithWorldArgsAndOutputDevice GD3D12DumpRayTracingGeometriesToCSVCmd( TEXT("D3D12.DumpRayTracingGeometriesToCSV"), TEXT("Dump all memory allocations for ray tracing resources to a CSV file on disc."), FConsoleCommandWithWorldArgsAndOutputDeviceDelegate::CreateStatic([](const TArray& Args, UWorld*, FOutputDevice& OutputDevice) { // CSV dumps all entries EDumpRayTracingGeometryMode Mode = EDumpRayTracingGeometryMode::All; int32 NumEntriesToShow = -1; bool bCSV = true; FString NameFilter; FBufferedOutputDevice BufferedOutput; DumpRayTracingGeometries(Mode, NumEntriesToShow, NameFilter, bCSV, BufferedOutput); BufferedOutput.RedirectTo(OutputDevice); })); inline void RegisterD3D12RayTracingGeometry(FD3D12RayTracingGeometry* Geometry) { GetD3D12RayTracingGeometryTracker().Add(Geometry); } inline void UnregisterD3D12RayTracingGeometry(FD3D12RayTracingGeometry* Geometry) { GetD3D12RayTracingGeometryTracker().Remove(Geometry); } #endif // UE_BUILD_SHIPPING const FD3D12ShaderIdentifier FD3D12ShaderIdentifier::Null = { 0, 0, 0, 0 }; static_assert(sizeof(FD3D12ShaderIdentifier) == D3D12_SHADER_IDENTIFIER_SIZE_IN_BYTES, "Unexpected shader identifier size"); static bool ShouldRunRayTracingGPUValidation() { // Wave ops are required to run ray tracing validation shaders const bool bSupportsWaveOps = GRHISupportsWaveOperations && RHISupportsWaveOperations(GMaxRHIShaderPlatform); return GD3D12RayTracingGPUValidation && bSupportsWaveOps; } static D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAGS TranslateRayTracingAccelerationStructureFlags(ERayTracingAccelerationStructureFlags Flags) { uint32 Result = {}; auto HandleFlag = [&Flags, &Result](ERayTracingAccelerationStructureFlags Engine, D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAGS Native) { if (EnumHasAllFlags(Flags, Engine)) { Result |= (uint32)Native; EnumRemoveFlags(Flags, Engine); } }; HandleFlag(ERayTracingAccelerationStructureFlags::AllowUpdate, D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAG_ALLOW_UPDATE); HandleFlag(ERayTracingAccelerationStructureFlags::AllowCompaction, D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAG_ALLOW_COMPACTION); HandleFlag(ERayTracingAccelerationStructureFlags::FastTrace, D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAG_PREFER_FAST_TRACE); HandleFlag(ERayTracingAccelerationStructureFlags::FastBuild, D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAG_PREFER_FAST_BUILD); HandleFlag(ERayTracingAccelerationStructureFlags::MinimizeMemory, D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAG_MINIMIZE_MEMORY); checkf(!EnumHasAnyFlags(Flags, Flags), TEXT("Some ERayTracingAccelerationStructureFlags entries were not handled")); return D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAGS(Result); } static D3D12_RAYTRACING_GEOMETRY_TYPE TranslateRayTracingGeometryType(ERayTracingGeometryType GeometryType) { switch (GeometryType) { case RTGT_Triangles: return D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES; break; case RTGT_Procedural: return D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS; break; default: checkf(false, TEXT("Unexpected ray tracing geometry type")); return D3D12_RAYTRACING_GEOMETRY_TYPE(0); } } struct FDXILLibrary { // No copy assignment or move because FDXILLibrary points to internal struct memory UE_NONCOPYABLE(FDXILLibrary) FDXILLibrary() = default; void InitFromDXIL(const void* Bytecode, uint32 BytecodeLength, const LPCWSTR* InEntryNames, const LPCWSTR* InExportNames, uint32 NumEntryNames) { check(NumEntryNames != 0); check(InEntryNames); check(InExportNames); EntryNames.SetNum(NumEntryNames); ExportNames.SetNum(NumEntryNames); ExportDesc.SetNum(NumEntryNames); for (uint32 EntryIndex = 0; EntryIndex < NumEntryNames; ++EntryIndex) { EntryNames[EntryIndex] = InEntryNames[EntryIndex]; ExportNames[EntryIndex] = InExportNames[EntryIndex]; ExportDesc[EntryIndex].ExportToRename = *(EntryNames[EntryIndex]); ExportDesc[EntryIndex].Flags = D3D12_EXPORT_FLAG_NONE; ExportDesc[EntryIndex].Name = *(ExportNames[EntryIndex]); } Desc.DXILLibrary.pShaderBytecode = Bytecode; Desc.DXILLibrary.BytecodeLength = BytecodeLength; Desc.NumExports = ExportDesc.Num(); Desc.pExports = ExportDesc.GetData(); } void InitFromDXIL(const D3D12_SHADER_BYTECODE& ShaderBytecode, LPCWSTR* InEntryNames, LPCWSTR* InExportNames, uint32 NumEntryNames) { InitFromDXIL(ShaderBytecode.pShaderBytecode, ShaderBytecode.BytecodeLength, InEntryNames, InExportNames, NumEntryNames); } D3D12_STATE_SUBOBJECT GetSubobject() const { D3D12_STATE_SUBOBJECT Subobject = {}; Subobject.Type = D3D12_STATE_SUBOBJECT_TYPE_DXIL_LIBRARY; Subobject.pDesc = &Desc; return Subobject; } // NOTE: typical DXIL library may contain up to 3 entry points (i.e. hit groups with closest hit, any hit and intersection shaders) // Typical case is 1 (RGS, MS or CHS only) or 2 (CHS + AHS for shaders with alpha masking) static constexpr uint32 ExpectedEntryPoints = 3; TArray> ExportDesc; TArray> EntryNames; TArray> ExportNames; D3D12_DXIL_LIBRARY_DESC Desc = {}; }; static TRefCountPtr CreateRayTracingStateObject( ID3D12Device5* RayTracingDevice, const TArrayView& ShaderLibraries, const TArrayView& Exports, uint32 MaxAttributeSizeInBytes, uint32 MaxPayloadSizeInBytes, const TArrayView& HitGroups, const ID3D12RootSignature* GlobalRootSignature, const TArrayView& LocalRootSignatures, const TArrayView& LocalRootSignatureAssociations, // indices into LocalRootSignatures, one per export (may be empty, which assumes single root signature used for everything) const TArrayView& ExistingCollections, D3D12_STATE_OBJECT_TYPE StateObjectType // Full RTPSO or a Collection ) { checkf((LocalRootSignatureAssociations.Num() == 0 && LocalRootSignatures.Num() == 1) || (LocalRootSignatureAssociations.Num() == Exports.Num()), TEXT("There must be exactly one local root signature association per export.")); TRefCountPtr Result; // There are several pipeline sub-objects that are always required: // 1) D3D12_RAYTRACING_SHADER_CONFIG // 2) D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION // 3) D3D12_RAYTRACING_PIPELINE_CONFIG // 4) D3D12_STATE_OBJECT_CONFIG // 5) Global root signature static constexpr uint32 NumRequiredSubobjects = 5; TArray Subobjects; Subobjects.SetNumUninitialized(NumRequiredSubobjects + ShaderLibraries.Num() + HitGroups.Num() + LocalRootSignatures.Num() + Exports.Num() + ExistingCollections.Num() ); TArray ExportAssociations; ExportAssociations.SetNumUninitialized(Exports.Num()); uint32 Index = 0; const uint32 NumExports = Exports.Num(); // Shader libraries for (const FDXILLibrary* Library : ShaderLibraries) { Subobjects[Index++] = Library->GetSubobject(); } // Shader config D3D12_RAYTRACING_SHADER_CONFIG ShaderConfig = {}; ShaderConfig.MaxAttributeSizeInBytes = MaxAttributeSizeInBytes; check(ShaderConfig.MaxAttributeSizeInBytes <= RAY_TRACING_MAX_ALLOWED_ATTRIBUTE_SIZE); ShaderConfig.MaxPayloadSizeInBytes = MaxPayloadSizeInBytes; const uint32 ShaderConfigIndex = Index; Subobjects[Index++] = D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_RAYTRACING_SHADER_CONFIG, &ShaderConfig}; // Shader config association D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION ShaderConfigAssociation = {}; ShaderConfigAssociation.NumExports = Exports.Num(); ShaderConfigAssociation.pExports = Exports.GetData(); ShaderConfigAssociation.pSubobjectToAssociate = &Subobjects[ShaderConfigIndex]; Subobjects[Index++] = D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_SUBOBJECT_TO_EXPORTS_ASSOCIATION, &ShaderConfigAssociation }; // Hit groups for (const D3D12_HIT_GROUP_DESC& HitGroupDesc : HitGroups) { Subobjects[Index++] = D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_HIT_GROUP, &HitGroupDesc }; } // Pipeline config D3D12_RAYTRACING_PIPELINE_CONFIG PipelineConfig = {}; PipelineConfig.MaxTraceRecursionDepth = RAY_TRACING_MAX_ALLOWED_RECURSION_DEPTH; const uint32 PipelineConfigIndex = Index; Subobjects[Index++] = D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_RAYTRACING_PIPELINE_CONFIG, &PipelineConfig }; // State object config D3D12_STATE_OBJECT_CONFIG StateObjectConfig = {}; if (GRHISupportsRayTracingPSOAdditions) { StateObjectConfig.Flags = D3D12_STATE_OBJECT_FLAG_ALLOW_STATE_OBJECT_ADDITIONS; } Subobjects[Index++] = D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_STATE_OBJECT_CONFIG, &StateObjectConfig }; // Global root signature Subobjects[Index++] = D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_GLOBAL_ROOT_SIGNATURE, &GlobalRootSignature }; // Local root signatures const uint32 LocalRootSignatureBaseIndex = Index; for (int32 SignatureIndex = 0; SignatureIndex < LocalRootSignatures.Num(); ++SignatureIndex) { checkf(LocalRootSignatures[SignatureIndex], TEXT("All local root signatures must be valid")); Subobjects[Index++] = D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_LOCAL_ROOT_SIGNATURE, &LocalRootSignatures[SignatureIndex] }; } // Local root signature associations for (int32 ExportIndex = 0; ExportIndex < Exports.Num(); ++ExportIndex) { // If custom LocalRootSignatureAssociations data is not provided, then assume same default local RS association. const int32 LocalRootSignatureIndex = LocalRootSignatureAssociations.Num() != 0 ? LocalRootSignatureAssociations[ExportIndex] : 0; D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION& Association = ExportAssociations[ExportIndex]; Association = D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION{}; Association.NumExports = 1; Association.pExports = &Exports[ExportIndex]; check(LocalRootSignatureIndex < LocalRootSignatures.Num()); Association.pSubobjectToAssociate = &Subobjects[LocalRootSignatureBaseIndex + LocalRootSignatureIndex]; Subobjects[Index++] = D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_SUBOBJECT_TO_EXPORTS_ASSOCIATION, &ExportAssociations[ExportIndex] }; } // Existing collection objects for (int32 CollectionIndex = 0; CollectionIndex < ExistingCollections.Num(); ++CollectionIndex) { Subobjects[Index++] = D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_EXISTING_COLLECTION, &ExistingCollections[CollectionIndex] }; } // Done! checkf(Index == Subobjects.Num(), TEXT("All pipeline subobjects must be initialized.")); // Create ray tracing pipeline state object D3D12_STATE_OBJECT_DESC Desc = {}; Desc.NumSubobjects = Index; Desc.pSubobjects = &Subobjects[0]; Desc.Type = StateObjectType; #if WITH_NVAPI bool bReorderingDisabled = false; if (IsRHIDeviceNVIDIA()) { // Enable the NVAPI extension to allow shader extensions in ray tracing for shader execution reordering NvAPI_Status NvapiStatus = NvAPI_D3D12_SetNvShaderExtnSlotSpaceLocalThread(RayTracingDevice, UE_HLSL_SLOT_NV_SHADER_EXTN, UE_HLSL_SPACE_NV_SHADER_EXTN); bReorderingDisabled = ensureMsgf(NvapiStatus == NVAPI_OK, TEXT("NVAPI ERROR %d on setup extensions\n"), NvapiStatus); } #endif HRESULT hr = RayTracingDevice->CreateStateObject(&Desc, IID_PPV_ARGS(Result.GetInitReference())); if (FAILED(hr)) { // Failed state objects are not fatal if it's coming from partial/non-required RTPSO // Will be fatal when it's actually needed for rendering UE_LOG(LogD3D12RHI, Warning, TEXT("Failed to create raytracing RTPSO - add '-d3ddebug' to the command line to get more info on RTPSO compilation errors from the debug device")); // Catch GPU crashes explicitly. if (hr == DXGI_ERROR_DEVICE_REMOVED || hr == DXGI_ERROR_DEVICE_HUNG) { VERIFYD3D12RESULT_EX(hr, RayTracingDevice); } } #if WITH_NVAPI if (bReorderingDisabled) { // Disable the NVAPI extension slot again after state object creation. NvAPI_Status NvapiStatus = NvAPI_D3D12_SetNvShaderExtnSlotSpaceLocalThread(RayTracingDevice, ~0u, 0); checkf(NvapiStatus == NVAPI_OK, TEXT("NVAPI ERROR %d on clear extensions\n"), NvapiStatus); } #endif INC_DWORD_STAT(STAT_D3D12RayTracingCreatedPipelines); INC_DWORD_STAT_BY(STAT_D3D12RayTracingCompiledShaders, NumExports); return Result; } inline uint64 GetShaderHash64(FRHIRayTracingShader* ShaderRHI) { uint64 ShaderHash; // 64 bits from the shader SHA1 FMemory::Memcpy(&ShaderHash, ShaderRHI->GetHash().Hash, sizeof(ShaderHash)); return ShaderHash; } // Generates a stable symbol name for a ray tracing shader, used for RT PSO creation. inline FString GenerateShaderName(const TCHAR* Prefix, uint64 Hash) { return FString::Printf(TEXT("%s_%016llx"), Prefix, Hash); } inline FString GenerateShaderName(FRHIRayTracingShader* ShaderRHI) { const FD3D12RayTracingShader* Shader = FD3D12DynamicRHI::ResourceCast(ShaderRHI); uint64 ShaderHash = GetShaderHash64(ShaderRHI); return GenerateShaderName(*(Shader->EntryPoint), ShaderHash); } static FD3D12ShaderIdentifier GetShaderIdentifier(ID3D12StateObjectProperties* PipelineProperties, const TCHAR* ExportName) { const void* ShaderIdData = PipelineProperties->GetShaderIdentifier(ExportName); checkf(ShaderIdData, TEXT("Couldn't find requested export in the ray tracing shader pipeline")); FD3D12ShaderIdentifier Result; Result.SetData(ShaderIdData); return Result; } static FD3D12ShaderIdentifier GetShaderIdentifier(ID3D12StateObject* StateObject, const TCHAR* ExportName) { TRefCountPtr PipelineProperties; HRESULT QueryInterfaceResult = StateObject->QueryInterface(IID_PPV_ARGS(PipelineProperties.GetInitReference())); checkf(SUCCEEDED(QueryInterfaceResult), TEXT("Failed to query pipeline properties from the ray tracing pipeline state object. Result=%08x"), QueryInterfaceResult); return GetShaderIdentifier(PipelineProperties, ExportName); } FD3D12RayTracingCompactionRequestHandler::FD3D12RayTracingCompactionRequestHandler(FD3D12Device* Device) : FD3D12DeviceChild(Device) { const size_t BufferSize = GD3D12RayTracingMaxBatchedCompaction * sizeof(uint64); const D3D12_RESOURCE_DESC ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(BufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); const FRHIBufferCreateDesc CreateDesc = FRHIBufferCreateDesc::Create(TEXT("PostBuildInfoBuffer"), BufferSize, 0, BUF_UnorderedAccess | BUF_SourceCopy) .SetInitialState(ERHIAccess::UAVMask) .SetGPUMask(FRHIGPUMask::FromIndex(GetParentDevice()->GetGPUIndex())); PostBuildInfoBuffer = GetParentDevice()->GetParentAdapter()->CreateRHIBuffer( ResourceDesc, 8, CreateDesc, ED3D12ResourceStateMode::MultiState, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ); SetD3D12ResourceName(PostBuildInfoBuffer->GetResource(), TEXT("PostBuildInfoBuffer")); PostBuildInfoStagingBuffer = RHICreateStagingBuffer(); } void FD3D12RayTracingCompactionRequestHandler::RequestCompact(FD3D12RayTracingGeometry* InRTGeometry) { uint32 GPUIndex = GetParentDevice()->GetGPUIndex(); check(InRTGeometry->AccelerationStructureBuffers[GPUIndex]); ERayTracingAccelerationStructureFlags GeometryBuildFlags = GetRayTracingAccelerationStructureBuildFlags(InRTGeometry->Initializer); check(EnumHasAllFlags(GeometryBuildFlags, ERayTracingAccelerationStructureFlags::AllowCompaction) && EnumHasAllFlags(GeometryBuildFlags, ERayTracingAccelerationStructureFlags::FastTrace) && !EnumHasAnyFlags(GeometryBuildFlags, ERayTracingAccelerationStructureFlags::AllowUpdate)); FScopeLock Lock(&CS); PendingRequests.Add(InRTGeometry); } bool FD3D12RayTracingCompactionRequestHandler::ReleaseRequest(FD3D12RayTracingGeometry* InRTGeometry) { FScopeLock Lock(&CS); // Remove from pending list, not found then try active requests if (PendingRequests.Remove(InRTGeometry) <= 0) { // If currently enqueued, then clear pointer to not handle the compaction request anymore for (int32 BLASIndex = 0; BLASIndex < ActiveBLASGPUAddresses.Num(); ++BLASIndex) { if (ActiveRequests[BLASIndex] == InRTGeometry) { ActiveRequests[BLASIndex] = nullptr; return true; } } return false; } else { return true; } } void FD3D12RayTracingCompactionRequestHandler::Update(FD3D12CommandContext& Context) { LLM_SCOPE_BYNAME(TEXT("FD3D12RT/Compaction")); FScopeLock Lock(&CS); // process previous build request data retrieval uint32 GPUIndex = GetParentDevice()->GetGPUIndex(); if (ActiveBLASGPUAddresses.Num() > 0) { // Ensure that our builds & copies have finished on GPU when enqueued - if still busy then wait until done if (PostBuildInfoBufferReadbackSyncPoint && !PostBuildInfoBufferReadbackSyncPoint->IsComplete()) { return; } // Readback the sizes from the readback buffer and schedule new builds ops on the RTGeometry objects uint64* SizesAfterCompaction = (uint64*)PostBuildInfoStagingBuffer->Lock(0, ActiveBLASGPUAddresses.Num() * sizeof(uint64)); for (int32 BLASIndex = 0; BLASIndex < ActiveBLASGPUAddresses.Num(); ++BLASIndex) { if (ActiveRequests[BLASIndex] != nullptr) { ActiveRequests[BLASIndex]->CompactAccelerationStructure(Context, GPUIndex, SizesAfterCompaction[BLASIndex]); } } PostBuildInfoStagingBuffer->Unlock(); // reset working values PostBuildInfoBufferReadbackSyncPoint = nullptr; ActiveRequests.Empty(ActiveRequests.Num()); ActiveBLASGPUAddresses.Empty(ActiveBLASGPUAddresses.Num()); } // build a new set of build requests to extract the build data for (FD3D12RayTracingGeometry* RTGeometry : PendingRequests) { ActiveRequests.Add(RTGeometry); FD3D12ResourceLocation& ResourceLocation = RTGeometry->AccelerationStructureBuffers[GPUIndex].GetReference()->ResourceLocation; ActiveBLASGPUAddresses.Add(ResourceLocation.GetGPUVirtualAddress()); Context.UpdateResidency(ResourceLocation.GetResource()); // enqueued enough requests for this update round if (ActiveRequests.Num() >= GD3D12RayTracingMaxBatchedCompaction) { break; } } // Do we have requests? if (ActiveRequests.Num() > 0) { // clear out all of the pending requests, don't allow the array to shrink PendingRequests.RemoveAt(0, ActiveRequests.Num(), EAllowShrinking::No); D3D12_RAYTRACING_ACCELERATION_STRUCTURE_POSTBUILD_INFO_DESC PostBuildInfoDesc = {}; PostBuildInfoDesc.DestBuffer = PostBuildInfoBuffer->ResourceLocation.GetGPUVirtualAddress(); PostBuildInfoDesc.InfoType = D3D12_RAYTRACING_ACCELERATION_STRUCTURE_POSTBUILD_INFO_COMPACTED_SIZE; //PostBuildInfoBuffer enters in D3D12_RESOURCE_STATE_UNORDERED_ACCESS //Context.TransitionResource(PostBuildInfoBuffer->GetResource(), D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, 0); // Force UAV barrier to make sure all previous builds ops are finished Context.AddUAVBarrier(); Context.FlushResourceBarriers(); // Emit the RT post build info from the selected requests Context.RayTracingCommandList()->EmitRaytracingAccelerationStructurePostbuildInfo(&PostBuildInfoDesc, ActiveBLASGPUAddresses.Num(), ActiveBLASGPUAddresses.GetData()); // Transition to copy source and perform the copy to readback Context.TransitionResource(PostBuildInfoBuffer->GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE, 0); Context.FlushResourceBarriers(); Context.RHICopyToStagingBuffer(PostBuildInfoBuffer, PostBuildInfoStagingBuffer, 0, sizeof(uint64) * ActiveBLASGPUAddresses.Num()); Context.TransitionResource(PostBuildInfoBuffer->GetResource(), D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, 0); // Update the sync point PostBuildInfoBufferReadbackSyncPoint = Context.GetContextSyncPoint(); } } // Cache for ray tracing pipeline collection objects, containing single shaders that can be linked into full pipelines. class FD3D12RayTracingPipelineCache : FD3D12AdapterChild { public: UE_NONCOPYABLE(FD3D12RayTracingPipelineCache) FD3D12RayTracingPipelineCache(FD3D12Adapter* Adapter) : FD3D12AdapterChild(Adapter) , DefaultLocalRootSignature(Adapter) { // Default empty local root signature LLM_SCOPE_BYNAME(TEXT("FD3D12RT/PipelineCache")); D3D12_VERSIONED_ROOT_SIGNATURE_DESC LocalRootSignatureDesc = {}; if (GetParentAdapter()->GetRootSignatureVersion() >= D3D_ROOT_SIGNATURE_VERSION_1_1) { LocalRootSignatureDesc.Version = D3D_ROOT_SIGNATURE_VERSION_1_1; LocalRootSignatureDesc.Desc_1_1.Flags |= D3D12_ROOT_SIGNATURE_FLAG_LOCAL_ROOT_SIGNATURE; } else { LocalRootSignatureDesc.Version = D3D_ROOT_SIGNATURE_VERSION_1_0; LocalRootSignatureDesc.Desc_1_0.Flags |= D3D12_ROOT_SIGNATURE_FLAG_LOCAL_ROOT_SIGNATURE; } DefaultLocalRootSignature.Init(LocalRootSignatureDesc, RS_RayTracingLocal); } ~FD3D12RayTracingPipelineCache() { Reset(); } struct FKey { uint64 ShaderHash = 0; uint32 MaxAttributeSizeInBytes = 0; uint32 MaxPayloadSizeInBytes = 0; ID3D12RootSignature* GlobalRootSignature = nullptr; ID3D12RootSignature* LocalRootSignature = nullptr; bool operator == (const FKey& Other) const { return ShaderHash == Other.ShaderHash && MaxAttributeSizeInBytes == Other.MaxAttributeSizeInBytes && MaxPayloadSizeInBytes == Other.MaxPayloadSizeInBytes && GlobalRootSignature == Other.GlobalRootSignature && LocalRootSignature == Other.LocalRootSignature; } inline friend uint32 GetTypeHash(const FKey& Key) { return Key.ShaderHash; } }; enum class ECollectionType { Unknown, RayGen, Miss, HitGroup, Callable, }; struct FEntry { // Move-only type FEntry() = default; FEntry(FEntry&& Other) = default; FEntry(const FEntry&) = delete; FEntry& operator = (const FEntry&) = delete; FEntry& operator = (FEntry&& Other) = delete; D3D12_EXISTING_COLLECTION_DESC GetCollectionDesc() { check(bDeserialized || (CompileEvent.IsValid() && CompileEvent->IsComplete())); check(StateObject); if (StateObject == nullptr) { UE_LOG(LogD3D12RHI, Fatal, TEXT("Trying to add a raytracing state collection which failed to compile to a RTPSO collection.")); } D3D12_EXISTING_COLLECTION_DESC Result = {}; Result.pExistingCollection = StateObject; return Result; } const TCHAR* GetPrimaryExportNameChars() { checkf(ExportNames.Num()!=0, TEXT("This ray tracing shader collection does not export any symbols.")); return *(ExportNames[0]); } ECollectionType CollectionType = ECollectionType::Unknown; TRefCountPtr Shader; TRefCountPtr StateObject; FD3D12RayTracingPipelineInfo PipelineInfo; FGraphEventRef CompileEvent; bool bDeserialized = false; static constexpr uint32 MaxExports = 4; TArray> ExportNames; FD3D12ShaderIdentifier Identifier; float CompileTimeMS = 0.0f; }; static const TCHAR* GetCollectionTypeName(ECollectionType Type) { switch (Type) { case ECollectionType::Unknown: return TEXT("Unknown"); case ECollectionType::RayGen: return TEXT("RayGen"); case ECollectionType::Miss: return TEXT("Miss"); case ECollectionType::HitGroup: return TEXT("HitGroup"); case ECollectionType::Callable: return TEXT("Callable"); default: return TEXT(""); } } class FShaderCompileTask { public: UE_NONCOPYABLE(FShaderCompileTask) FShaderCompileTask( FEntry& InEntry, FKey InCacheKey, FD3D12Device* InDevice, ECollectionType InCollectionType, bool bInRequired) : Entry(InEntry) , CacheKey(InCacheKey) , Device(InDevice) , RayTracingDevice(InDevice->GetDevice5()) , CollectionType(InCollectionType) , bRequired(bInRequired) { } static ESubsequentsMode::Type GetSubsequentsMode() { return ESubsequentsMode::TrackSubsequents; } void DoTask(ENamedThreads::Type CurrentThread, const FGraphEventRef& MyCompletionGraphEvent) { SCOPE_CYCLE_COUNTER(STAT_RTPSO_CompileShader); TRACE_CPUPROFILER_EVENT_SCOPE(ShaderCompileTask); uint64 CompileTimeCycles = 0; CompileTimeCycles -= FPlatformTime::Cycles64(); FD3D12RayTracingShader* Shader = Entry.Shader; static constexpr uint32 MaxEntryPoints = 3; // CHS+AHS+IS for HitGroup or just a single entry point for other collection types TArray> OriginalEntryPoints; TArray> RenamedEntryPoints; const uint32 NumHitGroups = CollectionType == ECollectionType::HitGroup ? 1 : 0; const uint64 ShaderHash = CacheKey.ShaderHash; ID3D12RootSignature* GlobalRootSignature = CacheKey.GlobalRootSignature; ID3D12RootSignature* LocalRootSignature = CacheKey.LocalRootSignature; const uint32 DefaultLocalRootSignatureIndex = 0; uint32 MaxAttributeSizeInBytes = CacheKey.MaxAttributeSizeInBytes; uint32 MaxPayloadSizeInBytes = CacheKey.MaxPayloadSizeInBytes; D3D12_HIT_GROUP_DESC HitGroupDesc = {}; if (CollectionType == ECollectionType::HitGroup) { HitGroupDesc.HitGroupExport = Entry.GetPrimaryExportNameChars(); HitGroupDesc.Type = Shader->IntersectionEntryPoint.IsEmpty() ? D3D12_HIT_GROUP_TYPE_TRIANGLES : D3D12_HIT_GROUP_TYPE_PROCEDURAL_PRIMITIVE; { const FString& ExportName = Entry.ExportNames.Add_GetRef(GenerateShaderName(TEXT("CHS"), ShaderHash)); HitGroupDesc.ClosestHitShaderImport = *ExportName; OriginalEntryPoints.Add(*(Shader->EntryPoint)); RenamedEntryPoints.Add(*ExportName); } if (!Shader->AnyHitEntryPoint.IsEmpty()) { const FString& ExportName = Entry.ExportNames.Add_GetRef(GenerateShaderName(TEXT("AHS"), ShaderHash)); HitGroupDesc.AnyHitShaderImport = *ExportName; OriginalEntryPoints.Add(*(Shader->AnyHitEntryPoint)); RenamedEntryPoints.Add(*ExportName); } if (!Shader->IntersectionEntryPoint.IsEmpty()) { const FString& ExportName = Entry.ExportNames.Add_GetRef(GenerateShaderName(TEXT("IS"), ShaderHash)); HitGroupDesc.IntersectionShaderImport = *ExportName; OriginalEntryPoints.Add(*(Shader->IntersectionEntryPoint)); RenamedEntryPoints.Add(*ExportName); } } else { checkf(CollectionType == ECollectionType::Miss || CollectionType == ECollectionType::RayGen || CollectionType == ECollectionType::Callable, TEXT("Unexpected RT shader collection type")); OriginalEntryPoints.Add(*(Shader->EntryPoint)); RenamedEntryPoints.Add(Entry.GetPrimaryExportNameChars()); } // Validate that memory reservation was correct check(Entry.ExportNames.Num() <= Entry.MaxExports); FDXILLibrary Library; Library.InitFromDXIL(Shader->GetShaderBytecode(), OriginalEntryPoints.GetData(), RenamedEntryPoints.GetData(), OriginalEntryPoints.Num()); const FDXILLibrary* LibraryPtr = &Library; Entry.StateObject = CreateRayTracingStateObject( RayTracingDevice, MakeArrayView(&LibraryPtr, 1), RenamedEntryPoints, MaxAttributeSizeInBytes, MaxPayloadSizeInBytes, MakeArrayView(&HitGroupDesc, NumHitGroups), GlobalRootSignature, MakeArrayView(&LocalRootSignature, 1), {}, // LocalRootSignatureAssociations (single RS will be used for all exports since this is null) {}, // ExistingCollections D3D12_STATE_OBJECT_TYPE_COLLECTION); if (Entry.StateObject) { Device->GetRayTracingPipelineInfo(Entry.StateObject, &Entry.PipelineInfo); // Retrieve the identifier from the library Entry.Identifier = GetShaderIdentifier(Entry.StateObject, Entry.GetPrimaryExportNameChars()); } else if (bRequired) { UE_LOG(LogD3D12RHI, Fatal, TEXT("Failed to create a required RTPSO pipeline state collection")); } CompileTimeCycles += FPlatformTime::Cycles64(); Entry.CompileTimeMS = float(FPlatformTime::ToMilliseconds64(CompileTimeCycles)); if (Entry.CompileTimeMS >= 1000.0f) { // Log compilations of individual shaders that took more than 1 second UE_LOG(LogD3D12RHI, Log, TEXT("Compiled %s for RTPSO in %.2f ms."), OriginalEntryPoints[0], Entry.CompileTimeMS); } } FORCEINLINE TStatId GetStatId() const { return GET_STATID(STAT_RTPSO_CompileShader); } ENamedThreads::Type GetDesiredThread() { return ENamedThreads::AnyHiPriThreadHiPriTask; } FEntry& Entry; FKey CacheKey; FD3D12Device* Device; ID3D12Device5* RayTracingDevice; ECollectionType CollectionType; bool bRequired; }; FEntry* GetOrCompileShader( FD3D12Device* Device, FD3D12RayTracingShader* Shader, ID3D12RootSignature* GlobalRootSignature, uint32 MaxAttributeSizeInBytes, uint32 MaxPayloadSizeInBytes, bool bRequired, ECollectionType CollectionType, FGraphEventArray& CompletionList, bool* bOutCacheHit = nullptr) { FScopeLock Lock(&CriticalSection); const uint64 ShaderHash = GetShaderHash64(Shader); ID3D12RootSignature* LocalRootSignature = nullptr; if (CollectionType == ECollectionType::RayGen) { // RayGen shaders use a default empty local root signature as all their resources bound via global RS. LocalRootSignature = DefaultLocalRootSignature.GetRootSignature(); } else { // All other shaders (hit groups, miss, callable) use custom root signatures. LocalRootSignature = Shader->LocalRootSignature->GetRootSignature(); } FKey CacheKey; CacheKey.ShaderHash = ShaderHash; CacheKey.MaxAttributeSizeInBytes = MaxAttributeSizeInBytes; CacheKey.MaxPayloadSizeInBytes = MaxPayloadSizeInBytes; CacheKey.GlobalRootSignature = GlobalRootSignature; CacheKey.LocalRootSignature = LocalRootSignature; FEntry*& FindResult = Cache.FindOrAdd(CacheKey); if (FindResult) { if (bOutCacheHit) *bOutCacheHit = true; } else { if (bOutCacheHit) *bOutCacheHit = false; if (FindResult == nullptr) { FindResult = new FEntry; } FEntry& Entry = *FindResult; Entry.CollectionType = CollectionType; Entry.Shader = Shader; if (Shader->bPrecompiledPSO) { D3D12_SHADER_BYTECODE Bytecode = Shader->GetShaderBytecode(); Entry.StateObject = Device->DeserializeRayTracingStateObject(Bytecode, GlobalRootSignature); if (Entry.StateObject) { Device->GetRayTracingPipelineInfo(Entry.StateObject, &Entry.PipelineInfo); } checkf(Entry.StateObject != nullptr, TEXT("Failed to deserialize RTPSO")); Entry.ExportNames.Add(Shader->EntryPoint); Entry.Identifier = GetShaderIdentifier(Entry.StateObject, *Shader->EntryPoint); Entry.bDeserialized = true; } else { // Generate primary export name, which is immediately required on the PSO creation thread. Entry.ExportNames.Add(GenerateShaderName(GetCollectionTypeName(CollectionType), ShaderHash)); checkf(Entry.ExportNames.Num() == 1, TEXT("Primary export name must always be first.")); // Defer actual compilation to another task, as there may be many shaders that may be compiled in parallel. // Result of the compilation (the collection PSO) is not needed until final RT PSO is linked. Entry.CompileEvent = TGraphTask::CreateTask().ConstructAndDispatchWhenReady( Entry, CacheKey, Device, CollectionType, bRequired ); } } if (FindResult->CompileEvent.IsValid()) { if (!FindResult->CompileEvent->IsComplete()) { CompletionList.Add(FindResult->CompileEvent); } } else { // required PSOs collection are fatal if not valid (can have failed during precaching of this // RTPSO collection and wasn't fatal during precaching, but did fill the cache) if (bRequired && FindResult->StateObject == nullptr) { UE_LOG(LogD3D12RHI, Fatal, TEXT("Failed to create a required RTPSO collection")); } } return FindResult; } void Reset() { FScopeLock Lock(&CriticalSection); for (auto It : Cache) { delete It.Value; } Cache.Reset(); } ID3D12RootSignature* GetGlobalRootSignature(const FRHIShaderBindingLayout& ShaderBindingLayout) { FD3D12Adapter* Adapter = GetParentAdapter(); const FD3D12RootSignature* RootSignature = Adapter->GetGlobalRayTracingRootSignature(ShaderBindingLayout); return RootSignature->GetRootSignature(); } private: FCriticalSection CriticalSection; TMap Cache; FD3D12RootSignature DefaultLocalRootSignature; // Default empty root signature used for default hit shaders. }; inline bool AreBindlessResourcesEnabledForRayTracing(FD3D12Adapter* Adapter) { #if PLATFORM_SUPPORTS_BINDLESS_RENDERING FD3D12BindlessDescriptorManager& Manager = Adapter->GetDevice(0)->GetBindlessDescriptorManager(); if (IsBindlessEnabledForRayTracing(Manager.GetConfiguration())) { return true; } #endif return false; } // Helper class used to manage SBT buffer for a specific GPU class FD3D12RayTracingShaderBindingTableInternal { private: void WriteData(uint32 WriteOffset, const void* InData, uint32 InDataSize) { #if DO_CHECK && DO_GUARD_SLOW Data.RangeCheck(WriteOffset); Data.RangeCheck(WriteOffset + InDataSize - 1); #endif // DO_CHECK && DO_GUARD_SLOW FMemory::Memcpy(Data.GetData() + WriteOffset, InData, InDataSize); } void CompareData(uint32 Offset, const void* InData, uint32 InDataSize) { const void* CurrentData = Data.GetData() + Offset; ensure(FMemory::Memcmp(CurrentData, InData, InDataSize) == 0); } void WriteLocalShaderRecord(uint32 ShaderTableOffset, uint32 RecordIndex, uint32 OffsetWithinRecord, const void* InData, uint32 InDataSize) { checkfSlow(OffsetWithinRecord % 4 == 0, TEXT("SBT record parameters must be written on DWORD-aligned boundary")); checkfSlow(InDataSize % 4 == 0, TEXT("SBT record parameters must be DWORD-aligned")); checkfSlow(OffsetWithinRecord + InDataSize <= LocalRecordSizeUnaligned, TEXT("SBT record write request is out of bounds")); const uint32 WriteOffset = ShaderTableOffset + LocalRecordStride * RecordIndex + OffsetWithinRecord; WriteData(WriteOffset, InData, InDataSize); } void CompareLocalShaderRecord(uint32 ShaderTableOffset, uint32 RecordIndex, uint32 OffsetWithinRecord, const void* InData, uint32 InDataSize) { const uint32 Offset = ShaderTableOffset + LocalRecordStride * RecordIndex + OffsetWithinRecord; CompareData(Offset, InData, InDataSize); } public: UE_NONCOPYABLE(FD3D12RayTracingShaderBindingTableInternal) // Ray tracing shader bindings can be processed in parallel. // Each concurrent worker gets its own dedicated descriptor cache instance to avoid contention or locking. // Scaling beyond 5 total threads does not yield any speedup in practice. static constexpr uint32 MaxBindingWorkers = 5; // RHI thread + 4 parallel workers. FD3D12RayTracingShaderBindingTableInternal(FRHICommandListBase& RHICmdList, const FRayTracingShaderBindingTableInitializer& Initializer, FD3D12Device* InDevice) : UniqueId(NextUniqueId++) { checkf(Initializer.LocalBindingDataSize <= 4096, TEXT("The maximum size of a local root signature is 4KB.")); // as per section 4.22.1 of DXR spec v1.0 check(Initializer.ShaderBindingMode != ERayTracingShaderBindingMode::Disabled); Device = InDevice; Lifetime = Initializer.Lifetime; HitGroupIndexingMode = Initializer.HitGroupIndexingMode; ShaderBindingMode = Initializer.ShaderBindingMode; NumShaderSlotsPerGeometrySegment = Initializer.NumShaderSlotsPerGeometrySegment; const uint32 NumHitGroupSlots = Initializer.HitGroupIndexingMode == ERayTracingHitGroupIndexingMode::Allow ? Initializer.NumGeometrySegments * Initializer.NumShaderSlotsPerGeometrySegment : 1; NumMissRecords = Initializer.NumMissShaderSlots; NumHitRecords = NumHitGroupSlots; NumCallableRecords = Initializer.NumCallableShaderSlots; if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::RTPSO)) { checkf(Initializer.LocalBindingDataSize >= sizeof(FD3D12HitGroupSystemParameters), TEXT("All local root signatures are expected to contain ray tracing system root parameters")); LocalRecordSizeUnaligned = ShaderIdentifierSize + Initializer.LocalBindingDataSize; LocalRecordStride = RoundUpToNextMultiple(LocalRecordSizeUnaligned, D3D12_RAYTRACING_SHADER_RECORD_BYTE_ALIGNMENT); // Custom descriptor cache is only required when local resources may be bound. // If only global resources are used, then transient descriptor cache can be used. const bool bNeedsDescriptorCache = (NumHitGroupSlots + Initializer.NumCallableShaderSlots + Initializer.NumMissShaderSlots) * Initializer.LocalBindingDataSize != 0; if (bNeedsDescriptorCache) { // #dxr_todo UE-72158: Remove this when RT descriptors are sub-allocated from the global view descriptor heap. if (GD3D12ExplicitViewDescriptorHeapOverflowReported) { GD3D12ExplicitViewDescriptorHeapSize = GD3D12ExplicitViewDescriptorHeapSize * 2; GD3D12ExplicitViewDescriptorHeapOverflowReported = 0; } // D3D12 is guaranteed to support 1M (D3D12_MAX_SHADER_VISIBLE_DESCRIPTOR_HEAP_SIZE_TIER_1) descriptors in a CBV/SRV/UAV heap, so clamp the size to this. // https://docs.microsoft.com/en-us/windows/desktop/direct3d12/hardware-support const uint32 NumViewDescriptors = FMath::Min(D3D12_MAX_SHADER_VISIBLE_DESCRIPTOR_HEAP_SIZE_TIER_1, GD3D12ExplicitViewDescriptorHeapSize); const uint32 NumSamplerDescriptors = D3D12_MAX_SHADER_VISIBLE_SAMPLER_HEAP_SIZE; DescriptorCache = new FD3D12ExplicitDescriptorCache(Device, MaxBindingWorkers); DescriptorCache->Init(0, NumViewDescriptors, NumSamplerDescriptors, ERHIBindlessConfiguration::RayTracing); } uint32 TotalDataSize = 0; HitGroupShaderTableOffset = TotalDataSize; TotalDataSize += NumHitGroupSlots * LocalRecordStride; TotalDataSize = RoundUpToNextMultiple(TotalDataSize, D3D12_RAYTRACING_SHADER_TABLE_BYTE_ALIGNMENT); CallableShaderTableOffset = TotalDataSize; TotalDataSize += Initializer.NumCallableShaderSlots * LocalRecordStride; TotalDataSize = RoundUpToNextMultiple(TotalDataSize, D3D12_RAYTRACING_SHADER_TABLE_BYTE_ALIGNMENT); MissShaderTableOffset = TotalDataSize; TotalDataSize += Initializer.NumMissShaderSlots * LocalRecordStride; TotalDataSize = RoundUpToNextMultiple(TotalDataSize, D3D12_RAYTRACING_SHADER_TABLE_BYTE_ALIGNMENT); Data.SetNumZeroed(TotalDataSize); #if DO_CHECK bWasDefaultMissShaderSet = false; #endif SetDefaultHitGroupIdentifier(FD3D12ShaderIdentifier::Null); SetDefaultMissShaderIdentifier(FD3D12ShaderIdentifier::Null); SetDefaultCallableShaderIdentifier(FD3D12ShaderIdentifier::Null); // Keep CPU-side data after upload Data.SetAllowCPUAccess(true); } if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::Inline) && Initializer.NumGeometrySegments > 0) { // Doesn't make sense to have inline SBT without hitgroup indexing check(HitGroupIndexingMode == ERayTracingHitGroupIndexingMode::Allow); const uint32 ParameterBufferSize = Initializer.NumGeometrySegments * sizeof(FD3D12HitGroupSystemParameters); InlineGeometryParameterData.SetNumZeroed(ParameterBufferSize); } if (Lifetime == ERayTracingShaderBindingTableLifetime::Persistent) { HitRecordData.SetNum(NumHitRecords); } } ~FD3D12RayTracingShaderBindingTableInternal() { // Might be slow - iterate faster somehow? for (int32 RecordIndex = 0; RecordIndex <= WorkerData[0].MaxUsedHitRecordIndex; ++RecordIndex) { ClearHitRecordData(0, RecordIndex); } delete DescriptorCache; #if D3D12RHI_USE_CONSTANT_BUFFER_VIEWS for (FWorkerThreadData& ThisWorkerData : WorkerData) { for (FD3D12ConstantBufferView* CBV : ThisWorkerData.LooseParameterCBVs) { delete CBV; } } #endif // D3D12RHI_USE_CONSTANT_BUFFER_VIEWS } template void SetLocalShaderParameters(uint32 ShaderTableOffset, uint32 RecordIndex, uint32 InOffsetWithinRootSignature, const T& Parameters) { WriteLocalShaderRecord(ShaderTableOffset, RecordIndex, ShaderIdentifierSize + InOffsetWithinRootSignature, &Parameters, sizeof(Parameters)); } void SetLocalShaderParameters(uint32 ShaderTableOffset, uint32 RecordIndex, uint32 InOffsetWithinRootSignature, const void* InData, uint32 InDataSize) { WriteLocalShaderRecord(ShaderTableOffset, RecordIndex, ShaderIdentifierSize + InOffsetWithinRootSignature, InData, InDataSize); } template void CompareLocalShaderParameters(uint32 ShaderTableOffset, uint32 RecordIndex, uint32 InOffsetWithinRootSignature, const T& Parameters) { CompareLocalShaderRecord(ShaderTableOffset, RecordIndex, ShaderIdentifierSize + InOffsetWithinRootSignature, &Parameters, sizeof(Parameters)); } void CompareLocalShaderParameters(uint32 ShaderTableOffset, uint32 RecordIndex, uint32 InOffsetWithinRootSignature, const void* InData, uint32 InDataSize) { CompareLocalShaderRecord(ShaderTableOffset, RecordIndex, ShaderIdentifierSize + InOffsetWithinRootSignature, InData, InDataSize); } template void SetMissShaderParameters(uint32 RecordIndex, uint32 InOffsetWithinRootSignature, const T& Parameters) { const uint32 ShaderTableOffset = MissShaderTableOffset; WriteLocalShaderRecord(ShaderTableOffset, RecordIndex, ShaderIdentifierSize + InOffsetWithinRootSignature, &Parameters, sizeof(Parameters)); } template void SetCallableShaderParameters(uint32 RecordIndex, uint32 InOffsetWithinRootSignature, const T& Parameters) { const uint32 ShaderTableOffset = CallableShaderTableOffset; WriteLocalShaderRecord(ShaderTableOffset, RecordIndex, ShaderIdentifierSize + InOffsetWithinRootSignature, &Parameters, sizeof(Parameters)); } void CopyLocalShaderParameters(uint32 InShaderTableOffset, uint32 InDestRecordIndex, uint32 InSourceRecordIndex, uint32 InOffsetWithinRootSignature) { const uint32 BaseOffset = InShaderTableOffset + ShaderIdentifierSize + InOffsetWithinRootSignature; const uint32 DestOffset = BaseOffset + LocalRecordStride * InDestRecordIndex; const uint32 SourceOffset = BaseOffset + LocalRecordStride * InSourceRecordIndex; const uint32 CopySize = LocalRecordStride - ShaderIdentifierSize - InOffsetWithinRootSignature; checkSlow(CopySize <= LocalRecordStride); FMemory::Memcpy( Data.GetData() + DestOffset, Data.GetData() + SourceOffset, CopySize); } void CopyHitGroupParameters(uint32 InDestRecordIndex, uint32 InSourceRecordIndex, uint32 InOffsetWithinRootSignature) { const uint32 ShaderTableOffset = HitGroupShaderTableOffset; CopyLocalShaderParameters(ShaderTableOffset, InDestRecordIndex, InSourceRecordIndex, InOffsetWithinRootSignature); } void SetMissIdentifier(uint32 RecordIndex, const FD3D12ShaderIdentifier& ShaderIdentifier) { const uint32 WriteOffset = MissShaderTableOffset + RecordIndex * LocalRecordStride; #if DO_CHECK if (RecordIndex == 0) { bWasDefaultMissShaderSet = true; } #endif WriteData(WriteOffset, ShaderIdentifier.Data, ShaderIdentifierSize); } void SetCallableIdentifier(uint32 RecordIndex, const FD3D12ShaderIdentifier& ShaderIdentifier) { const uint32 WriteOffset = CallableShaderTableOffset + RecordIndex * LocalRecordStride; WriteData(WriteOffset, ShaderIdentifier.Data, ShaderIdentifierSize); } void SetDefaultHitGroupIdentifier(const FD3D12ShaderIdentifier& ShaderIdentifier) { const uint32 WriteOffset = HitGroupShaderTableOffset; WriteData(WriteOffset, ShaderIdentifier.Data, ShaderIdentifierSize); } void SetHitGroupSystemParameters(uint32 RecordIndex, const FD3D12HitGroupSystemParameters& SystemParameters) { const uint32 OffsetWithinRootSignature = 0; // System parameters are always first in the RS. const uint32 ShaderTableOffset = HitGroupShaderTableOffset; SetLocalShaderParameters(ShaderTableOffset, RecordIndex, OffsetWithinRootSignature, SystemParameters); } void CompareHitGroupSystemParameters(uint32 RecordIndex, const FD3D12HitGroupSystemParameters& SystemParameters) { const uint32 OffsetWithinRootSignature = 0; // System parameters are always first in the RS. const uint32 ShaderTableOffset = HitGroupShaderTableOffset; CompareLocalShaderParameters(ShaderTableOffset, RecordIndex, OffsetWithinRootSignature, SystemParameters); } void SetHitGroupIdentifier(uint32 RecordIndex, const FD3D12ShaderIdentifier& ShaderIdentifier) { checkfSlow(ShaderIdentifier.IsValid(), TEXT("Shader identifier must be initialized FD3D12RayTracingPipelineState::GetShaderIdentifier() before use.")); checkSlow(sizeof(ShaderIdentifier.Data) >= ShaderIdentifierSize); const uint32 WriteOffset = HitGroupShaderTableOffset + RecordIndex * LocalRecordStride; WriteData(WriteOffset, ShaderIdentifier.Data, ShaderIdentifierSize); } void CompareHitGroupIdentifier(uint32 RecordIndex, const FD3D12ShaderIdentifier& ShaderIdentifier) { checkfSlow(ShaderIdentifier.IsValid(), TEXT("Shader identifier must be initialized FD3D12RayTracingPipelineState::GetShaderIdentifier() before use.")); checkSlow(sizeof(ShaderIdentifier.Data) >= ShaderIdentifierSize); const uint32 WriteOffset = HitGroupShaderTableOffset + RecordIndex * LocalRecordStride; CompareData(WriteOffset, ShaderIdentifier.Data, ShaderIdentifierSize); } void SetDefaultMissShaderIdentifier(const FD3D12ShaderIdentifier& ShaderIdentifier) { // Set all slots to the same default for (uint32 Index = 0; Index < NumMissRecords; ++Index) { SetMissIdentifier(Index, ShaderIdentifier); } #if DO_CHECK bWasDefaultMissShaderSet = false; #endif } void SetDefaultCallableShaderIdentifier(const FD3D12ShaderIdentifier& ShaderIdentifier) { for (uint32 Index = 0; Index < NumCallableRecords; ++Index) { SetCallableIdentifier(Index, ShaderIdentifier); } } void SetInlineGeometryParameters(uint32 InlineRecordIndex, const FD3D12HitGroupSystemParameters& SystemParameters) { const uint32 DataSize = sizeof(FD3D12HitGroupSystemParameters); const uint32 WriteOffset = DataSize * InlineRecordIndex; FMemory::Memcpy(&InlineGeometryParameterData[WriteOffset], &SystemParameters, DataSize); } void CompareInlineGeometryParameters(uint32 InlineRecordIndex, const FD3D12HitGroupSystemParameters& SystemParameters) { const uint32 DataSize = sizeof(FD3D12HitGroupSystemParameters); const uint32 WriteOffset = DataSize * InlineRecordIndex; const void* CurrentData = InlineGeometryParameterData.GetData() + WriteOffset; ensure(FMemory::Memcmp(CurrentData, &SystemParameters, DataSize) == 0); } FRHISizeAndStride GetInlineBindingDataSizeAndStride() const { return FRHISizeAndStride { (uint64)InlineGeometryParameterData.Num(), sizeof(FD3D12HitGroupSystemParameters) }; } void Commit(FD3D12CommandContext& Context, FRHIBuffer* InInlineBindingDataBuffer) { TRACE_CPUPROFILER_EVENT_SCOPE(ShaderTableCommit); check(IsInRHIThread() || !IsRunningRHIInSeparateThread()); checkf(bIsDirty, TEXT("bIsDirty should be checked before calling Commit()")); #if ENABLE_RESIDENCY_MANAGEMENT // Merge all data from worker threads into the main set { TRACE_CPUPROFILER_EVENT_SCOPE(MergeWorkerData); for (uint32 WorkerIndex = 1; WorkerIndex < MaxBindingWorkers; ++WorkerIndex) { for (FD3D12Resource* Resource : WorkerData[WorkerIndex].DynamicReferencedResources) { AddDynamicReferencedResource(0, Resource); } for (auto Iter = WorkerData[WorkerIndex].PersistentReferencedResources.CreateIterator(); Iter; ++Iter) { UpdatePersistentReferencedResource(0, Iter.Key(), Iter.Value()); } WorkerData[WorkerIndex].DynamicReferencedResources.Empty(); WorkerData[WorkerIndex].DynamicReferencedResourceSet.Empty(); WorkerData[WorkerIndex].PersistentReferencedResources.Empty(); WorkerData[WorkerIndex].LastTrackedBaseShaderResources.Empty(); } } // Build the used persistent d3d12 resource set from the views { TRACE_CPUPROFILER_EVENT_SCOPE(BuildUsedResourceSet); ReferencedResources.Empty(WorkerData[0].PersistentReferencedResources.Num() + WorkerData[0].DynamicReferencedResources.Num()); // Add tracked resources for persistent SBT records for (auto Iter = WorkerData[0].PersistentReferencedResources.CreateIterator(); Iter; ++Iter) { ReferencedResources.Add(Iter.Key()); } // Also add all the collected dynamic resources and reset for the next 'frame' ReferencedResources.Append(WorkerData[0].DynamicReferencedResources); WorkerData[0].DynamicReferencedResources.Empty(); WorkerData[0].DynamicReferencedResourceSet.Empty(); WorkerData[0].LastTrackedBaseShaderResources.Empty(); } #endif // ENABLE_RESIDENCY_MANAGEMENT // Merge max record index from worker data into index 0 for (uint32 WorkerIndex = 1; WorkerIndex < MaxBindingWorkers; ++WorkerIndex) { WorkerData[0].MaxUsedHitRecordIndex = FMath::Max(WorkerData[0].MaxUsedHitRecordIndex, WorkerData[WorkerIndex].MaxUsedHitRecordIndex); WorkerData[WorkerIndex].MaxUsedHitRecordIndex = 0; } // Clear the record cache for reuse for all worker threads (including index 0) for (uint32 WorkerIndex = 0; WorkerIndex < MaxBindingWorkers; ++WorkerIndex) { WorkerData[WorkerIndex].ShaderRecordCache.Empty(); } FD3D12Adapter* Adapter = Device->GetParentAdapter(); if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::RTPSO)) { checkf(Data.Num(), TEXT("Shader table is expected to be initialized before copying to GPU.")); #if DO_CHECK // Need to split out hit binding data from miss and callable because it's possible that hit binding data wasn't set or needed (no valid binding), // but miss data is still needed, and we also don't need to copy the complete allocated hit binding data but only upto MaxUsedHitRecordIndex // which is not easily possible with the 1 big allocation. //checkf(bWasDefaultMissShaderSet, TEXT("At least the first miss shader must have been set before copying to GPU.")); #endif const uint32 BufferSize = Data.GetResourceDataSize(); D3D12_RESOURCE_DESC BufferDesc = CD3DX12_RESOURCE_DESC::Buffer(BufferSize, D3D12_RESOURCE_FLAG_NONE, D3D12_RAYTRACING_SHADER_TABLE_BYTE_ALIGNMENT); const FRHIBufferCreateDesc CreateDesc = FRHIBufferCreateDesc::Create(TEXT("ShaderBindingTable"), BufferSize, 0, BUF_Static) .SetInitialState(ERHIAccess::CopyDest) .SetGPUMask(FRHIGPUMask::FromIndex(Device->GetGPUIndex())); ID3D12ResourceAllocator* ResourceAllocator = nullptr; Buffer = Adapter->CreateRHIBuffer( BufferDesc, BufferDesc.Alignment, CreateDesc, ED3D12ResourceStateMode::MultiState, D3D12_RESOURCE_STATE_COPY_DEST, /*bHasInitialData*/ true ); // Use copy queue for uploading the data Context.BatchedSyncPoints.ToWait.Emplace(Buffer->UploadResourceDataViaCopyQueue(Context, &Data)); // Enqueue transition to SRV Context.TransitionResource( Buffer->GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE, 0); } // Also copy geometry parameter data to the GPU buffer if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::Inline) && WorkerData[0].MaxUsedHitRecordIndex >= 0) { check(InInlineBindingDataBuffer); FD3D12Buffer* D3D12InlineBindingDataBuffer = FD3D12DynamicRHI::ResourceCast(InInlineBindingDataBuffer, Device->GetGPUIndex()); uint32 MaxUsedInlineRecordIndex = WorkerData[0].MaxUsedHitRecordIndex / NumShaderSlotsPerGeometrySegment; uint32 MaxUsedInlineGeometryParamaterDataSize = (MaxUsedInlineRecordIndex + 1) * sizeof(FD3D12HitGroupSystemParameters); FD3D12ResourceLocation UploadResourceLocation(Device); void* MappedUploadMemory = Adapter->GetUploadHeapAllocator(Device->GetGPUIndex()).AllocUploadResource(MaxUsedInlineGeometryParamaterDataSize, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT, UploadResourceLocation); FMemory::Memcpy(MappedUploadMemory, InlineGeometryParameterData.GetData(), MaxUsedInlineGeometryParamaterDataSize); FD3D12Resource* SourceResource = UploadResourceLocation.GetResource(); FD3D12Resource* DestResource = D3D12InlineBindingDataBuffer->ResourceLocation.GetResource(); Context.UpdateResidency(DestResource); Context.CopyBufferRegionChecked( DestResource->GetResource(), DestResource->GetName(), D3D12InlineBindingDataBuffer->ResourceLocation.GetOffsetFromBaseOfResource(), SourceResource->GetResource(), SourceResource->GetName(), UploadResourceLocation.GetOffsetFromBaseOfResource(), MaxUsedInlineGeometryParamaterDataSize ); #if ENABLE_RESIDENCY_MANAGEMENT // assume no references resources when inline SBT only - inline SBT only has references to IB/VB buffers and these are // residency tracked via the TLAS check(ShaderBindingMode != ERayTracingShaderBindingMode::Inline || ReferencedResources.IsEmpty()); #endif // ENABLE_RESIDENCY_MANAGEMENT } bIsDirty = false; } D3D12_GPU_VIRTUAL_ADDRESS GetShaderTableAddress() const { checkf(!bIsDirty, TEXT("Shader table update is pending, therefore GPU address is not available. Use Commit() to upload data and acquire a valid GPU buffer address.")); return Buffer->ResourceLocation.GetGPUVirtualAddress(); } D3D12_DISPATCH_RAYS_DESC GetDispatchRaysDesc(FD3D12Device* InDevice, const FD3D12ShaderIdentifier& RayGenShaderIdentifier) const { check(Device == InDevice); // Allocate memory for the ray gen shader identifier storage check(ShaderIdentifierSize == sizeof(FD3D12ShaderIdentifier)); FD3D12ResourceLocation UploadResourceLocation(Device); void* RayGenGPUData = Device->GetDefaultFastAllocator().Allocate(RayGenRecordStride, 256, &UploadResourceLocation); FMemory::Memcpy(RayGenGPUData, &RayGenShaderIdentifier, ShaderIdentifierSize); D3D12_GPU_VIRTUAL_ADDRESS RayGenStartShaderIdentifierAddress = UploadResourceLocation.GetGPUVirtualAddress(); D3D12_GPU_VIRTUAL_ADDRESS ShaderTableAddress = GetShaderTableAddress(); D3D12_DISPATCH_RAYS_DESC Desc = {}; Desc.RayGenerationShaderRecord.StartAddress = RayGenStartShaderIdentifierAddress; Desc.RayGenerationShaderRecord.SizeInBytes = RayGenRecordStride; Desc.MissShaderTable.StartAddress = ShaderTableAddress + MissShaderTableOffset; Desc.MissShaderTable.StrideInBytes = LocalRecordStride; Desc.MissShaderTable.SizeInBytes = LocalRecordStride * NumMissRecords; if (NumCallableRecords) { Desc.CallableShaderTable.StartAddress = ShaderTableAddress + CallableShaderTableOffset; Desc.CallableShaderTable.StrideInBytes = LocalRecordStride; Desc.CallableShaderTable.SizeInBytes = NumCallableRecords * LocalRecordStride; } if (HitGroupIndexingMode == ERayTracingHitGroupIndexingMode::Allow) { Desc.HitGroupTable.StartAddress = ShaderTableAddress + HitGroupShaderTableOffset; Desc.HitGroupTable.StrideInBytes = LocalRecordStride; Desc.HitGroupTable.SizeInBytes = NumHitRecords * LocalRecordStride; } else { Desc.HitGroupTable.StartAddress = ShaderTableAddress + HitGroupShaderTableOffset; Desc.HitGroupTable.StrideInBytes = 0; // Zero stride effectively disables SBT indexing Desc.HitGroupTable.SizeInBytes = D3D12_RAYTRACING_SHADER_TABLE_BYTE_ALIGNMENT; // Minimal table with only one record } return Desc; } static constexpr uint32 ShaderIdentifierSize = D3D12_SHADER_IDENTIFIER_SIZE_IN_BYTES; uint32 NumHitRecords = 0; uint32 NumCallableRecords = 0; uint32 NumMissRecords = 0; uint32 MissShaderTableOffset = 0; uint32 HitGroupShaderTableOffset = 0; uint32 CallableShaderTableOffset = 0; FD3D12Device* Device = nullptr; ERayTracingShaderBindingTableLifetime Lifetime = ERayTracingShaderBindingTableLifetime::Transient; ERayTracingHitGroupIndexingMode HitGroupIndexingMode = ERayTracingHitGroupIndexingMode::Allow; ERayTracingShaderBindingMode ShaderBindingMode = ERayTracingShaderBindingMode::RTPSO; uint32 NumShaderSlotsPerGeometrySegment = 0; // Note: TABLE_BYTE_ALIGNMENT is used instead of RECORD_BYTE_ALIGNMENT to allow arbitrary switching // between multiple RayGen and Miss shaders within the same underlying table. static constexpr uint32 RayGenRecordStride = D3D12_RAYTRACING_SHADER_TABLE_BYTE_ALIGNMENT; uint32 LocalRecordSizeUnaligned = 0; // size of the shader identifier + local root parameters, not aligned to SHADER_RECORD_BYTE_ALIGNMENT (used for out-of-bounds access checks) uint32 LocalRecordStride = 0; // size of shader identifier + local root parameters, aligned to SHADER_RECORD_BYTE_ALIGNMENT (same for hit groups and callable shaders) TResourceArray Data; // Buffer that contains per-hitrecord index and vertex buffer binding data TResourceArray InlineGeometryParameterData; bool bIsDirty = true; TRefCountPtr Buffer; #if DO_CHECK bool bWasDefaultMissShaderSet = false; #endif // All reference d3d12 resources from both persistent and dynamic records (rebuild during commit) TArray ReferencedResources; // SBTs have their own descriptor heaps FD3D12ExplicitDescriptorCache* DescriptorCache = nullptr; struct FRecordUpdateUniformBufferListener : public ID3D12UniformBufferUpdateListener { FD3D12RayTracingShaderBindingTableInternal& ShaderTable; FD3D12UniformBuffer* UniformBuffer = nullptr; uint32 ShaderTableOffset = 0; uint32 RecordIndex = 0; uint32 OffsetWithinRootSignature = 0; FRecordUpdateUniformBufferListener(FD3D12RayTracingShaderBindingTableInternal& InShaderTable) : ShaderTable(InShaderTable) { } virtual ~FRecordUpdateUniformBufferListener() = default; void Init(FD3D12UniformBuffer* InUniformBuffer, uint32 InShaderTableOffset, uint32 InRecordIndex, uint32 InOffsetWithinRootSignature) { UniformBuffer = InUniformBuffer; ShaderTableOffset = InShaderTableOffset; RecordIndex = InRecordIndex; OffsetWithinRootSignature = InOffsetWithinRootSignature; UniformBuffer->AddUpdateListener(this); } void Reset() { check(UniformBuffer); UniformBuffer->RemoveUpdateListener(this); UniformBuffer = nullptr; } virtual void RemoveListener(FD3D12UniformBuffer* InUpdatedUniformBuffer) final override { check(UniformBuffer == InUpdatedUniformBuffer); ShaderTable.RemoveUpdateUniformBufferListener(RecordIndex, this); } virtual void UniformBufferUpdated(FRHICommandListBase& CmdList, FD3D12UniformBuffer* InUpdatedUniformBuffer) final override { check(UniformBuffer == InUpdatedUniformBuffer); ShaderTable.SetLocalShaderParameters(ShaderTableOffset, RecordIndex, OffsetWithinRootSignature, InUpdatedUniformBuffer->ResourceLocation.GetGPUVirtualAddress()); INC_DWORD_STAT(STAT_D3D12RayTracingUBRecordUpdate); } }; struct FRecordUpdateRayTracingGeometryListener : public ID3D12RayTracingGeometryUpdateListener { FD3D12RayTracingShaderBindingTableInternal* ShaderTable = nullptr; const FD3D12RayTracingGeometry* Geometry = nullptr; uint32 RecordIndex = 0; uint32 UserData = 0; uint32 SegmentIndex = 0; virtual ~FRecordUpdateRayTracingGeometryListener() = default; void Init(FD3D12RayTracingShaderBindingTableInternal* InShaderTable, const FD3D12RayTracingGeometry* InGeometry, uint32 InRecordIndex, uint32 InUserData, uint32 InSegmentIndex) { check(Geometry == nullptr); ShaderTable = InShaderTable; Geometry = InGeometry; RecordIndex = InRecordIndex; UserData = InUserData; SegmentIndex = InSegmentIndex; Geometry->AddUpdateListener(this); } void Reset() { ShaderTable = nullptr; Geometry = nullptr; RecordIndex = 0; UserData = 0; SegmentIndex = 0; } virtual void RemoveListener(FD3D12RayTracingGeometry* InGeometry) final override { check(Geometry == InGeometry); ShaderTable->RemoveUpdateRayTracingGeometryListener(RecordIndex, this); } virtual void HitGroupParametersUpdated(FD3D12RayTracingGeometry* InGeometry) final override { check(InGeometry == Geometry); uint32 GPUIndex = ShaderTable->Device->GetGPUIndex(); FD3D12HitGroupSystemParameters SystemParameters = Geometry->HitGroupSystemParameters[GPUIndex][SegmentIndex]; SystemParameters.RootConstants.UserData = UserData; uint32 InlineRecordIndex = ShaderTable->GetInlineRecordIndex(RecordIndex); if (EnumHasAnyFlags(ShaderTable->ShaderBindingMode, ERayTracingShaderBindingMode::RTPSO)) { ShaderTable->SetHitGroupSystemParameters(RecordIndex, SystemParameters); } if (EnumHasAnyFlags(ShaderTable->ShaderBindingMode, ERayTracingShaderBindingMode::Inline) && InlineRecordIndex != INDEX_NONE) { ShaderTable->SetInlineGeometryParameters(InlineRecordIndex, SystemParameters); } } }; #if ENABLE_RESIDENCY_MANAGEMENT struct FRecordUpdateShaderResourceRenameListener : public FD3D12ShaderResourceRenameListener { FD3D12RayTracingShaderBindingTableInternal& ShaderTable; uint32 RecordIndex = 0; TRefCountPtr ShaderResource; FD3D12Resource* CurrentResource = nullptr; FRecordUpdateShaderResourceRenameListener(FD3D12RayTracingShaderBindingTableInternal& InShaderTable) : ShaderTable(InShaderTable) { } virtual ~FRecordUpdateShaderResourceRenameListener() = default; void Init(uint32 WorkerIndex, uint32 InRecordIndex, FD3D12BaseShaderResource* InShaderResource) { RecordIndex = InRecordIndex; ShaderResource = InShaderResource; CurrentResource = InShaderResource->ResourceLocation.GetResource(); ShaderResource->AddRenameListener(this); if (CurrentResource) { ShaderTable.UpdatePersistentReferencedResource(WorkerIndex, CurrentResource, 1); } } void Reset(uint32 WorkerIndex) { check(ShaderResource); if (CurrentResource) { ShaderTable.UpdatePersistentReferencedResource(WorkerIndex, CurrentResource, -1); ShaderTable.AddLastTrackedBaseShaderResource(WorkerIndex, ShaderResource); } ShaderResource->RemoveRenameListener(this); ShaderResource = nullptr; RecordIndex = 0; CurrentResource = 0; } virtual void ResourceRenamed(FD3D12ContextArray const& Contexts, FD3D12BaseShaderResource* InRenamedResource, FD3D12ResourceLocation* InNewResourceLocation) override { check(InRenamedResource == ShaderResource); FD3D12Resource* NewResource = InNewResourceLocation->GetResource(); // only care if actual d3d12 resource has changed if (NewResource != CurrentResource) { // Resource could be streamed in or out and then contain empty d3d12 backing resource if (CurrentResource) { ShaderTable.UpdatePersistentReferencedResource(0, CurrentResource, -1); } if (NewResource) { ShaderTable.UpdatePersistentReferencedResource(0, NewResource, 1); } CurrentResource = NewResource; } } }; struct FRecordUpdateTextureReferenceReplaceListener : public FD3D12TextureReferenceReplaceListener, public FD3D12ShaderResourceRenameListener { FD3D12RayTracingShaderBindingTableInternal& ShaderTable; uint32 RecordIndex = 0; TRefCountPtr TextureReference; FD3D12Resource* CurrentResource = nullptr; FRecordUpdateTextureReferenceReplaceListener(FD3D12RayTracingShaderBindingTableInternal& InShaderTable) : ShaderTable(InShaderTable) { } virtual ~FRecordUpdateTextureReferenceReplaceListener() = default; void Init(uint32 WorkerIndex, uint32 InRecordIndex, FD3D12RHITextureReference* InTextureReference) { RecordIndex = InRecordIndex; TextureReference = InTextureReference; TextureReference->AddReplaceListener(this); // Handle rename events on the references texture as well FD3D12Texture* ReferencedTexture = FD3D12DynamicRHI::ResourceCast(TextureReference->GetReferencedTexture()); ReferencedTexture->AddRenameListener(this); CurrentResource = ReferencedTexture->ResourceLocation.GetResource(); check(CurrentResource); ShaderTable.UpdatePersistentReferencedResource(WorkerIndex, CurrentResource, 1); } void Reset(uint32 WorkerIndex) { check(TextureReference && CurrentResource); ShaderTable.UpdatePersistentReferencedResource(WorkerIndex, CurrentResource, -1); FD3D12Texture* ReferencedTexture = FD3D12DynamicRHI::ResourceCast(TextureReference->GetReferencedTexture()); ReferencedTexture->RemoveRenameListener(this); TextureReference->RemoveReplaceListener(this); ShaderTable.AddLastTrackedBaseShaderResource(WorkerIndex, ReferencedTexture); TextureReference = nullptr; RecordIndex = 0; CurrentResource = 0; } virtual void TextureReplaced(FD3D12ContextArray const& Contexts, FD3D12RHITextureReference* InTextureReference, FD3D12Texture* CurrentTexture, FD3D12Texture* NewTexture) override { check(TextureReference == InTextureReference); FD3D12Resource* NewResource = NewTexture->ResourceLocation.GetResource(); // only care if actual d3d12 resource has changed if (NewResource != CurrentResource) { ShaderTable.UpdatePersistentReferencedResource(0, CurrentResource, -1); ShaderTable.UpdatePersistentReferencedResource(0, NewResource, 1); CurrentResource = NewResource; } // Also update the rename listener CurrentTexture->RemoveRenameListener(this); NewTexture->AddRenameListener(this); } virtual void ResourceRenamed(FD3D12ContextArray const& Contexts, FD3D12BaseShaderResource* InRenamedResource, FD3D12ResourceLocation* InNewResourceLocation) override { FD3D12Texture* ReferencedTexture = FD3D12DynamicRHI::ResourceCast(TextureReference->GetReferencedTexture()); check(InRenamedResource == ReferencedTexture); FD3D12Resource* NewResource = InNewResourceLocation->GetResource(); // only care if actual d3d12 resource has changed if (NewResource != CurrentResource) { ShaderTable.UpdatePersistentReferencedResource(0, CurrentResource, -1); ShaderTable.UpdatePersistentReferencedResource(0, NewResource, 1); CurrentResource = NewResource; } } }; #endif // ENABLE_RESIDENCY_MANAGEMENT // Tracked data for each record so stored GPU data can be updated after defrag operation struct FRecordData { enum class EState { Uninitialized, Persistent, Transient, }; void Reset() { #if ENABLE_RESIDENCY_MANAGEMENT ReferencedShaderResourceListeners.Empty(); ReferencedTextureReferenceListeners.Empty(); #endif // ENABLE_RESIDENCY_MANAGEMENT UpdateGeometryListener.Reset(); UpdateUniformBufferListeners.Empty(); #if DO_CHECK LooseParameterData.Empty(); #endif //DO_CHECK State = EState::Uninitialized; } EState State = EState::Uninitialized; TArray UpdateUniformBufferListeners; FRecordUpdateRayTracingGeometryListener UpdateGeometryListener; FD3D12ResourceLocation* LooseParameterResourceLocation = nullptr; #if ENABLE_RESIDENCY_MANAGEMENT TArray ReferencedShaderResourceListeners; TArray ReferencedTextureReferenceListeners; #endif // ENABLE_RESIDENCY_MANAGEMENT #if DO_CHECK TArray LooseParameterData; #endif //DO_CHECK }; TArray HitRecordData; uint32 GetInlineRecordIndex(uint32 RecordIndex) const { // Only care about shader slot 0 for inline geometry parameters -> remap the record index return (RecordIndex % NumShaderSlotsPerGeometrySegment == 0) ? RecordIndex / NumShaderSlotsPerGeometrySegment : INDEX_NONE; } void SetHitGroupGeometrySystemParameters(uint32 WorkerIndex, ERayTracingLocalShaderBindingType BindingType, uint32 RecordIndex, const FD3D12RayTracingGeometry* Geometry, uint32 UserData, uint32 GeometrySegmentIndex) { FD3D12HitGroupSystemParameters SystemParameters = Geometry->HitGroupSystemParameters[Device->GetGPUIndex()][GeometrySegmentIndex]; SystemParameters.RootConstants.UserData = UserData; WorkerData[WorkerIndex].MaxUsedHitRecordIndex = FMath::Max(WorkerData[WorkerIndex].MaxUsedHitRecordIndex, (int32)RecordIndex); uint32 InlineRecordIndex = GetInlineRecordIndex(RecordIndex); if (Lifetime == ERayTracingShaderBindingTableLifetime::Transient) { if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::RTPSO)) { SetHitGroupSystemParameters(RecordIndex, SystemParameters); } if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::Inline) && InlineRecordIndex != INDEX_NONE) { SetInlineGeometryParameters(InlineRecordIndex, SystemParameters); } } else { FRecordData& RecordData = HitRecordData[RecordIndex]; bool bSetData = BindingType != ERayTracingLocalShaderBindingType::Validation; // Special use case when trying to set transient binding on already persistent stored binding (validate content only then) // Can happen we go from persistent -> transient -> persistent when RTPSO is still compiling for example if (RecordData.State == FRecordData::EState::Persistent && BindingType == ERayTracingLocalShaderBindingType::Transient) { bSetData = false; } if (bSetData) { check(RecordData.State != FRecordData::EState::Persistent || BindingType == ERayTracingLocalShaderBindingType::Transient); if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::RTPSO)) { SetHitGroupSystemParameters(RecordIndex, SystemParameters); } if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::Inline) && InlineRecordIndex != INDEX_NONE) { SetInlineGeometryParameters(InlineRecordIndex, SystemParameters); } // If persistently stored then also listen to hit record updates for stream in/out events of the ray tracing geometry data if (BindingType == ERayTracingLocalShaderBindingType::Persistent) { HitRecordData[RecordIndex].UpdateGeometryListener.Init(this, Geometry, RecordIndex, UserData, GeometrySegmentIndex); RecordData.State = FRecordData::EState::Persistent; } else // if (RecordData.State == FRecordData::EState::Uninitialized) { RecordData.State = FRecordData::EState::Transient; } } else { check(RecordData.State == FRecordData::EState::Persistent); if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::RTPSO)) { CompareHitGroupSystemParameters(RecordIndex, SystemParameters); } if (EnumHasAnyFlags(ShaderBindingMode, ERayTracingShaderBindingMode::Inline) && InlineRecordIndex != INDEX_NONE) { CompareInlineGeometryParameters(InlineRecordIndex, SystemParameters); } } } } #if ENABLE_RESIDENCY_MANAGEMENT void AddLastTrackedBaseShaderResource(uint32 WorkerIndex, FD3D12BaseShaderResource* BaseShaderResource) { WorkerData[WorkerIndex].LastTrackedBaseShaderResources.Add(BaseShaderResource); } void UpdatePersistentReferencedResource(uint32 WorkerIndex, FD3D12Resource* Resource, int32 Count) { check(Resource); // Update the worker thread reference count of the resource int32& ReferenceCount = WorkerData[WorkerIndex].PersistentReferencedResources.FindOrAdd(Resource); ReferenceCount += Count; if (WorkerIndex == 0) { check(ReferenceCount >= 0); if (ReferenceCount == 0) { WorkerData[WorkerIndex].PersistentReferencedResources.Remove(Resource); } } } void AddDynamicReferencedResource(uint32 WorkerIndex, FD3D12Resource* Resource) { bool bIsAlreadyInSet = false; WorkerData[WorkerIndex].DynamicReferencedResourceSet.Add(Resource, &bIsAlreadyInSet); if (!bIsAlreadyInSet) { WorkerData[WorkerIndex].DynamicReferencedResources.Add(Resource); } } #endif // ENABLE_RESIDENCY_MANAGEMENT void AddReferencedShaderResource(uint32 WorkerIndex, uint32 RecordIndex, ERayTracingLocalShaderBindingType BindingType, FD3D12BaseShaderResource* ShaderResource) { #if ENABLE_RESIDENCY_MANAGEMENT if (BindingType == ERayTracingLocalShaderBindingType::Transient || Lifetime != ERayTracingShaderBindingTableLifetime::Persistent) { AddDynamicReferencedResource(WorkerIndex, ShaderResource->GetResource()); } else if (BindingType == ERayTracingLocalShaderBindingType::Persistent) { // Same base shader resource can be bound multiple times to the same record - only track once FRecordData& RecordData = HitRecordData[RecordIndex]; for (FRecordUpdateShaderResourceRenameListener* Listener : RecordData.ReferencedShaderResourceListeners) { if (Listener->ShaderResource == ShaderResource) { return; } } // Update the tracked record data so it can be updated in case of defrag operation FRecordUpdateShaderResourceRenameListener* Listener = new FRecordUpdateShaderResourceRenameListener(*this); Listener->Init(WorkerIndex, RecordIndex, ShaderResource); RecordData.ReferencedShaderResourceListeners.Add(Listener); } #endif // ENABLE_RESIDENCY_MANAGEMENT } void AddReferencedTexture(uint32 WorkerIndex, uint32 RecordIndex, ERayTracingLocalShaderBindingType BindingType, FRHITexture* RHITexture) { #if ENABLE_RESIDENCY_MANAGEMENT if (BindingType == ERayTracingLocalShaderBindingType::Transient || Lifetime != ERayTracingShaderBindingTableLifetime::Persistent) { FD3D12Texture* Texture = FD3D12CommandContext::RetrieveTexture(RHITexture, Device->GetGPUIndex()); AddDynamicReferencedResource(WorkerIndex, Texture->ResourceLocation.GetResource()); } else if (BindingType == ERayTracingLocalShaderBindingType::Persistent) { // Collect all texture references independent of the shader resources because the internal shader resource can be changed on swap if (FRHITextureReference* TextureReferenceRHI = RHITexture->GetTextureReference()) { FD3D12RHITextureReference* TextureReference = FD3D12CommandContext::RetrieveObject(TextureReferenceRHI, Device->GetGPUIndex()); // Same texture reference can be bound multiple times to the same record - only track once FRecordData& RecordData = HitRecordData[RecordIndex]; for (FRecordUpdateTextureReferenceReplaceListener* Listener : RecordData.ReferencedTextureReferenceListeners) { if (Listener->TextureReference == TextureReference) { return; } } // Update the tracked record data so it can be updated in case of texture reference swap FRecordUpdateTextureReferenceReplaceListener* Listener = new FRecordUpdateTextureReferenceReplaceListener(*this); Listener->Init(WorkerIndex, RecordIndex, TextureReference); RecordData.ReferencedTextureReferenceListeners.Add(Listener); } else { // Fall back to regular base shader resource code path FD3D12Texture* Texture = FD3D12CommandContext::RetrieveTexture(RHITexture, Device->GetGPUIndex()); AddReferencedShaderResource(WorkerIndex, RecordIndex, BindingType, Texture); } } #endif // ENABLE_RESIDENCY_MANAGEMENT } void AddReferencedUniformBuffer(uint32 WorkerIndex, uint32 RecordIndex, uint32 ShaderTableOffset, uint32 InOffsetWithinRootSignature, ERayTracingLocalShaderBindingType BindingType, FD3D12UniformBuffer* UniformBuffer) { // Uniform buffer shouldn't have an residency handles because they are all allocated in upload memory so doesn't require extra tracking check(UniformBuffer->ResourceLocation.GetResource()->GetResidencyHandles().IsEmpty()); if (Lifetime == ERayTracingShaderBindingTableLifetime::Persistent && BindingType == ERayTracingLocalShaderBindingType::Persistent) { FRecordData& RecordData = HitRecordData[RecordIndex]; for (FRecordUpdateUniformBufferListener* Listener : RecordData.UpdateUniformBufferListeners) { check(Listener->UniformBuffer != UniformBuffer); } FRecordUpdateUniformBufferListener* Listener = new FRecordUpdateUniformBufferListener(*this); Listener->Init(UniformBuffer, ShaderTableOffset, RecordIndex, InOffsetWithinRootSignature); RecordData.UpdateUniformBufferListeners.Add(Listener); } } FD3D12ConstantBufferView* SetLooseParameterData( uint32 WorkerIndex, uint32 RecordIndex, ERayTracingLocalShaderBindingType BindingType, const void* LooseParameterData, uint32 LooseParameterDataSize, D3D12_GPU_VIRTUAL_ADDRESS& OutGPUVirtualAddress) { // If we see a significant number of LooseParameter allocations coming through this path, we should consider // caching constant buffer blocks inside ShaderTable and linearly sub-allocate from them. // If the amount of data is relatively small, it may also be possible to use root constants and avoid extra allocations entirely. FD3D12ConstantBufferView* ConstantBufferView = nullptr; if (BindingType == ERayTracingLocalShaderBindingType::Validation) { #if DO_CHECK FRecordData& RecordData = HitRecordData[RecordIndex]; check(RecordData.LooseParameterData.Num() == LooseParameterDataSize); check(FMemory::Memcmp(RecordData.LooseParameterData.GetData(), LooseParameterData, LooseParameterDataSize) == 0); check(RecordData.LooseParameterResourceLocation); OutGPUVirtualAddress = RecordData.LooseParameterResourceLocation->GetGPUVirtualAddress(); #endif // DO_CHECK } else { // Always allocate using the upload heap allocator and not the TransientUniformBufferAllocator (can cause GPU crashes with transient data somehow) FD3D12ResourceLocation ResourceLocation(Device); FD3D12UploadHeapAllocator& Allocator = Device->GetParentAdapter()->GetUploadHeapAllocator(Device->GetGPUIndex()); void* MappedData = Allocator.AllocUploadResource(LooseParameterDataSize, D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT, ResourceLocation); FMemory::Memcpy(MappedData, LooseParameterData, LooseParameterDataSize); OutGPUVirtualAddress = ResourceLocation.GetGPUVirtualAddress(); // Uniform buffer shouldn't have an residency handles because they are all allocated in upload memory so doesn't require residency tracking check(ResourceLocation.GetResource()->GetResidencyHandles().IsEmpty()); #if D3D12RHI_USE_CONSTANT_BUFFER_VIEWS // Not implemented for persistent SBT yet (CBV needs to be managed per record then instead of at SBT level) // // CBV also has pointer to stack local ResourceLocation which it shouldn't touch anymore after creation but // will still have dangling pointer - might be better to always allocate ResourceLocation as well on heap next to CBV (like persistent path) check(Lifetime == ERayTracingShaderBindingTableLifetime::Transient); ConstantBufferView = new FD3D12ConstantBufferView(Device, nullptr); ConstantBufferView->CreateView(&ResourceLocation, 0, Align(LooseParameterDataSize, D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT)); WorkerData[WorkerIndex].LooseParameterCBVs.Add(ConstantBufferView); #endif // D3D12RHI_USE_CONSTANT_BUFFER_VIEWS if (Lifetime == ERayTracingShaderBindingTableLifetime::Persistent) { // Move ownership of the upload memory to the record data when persistent (otherwise auto freed next 'frame') FRecordData& RecordData = HitRecordData[RecordIndex]; if (BindingType == ERayTracingLocalShaderBindingType::Persistent) { check(RecordData.LooseParameterResourceLocation == nullptr); RecordData.LooseParameterResourceLocation = new FD3D12ResourceLocation(Device); FD3D12ResourceLocation::TransferOwnership(*RecordData.LooseParameterResourceLocation, ResourceLocation); #if DO_CHECK RecordData.LooseParameterData.SetNum(LooseParameterDataSize); FMemory::Memcpy(RecordData.LooseParameterData.GetData(), LooseParameterData, LooseParameterDataSize); #endif // DO_CHECK } // If we had loose parameter resource location allocated then it means it was persistent and now temporarily transient // Update the cached loose parameter GPU allocation to make sure the SBT doesn't contain any dangling upload memory allocations else if (RecordData.LooseParameterResourceLocation) { #if DO_CHECK check(RecordData.LooseParameterData.Num() == LooseParameterDataSize); check(FMemory::Memcmp(RecordData.LooseParameterData.GetData(), LooseParameterData, LooseParameterDataSize) == 0); #endif // DO_CHECK RecordData.LooseParameterResourceLocation->Clear(); FD3D12ResourceLocation::TransferOwnership(*RecordData.LooseParameterResourceLocation, ResourceLocation); } } } return ConstantBufferView; } void ClearHitRecordData(uint32 WorkerIndex, uint32 RecordIndex) { if (Lifetime != ERayTracingShaderBindingTableLifetime::Persistent) { return; } FRecordData& RecordData = HitRecordData[RecordIndex]; for (FRecordUpdateUniformBufferListener* Listener : RecordData.UpdateUniformBufferListeners) { Listener->Reset(); delete Listener; } if (RecordData.LooseParameterResourceLocation) { delete RecordData.LooseParameterResourceLocation; RecordData.LooseParameterResourceLocation = nullptr; } if (RecordData.UpdateGeometryListener.Geometry) { RecordData.UpdateGeometryListener.Geometry->RemoveUpdateListener(&RecordData.UpdateGeometryListener); } #if ENABLE_RESIDENCY_MANAGEMENT for (FRecordUpdateShaderResourceRenameListener* Listener : RecordData.ReferencedShaderResourceListeners) { Listener->Reset(WorkerIndex); delete Listener; } for (FRecordUpdateTextureReferenceReplaceListener* Listener : RecordData.ReferencedTextureReferenceListeners) { Listener->Reset(WorkerIndex); delete Listener; } #endif // ENABLE_RESIDENCY_MANAGEMENT // Reset record to default state RecordData.Reset(); } void RemoveUpdateUniformBufferListener(uint32 RecordIndex, FRecordUpdateUniformBufferListener* Listener) { bool bFound = false; for (int32 Index = 0; Index < HitRecordData[RecordIndex].UpdateUniformBufferListeners.Num(); ++Index) { if (HitRecordData[RecordIndex].UpdateUniformBufferListeners[Index] == Listener) { HitRecordData[RecordIndex].UpdateUniformBufferListeners.RemoveAt(Index); delete Listener; bFound = true; break; } } check(bFound); } void RemoveUpdateRayTracingGeometryListener(uint32 RecordIndex, FRecordUpdateRayTracingGeometryListener* Listener) { FRecordData& RecordData = HitRecordData[RecordIndex]; check(&RecordData.UpdateGeometryListener == Listener && RecordData.UpdateGeometryListener.Geometry != nullptr); RecordData.UpdateGeometryListener.Reset(); } struct FShaderRecordCacheKey { static constexpr uint32 MaxUniformBuffers = 6; FRHIUniformBuffer* const* UniformBuffers[MaxUniformBuffers]; uint64 Hash = 0; uint32 NumUniformBuffers = 0; uint32 ShaderIndex = 0; FShaderRecordCacheKey() = default; FShaderRecordCacheKey(uint32 InNumUniformBuffers, FRHIUniformBuffer* const* InUniformBuffers, uint32 InShaderIndex) { ShaderIndex = InShaderIndex; check(InNumUniformBuffers <= MaxUniformBuffers); NumUniformBuffers = FMath::Min(MaxUniformBuffers, InNumUniformBuffers); const uint64 DataSizeInBytes = sizeof(FRHIUniformBuffer*) * NumUniformBuffers; FMemory::Memcpy(UniformBuffers, InUniformBuffers, DataSizeInBytes); Hash = FXxHash64::HashBuffer(UniformBuffers, DataSizeInBytes).Hash; } bool operator == (const FShaderRecordCacheKey& Other) const { if (Hash != Other.Hash) return false; if (ShaderIndex != Other.ShaderIndex) return false; if (NumUniformBuffers != Other.NumUniformBuffers) return false; for (uint32 BufferIndex = 0; BufferIndex < NumUniformBuffers; ++BufferIndex) { if (UniformBuffers[BufferIndex] != Other.UniformBuffers[BufferIndex]) return false; } return true; } friend uint32 GetTypeHash(const FShaderRecordCacheKey& Key) { return uint32(Key.Hash); } }; void UpdateResidency(FD3D12CommandContext& CommandContext) const { // Skip redundant resource residency updates when a shader table is repeatedly used on the same command list bool bWasAlreadyInSet = false; CommandContext.RayTracingShaderTables.FindOrAdd(UniqueId, bWasAlreadyInSet); if (bWasAlreadyInSet) { return; } TRACE_CPUPROFILER_EVENT_SCOPE(ShaderTableUpdateResidency); for (FD3D12Resource* Resource : ReferencedResources) { CommandContext.UpdateResidency(Resource); } CommandContext.UpdateResidency(Buffer->GetResource()); } struct alignas(PLATFORM_CACHE_LINE_SIZE) FWorkerThreadData { Experimental::TSherwoodMap ShaderRecordCache; int32 MaxUsedHitRecordIndex = -1; #if ENABLE_RESIDENCY_MANAGEMENT // Track all persistent resources with number of refs accross all records (when ref reaches 0 then it doesn't need be tracked anymore) TMap, int32> PersistentReferencedResources; // Track last reference to base shader resources because PersistentReferencedResources could have the last reference to the d3d12 resource // and we can't delete d3d12baseshaderresource with committed resource when it doesn't hold the last reference to it TSet> LastTrackedBaseShaderResources; // A set of all resources referenced by this shader table for the purpose of updating residency before ray tracing work dispatch. Experimental::TSherwoodSet DynamicReferencedResourceSet; TArray> DynamicReferencedResources; #endif // ENABLE_RESIDENCY_MANAGEMENT #if D3D12RHI_USE_CONSTANT_BUFFER_VIEWS TArray LooseParameterCBVs; #endif // D3D12RHI_USE_CONSTANT_BUFFER_VIEWS }; FWorkerThreadData WorkerData[MaxBindingWorkers]; const uint64 UniqueId; UE::FMutex DispatchMutex; private: static std::atomic_uint64_t NextUniqueId; }; std::atomic_uint64_t FD3D12RayTracingShaderBindingTableInternal::NextUniqueId = 0; static void CreateSpecializedStateObjects( ID3D12Device5* RayTracingDevice, ID3D12RootSignature* GlobalRootSignature, uint32 MaxAttributeSizeInBytes, uint32 MaxPayloadSizeInBytes, const FD3D12RayTracingShaderLibrary& RayGenShaders, const TArray& UniqueShaderCollections, const TMap& RayGenShaderIndexByHash, TArray>& OutSpecializedStateObjects, TArray& OutSpecializationIndices) { static constexpr uint32 MaxSpecializationBuckets = FD3D12RayTracingPipelineInfo::MaxPerformanceGroups; if (RayGenShaders.Shaders.Num() <= 1) { // No specializations needed return; } // Initialize raygen shader PSO specialization map to default values OutSpecializationIndices.Reserve(RayGenShaders.Shaders.Num()); for (int32 It = 0; It < RayGenShaders.Shaders.Num(); ++It) { OutSpecializationIndices.Add(INDEX_NONE); } struct FRayGenShaderSpecialization { D3D12_EXISTING_COLLECTION_DESC Desc = {}; int32 ShaderIndex = INDEX_NONE; }; TArray RayGenShaderCollectionBuckets[MaxSpecializationBuckets]; TArray ShaderCollectionDescs; // Find useful performance group range for non-raygen shaders. // It is not necessary to create PSO specializations for high-occupancy RGS if overall PSO will be limited by low-occupancy hit shaders. // Also not necessary to create specializations if all raygen shaders are already in the same group. uint32 MaxPerformanceGroupRGS = 0; uint32 MinPerformanceGroupRGS = MaxSpecializationBuckets - 1; uint32 MaxPerformanceGroupOther = 0; uint32 MinPerformanceGroupOther = MaxSpecializationBuckets - 1; int32 LastRayGenShaderCollectionIndex = INDEX_NONE; for (int32 EntryIndex = 0; EntryIndex < UniqueShaderCollections.Num(); ++EntryIndex) { FD3D12RayTracingPipelineCache::FEntry* Entry = UniqueShaderCollections[EntryIndex]; const uint32 Group = FMath::Min(Entry->PipelineInfo.PerformanceGroup, MaxSpecializationBuckets); if (Entry->CollectionType == FD3D12RayTracingPipelineCache::ECollectionType::RayGen) { MaxPerformanceGroupRGS = FMath::Max(MaxPerformanceGroupRGS, Group); MinPerformanceGroupRGS = FMath::Min(MinPerformanceGroupRGS, Group); LastRayGenShaderCollectionIndex = EntryIndex; } else { checkf(EntryIndex > LastRayGenShaderCollectionIndex, TEXT("Ray generation shaders are expected to be first in the UniqueShaderCollections list.")); MaxPerformanceGroupOther = FMath::Max(MaxPerformanceGroupOther, Group); MinPerformanceGroupOther = FMath::Min(MinPerformanceGroupOther, Group); // This is a hit/miss/callable shader which will be common for all specialized RTPSOs. ShaderCollectionDescs.Add(Entry->GetCollectionDesc()); } } if (MinPerformanceGroupRGS == MaxPerformanceGroupRGS) { // No need to create a specialized PSO if all raygen shaders are already in the same group return; } // Split RGS collections into a separate lists, organized by performance group for (int32 EntryIndex = 0; EntryIndex <= LastRayGenShaderCollectionIndex; ++EntryIndex) { FD3D12RayTracingPipelineCache::FEntry* Entry = UniqueShaderCollections[EntryIndex]; check(Entry->CollectionType == FD3D12RayTracingPipelineCache::ECollectionType::RayGen); // Don't create specializations for raygen shaders that have better occupancy than worst non-raygen shader const uint32 SpecializationBucket = FMath::Min(Entry->PipelineInfo.PerformanceGroup, MinPerformanceGroupOther); // Don't create extra specialized pipelines for group 0 (worst-performing) and just use the default RTPSO. if (SpecializationBucket > 0) { FRayGenShaderSpecialization Specialization; Specialization.Desc = Entry->GetCollectionDesc(); Specialization.ShaderIndex = RayGenShaderIndexByHash.FindChecked(Entry->Shader->GetHash()); RayGenShaderCollectionBuckets[SpecializationBucket].Add(Specialization); } } OutSpecializedStateObjects.Reserve(MaxSpecializationBuckets); const uint32 ShaderCollectionDescsSize = ShaderCollectionDescs.Num(); for (const TArray& SpecializationBucket : RayGenShaderCollectionBuckets) { if (SpecializationBucket.IsEmpty()) { continue; } const int32 SpecializationIndex = OutSpecializedStateObjects.Num(); for (const FRayGenShaderSpecialization& Specialization : SpecializationBucket) { // Temporarily add the RGSs to complete shader collection ShaderCollectionDescs.Add(Specialization.Desc); OutSpecializationIndices[Specialization.ShaderIndex] = SpecializationIndex; } TRefCountPtr SpecializedPSO = CreateRayTracingStateObject( RayTracingDevice, {}, // Libraries, {}, // LibraryExports, MaxAttributeSizeInBytes, MaxPayloadSizeInBytes, {}, // HitGroups GlobalRootSignature, {}, // LocalRootSignatures {}, // LocalRootSignatureAssociations, ShaderCollectionDescs, D3D12_STATE_OBJECT_TYPE_RAYTRACING_PIPELINE); OutSpecializedStateObjects.Add(SpecializedPSO); // Remove the temporary RGSs ShaderCollectionDescs.SetNum(ShaderCollectionDescsSize); } } FD3D12RayTracingPipelineState::FD3D12RayTracingPipelineState(FD3D12Device* Device, const FRayTracingPipelineStateInitializer& Initializer) : FRHIRayTracingPipelineState(Initializer), Device(Device) { SCOPE_CYCLE_COUNTER(STAT_RTPSO_CreatePipeline); TRACE_CPUPROFILER_EVENT_SCOPE(RTPSO_CreatePipeline); checkf(Initializer.GetRayGenTable().Num() > 0 || Initializer.bPartial, TEXT("Ray tracing pipelines must have at leat one ray generation shader.")); checkf(Initializer.bPartial || Initializer.GetHitGroupTable().Num() > 0, TEXT("Ray tracing pipelines must have at leat one hit shader.")); uint64 TotalCreationTime = 0; uint64 CompileTime = 0; uint64 LinkTime = 0; uint32 NumCacheHits = 0; TotalCreationTime -= FPlatformTime::Cycles64(); ID3D12Device5* RayTracingDevice = Device->GetDevice5(); TArrayView InitializerHitGroups = Initializer.GetHitGroupTable(); TArrayView InitializerMissShaders = Initializer.GetMissTable(); TArrayView InitializerRayGenShaders = Initializer.GetRayGenTable(); TArrayView InitializerCallableShaders = Initializer.GetCallableTable(); FRHIShaderBindingLayout ShaderBindingLayout = Initializer.ShaderBindingLayout ? *Initializer.ShaderBindingLayout : FRHIShaderBindingLayout(); const uint32 MaxTotalShaders = InitializerRayGenShaders.Num() + InitializerMissShaders.Num() + InitializerHitGroups.Num() + InitializerCallableShaders.Num(); checkf(MaxTotalShaders >= 1, TEXT("Ray tracing pipelines are expected to contain at least one shader")); FD3D12RayTracingPipelineCache* PipelineCache = Device->GetRayTracingPipelineCache(); // All raygen shaders must share the same global root signature (this is validated below) GlobalRootSignature = PipelineCache->GetGlobalRootSignature(ShaderBindingLayout); const FD3D12RayTracingPipelineState* BasePipeline = GRHISupportsRayTracingPSOAdditions ? FD3D12DynamicRHI::ResourceCast(Initializer.BasePipeline.GetReference()) : nullptr; if (BasePipeline) { PipelineShaderHashes = BasePipeline->PipelineShaderHashes; } PipelineShaderHashes.Reserve(MaxTotalShaders); TArray UniqueShaderCollections; UniqueShaderCollections.Reserve(MaxTotalShaders); FGraphEventArray CompileCompletionList; CompileCompletionList.Reserve(MaxTotalShaders); // Helper function to acquire a D3D12_EXISTING_COLLECTION_DESC for a compiled shader via cache auto AddShaderCollection = [Device, ShaderBindingLayoutHash = ShaderBindingLayout.GetHash(), GlobalRootSignature = this->GlobalRootSignature, PipelineCache, &UniqueShaderHashes = this->PipelineShaderHashes, &UniqueShaderCollections, &Initializer, &NumCacheHits, &CompileTime, &CompileCompletionList] (FD3D12RayTracingShader* Shader, FD3D12RayTracingPipelineCache::ECollectionType CollectionType) { // verify that that the same shader binding layout is used for all shaders in the RTPSO or not sampling any resources uint32 TotalResourceCount = Shader->ResourceCounts.NumCBs + Shader->ResourceCounts.NumSRVs + Shader->ResourceCounts.NumUAVs + Shader->ResourceCounts.NumSamplers; if (TotalResourceCount != 0 && Shader->ShaderBindingLayoutHash != ShaderBindingLayoutHash) { UE_LOG(LogD3D12RHI, Warning, TEXT("Raytracing shader with with entry point %s, name %s and ShaderBindingLayout hash %u doesn't match the RTPSO ShaderBindingLayout hash %u."), *Shader->EntryPoint, Shader->GetShaderName(), Shader->ShaderBindingLayoutHash, ShaderBindingLayoutHash); ensure(false); } bool bIsAlreadyInSet = false; const uint64 ShaderHash = GetShaderHash64(Shader); UniqueShaderHashes.Add(ShaderHash, &bIsAlreadyInSet); bool bCacheHit = false; CompileTime -= FPlatformTime::Cycles64(); FD3D12RayTracingPipelineCache::FEntry* ShaderCacheEntry = PipelineCache->GetOrCompileShader( Device, Shader, GlobalRootSignature, Initializer.MaxAttributeSizeInBytes, Initializer.MaxPayloadSizeInBytes, !Initializer.bPartial, //< partial PSOs are not required CollectionType, CompileCompletionList, &bCacheHit); CompileTime += FPlatformTime::Cycles64(); if (!bIsAlreadyInSet) { UniqueShaderCollections.Add(ShaderCacheEntry); if (bCacheHit) NumCacheHits++; } return ShaderCacheEntry; }; // Add ray generation shaders TArray RayGenShaderEntries; RayGenShaders.Reserve(InitializerRayGenShaders.Num()); RayGenShaderEntries.Reserve(InitializerRayGenShaders.Num()); TMap RayGenShaderIndexByHash; checkf(UniqueShaderCollections.Num() == 0, TEXT("Ray generation shaders are expected to be first in the UniqueShaderCollections list.")); for (FRHIRayTracingShader* ShaderRHI : InitializerRayGenShaders) { FD3D12RayTracingShader* Shader = FD3D12DynamicRHI::ResourceCast(ShaderRHI); checkf(!Shader->UsesGlobalUniformBuffer(), TEXT("Global uniform buffers are not implemented for ray generation shaders")); FD3D12RayTracingPipelineCache::FEntry* ShaderCacheEntry = AddShaderCollection(Shader, FD3D12RayTracingPipelineCache::ECollectionType::RayGen); RayGenShaderEntries.Add(ShaderCacheEntry); RayGenShaderIndexByHash.Add(Shader->GetHash(), RayGenShaders.Shaders.Num()); RayGenShaders.Shaders.Add(Shader); } MaxHitGroupViewDescriptors = 0; MaxLocalRootSignatureSize = 0; // Add miss shaders TArray MissShaderEntries; MissShaders.Reserve(InitializerMissShaders.Num()); MissShaderEntries.Reserve(InitializerMissShaders.Num()); for (FRHIRayTracingShader* ShaderRHI : InitializerMissShaders) { FD3D12RayTracingShader* Shader = FD3D12DynamicRHI::ResourceCast(ShaderRHI); checkf(Shader, TEXT("A valid ray tracing shader must be provided for all elements in the FRayTracingPipelineStateInitializer miss shader table.")); checkf(!Shader->UsesGlobalUniformBuffer(), TEXT("Global uniform buffers are not implemented for ray tracing miss shaders")); const uint32 ShaderViewDescriptors = Shader->ResourceCounts.NumSRVs + Shader->ResourceCounts.NumUAVs; MaxHitGroupViewDescriptors = FMath::Max(MaxHitGroupViewDescriptors, ShaderViewDescriptors); MaxLocalRootSignatureSize = FMath::Max(MaxLocalRootSignatureSize, Shader->LocalRootSignature->GetTotalRootSignatureSizeInBytes()); FD3D12RayTracingPipelineCache::FEntry* ShaderCacheEntry = AddShaderCollection(Shader, FD3D12RayTracingPipelineCache::ECollectionType::Miss); MissShaderEntries.Add(ShaderCacheEntry); MissShaders.Shaders.Add(Shader); } // Add hit groups TArray HitGroupEntries; HitGroupShaders.Reserve(InitializerHitGroups.Num()); HitGroupEntries.Reserve(InitializerHitGroups.Num()); for (FRHIRayTracingShader* ShaderRHI : InitializerHitGroups) { FD3D12RayTracingShader* Shader = FD3D12DynamicRHI::ResourceCast(ShaderRHI); checkf(Shader, TEXT("A valid ray tracing hit group shader must be provided for all elements in the FRayTracingPipelineStateInitializer hit group table.")); const uint32 ShaderViewDescriptors = Shader->ResourceCounts.NumSRVs + Shader->ResourceCounts.NumUAVs; MaxHitGroupViewDescriptors = FMath::Max(MaxHitGroupViewDescriptors, ShaderViewDescriptors); MaxLocalRootSignatureSize = FMath::Max(MaxLocalRootSignatureSize, Shader->LocalRootSignature->GetTotalRootSignatureSizeInBytes()); FD3D12RayTracingPipelineCache::FEntry* ShaderCacheEntry = AddShaderCollection(Shader, FD3D12RayTracingPipelineCache::ECollectionType::HitGroup); HitGroupEntries.Add(ShaderCacheEntry); HitGroupShaders.Shaders.Add(Shader); } // Add callable shaders TArray CallableShaderEntries; CallableShaders.Reserve(InitializerCallableShaders.Num()); CallableShaderEntries.Reserve(InitializerCallableShaders.Num()); for (FRHIRayTracingShader* ShaderRHI : InitializerCallableShaders) { FD3D12RayTracingShader* Shader = FD3D12DynamicRHI::ResourceCast(ShaderRHI); checkf(Shader, TEXT("A valid ray tracing shader must be provided for all elements in the FRayTracingPipelineStateInitializer callable shader table.")); checkf(!Shader->UsesGlobalUniformBuffer(), TEXT("Global uniform buffers are not implemented for ray tracing callable shaders")); const uint32 ShaderViewDescriptors = Shader->ResourceCounts.NumSRVs + Shader->ResourceCounts.NumUAVs; MaxHitGroupViewDescriptors = FMath::Max(MaxHitGroupViewDescriptors, ShaderViewDescriptors); MaxLocalRootSignatureSize = FMath::Max(MaxLocalRootSignatureSize, Shader->LocalRootSignature->GetTotalRootSignatureSizeInBytes()); FD3D12RayTracingPipelineCache::FEntry* ShaderCacheEntry = AddShaderCollection(Shader, FD3D12RayTracingPipelineCache::ECollectionType::Callable); CallableShaderEntries.Add(ShaderCacheEntry); CallableShaders.Shaders.Add(Shader); } check(Initializer.GetMaxLocalBindingDataSize() >= MaxLocalRootSignatureSize); // Wait for all compilation tasks to be complete and then gather the compiled collection descriptors CompileTime -= FPlatformTime::Cycles64(); FTaskGraphInterface::Get().WaitUntilTasksComplete(CompileCompletionList); CompileTime += FPlatformTime::Cycles64(); if (Initializer.bPartial) { // Partial pipelines don't have a linking phase, so exit immediately after compilation tasks are complete. return; } TArray UniqueShaderCollectionDescs; UniqueShaderCollectionDescs.Reserve(MaxTotalShaders); for (FD3D12RayTracingPipelineCache::FEntry* Entry : UniqueShaderCollections) { UniqueShaderCollectionDescs.Add(Entry->GetCollectionDesc()); } // Link final RTPSO from shader collections LinkTime -= FPlatformTime::Cycles64(); // Extending RTPSOs is currently not compatible with PSO specializations if (BasePipeline && GRayTracingSpecializeStateObjects == 0) { if (UniqueShaderCollectionDescs.Num() == 0) { // New PSO does not actually have any new shaders that were not in the base StateObject = BasePipeline->StateObject.GetReference(); } else { TArray Subobjects; int32 SubobjectIndex = 0; Subobjects.Reserve(UniqueShaderCollectionDescs.Num() + 1); D3D12_STATE_OBJECT_CONFIG StateObjectConfig = {}; StateObjectConfig.Flags = D3D12_STATE_OBJECT_FLAG_ALLOW_STATE_OBJECT_ADDITIONS; Subobjects.Add(D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_STATE_OBJECT_CONFIG, &StateObjectConfig }); for (const D3D12_EXISTING_COLLECTION_DESC& Collection : UniqueShaderCollectionDescs) { Subobjects.Add(D3D12_STATE_SUBOBJECT{ D3D12_STATE_SUBOBJECT_TYPE_EXISTING_COLLECTION, &Collection }); } D3D12_STATE_OBJECT_DESC Desc = {}; Desc.Type = D3D12_STATE_OBJECT_TYPE_RAYTRACING_PIPELINE; Desc.NumSubobjects = Subobjects.Num(); Desc.pSubobjects = Subobjects.GetData(); ID3D12Device7* Device7 = Device->GetDevice7(); VERIFYD3D12RESULT(Device7->AddToStateObject(&Desc, BasePipeline->StateObject.GetReference(), IID_PPV_ARGS(StateObject.GetInitReference()))); } } else { StateObject = CreateRayTracingStateObject( RayTracingDevice, {}, // Libraries, {}, // LibraryExports, Initializer.MaxAttributeSizeInBytes, Initializer.MaxPayloadSizeInBytes, {}, // HitGroups GlobalRootSignature, {}, // LocalRootSignatures {}, // LocalRootSignatureAssociations, UniqueShaderCollectionDescs, D3D12_STATE_OBJECT_TYPE_RAYTRACING_PIPELINE); if (StateObject == nullptr) { UE_LOG(LogD3D12RHI, Fatal, TEXT("Failed to a create raytracing pipeline state")); } } if (GRayTracingSpecializeStateObjects != 0 && Initializer.GetRayGenTable().Num() > 1) { CreateSpecializedStateObjects( RayTracingDevice, GlobalRootSignature, Initializer.MaxAttributeSizeInBytes, Initializer.MaxPayloadSizeInBytes, RayGenShaders, UniqueShaderCollections, RayGenShaderIndexByHash, SpecializedStateObjects, // out param SpecializationIndices // out param ); } LinkTime += FPlatformTime::Cycles64(); HRESULT QueryInterfaceResult = StateObject->QueryInterface(IID_PPV_ARGS(PipelineProperties.GetInitReference())); checkf(SUCCEEDED(QueryInterfaceResult), TEXT("Failed to query pipeline properties from the ray tracing pipeline state object. Result=%08x"), QueryInterfaceResult); // Query shader identifiers from the pipeline state object check(HitGroupEntries.Num() == InitializerHitGroups.Num()); auto GetEntryShaderIdentifier = [Properties = PipelineProperties.GetReference()](FD3D12RayTracingPipelineCache::FEntry* Entry) -> FD3D12ShaderIdentifier { if (Entry->Identifier.IsValid()) { return Entry->Identifier; } else { return GetShaderIdentifier(Properties, Entry->GetPrimaryExportNameChars()); } }; HitGroupShaders.Identifiers.SetNumUninitialized(InitializerHitGroups.Num()); for (int32 HitGroupIndex = 0; HitGroupIndex < HitGroupEntries.Num(); ++HitGroupIndex) { HitGroupShaders.Identifiers[HitGroupIndex] = GetEntryShaderIdentifier(HitGroupEntries[HitGroupIndex]); } RayGenShaders.Identifiers.SetNumUninitialized(RayGenShaderEntries.Num()); for (int32 ShaderIndex = 0; ShaderIndex < RayGenShaderEntries.Num(); ++ShaderIndex) { RayGenShaders.Identifiers[ShaderIndex] = GetEntryShaderIdentifier(RayGenShaderEntries[ShaderIndex]); } MissShaders.Identifiers.SetNumUninitialized(MissShaderEntries.Num()); for (int32 ShaderIndex = 0; ShaderIndex < MissShaderEntries.Num(); ++ShaderIndex) { MissShaders.Identifiers[ShaderIndex] = GetEntryShaderIdentifier(MissShaderEntries[ShaderIndex]); } CallableShaders.Identifiers.SetNumUninitialized(CallableShaderEntries.Num()); for (int32 ShaderIndex = 0; ShaderIndex < CallableShaderEntries.Num(); ++ShaderIndex) { CallableShaders.Identifiers[ShaderIndex] = GetEntryShaderIdentifier(CallableShaderEntries[ShaderIndex]); } PipelineStackSize = PipelineProperties->GetPipelineStackSize(); TotalCreationTime += FPlatformTime::Cycles64(); // Report stats for pipelines that take a long time to create #if !NO_LOGGING // Gather PSO stats ShaderStats.Reserve(UniqueShaderCollections.Num()); for (FD3D12RayTracingPipelineCache::FEntry* Entry : UniqueShaderCollections) { FShaderStats Stats; Stats.Name = *(Entry->Shader->EntryPoint); Stats.ShaderSize = Entry->Shader->Code.Num(); Stats.CompileTimeMS = Entry->CompileTimeMS; #if PLATFORM_WINDOWS if (Entry->Shader->GetFrequency() == SF_RayGen) { Stats.StackSize = uint32(PipelineProperties->GetShaderStackSize(*(Entry->ExportNames[0]))); } #endif // PLATFORM_WINDOWS ShaderStats.Add(Stats); } ShaderStats.Sort([](const FShaderStats& A, const FShaderStats& B) { return B.CompileTimeMS < A.CompileTimeMS; }); const double TotalCreationTimeMS = 1000.0 * FPlatformTime::ToSeconds64(TotalCreationTime); // log a message if the RTPSO creating took a long time (with a higher threshold if this was a background compilation) const double CreationTimeWarningThresholdMS = 10.0; const double BackgroundCreatingTimeWarningThresholdMS = 1000.0; if (TotalCreationTimeMS > (Initializer.bBackgroundCompilation ? CreationTimeWarningThresholdMS : BackgroundCreatingTimeWarningThresholdMS)) { const double CompileTimeMS = 1000.0 * FPlatformTime::ToSeconds64(CompileTime); const double LinkTimeMS = 1000.0 * FPlatformTime::ToSeconds64(LinkTime); const uint32 NumUniqueShaders = UniqueShaderCollections.Num(); UE_LOG(LogD3D12RHI, Log, TEXT("Creating RTPSO with %d shaders (%d cached, %d new) took %.2f ms. Compile time %.2f ms, link time %.2f ms."), PipelineShaderHashes.Num(), NumCacheHits, NumUniqueShaders - NumCacheHits, (float)TotalCreationTimeMS, (float)CompileTimeMS, (float)LinkTimeMS); } #endif //!NO_LOGGING } class FD3D12RayTracingShaderBindingTable : public FRHIShaderBindingTable, public FD3D12AdapterChild { public: UE_NONCOPYABLE(FD3D12RayTracingShaderBindingTable) FD3D12RayTracingShaderBindingTable(FRHICommandListBase& RHICmdList, FD3D12Adapter* Adapter, const FRayTracingShaderBindingTableInitializer& InInitializer) : FRHIShaderBindingTable(InInitializer), FD3D12AdapterChild(Adapter) { INC_DWORD_STAT(STAT_D3D12RayTracingAllocatedSBT); checkf(Initializer.NumMissShaderSlots >= 1, TEXT("Need at least 1 miss shader slot.")); for (FD3D12Device* Device : Adapter->GetDevices()) { InitForDevice(RHICmdList, Device); } }; ~FD3D12RayTracingShaderBindingTable() { for (auto& Table : ShaderTablesPerGPU) { delete Table; Table = nullptr; } DEC_DWORD_STAT(STAT_D3D12RayTracingAllocatedSBT); } FD3D12RayTracingShaderBindingTableInternal* GetTableForDevice(FD3D12Device* Device) { const uint32 GPUIndex = Device->GetGPUIndex(); return ShaderTablesPerGPU[GPUIndex]; } void ReleaseForDevice(FD3D12Device* Device) { const uint32 GPUIndex = Device->GetGPUIndex(); delete ShaderTablesPerGPU[GPUIndex]; ShaderTablesPerGPU[GPUIndex] = nullptr; } virtual FRHISizeAndStride GetInlineBindingDataSizeAndStride() const override final { // Size should be the same for all tables if (ShaderTablesPerGPU[0]) { return ShaderTablesPerGPU[0]->GetInlineBindingDataSizeAndStride(); } return FRHISizeAndStride{0,0}; } private: void InitForDevice(FRHICommandListBase& RHICmdList, FD3D12Device* Device) { TRACE_CPUPROFILER_EVENT_SCOPE(ShaderTableInit); SCOPE_CYCLE_COUNTER(STAT_D3D12CreateShaderTable); const uint32 GPUIndex = Device->GetGPUIndex(); check(ShaderTablesPerGPU[GPUIndex] == nullptr); ShaderTablesPerGPU[GPUIndex] = new FD3D12RayTracingShaderBindingTableInternal(RHICmdList, Initializer, Device); } FD3D12RayTracingShaderBindingTableInternal* ShaderTablesPerGPU[MAX_NUM_GPUS] = {}; }; void FD3D12Device::InitRayTracing() { LLM_SCOPE_BYNAME(TEXT("FD3D12RT")); check(RayTracingPipelineCache == nullptr); RayTracingPipelineCache = new FD3D12RayTracingPipelineCache(GetParentAdapter()); } void FD3D12Device::CleanupRayTracing() { delete RayTracingPipelineCache; RayTracingPipelineCache = nullptr; for (FD3D12Queue& Queue : Queues) { delete Queue.RayTracingDispatchRaysDescBuffer; Queue.RayTracingDispatchRaysDescBuffer = nullptr; } } static D3D12_RAYTRACING_INSTANCE_FLAGS TranslateRayTracingInstanceFlags(ERayTracingInstanceFlags InFlags) { D3D12_RAYTRACING_INSTANCE_FLAGS Result = D3D12_RAYTRACING_INSTANCE_FLAG_NONE; if (EnumHasAnyFlags(InFlags, ERayTracingInstanceFlags::TriangleCullDisable)) { Result |= D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_CULL_DISABLE; } if (!EnumHasAnyFlags(InFlags, ERayTracingInstanceFlags::TriangleCullReverse)) { // Counterclockwise is the default for UE. Reversing culling is achieved by *not* setting this flag. Result |= D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE; } if (EnumHasAnyFlags(InFlags, ERayTracingInstanceFlags::ForceOpaque)) { Result |= D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_OPAQUE; } if (EnumHasAnyFlags(InFlags, ERayTracingInstanceFlags::ForceNonOpaque)) { Result |= D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE; } return Result; } FRayTracingAccelerationStructureSize FD3D12DynamicRHI::RHICalcRayTracingSceneSize(const FRayTracingSceneInitializer& Initializer) { D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_INPUTS BuildInputs = {}; BuildInputs.Type = D3D12_RAYTRACING_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL; BuildInputs.DescsLayout = D3D12_ELEMENTS_LAYOUT_ARRAY; BuildInputs.NumDescs = Initializer.MaxNumInstances; BuildInputs.Flags = TranslateRayTracingAccelerationStructureFlags(Initializer.BuildFlags); FD3D12Adapter& Adapter = GetAdapter(); FRayTracingAccelerationStructureSize SizeInfo = {}; for (uint32 GPUIndex = 0; GPUIndex < GNumExplicitGPUsForRendering; ++GPUIndex) { D3D12_RAYTRACING_ACCELERATION_STRUCTURE_PREBUILD_INFO PrebuildInfo = {}; Adapter.GetDevice(GPUIndex)->GetRaytracingAccelerationStructurePrebuildInfo(&BuildInputs, &PrebuildInfo); SizeInfo.ResultSize = FMath::Max(SizeInfo.ResultSize, PrebuildInfo.ResultDataMaxSizeInBytes); SizeInfo.BuildScratchSize = FMath::Max(SizeInfo.BuildScratchSize, PrebuildInfo.ScratchDataSizeInBytes); SizeInfo.UpdateScratchSize = FMath::Max(SizeInfo.UpdateScratchSize, PrebuildInfo.UpdateScratchDataSizeInBytes); } return SizeInfo; } static ERayTracingAccelerationStructureFlags GetRayTracingAccelerationStructureBuildFlags(const FRayTracingGeometryInitializer& Initializer) { ERayTracingAccelerationStructureFlags BuildFlags = ERayTracingAccelerationStructureFlags::None; if (Initializer.bFastBuild) { BuildFlags = ERayTracingAccelerationStructureFlags::FastBuild; } else { BuildFlags = ERayTracingAccelerationStructureFlags::FastTrace; } if (Initializer.bAllowUpdate) { EnumAddFlags(BuildFlags, ERayTracingAccelerationStructureFlags::AllowUpdate); } if (!Initializer.bFastBuild && !Initializer.bAllowUpdate && Initializer.bAllowCompaction && GD3D12RayTracingAllowCompaction && (uint32(GD3D12RayTracingCompactionMinPrimitiveCount) < Initializer.TotalPrimitiveCount)) { EnumAddFlags(BuildFlags, ERayTracingAccelerationStructureFlags::AllowCompaction); } if (GRayTracingDebugForceBuildMode == 1) { EnumAddFlags(BuildFlags, ERayTracingAccelerationStructureFlags::FastBuild); EnumRemoveFlags(BuildFlags, ERayTracingAccelerationStructureFlags::FastTrace); } else if (GRayTracingDebugForceBuildMode == 2) { EnumAddFlags(BuildFlags, ERayTracingAccelerationStructureFlags::FastTrace); EnumRemoveFlags(BuildFlags, ERayTracingAccelerationStructureFlags::FastBuild); } return BuildFlags; } void TranslateRayTracingGeometryDescs(const FRayTracingGeometryInitializer& Initializer, TArrayView Output) { check(Output.Num() == Initializer.Segments.Num()); D3D12_RAYTRACING_GEOMETRY_TYPE GeometryType = TranslateRayTracingGeometryType(Initializer.GeometryType); uint32 ComputedPrimitiveCountForValidation = 0; for (int32 SegmentIndex = 0; SegmentIndex < Initializer.Segments.Num(); ++SegmentIndex) { const FRayTracingGeometrySegment& Segment = Initializer.Segments[SegmentIndex]; checkf(Segment.VertexBuffer, TEXT("Position vertex buffer is required for ray tracing geometry.")); checkf(Segment.VertexBufferStride, TEXT("Non-zero position vertex buffer stride is required.")); checkf(Segment.VertexBufferStride % 4 == 0, TEXT("Position vertex buffer stride must be aligned to 4 bytes for ByteAddressBuffer loads to work.")); checkf(Segment.MaxVertices != 0 || Segment.NumPrimitives == 0, TEXT("FRayTracingGeometrySegment.MaxVertices for '%s' must contain number of positions in the vertex buffer or maximum index buffer value+1 if index buffer is provided."), *Initializer.DebugName.ToString()); if (Initializer.GeometryType == RTGT_Triangles) { checkf(Segment.VertexBufferElementType == VET_Float3 || Segment.VertexBufferElementType == VET_Float4, TEXT("Only float3/4 vertex buffers are currently implemented.")); // #dxr_todo UE-72160: support other vertex buffer formats checkf(Segment.VertexBufferStride >= 12, TEXT("Only deinterleaved float3 position vertex buffers are currently implemented.")); // #dxr_todo UE-72160: support interleaved vertex buffers } else if (Initializer.GeometryType == RTGT_Procedural) { checkf(Segment.VertexBufferStride >= (2 * sizeof(FVector3f)), TEXT("Procedural geometry vertex buffer must contain at least 2xFloat3 that defines 3D bounding boxes of primitives.")); } if (Initializer.IndexBuffer) { uint32 IndexStride = Initializer.IndexBuffer->GetStride(); check(Initializer.IndexBuffer->GetSize() >= (Segment.FirstPrimitive + Segment.NumPrimitives) * FD3D12RayTracingGeometry::IndicesPerPrimitive * IndexStride + Initializer.IndexBufferOffset); } D3D12_RAYTRACING_GEOMETRY_DESC Desc = {}; Desc.Flags = D3D12_RAYTRACING_GEOMETRY_FLAG_NONE; Desc.Type = GeometryType; if (Segment.bForceOpaque) { // Deny anyhit shader invocations when this segment is hit Desc.Flags |= D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE; } if (!Segment.bAllowDuplicateAnyHitShaderInvocation) { // Allow only a single any-hit shader invocation per primitive Desc.Flags |= D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION; } switch (GeometryType) { case D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES: switch (Segment.VertexBufferElementType) { case VET_Float4: // While the DXGI_FORMAT_R32G32B32A32_FLOAT format is not supported by DXR, since we manually load vertex // data when we are building the BLAS, we can just rely on the vertex stride to offset the read index, // and read only the 3 vertex components, and so use the DXGI_FORMAT_R32G32B32_FLOAT vertex format case VET_Float3: Desc.Triangles.VertexFormat = DXGI_FORMAT_R32G32B32_FLOAT; break; case VET_Float2: Desc.Triangles.VertexFormat = DXGI_FORMAT_R32G32_FLOAT; break; case VET_Half2: Desc.Triangles.VertexFormat = DXGI_FORMAT_R16G16_FLOAT; break; default: checkNoEntry(); break; } if (Initializer.IndexBuffer) { // In some cases the geometry is created with 16 bit index buffer, but it's 32 bit at build time. // We conservatively set this to 32 bit to allocate acceleration structure memory. Desc.Triangles.IndexFormat = DXGI_FORMAT_R32_UINT; Desc.Triangles.IndexCount = Segment.NumPrimitives * FD3D12RayTracingGeometry::IndicesPerPrimitive; Desc.Triangles.VertexCount = Segment.MaxVertices; } else { // Non-indexed geometry checkf(Initializer.Segments.Num() == 1, TEXT("Non-indexed geometry with multiple segments is not implemented.")); Desc.Triangles.IndexFormat = DXGI_FORMAT_UNKNOWN; Desc.Triangles.VertexCount = FMath::Min(Segment.MaxVertices, Initializer.TotalPrimitiveCount * 3); } break; case D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS: Desc.AABBs.AABBCount = Segment.NumPrimitives; break; default: checkf(false, TEXT("Unexpected ray tracing geometry type")); break; } ComputedPrimitiveCountForValidation += Segment.NumPrimitives; Output[SegmentIndex] = Desc; } check(ComputedPrimitiveCountForValidation == Initializer.TotalPrimitiveCount); } FRayTracingAccelerationStructureSize FD3D12DynamicRHI::RHICalcRayTracingGeometrySize(const FRayTracingGeometryInitializer& Initializer) { { const bool bHasOfflineMetadata = Initializer.OfflineDataHeader.IsValid(); if (bHasOfflineMetadata) { FRayTracingAccelerationStructureOfflineMetadata OfflineBvhMetadata = RHIGetRayTracingGeometryOfflineMetadata(Initializer.OfflineDataHeader); FRayTracingAccelerationStructureSize SizeInfo = {}; SizeInfo.ResultSize = Align(OfflineBvhMetadata.Size, GRHIRayTracingAccelerationStructureAlignment); return SizeInfo; } } FRayTracingAccelerationStructureSize SizeInfo = {}; ERayTracingAccelerationStructureFlags BuildFlags = GetRayTracingAccelerationStructureBuildFlags(Initializer); D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_INPUTS PrebuildDescInputs = {}; TArray> GeometryDescs; GeometryDescs.SetNumUninitialized(Initializer.Segments.Num()); TranslateRayTracingGeometryDescs(Initializer, GeometryDescs); D3D12_RAYTRACING_GEOMETRY_TYPE GeometryType = TranslateRayTracingGeometryType(Initializer.GeometryType); PrebuildDescInputs.Type = D3D12_RAYTRACING_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL; PrebuildDescInputs.DescsLayout = D3D12_ELEMENTS_LAYOUT_ARRAY; PrebuildDescInputs.NumDescs = GeometryDescs.Num(); PrebuildDescInputs.pGeometryDescs = GeometryDescs.GetData(); PrebuildDescInputs.Flags = TranslateRayTracingAccelerationStructureFlags(BuildFlags); FD3D12Adapter& Adapter = GetAdapter(); // We don't know the final index buffer format, so take maximum of 16 and 32 bit. static const DXGI_FORMAT ValidIndexBufferFormats[] = { DXGI_FORMAT_R16_UINT, DXGI_FORMAT_R32_UINT }; static const DXGI_FORMAT NullIndexBufferFormats[] = { DXGI_FORMAT_UNKNOWN }; TArrayView IndexFormats = Initializer.IndexBuffer.IsValid() ? MakeArrayView(ValidIndexBufferFormats) : MakeArrayView(NullIndexBufferFormats); for (DXGI_FORMAT IndexFormat : IndexFormats) { for (D3D12_RAYTRACING_GEOMETRY_DESC& GeometryDesc : GeometryDescs) { if (GeometryDesc.Type == D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES) { GeometryDesc.Triangles.IndexFormat = IndexFormat; } } // Get maximum buffer sizes for all GPUs in the system for (uint32 GPUIndex = 0; GPUIndex < GNumExplicitGPUsForRendering; ++GPUIndex) { D3D12_RAYTRACING_ACCELERATION_STRUCTURE_PREBUILD_INFO PrebuildInfo = {}; Adapter.GetDevice(GPUIndex)->GetRaytracingAccelerationStructurePrebuildInfo(&PrebuildDescInputs, &PrebuildInfo); SizeInfo.ResultSize = FMath::Max(SizeInfo.ResultSize, PrebuildInfo.ResultDataMaxSizeInBytes); SizeInfo.BuildScratchSize = FMath::Max(SizeInfo.BuildScratchSize, PrebuildInfo.ScratchDataSizeInBytes); SizeInfo.UpdateScratchSize = FMath::Max(SizeInfo.UpdateScratchSize, PrebuildInfo.UpdateScratchDataSizeInBytes); } } SizeInfo.ResultSize = Align(SizeInfo.ResultSize, GRHIRayTracingAccelerationStructureAlignment); SizeInfo.BuildScratchSize = Align(SizeInfo.BuildScratchSize, GRHIRayTracingScratchBufferAlignment); SizeInfo.UpdateScratchSize = Align(FMath::Max(1ULL, SizeInfo.UpdateScratchSize), GRHIRayTracingScratchBufferAlignment); return SizeInfo; } FRayTracingAccelerationStructureOfflineMetadata FD3D12DynamicRHI::RHIGetRayTracingGeometryOfflineMetadata(const FRayTracingGeometryOfflineDataHeader& OfflineDataHeader) { static_assert(sizeof(FD3D12RayTracingOfflineBvhHeader) <= sizeof(FRayTracingGeometryOfflineDataHeader), "FRayTracingGeometryOfflineDataHeader must be large enough to fit FOfflineBVHHeader"); FD3D12RayTracingOfflineBvhHeader BvhHeader; FMemory::Memcpy(&BvhHeader, &OfflineDataHeader, sizeof(BvhHeader)); FRayTracingAccelerationStructureOfflineMetadata Metadata; Metadata.Size = BvhHeader.Size; Metadata.SerializedSize = BvhHeader.SerializedSize; Metadata.SerializedOffset = 0; return Metadata; } FRayTracingPipelineStateRHIRef FD3D12DynamicRHI::RHICreateRayTracingPipelineState(const FRayTracingPipelineStateInitializer& Initializer) { FD3D12Device* Device = GetAdapter().GetDevice(0); // All pipelines are created on the first node, as they may be used on any other linked GPU. FD3D12RayTracingPipelineState* Result = new FD3D12RayTracingPipelineState(Device, Initializer); return Result; } FRayTracingGeometryRHIRef FD3D12DynamicRHI::RHICreateRayTracingGeometry(FRHICommandListBase& RHICmdList, const FRayTracingGeometryInitializer& Initializer) { FD3D12Adapter& Adapter = GetAdapter(); return new FD3D12RayTracingGeometry(RHICmdList, &Adapter, Initializer); } FRayTracingSceneRHIRef FD3D12DynamicRHI::RHICreateRayTracingScene(FRayTracingSceneInitializer Initializer) { TRACE_CPUPROFILER_EVENT_SCOPE(CreateRayTracingScene); FD3D12Adapter& Adapter = GetAdapter(); return new FD3D12RayTracingScene(&Adapter, MoveTemp(Initializer)); } FShaderBindingTableRHIRef FD3D12DynamicRHI::RHICreateShaderBindingTable(FRHICommandListBase& RHICmdList, const FRayTracingShaderBindingTableInitializer& Initializer) { TRACE_CPUPROFILER_EVENT_SCOPE(CreateRayTracingScene); FD3D12Adapter& Adapter = GetAdapter(); return new FD3D12RayTracingShaderBindingTable(RHICmdList, &Adapter, Initializer); } FBufferRHIRef FD3D12RayTracingGeometry::NullTransformBuffer; enum class ERayTracingBufferType { AccelerationStructure, Scratch }; static TRefCountPtr CreateRayTracingBuffer(FD3D12Adapter* Adapter, uint32 GPUIndex, uint64 Size, ERayTracingBufferType Type, const FDebugName& DebugName) { FString DebugNameString = DebugName.ToString(); checkf(Size != 0, TEXT("Attempting to create ray tracing %s buffer of zero size. Debug name: %s"), Type == ERayTracingBufferType::AccelerationStructure ? TEXT("AccelerationStructure") : TEXT("Scratch"), *DebugNameString); TRefCountPtr Result; ID3D12ResourceAllocator* ResourceAllocator = nullptr; FRHIGPUMask GPUMask = FRHIGPUMask::FromIndex(GPUIndex); bool bHasInitialData = false; if (Type == ERayTracingBufferType::AccelerationStructure) { const D3D12_RESOURCE_DESC ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(Size, D3D12_RESOURCE_FLAG_NONE); const FRHIBufferCreateDesc CreateDesc = FRHIBufferCreateDesc::Create(*DebugNameString, Size, 0, BUF_AccelerationStructure) .SetInitialState(ERHIAccess::BVHWrite) .SetGPUMask(GPUMask); Result = Adapter->CreateRHIBuffer( ResourceDesc, D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BYTE_ALIGNMENT, CreateDesc, ED3D12ResourceStateMode::SingleState, D3D12_RESOURCE_STATE_RAYTRACING_ACCELERATION_STRUCTURE ); } else if (Type == ERayTracingBufferType::Scratch) { // Scratch doesn't need single state anymore because there are only a few scratch allocations left and allocating a // dedicated single state heap for it wastes memory - ideally all scratch allocations should be transient const D3D12_RESOURCE_DESC ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(Size, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); const FRHIBufferCreateDesc CreateDesc = FRHIBufferCreateDesc::Create(*DebugNameString, Size, 0, BUF_UnorderedAccess) .SetInitialState(ERHIAccess::BVHWrite) .SetGPUMask(GPUMask); Result = Adapter->CreateRHIBuffer( ResourceDesc, D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BYTE_ALIGNMENT, CreateDesc, ED3D12ResourceStateMode::Default, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ); // Elevates the scratch buffer heap priority, which may help performance / stability in low memory conditions // (Acceleration structure already boosted from allocation side) ID3D12Pageable* HeapResource = Result->GetResource()->GetPageable(); Adapter->SetResidencyPriority(HeapResource, D3D12_RESIDENCY_PRIORITY_HIGH, GPUIndex); } else { checkNoEntry(); } return Result; } FString GetGeometryInitializerDebugString(const FRayTracingGeometryInitializer& Initializer) { TStringBuilder<128> Result; Result << "DebugName=" << Initializer.DebugName.ToString(); Result << " NumSegments=" << Initializer.Segments.Num(); Result << " NumPrims=" << Initializer.TotalPrimitiveCount; if (Initializer.IndexBuffer) { Result << " IndexStride=" << Initializer.IndexBuffer->GetStride(); } else { Result << " NonIndexed"; } if (Initializer.OfflineData) { Result << " HasOfflineData"; } return Result.ToString(); } FD3D12RayTracingGeometry::FD3D12RayTracingGeometry(FRHICommandListBase& RHICmdList, FD3D12Adapter* Adapter, const FRayTracingGeometryInitializer& InInitializer) : FRHIRayTracingGeometry(InInitializer), FD3D12AdapterChild(Adapter) { INC_DWORD_STAT(STAT_D3D12RayTracingAllocatedBLAS); static const FName NAME_BLAS(TEXT("BLAS")); DebugName = !Initializer.DebugName.IsNone() ? Initializer.DebugName : NAME_BLAS; OwnerName = Initializer.OwnerName; FMemory::Memzero(bHasPendingCompactionRequests); FMemory::Memzero(bRegisteredAsRenameListener); if(!FD3D12RayTracingGeometry::NullTransformBuffer.IsValid()) { TArray NullTransformData; NullTransformData.SetNumZeroed(12); FD3D12RayTracingGeometry::NullTransformBuffer = UE::RHIResourceUtils::CreateVertexBufferFromArray( RHICmdList, TEXT("NullTransformBuffer"), EBufferUsageFlags::Static, MakeConstArrayView(NullTransformData) ); } RegisterD3D12RayTracingGeometry(this); checkf(Initializer.Segments.Num() > 0, TEXT("Ray tracing geometry must be initialized with at least one segment.")); GeometryDescs.SetNumUninitialized(Initializer.Segments.Num()); TranslateRayTracingGeometryDescs(Initializer, GeometryDescs); SetDirty(FRHIGPUMask::All(), true); const bool bHasOfflineMetadata = Initializer.OfflineDataHeader.IsValid(); FRayTracingAccelerationStructureOfflineMetadata OfflineBvhMetadata; if (bHasOfflineMetadata) { OfflineBvhMetadata = RHIGetRayTracingGeometryOfflineMetadata(InInitializer.OfflineDataHeader); } const void* SourceData = nullptr; if (Initializer.OfflineData != nullptr) { checkf(bHasOfflineMetadata, TEXT("OfflineData provided in Initializer has data but OfflineDataHeader is not valid.")); SourceData = Initializer.OfflineData->GetResourceData(); const uint32 SourceDataSize = Initializer.OfflineData->GetResourceDataSize(); checkf(SourceDataSize >= OfflineBvhMetadata.SerializedSize, TEXT("OfflineData provided in Initializer has %u bytes but FD3D12RayTracingGeometry expected %u bytes."), SourceDataSize, OfflineBvhMetadata.SerializedSize); } if (SourceData != nullptr) { checkf(!InInitializer.bAllowUpdate, TEXT("FD3D12RayTracingGeometry doesn't support updating BVH created using offline data.")); } // Compute the required size of the in-memory BVH buffer if (SourceData != nullptr) { SizeInfo.ResultSize = OfflineBvhMetadata.Size; SizeInfo.BuildScratchSize = 0; SizeInfo.UpdateScratchSize = 0; AccelerationStructureCompactedSize = OfflineBvhMetadata.Size; } else { // Get maximum buffer sizes for all GPUs in the system SizeInfo = RHICalcRayTracingGeometrySize(Initializer); } checkf(SizeInfo.ResultSize != 0, TEXT("Unexpected acceleration structure buffer size (0).\nGeometry initializer details:\n%s"), *GetGeometryInitializerDebugString(Initializer)); // If this RayTracingGeometry going to be used as streaming destination // we don't want to allocate its memory as it will be replaced later by streamed version // but we still need correct SizeInfo as it is used to estimate its memory requirements outside of RHI. if (Initializer.Type == ERayTracingGeometryInitializerType::StreamingDestination) { return; } // Allocate acceleration structure buffer FOREACH_GPU(GPUIndex < MAX_NUM_GPUS && GPUIndex < GNumExplicitGPUsForRendering, { AccelerationStructureBuffers[GPUIndex] = CreateRayTracingBuffer(Adapter, GPUIndex, SizeInfo.ResultSize, ERayTracingBufferType::AccelerationStructure, DebugName); AccelerationStructureBuffers[GPUIndex]->SetOwnerName(OwnerName); INC_MEMORY_STAT_BY(STAT_D3D12RayTracingUsedVideoMemory, AccelerationStructureBuffers[GPUIndex]->GetSize()); INC_MEMORY_STAT_BY(STAT_D3D12RayTracingBLASMemory, AccelerationStructureBuffers[GPUIndex]->GetSize()); if (Initializer.bAllowUpdate) { INC_MEMORY_STAT_BY(STAT_D3D12RayTracingDynamicBLASMemory, AccelerationStructureBuffers[GPUIndex]->GetSize()); } else { INC_MEMORY_STAT_BY(STAT_D3D12RayTracingStaticBLASMemory, AccelerationStructureBuffers[GPUIndex]->GetSize()); } }); INC_DWORD_STAT_BY(STAT_D3D12RayTracingTrianglesBLAS, Initializer.TotalPrimitiveCount); const bool bForRendering = Initializer.Type == ERayTracingGeometryInitializerType::Rendering; if (SourceData != nullptr) { FD3D12Device* Device = Adapter->GetDevice(0); FD3D12ResourceLocation SrcResourceLoc(Device); uint8* DstDataBase = (uint8*)Adapter->GetUploadHeapAllocator(0).AllocUploadResource(OfflineBvhMetadata.SerializedSize, 256, SrcResourceLoc); FMemory::Memcpy(DstDataBase, SourceData, OfflineBvhMetadata.SerializedSize); RHICmdList.EnqueueLambda([this, SrcResourceLoc = MoveTemp(SrcResourceLoc), bForRendering](FRHICommandListBase& ExecutingCmdList) { FOREACH_GPU(GPUIndex < MAX_NUM_GPUS && GPUIndex < GNumExplicitGPUsForRendering, { FD3D12CommandContext& Context = FD3D12CommandContext::Get(ExecutingCmdList, GPUIndex); FD3D12Buffer* AccelerationStructure = AccelerationStructureBuffers[GPUIndex]; Context.RayTracingCommandList()->CopyRaytracingAccelerationStructure( AccelerationStructure->ResourceLocation.GetGPUVirtualAddress(), SrcResourceLoc.GetGPUVirtualAddress(), D3D12_RAYTRACING_ACCELERATION_STRUCTURE_COPY_MODE_DESERIALIZE ); Context.UpdateResidency(SrcResourceLoc.GetResource()); Context.ConditionalSplitCommandList(); if (bForRendering) { RegisterAsRenameListener(GPUIndex); SetupHitGroupSystemParameters(GPUIndex); } }); SetDirty(FRHIGPUMask::All(), false); }); Initializer.OfflineData->Discard(); } else { // Offline data already registered via FD3D12RHICommandInitializeRayTracingGeometry FOREACH_GPU(GPUIndex < MAX_NUM_GPUS && GPUIndex < GNumExplicitGPUsForRendering, { RegisterAsRenameListener(GPUIndex); }); } } void FD3D12RayTracingGeometry::Swap(FD3D12RayTracingGeometry& Other) { FOREACH_GPU(GPUIndex < MAX_NUM_GPUS, { ::Swap(AccelerationStructureBuffers[GPUIndex], Other.AccelerationStructureBuffers[GPUIndex]); ::Swap(bIsAccelerationStructureDirty[GPUIndex], Other.bIsAccelerationStructureDirty[GPUIndex]); }); ::Swap(AccelerationStructureCompactedSize, Other.AccelerationStructureCompactedSize); FOREACH_GPU(GPUIndex < MAX_NUM_GPUS && GPUIndex < GNumExplicitGPUsForRendering, { UnregisterAsRenameListener(GPUIndex); }); Initializer = Other.Initializer; DebugName = !Initializer.DebugName.IsNone() ? Initializer.DebugName : FName(TEXT("BLAS")); checkf(Initializer.Segments.Num() > 0, TEXT("Ray tracing geometry must be initialized with at least one segment.")); GeometryDescs.SetNumUninitialized(Initializer.Segments.Num()); TranslateRayTracingGeometryDescs(Initializer, GeometryDescs); FOREACH_GPU(GPUIndex < MAX_NUM_GPUS && GPUIndex < GNumExplicitGPUsForRendering, { RegisterAsRenameListener(GPUIndex); SetupHitGroupSystemParameters(GPUIndex); }); } void FD3D12RayTracingGeometry::ReleaseUnderlyingResource() { UnregisterD3D12RayTracingGeometry(this); // Remove compaction request if still pending FOREACH_GPU(GPUIndex < MAX_NUM_GPUS, { if (bHasPendingCompactionRequests[GPUIndex]) { check(AccelerationStructureBuffers[GPUIndex]); FD3D12Device* Device = AccelerationStructureBuffers[GPUIndex].GetReference()->GetParentDevice(); bool bRequestFound = Device->GetRayTracingCompactionRequestHandler()->ReleaseRequest(this); check(bRequestFound); bHasPendingCompactionRequests[GPUIndex] = false; } }); // Unregister as dependent resource on vertex and index buffers & clear the SRVs FOREACH_GPU(GPUIndex < MAX_NUM_GPUS, { HitGroupSystemIndexBufferSRV[GPUIndex].Reset(); HitGroupSystemSegmentVertexBufferSRVs[GPUIndex].Empty(); UnregisterAsRenameListener(GPUIndex); }); for (TRefCountPtr& Buffer : AccelerationStructureBuffers) { if (Buffer) { DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingUsedVideoMemory, Buffer->GetSize()); DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingBLASMemory, Buffer->GetSize()); ERayTracingAccelerationStructureFlags BuildFlags = GetRayTracingAccelerationStructureBuildFlags(Initializer); if (EnumHasAllFlags(BuildFlags, ERayTracingAccelerationStructureFlags::AllowUpdate)) { DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingDynamicBLASMemory, Buffer->GetSize()); } else { DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingStaticBLASMemory, Buffer->GetSize()); } } } if (Initializer.Type != ERayTracingGeometryInitializerType::StreamingSource) { DEC_DWORD_STAT_BY(STAT_D3D12RayTracingTrianglesBLAS, Initializer.TotalPrimitiveCount); DEC_DWORD_STAT(STAT_D3D12RayTracingAllocatedBLAS); } // Reset members for (TRefCountPtr& Buffer : AccelerationStructureBuffers) { Buffer.SafeRelease(); } Initializer = {}; AccelerationStructureCompactedSize = 0; GeometryDescs = {}; for (TArray& HitGroupParametersForGPU : HitGroupSystemParameters) { HitGroupParametersForGPU.Empty(); } } FD3D12RayTracingGeometry::~FD3D12RayTracingGeometry() { // RT geometry can be destroyed before persistent SBT records are cleared from the cached MDCs // because they are still pending removal from the scene primitives for (ID3D12RayTracingGeometryUpdateListener* UpdateListener : UpdateListeners) { UpdateListener->RemoveListener(this); } UpdateListeners.Empty(); ReleaseUnderlyingResource(); } void FD3D12RayTracingGeometry::AllocateBufferSRVs(uint32 InGPUIndex) { HitGroupSystemIndexBufferSRV[InGPUIndex].Reset(); HitGroupSystemSegmentVertexBufferSRVs[InGPUIndex].Empty(); // Procedural doesn't need any SRVs for index buffer if (Initializer.IndexBuffer && Initializer.GeometryType == RTGT_Triangles) { checkf((Initializer.IndexBufferOffset % RHI_RAW_VIEW_ALIGNMENT) == 0, TEXT("The byte offset of raw views must be a multiple of %d (specified offset: %d)."), RHI_RAW_VIEW_ALIGNMENT, Initializer.IndexBufferOffset); FD3D12Buffer* IndexBuffer = FD3D12DynamicRHI::ResourceCast(Initializer.IndexBuffer.GetReference()); // Initializer.TotalPrimitiveCount is the accumulated num primitives of the segments // The highest indexed entry can be higher due to Segment.FirstPrimitive or it can be lower if segments overlap // So here we calculate the highest indexed entry by looping over the segments uint32 MaxPrimitiveCount = 0; for (const FRayTracingGeometrySegment& Segment : Initializer.Segments) { MaxPrimitiveCount = FMath::Max(MaxPrimitiveCount, Segment.FirstPrimitive + Segment.NumPrimitives); } D3D12_SHADER_RESOURCE_VIEW_DESC SRVDesc = {}; SRVDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER; SRVDesc.Format = DXGI_FORMAT_R32_TYPELESS; SRVDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; SRVDesc.Buffer.FirstElement = (Initializer.IndexBufferOffset + IndexBuffer->ResourceLocation.GetOffsetFromBaseOfResource()) >> 2u; SRVDesc.Buffer.NumElements = FMath::Max((uint32)1, ((MaxPrimitiveCount * 3 * IndexBuffer->GetStride()) + 3) >> 2u); SRVDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW; SRVDesc.Buffer.StructureByteStride = 0; HitGroupSystemIndexBufferSRV[InGPUIndex] = MakeShared(GetParentAdapter()->GetDevice(InGPUIndex), InGPUIndex > 0 ? HitGroupSystemIndexBufferSRV[0].Get() : nullptr); HitGroupSystemIndexBufferSRV[InGPUIndex]->CreateView(IndexBuffer, SRVDesc, FD3D12ShaderResourceView::EFlags::None); } for (const FRayTracingGeometrySegment& Segment : Initializer.Segments) { checkf((Segment.VertexBufferOffset % RHI_RAW_VIEW_ALIGNMENT) == 0, TEXT("The byte offset of raw views must be a multiple of %d (specified offset: %d)."), RHI_RAW_VIEW_ALIGNMENT, Segment.VertexBufferOffset); FD3D12Buffer* VertexBuffer = FD3D12DynamicRHI::ResourceCast(Segment.VertexBuffer.GetReference()); D3D12_SHADER_RESOURCE_VIEW_DESC SRVDesc = {}; SRVDesc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER; SRVDesc.Format = DXGI_FORMAT_R32_TYPELESS; SRVDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; SRVDesc.Buffer.FirstElement = (Segment.VertexBufferOffset + VertexBuffer->ResourceLocation.GetOffsetFromBaseOfResource()) >> 2u; if (Initializer.GeometryType == RTGT_Procedural) { SRVDesc.Buffer.NumElements = ((Segment.NumPrimitives * Segment.VertexBufferStride) + 3) / 4; //< NumElements in R32 size } else { SRVDesc.Buffer.NumElements = FMath::Max((uint32)1, ((Segment.MaxVertices * Segment.VertexBufferStride) + 3) / 4); //< NumElements in R32 size } SRVDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW; SRVDesc.Buffer.StructureByteStride = 0; FD3D12ShaderResourceView* FirstLinkedObject = nullptr; if (InGPUIndex > 0) { int32 SegmentIndex = HitGroupSystemSegmentVertexBufferSRVs[InGPUIndex].Num(); if (HitGroupSystemSegmentVertexBufferSRVs[0].Num() > SegmentIndex) { FirstLinkedObject = HitGroupSystemSegmentVertexBufferSRVs[0][SegmentIndex].Get(); } } TSharedPtr VertexBufferSRV = MakeShared(GetParentAdapter()->GetDevice(InGPUIndex), FirstLinkedObject); VertexBufferSRV->CreateView(VertexBuffer, SRVDesc, FD3D12ShaderResourceView::EFlags::None); HitGroupSystemSegmentVertexBufferSRVs[InGPUIndex].Add(VertexBufferSRV); } } void FD3D12RayTracingGeometry::RegisterAsRenameListener(uint32 InGPUIndex) { // Not needed if bindless if (AreBindlessResourcesEnabledForRayTracing(GetParentAdapter())) { return; } check(!bRegisteredAsRenameListener[InGPUIndex]); FD3D12Buffer* IndexBuffer = FD3D12DynamicRHI::ResourceCast(Initializer.IndexBuffer.GetReference(), InGPUIndex); if (IndexBuffer) { IndexBuffer->AddRenameListener(this); } TArray> UniqueVertexBuffers; UniqueVertexBuffers.Reserve(Initializer.Segments.Num()); for (const FRayTracingGeometrySegment& Segment : Initializer.Segments) { FD3D12Buffer* VertexBuffer = FD3D12DynamicRHI::ResourceCast(Segment.VertexBuffer.GetReference(), InGPUIndex); if (VertexBuffer && !UniqueVertexBuffers.Contains(VertexBuffer)) { VertexBuffer->AddRenameListener(this); UniqueVertexBuffers.Add(VertexBuffer); } } bRegisteredAsRenameListener[InGPUIndex] = true; } void FD3D12RayTracingGeometry::UnregisterAsRenameListener(uint32 InGPUIndex) { if (!bRegisteredAsRenameListener[InGPUIndex]) { return; } check(!AreBindlessResourcesEnabledForRayTracing(GetParentAdapter())); FD3D12Buffer* IndexBuffer = FD3D12DynamicRHI::ResourceCast(Initializer.IndexBuffer.GetReference(), InGPUIndex); if (IndexBuffer) { IndexBuffer->RemoveRenameListener(this); } TArray> UniqueVertexBuffers; UniqueVertexBuffers.Reserve(Initializer.Segments.Num()); for (const FRayTracingGeometrySegment& Segment : Initializer.Segments) { FD3D12Buffer* VertexBuffer = FD3D12DynamicRHI::ResourceCast(Segment.VertexBuffer.GetReference(), InGPUIndex); if (VertexBuffer && !UniqueVertexBuffers.Contains(VertexBuffer)) { VertexBuffer->RemoveRenameListener(this); UniqueVertexBuffers.Add(VertexBuffer); } } bRegisteredAsRenameListener[InGPUIndex] = false; } void FD3D12RayTracingGeometry::ResourceRenamed(FD3D12ContextArray const& Contexts, FD3D12BaseShaderResource* InRenamedResource, FD3D12ResourceLocation* InNewResourceLocation) { check(!AreBindlessResourcesEnabledForRayTracing(GetParentAdapter())); // Empty resource location is used on destruction of the base shader resource but this // shouldn't happen for RT Geometries because it keeps smart pointers to it's resources. check(InNewResourceLocation != nullptr); // Recreate the hit group parameters which cache the address to the index and vertex buffers directly if the geometry is fully valid uint32 GPUIndex = InRenamedResource->GetParentDevice()->GetGPUIndex(); if (BuffersValid(GPUIndex)) { SetupHitGroupSystemParameters(GPUIndex); } } bool FD3D12RayTracingGeometry::BuffersValid(uint32 GPUIndex) const { if (Initializer.IndexBuffer) { const FD3D12Buffer* IndexBuffer = FD3D12DynamicRHI::ResourceCast(Initializer.IndexBuffer.GetReference(), GPUIndex); if (!IndexBuffer->ResourceLocation.IsValid()) { return false; } } for (const FRayTracingGeometrySegment& Segment : Initializer.Segments) { const FD3D12Buffer* VertexBuffer = FD3D12DynamicRHI::ResourceCast(Segment.VertexBuffer.GetReference(), GPUIndex); if (!VertexBuffer->ResourceLocation.IsValid()) { return false; } } return true; } void FD3D12RayTracingGeometry::UpdateResidency(FD3D12CommandContext& CommandContext) { if (Initializer.IndexBuffer) { FD3D12Buffer* IndexBuffer = CommandContext.RetrieveObject(Initializer.IndexBuffer.GetReference()); CommandContext.UpdateResidency(IndexBuffer->GetResource()); } for (const FRayTracingGeometrySegment& Segment : Initializer.Segments) { const FBufferRHIRef& RHIVertexBuffer = Segment.VertexBuffer; FD3D12Buffer* VertexBuffer = CommandContext.RetrieveObject(RHIVertexBuffer.GetReference()); CommandContext.UpdateResidency(VertexBuffer->ResourceLocation.GetResource()); } const uint32 GPUIndex = CommandContext.GetGPUIndex(); CommandContext.UpdateResidency(AccelerationStructureBuffers[GPUIndex]->GetResource()); } void FD3D12RayTracingGeometry::SetupHitGroupSystemParameters(uint32 InGPUIndex) { D3D12_RAYTRACING_GEOMETRY_TYPE GeometryType = TranslateRayTracingGeometryType(Initializer.GeometryType); bool bBindless = AreBindlessResourcesEnabledForRayTracing(GetParentAdapter()); TArray& HitGroupSystemParametersForThisGPU = HitGroupSystemParameters[InGPUIndex]; HitGroupSystemParametersForThisGPU.Reset(Initializer.Segments.Num()); check(BuffersValid(InGPUIndex)); if (bBindless) { AllocateBufferSRVs(InGPUIndex); } FD3D12Buffer* IndexBuffer = FD3D12DynamicRHI::ResourceCast(Initializer.IndexBuffer.GetReference(), InGPUIndex); const uint32 IndexStride = IndexBuffer ? IndexBuffer->GetStride() : 0; for (int32 SegmentIndex = 0; SegmentIndex < Initializer.Segments.Num(); ++SegmentIndex) { const FRayTracingGeometrySegment& Segment = Initializer.Segments[SegmentIndex]; FD3D12Buffer* VertexBuffer = FD3D12DynamicRHI::ResourceCast(Segment.VertexBuffer.GetReference(), InGPUIndex); FD3D12HitGroupSystemParameters SystemParameters = {}; SystemParameters.RootConstants.SetVertexAndIndexStride(Segment.VertexBufferStride, IndexStride); if (bBindless) { SystemParameters.BindlessHitGroupSystemVertexBuffer = HitGroupSystemSegmentVertexBufferSRVs[InGPUIndex][SegmentIndex]->GetBindlessHandle().GetIndex(); } else { SystemParameters.VertexBuffer = VertexBuffer->ResourceLocation.GetGPUVirtualAddress() + Segment.VertexBufferOffset; } if (GeometryType == D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES && IndexBuffer != nullptr) { if (bBindless) { SystemParameters.BindlessHitGroupSystemIndexBuffer = HitGroupSystemIndexBufferSRV[InGPUIndex]->GetBindlessHandle().GetIndex(); } else { SystemParameters.IndexBuffer = IndexBuffer->ResourceLocation.GetGPUVirtualAddress(); } SystemParameters.RootConstants.IndexBufferOffsetInBytes = Initializer.IndexBufferOffset + IndexStride * Segment.FirstPrimitive * FD3D12RayTracingGeometry::IndicesPerPrimitive; SystemParameters.RootConstants.FirstPrimitive = Segment.FirstPrimitive; } HitGroupSystemParametersForThisGPU.Add(SystemParameters); } // Notify listeners about changes HitGroupParamatersUpdated(); } void FD3D12RayTracingGeometry::CreateAccelerationStructureBuildDesc(FD3D12CommandContext& CommandContext, EAccelerationStructureBuildMode BuildMode, D3D12_GPU_VIRTUAL_ADDRESS ScratchBufferAddress, D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_DESC& OutDesc, TArrayView& OutGeometryDescs) const { if (Initializer.IndexBuffer) { checkf(Initializer.IndexBuffer->GetStride() == 2 || Initializer.IndexBuffer->GetStride() == 4, TEXT("Index buffer must be 16 or 32 bit.")); } const uint32 GPUIndex = CommandContext.GetGPUIndex(); const uint32 IndexStride = Initializer.IndexBuffer ? Initializer.IndexBuffer->GetStride() : 0; const bool bIsUpdate = BuildMode == EAccelerationStructureBuildMode::Update; // Use the pre-built descs as template and set the GPU resource pointers (current VB/IB). check(OutGeometryDescs.Num() == GeometryDescs.Num()); checkf(BuffersValid(GPUIndex), TEXT("Index & vertex buffers should be valid (not streamed out) when building the acceleration structure")); FD3D12Buffer* IndexBuffer = CommandContext.RetrieveObject(Initializer.IndexBuffer.GetReference()); FD3D12Buffer* NullTransformBufferD3D12 = CommandContext.RetrieveObject(NullTransformBuffer.GetReference()); const TArray& HitGroupSystemParametersForThisGPU = HitGroupSystemParameters[GPUIndex]; check(HitGroupSystemParametersForThisGPU.Num() == Initializer.Segments.Num()); ERayTracingAccelerationStructureFlags BuildFlags = GetRayTracingAccelerationStructureBuildFlags(Initializer); D3D12_RAYTRACING_GEOMETRY_TYPE GeometryType = TranslateRayTracingGeometryType(Initializer.GeometryType); for (int32 SegmentIndex = 0; SegmentIndex < Initializer.Segments.Num(); ++SegmentIndex) { D3D12_RAYTRACING_GEOMETRY_DESC& Desc = OutGeometryDescs[SegmentIndex]; Desc = GeometryDescs[SegmentIndex]; // Copy from template const FRayTracingGeometrySegment& Segment = Initializer.Segments[SegmentIndex]; const FD3D12HitGroupSystemParameters& SystemParameters = HitGroupSystemParametersForThisGPU[SegmentIndex]; FD3D12Buffer* VertexBuffer = CommandContext.RetrieveObject(Segment.VertexBuffer.GetReference()); switch (GeometryType) { case D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES: switch (Segment.VertexBufferElementType) { case VET_Float4: // While the DXGI_FORMAT_R32G32B32A32_FLOAT format is not supported by DXR, since we manually load vertex // data when we are building the BLAS, we can just rely on the vertex stride to offset the read index, // and read only the 3 vertex components, and so use the DXGI_FORMAT_R32G32B32_FLOAT vertex format case VET_Float3: check(Desc.Triangles.VertexFormat == DXGI_FORMAT_R32G32B32_FLOAT); break; case VET_Float2: check(Desc.Triangles.VertexFormat == DXGI_FORMAT_R32G32_FLOAT); break; case VET_Half2: check(Desc.Triangles.VertexFormat == DXGI_FORMAT_R16G16_FLOAT); break; default: checkNoEntry(); break; } if (!Segment.bEnabled) { Desc.Triangles.IndexCount = 0; } checkf(Desc.Triangles.Transform3x4 == D3D12_GPU_VIRTUAL_ADDRESS(0), TEXT("BLAS geometry transforms are not supported!")); if (IndexBuffer) { check(Desc.Triangles.IndexCount <= Segment.NumPrimitives * FD3D12RayTracingGeometry::IndicesPerPrimitive); Desc.Triangles.IndexFormat = (IndexStride == 4 ? DXGI_FORMAT_R32_UINT : DXGI_FORMAT_R16_UINT); Desc.Triangles.IndexBuffer = IndexBuffer->ResourceLocation.GetGPUVirtualAddress() + SystemParameters.RootConstants.IndexBufferOffsetInBytes; } else { // Non-indexed geometry checkf(Initializer.Segments.Num() == 1, TEXT("Non-indexed geometry with multiple segments is not implemented.")); check(Desc.Triangles.IndexFormat == DXGI_FORMAT_UNKNOWN); check(Desc.Triangles.IndexCount == 0); check(Desc.Triangles.IndexBuffer == D3D12_GPU_VIRTUAL_ADDRESS(0)); } Desc.Triangles.VertexBuffer.StartAddress = VertexBuffer->ResourceLocation.GetGPUVirtualAddress() + Segment.VertexBufferOffset; Desc.Triangles.VertexBuffer.StrideInBytes = Segment.VertexBufferStride; break; case D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS: Desc.AABBs.AABBCount = Segment.NumPrimitives; Desc.AABBs.AABBs.StartAddress = VertexBuffer->ResourceLocation.GetGPUVirtualAddress() + Segment.VertexBufferOffset; Desc.AABBs.AABBs.StrideInBytes = Segment.VertexBufferStride; break; default: checkf(false, TEXT("Unexpected ray tracing geometry type")); break; } if (GeometryType == D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES) { // #dxr_todo UE-72160: support various vertex buffer layouts (fetch/decode based on vertex stride and format) checkf(Segment.VertexBufferElementType == VET_Float3 || Segment.VertexBufferElementType == VET_Float4, TEXT("Only VET_Float3 and Float4 are currently implemented and tested. Other formats will be supported in the future.")); } } D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAGS LocalBuildFlags = TranslateRayTracingAccelerationStructureFlags(BuildFlags); if (bIsUpdate) { checkf(EnumHasAllFlags(BuildFlags, ERayTracingAccelerationStructureFlags::AllowUpdate), TEXT("Acceleration structure must be created with FRayTracingGeometryInitializer::bAllowUpdate=true to perform refit / update.")); LocalBuildFlags |= D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAG_PERFORM_UPDATE; } D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_INPUTS PrebuildDescInputs = {}; PrebuildDescInputs.Type = D3D12_RAYTRACING_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL; PrebuildDescInputs.DescsLayout = D3D12_ELEMENTS_LAYOUT_ARRAY; PrebuildDescInputs.NumDescs = OutGeometryDescs.Num(); PrebuildDescInputs.pGeometryDescs = OutGeometryDescs.GetData(); PrebuildDescInputs.Flags = LocalBuildFlags; D3D12_RAYTRACING_ACCELERATION_STRUCTURE_PREBUILD_INFO PrebuildInfo = {}; CommandContext.GetParentDevice()->GetRaytracingAccelerationStructurePrebuildInfo(&PrebuildDescInputs, &PrebuildInfo); // Must make sure that values computed in the constructor are valid. check(PrebuildInfo.ResultDataMaxSizeInBytes <= SizeInfo.ResultSize); if (bIsUpdate) { check(PrebuildInfo.UpdateScratchDataSizeInBytes <= SizeInfo.UpdateScratchSize); } else { check(PrebuildInfo.ScratchDataSizeInBytes <= SizeInfo.BuildScratchSize); } D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_DESC BuildDesc = {}; BuildDesc.Inputs = PrebuildDescInputs; BuildDesc.DestAccelerationStructureData = AccelerationStructureBuffers[GPUIndex]->ResourceLocation.GetGPUVirtualAddress(); BuildDesc.ScratchAccelerationStructureData = ScratchBufferAddress; BuildDesc.SourceAccelerationStructureData = bIsUpdate ? AccelerationStructureBuffers[GPUIndex]->ResourceLocation.GetGPUVirtualAddress() : D3D12_GPU_VIRTUAL_ADDRESS(0); OutDesc = BuildDesc; } static bool ShouldCompactAfterBuild(ERayTracingAccelerationStructureFlags BuildFlags) { return EnumHasAllFlags(BuildFlags, ERayTracingAccelerationStructureFlags::AllowCompaction | ERayTracingAccelerationStructureFlags::FastTrace) && !EnumHasAnyFlags(BuildFlags, ERayTracingAccelerationStructureFlags::AllowUpdate); } void FD3D12RayTracingGeometry::CompactAccelerationStructure(FD3D12CommandContext& CommandContext, uint32 InGPUIndex, uint64 InSizeAfterCompaction) { LLM_SCOPE_BYNAME(TEXT("FD3D12RT/CompactBLAS")); // Should have a pending request check(bHasPendingCompactionRequests[InGPUIndex]); bHasPendingCompactionRequests[InGPUIndex] = false; ensureMsgf(InSizeAfterCompaction > 0, TEXT("Compacted acceleration structure size is expected to be non-zero. This error suggests that GPU readback synchronization is broken.")); if (InSizeAfterCompaction == 0) { return; } DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingUsedVideoMemory, AccelerationStructureBuffers[InGPUIndex]->GetSize()); DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingBLASMemory, AccelerationStructureBuffers[InGPUIndex]->GetSize()); DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingStaticBLASMemory, AccelerationStructureBuffers[InGPUIndex]->GetSize()); UnregisterD3D12RayTracingGeometry(this); // Move old AS into this temporary variable which gets released when this function returns TRefCountPtr OldAccelerationStructure = AccelerationStructureBuffers[InGPUIndex]; AccelerationStructureBuffers[InGPUIndex] = CreateRayTracingBuffer(CommandContext.GetParentAdapter(), InGPUIndex, InSizeAfterCompaction, ERayTracingBufferType::AccelerationStructure, DebugName); AccelerationStructureBuffers[InGPUIndex]->SetOwnerName(OwnerName); INC_MEMORY_STAT_BY(STAT_D3D12RayTracingUsedVideoMemory, AccelerationStructureBuffers[InGPUIndex]->GetSize()); INC_MEMORY_STAT_BY(STAT_D3D12RayTracingBLASMemory, AccelerationStructureBuffers[InGPUIndex]->GetSize()); INC_MEMORY_STAT_BY(STAT_D3D12RayTracingStaticBLASMemory, AccelerationStructureBuffers[InGPUIndex]->GetSize()); CommandContext.UpdateResidency(OldAccelerationStructure->GetResource()); CommandContext.UpdateResidency(AccelerationStructureBuffers[InGPUIndex]->GetResource()); CommandContext.RayTracingCommandList()->CopyRaytracingAccelerationStructure( AccelerationStructureBuffers[InGPUIndex]->ResourceLocation.GetGPUVirtualAddress(), OldAccelerationStructure->ResourceLocation.GetGPUVirtualAddress(), D3D12_RAYTRACING_ACCELERATION_STRUCTURE_COPY_MODE_COMPACT ); AccelerationStructureCompactedSize = InSizeAfterCompaction; RegisterD3D12RayTracingGeometry(this); } FD3D12RayTracingScene::FD3D12RayTracingScene(FD3D12Adapter* Adapter, FRayTracingSceneInitializer InInitializer) : FD3D12AdapterChild(Adapter), Initializer(MoveTemp(InInitializer)) { INC_DWORD_STAT(STAT_D3D12RayTracingAllocatedTLAS); checkf(Initializer.Lifetime == RTSL_SingleFrame, TEXT("Only single-frame ray tracing scenes are currently implemented.")); // Get maximum buffer sizes for all GPUs in the system SizeInfo = RHICalcRayTracingSceneSize(Initializer); }; FD3D12RayTracingScene::~FD3D12RayTracingScene() { ReleaseBuffer(); DEC_DWORD_STAT(STAT_D3D12RayTracingAllocatedTLAS); } void FD3D12RayTracingScene::ReleaseBuffer() { for (auto& AccelerationStructureBuffer : AccelerationStructureBuffers) { if (AccelerationStructureBuffer) { DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingUsedVideoMemory, AccelerationStructureBuffer->GetSize()); DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingTLASMemory, AccelerationStructureBuffer->GetSize()); } AccelerationStructureBuffer = nullptr; } } void FD3D12RayTracingScene::BindBuffer(FRHIBuffer* InBuffer, uint32 InBufferOffset) { check(SizeInfo.ResultSize + InBufferOffset <= InBuffer->GetSize()); for (uint32 GPUIndex = 0; GPUIndex < GNumExplicitGPUsForRendering; ++GPUIndex) { if (AccelerationStructureBuffers[GPUIndex]) { DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingUsedVideoMemory, AccelerationStructureBuffers[GPUIndex]->GetSize()); DEC_MEMORY_STAT_BY(STAT_D3D12RayTracingTLASMemory, AccelerationStructureBuffers[GPUIndex]->GetSize()); } AccelerationStructureBuffers[GPUIndex] = FD3D12CommandContext::RetrieveObject(InBuffer, GPUIndex); INC_MEMORY_STAT_BY(STAT_D3D12RayTracingUsedVideoMemory, AccelerationStructureBuffers[GPUIndex]->GetSize()); INC_MEMORY_STAT_BY(STAT_D3D12RayTracingTLASMemory, AccelerationStructureBuffers[GPUIndex]->GetSize()); } BufferOffset = InBufferOffset; } void PrepareAccelerationStructureBuild( FD3D12CommandContext& CommandContext, FD3D12RayTracingScene& Scene, FD3D12Buffer* ScratchBuffer, uint32 ScratchBufferOffset, FD3D12Buffer* InstanceBuffer, uint32 InstanceBufferOffset, uint32 NumInstances, EAccelerationStructureBuildMode BuildMode, D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_DESC& OutBuildDesc) { TRACE_CPUPROFILER_EVENT_SCOPE(PrepareAccelerationStructureBuild_TopLevel); check(InstanceBuffer != nullptr); checkf(NumInstances <= Scene.Initializer.MaxNumInstances, TEXT("NumInstances must be less or equal to MaxNumInstances")); const bool bIsUpdate = BuildMode == EAccelerationStructureBuildMode::Update; if (bIsUpdate) { checkf(NumInstances == Scene.NumInstances, TEXT("Number of instances used to update TLAS must match the number used to build.")); } else { Scene.NumInstances = NumInstances; } const uint32 GPUIndex = CommandContext.GetGPUIndex(); FD3D12Adapter* Adapter = CommandContext.GetParentAdapter(); TRefCountPtr AutoScratchBuffer; if (ScratchBuffer == nullptr) { const uint64 ScratchBufferSize = bIsUpdate ? Scene.SizeInfo.UpdateScratchSize : Scene.SizeInfo.BuildScratchSize; static const FName ScratchBufferName("AutoBuildScratchTLAS"); AutoScratchBuffer = CreateRayTracingBuffer(Adapter, GPUIndex, ScratchBufferSize, ERayTracingBufferType::Scratch, ScratchBufferName); ScratchBuffer = AutoScratchBuffer.GetReference(); ScratchBufferOffset = 0; } if (bIsUpdate) { checkf(ScratchBuffer, TEXT("TLAS update requires scratch buffer of at least %lld bytes."), Scene.SizeInfo.UpdateScratchSize); } else { checkf(ScratchBuffer, TEXT("TLAS build requires scratch buffer of at least %lld bytes."), Scene.SizeInfo.BuildScratchSize); } { D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_INPUTS BuildInputs; BuildInputs = {}; BuildInputs.Type = D3D12_RAYTRACING_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL; BuildInputs.DescsLayout = D3D12_ELEMENTS_LAYOUT_ARRAY; BuildInputs.NumDescs = NumInstances; BuildInputs.Flags = TranslateRayTracingAccelerationStructureFlags(Scene.Initializer.BuildFlags); D3D12_RAYTRACING_ACCELERATION_STRUCTURE_PREBUILD_INFO PrebuildInfo = {}; CommandContext.GetParentDevice()->GetRaytracingAccelerationStructurePrebuildInfo(&BuildInputs, &PrebuildInfo); checkf(PrebuildInfo.ResultDataMaxSizeInBytes <= Scene.SizeInfo.ResultSize, TEXT("TLAS build result buffer now requires %lld bytes, but only %lld was calculated in the constructor."), PrebuildInfo.ResultDataMaxSizeInBytes, Scene.SizeInfo.ResultSize); checkf(PrebuildInfo.ScratchDataSizeInBytes <= Scene.SizeInfo.BuildScratchSize, TEXT("TLAS build scratch buffer now requires %lld bytes, but only %lld was calculated in the constructor."), PrebuildInfo.ScratchDataSizeInBytes, Scene.SizeInfo.BuildScratchSize); checkf(PrebuildInfo.UpdateScratchDataSizeInBytes <= Scene.SizeInfo.UpdateScratchSize, TEXT("TLAS update scratch buffer now requires %lld bytes, but only %lld was calculated in the constructor."), PrebuildInfo.UpdateScratchDataSizeInBytes, Scene.SizeInfo.UpdateScratchSize); if (bIsUpdate) { checkf(ScratchBufferOffset + PrebuildInfo.UpdateScratchDataSizeInBytes <= ScratchBuffer->GetSize(), TEXT("TLAS scratch buffer size is %d bytes with offset %d (%d bytes available), but the update requires %lld bytes (NumInstances = %d)."), ScratchBuffer->GetSize(), ScratchBufferOffset, ScratchBuffer->GetSize() - ScratchBufferOffset, PrebuildInfo.UpdateScratchDataSizeInBytes, NumInstances); } else { checkf(ScratchBufferOffset + PrebuildInfo.ScratchDataSizeInBytes <= ScratchBuffer->GetSize(), TEXT("TLAS scratch buffer size is %d bytes with offset %d (%d bytes available), but the build requires %lld bytes (NumInstances = %d)."), ScratchBuffer->GetSize(), ScratchBufferOffset, ScratchBuffer->GetSize() - ScratchBufferOffset, PrebuildInfo.ScratchDataSizeInBytes, NumInstances); } } // Make necessary resources resident TRefCountPtr& AccelerationStructureBuffer = Scene.AccelerationStructureBuffers[GPUIndex]; checkf(AccelerationStructureBuffer.IsValid(), TEXT("Acceleration structure buffer must be set for this scene using RHIBindAccelerationStructureMemory() before build command is issued.")); CommandContext.UpdateResidency(AccelerationStructureBuffer->GetResource()); CommandContext.UpdateResidency(InstanceBuffer->GetResource()); CommandContext.UpdateResidency(ScratchBuffer->GetResource()); { // at the same time also gather resources that need to be resident when using TLAS TArray& ResourcesToMakeResidentForThisGPU = Scene.ResourcesToMakeResident[GPUIndex]; ResourcesToMakeResidentForThisGPU.Reset(0); Experimental::TSherwoodSet UniqueResidencyHandles; auto AddResidencyHandleForResource = [&UniqueResidencyHandles, &ResourcesToMakeResidentForThisGPU] (FD3D12Resource* Resource) { #if ENABLE_RESIDENCY_MANAGEMENT bool bShouldTrackResidency = false; if (Resource->NeedsDeferredResidencyUpdate()) { // Resources whose residency handles might change dynamically must always be tracked bShouldTrackResidency = true; } else { // Resources that share *all* residency handles with what's already tracked don't need to be added to be tracked separately for (FD3D12ResidencyHandle* ResidencyHandle : Resource->GetResidencyHandles()) { if (D3DX12Residency::IsInitialized(ResidencyHandle)) { bool bIsAlreadyInSet = false; UniqueResidencyHandles.Add(ResidencyHandle, &bIsAlreadyInSet); if (!bIsAlreadyInSet) { bShouldTrackResidency = true; } } } } if (bShouldTrackResidency) { ResourcesToMakeResidentForThisGPU.Add(Resource); } #endif // ENABLE_RESIDENCY_MANAGEMENT }; const int32 NumReferencedGeometries = Scene.ReferencedGeometries.Num(); for (int32 Index = 0; Index < NumReferencedGeometries; ++Index) { FD3D12RayTracingGeometry* Geometry = FD3D12DynamicRHI::ResourceCast(Scene.ReferencedGeometries[Index].GetReference()); checkf(!Geometry->IsDirty(CommandContext.GetGPUIndex()), TEXT("Acceleration structures for all geometries must be built before building the top level acceleration structure for the scene.")); CommandContext.UpdateResidency(Geometry->AccelerationStructureBuffers[GPUIndex]->ResourceLocation.GetResource()); AddResidencyHandleForResource(Geometry->AccelerationStructureBuffers[GPUIndex]->GetResource()); if (GRHIGlobals.RayTracing.SupportsShaders || GRHIGlobals.RayTracing.RequiresInlineRayTracingSBT) { checkf(Geometry->BuffersValid(CommandContext.GetGPUIndex()), TEXT("Index & vertex buffers for all geometries must be valid (streamed in) when adding geometry to the top level acceleration structure for the scene")); if (Geometry->Initializer.IndexBuffer) { FD3D12Buffer* IndexBuffer = CommandContext.RetrieveObject(Geometry->Initializer.IndexBuffer.GetReference()); AddResidencyHandleForResource(IndexBuffer->GetResource()); } for (const FRayTracingGeometrySegment& Segment : Geometry->Initializer.Segments) { if (Segment.VertexBuffer) { FD3D12Buffer* VertexBuffer = CommandContext.RetrieveObject(Segment.VertexBuffer.GetReference()); AddResidencyHandleForResource(VertexBuffer->GetResource()); } } } } } if (ShouldRunRayTracingGPUValidation()) { RHI_BREADCRUMB_EVENT(CommandContext, "RTSceneValidation"); TRHICommandList_RecursiveHazardous RHICmdList(&CommandContext); uint32 InstanceBufferStride = GRHIRayTracingInstanceDescriptorSize; PRAGMA_DISABLE_DEPRECATION_WARNINGS // TODO: Validation related to SBT needs to be done somewhere else since SBT is not known when in BuildAccelerationStructure uint32 TotalHitGroupSlots = Scene.Initializer.NumTotalSegments; PRAGMA_ENABLE_DEPRECATION_WARNINGS FRayTracingValidateSceneBuildParamsCS::Dispatch(RHICmdList, TotalHitGroupSlots, NumInstances, InstanceBuffer, InstanceBufferOffset, InstanceBufferStride); } { const D3D12_GPU_VIRTUAL_ADDRESS BufferAddress = AccelerationStructureBuffer->ResourceLocation.GetGPUVirtualAddress() + Scene.BufferOffset; D3D12_GPU_VIRTUAL_ADDRESS ScratchAddress = ScratchBuffer->ResourceLocation.GetGPUVirtualAddress() + ScratchBufferOffset; checkf(BufferAddress % GRHIRayTracingAccelerationStructureAlignment == 0, TEXT("TLAS buffer (plus offset) must be aligned to %lld bytes."), GRHIRayTracingAccelerationStructureAlignment); checkf(ScratchAddress % GRHIRayTracingScratchBufferAlignment == 0, TEXT("TLAS scratch buffer (plus offset) must be aligned to %lld bytes."), GRHIRayTracingScratchBufferAlignment); OutBuildDesc.Inputs = {}; OutBuildDesc.Inputs.Type = D3D12_RAYTRACING_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL; OutBuildDesc.Inputs.DescsLayout = D3D12_ELEMENTS_LAYOUT_ARRAY; OutBuildDesc.Inputs.NumDescs = NumInstances; OutBuildDesc.Inputs.InstanceDescs = InstanceBuffer->ResourceLocation.GetGPUVirtualAddress() + InstanceBufferOffset; OutBuildDesc.Inputs.Flags = TranslateRayTracingAccelerationStructureFlags(Scene.Initializer.BuildFlags); if (bIsUpdate) { checkf(EnumHasAllFlags(Scene.Initializer.BuildFlags, ERayTracingAccelerationStructureFlags::AllowUpdate), TEXT("Acceleration structure must be created with FRayTracingGeometryInitializer::bAllowUpdate=true to perform refit / update.")); OutBuildDesc.Inputs.Flags |= D3D12_RAYTRACING_ACCELERATION_STRUCTURE_BUILD_FLAG_PERFORM_UPDATE; } OutBuildDesc.DestAccelerationStructureData = BufferAddress; OutBuildDesc.ScratchAccelerationStructureData = ScratchAddress; OutBuildDesc.SourceAccelerationStructureData = bIsUpdate ? BufferAddress : D3D12_GPU_VIRTUAL_ADDRESS(0); if (bIsUpdate) { INC_DWORD_STAT(STAT_D3D12RayTracingUpdatedTLAS); } else { INC_DWORD_STAT(STAT_D3D12RayTracingBuiltTLAS); } } } void FD3D12RayTracingScene::UpdateResidency(FD3D12CommandContext& CommandContext) const { #if ENABLE_RESIDENCY_MANAGEMENT const uint32 GPUIndex = CommandContext.GetGPUIndex(); CommandContext.UpdateResidency(AccelerationStructureBuffers[GPUIndex]->GetResource()); for (const FD3D12Resource* Resource : ResourcesToMakeResident[GPUIndex]) { CommandContext.UpdateResidency(Resource); } #endif // ENABLE_RESIDENCY_MANAGEMENT } void FD3D12CommandContext::BuildAccelerationStructuresInternal(TConstArrayView BuildDescs) { for (const D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_DESC& Desc : BuildDescs) { GraphicsCommandList4()->BuildRaytracingAccelerationStructure(&Desc, 0, nullptr); } } #if WITH_MGPU void FD3D12CommandContext::UnregisterAccelerationStructuresInternalMGPU(TConstArrayView Params, FRHIGPUMask GPUMask) { // We need to unregister rename listeners for all GPUs in a separate pass before running "RHIBuildAccelerationStructures", as the build process // may modify the buffer references in the ray tracing geometry. This leads to an assert where the code attempts to unregister the newer buffer // references on the additional GPUs, rather than the original buffer references. It's OK to unregister redundantly, as a flag is set to track // whether a buffer is registered, and additional unregister calls do nothing. for (uint32 GPUIndex : GPUMask) { for (const FRayTracingGeometryBuildParams& P : Params) { FD3D12RayTracingGeometry* Geometry = FD3D12DynamicRHI::ResourceCast(P.Geometry.GetReference()); Geometry->UnregisterAsRenameListener(GPUIndex); } } } #endif // WITH_MGPU void FD3D12CommandContext::RHIBuildAccelerationStructures(TConstArrayView Params, const FRHIBufferRange& ScratchBufferRange) { TRACE_CPUPROFILER_EVENT_SCOPE(BuildAccelerationStructure_BottomLevel); SCOPE_CYCLE_COUNTER(STAT_D3D12BuildBLAS); LLM_SCOPE_BYNAME(TEXT("FD3D12RT/BLAS")); checkf(ScratchBufferRange.Buffer != nullptr, TEXT("BuildAccelerationStructures requires valid scratch buffer")); // Update geometry vertex buffers for (const FRayTracingGeometryBuildParams& P : Params) { FD3D12RayTracingGeometry* Geometry = FD3D12DynamicRHI::ResourceCast(P.Geometry.GetReference()); Geometry->UnregisterAsRenameListener(GetGPUIndex()); if (P.Segments.Num()) { checkf(P.Segments.Num() == Geometry->Initializer.Segments.Num(), TEXT("If updated segments are provided, they must exactly match existing geometry segments. Only vertex buffer bindings may change.")); for (int32 i = 0; i < P.Segments.Num(); ++i) { checkf(P.Segments[i].MaxVertices <= Geometry->Initializer.Segments[i].MaxVertices, TEXT("Maximum number of vertices in a segment (%u) must not be larger than what was declared during FRHIRayTracingGeometry creation (%u), as this controls BLAS memory allocation."), P.Segments[i].MaxVertices, Geometry->Initializer.Segments[i].MaxVertices ); Geometry->Initializer.Segments[i].VertexBuffer = P.Segments[i].VertexBuffer; Geometry->Initializer.Segments[i].VertexBufferElementType = P.Segments[i].VertexBufferElementType; Geometry->Initializer.Segments[i].VertexBufferStride = P.Segments[i].VertexBufferStride; Geometry->Initializer.Segments[i].VertexBufferOffset = P.Segments[i].VertexBufferOffset; } } } FlushResourceBarriers(); const uint32 GPUIndex = GetGPUIndex(); // Then do all work TArray> BuildDescs; BuildDescs.Reserve(Params.Num()); uint32 ScratchBufferSize = ScratchBufferRange.Size ? ScratchBufferRange.Size : ScratchBufferRange.Buffer->GetSize(); checkf(ScratchBufferSize + ScratchBufferRange.Offset <= ScratchBufferRange.Buffer->GetSize(), TEXT("BLAS scratch buffer range size is %lld bytes with offset %lld, but the buffer only has %lld bytes. "), ScratchBufferRange.Size, ScratchBufferRange.Offset, ScratchBufferRange.Buffer->GetSize()); const uint64 ScratchAlignment = GRHIRayTracingAccelerationStructureAlignment; FD3D12Buffer* ScratchBuffer = FD3D12DynamicRHI::ResourceCast(ScratchBufferRange.Buffer, GPUIndex); uint32 ScratchBufferOffset = ScratchBufferRange.Offset; UpdateResidency(ScratchBuffer->GetResource()); FMemMark Mark(FMemStack::Get()); for (int32 i = 0; i < Params.Num(); i++) { const FRayTracingGeometryBuildParams& P = Params[i]; FD3D12RayTracingGeometry* Geometry = FD3D12DynamicRHI::ResourceCast(P.Geometry.GetReference()); Geometry->SetDirty(GetGPUMask(), true); // Register as rename listener to index/vertex buffers Geometry->UnregisterAsRenameListener(GPUIndex); Geometry->RegisterAsRenameListener(GPUIndex); // Recreate the hit group system parameters and use them during setup of the descs Geometry->SetupHitGroupSystemParameters(GPUIndex); if (Geometry->IsDirty(GPUIndex)) { uint64 ScratchBufferRequiredSize = P.BuildMode == EAccelerationStructureBuildMode::Update ? Geometry->SizeInfo.UpdateScratchSize : Geometry->SizeInfo.BuildScratchSize; checkf(ScratchBufferRequiredSize + ScratchBufferOffset <= ScratchBufferSize, TEXT("BLAS scratch buffer size is %lld bytes with offset %lld (%lld bytes available), but the build requires %lld bytes. "), ScratchBufferSize, ScratchBufferOffset, ScratchBufferSize - ScratchBufferOffset, ScratchBufferRequiredSize); D3D12_GPU_VIRTUAL_ADDRESS ScratchBufferAddress = ScratchBuffer->ResourceLocation.GetGPUVirtualAddress() + ScratchBufferOffset; ScratchBufferOffset = Align(ScratchBufferOffset + ScratchBufferRequiredSize, ScratchAlignment); checkf(ScratchBufferAddress % GRHIRayTracingAccelerationStructureAlignment == 0, TEXT("BLAS scratch buffer (plus offset) must be aligned to %lld bytes."), GRHIRayTracingAccelerationStructureAlignment); // We need to keep D3D12_RAYTRACING_GEOMETRY_DESCs that are used in D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_DESC alive. const uint32 NumGeometryDescs = Geometry->GeometryDescs.Num(); D3D12_RAYTRACING_GEOMETRY_DESC* LocalGeometryDescsMemory = (D3D12_RAYTRACING_GEOMETRY_DESC*)FMemStack::Get().Alloc(NumGeometryDescs * sizeof(D3D12_RAYTRACING_GEOMETRY_DESC), alignof(D3D12_RAYTRACING_GEOMETRY_DESC)); TArrayView LocalGeometryDescs = MakeArrayView(LocalGeometryDescsMemory, NumGeometryDescs); D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_DESC& BuildDesc = BuildDescs.AddZeroed_GetRef(); Geometry->CreateAccelerationStructureBuildDesc(*this, P.BuildMode, ScratchBufferAddress, BuildDesc, LocalGeometryDescs); Geometry->UpdateResidency(*this); if (P.BuildMode == EAccelerationStructureBuildMode::Update) { INC_DWORD_STAT(STAT_D3D12RayTracingUpdatedBLAS); } else { INC_DWORD_STAT(STAT_D3D12RayTracingBuiltBLAS); } } } if (ShouldRunRayTracingGPUValidation()) { RHI_BREADCRUMB_EVENT(*this, "RTGeometryValidation"); TRHICommandList_RecursiveHazardous RHICmdList(this); for (const FRayTracingGeometryBuildParams& P : Params) { FRayTracingValidateGeometryBuildParamsCS::Dispatch(RHICmdList, P); } } BuildAccelerationStructuresInternal(BuildDescs); for (const FRayTracingGeometryBuildParams& P : Params) { FD3D12RayTracingGeometry* Geometry = FD3D12DynamicRHI::ResourceCast(P.Geometry.GetReference()); if (Geometry->IsDirty(GPUIndex)) { ERayTracingAccelerationStructureFlags GeometryBuildFlags = GetRayTracingAccelerationStructureBuildFlags(Geometry->Initializer); if (ShouldCompactAfterBuild(GeometryBuildFlags)) { GetParentDevice()->GetRayTracingCompactionRequestHandler()->RequestCompact(Geometry); Geometry->bHasPendingCompactionRequests[GPUIndex] = true; } Geometry->SetDirty(GetGPUMask(), false); } } // Add a UAV barrier after each acceleration structure build batch. // This is required because there are currently no explicit read/write barriers // for acceleration structures, but we need to ensure that all commands // are complete before BLAS is used again on the GPU. AddUAVBarrier(); } void FD3D12CommandContext::RHIBuildAccelerationStructures(TConstArrayView Params) { TRACE_CPUPROFILER_EVENT_SCOPE(BuildAccelerationStructures_TopLevel); SCOPE_CYCLE_COUNTER(STAT_D3D12BuildTLAS); TArray> BuildDescs; BuildDescs.Reserve(Params.Num()); for (const FRayTracingSceneBuildParams& SceneBuildParams : Params) { FD3D12RayTracingScene* Scene = FD3D12DynamicRHI::ResourceCast(SceneBuildParams.Scene); FD3D12Buffer* ScratchBuffer = RetrieveObject(SceneBuildParams.ScratchBuffer); FD3D12Buffer* InstanceBuffer = RetrieveObject(SceneBuildParams.InstanceBuffer); Scene->ReferencedGeometries.Reserve(SceneBuildParams.ReferencedGeometries.Num()); for (FRHIRayTracingGeometry* ReferencedGeometry : SceneBuildParams.ReferencedGeometries) { Scene->ReferencedGeometries.Add(ReferencedGeometry); } D3D12_BUILD_RAYTRACING_ACCELERATION_STRUCTURE_DESC& BuildDesc = BuildDescs.AddDefaulted_GetRef(); PrepareAccelerationStructureBuild( *this, *Scene, ScratchBuffer, SceneBuildParams.ScratchBufferOffset, InstanceBuffer, SceneBuildParams.InstanceBufferOffset, SceneBuildParams.NumInstances, SceneBuildParams.BuildMode, BuildDesc ); } // UAV barrier is used here to ensure that all bottom level acceleration structures are built AddUAVBarrier(); FlushResourceBarriers(); BuildAccelerationStructuresInternal(BuildDescs); // UAV barrier is used here to ensure that the acceleration structure build is complete before any rays are traced // #dxr_todo: these barriers should ideally be inserted by the high level code to allow more overlapped execution AddUAVBarrier(); for (const FRayTracingSceneBuildParams& SceneBuildParams : Params) { FD3D12RayTracingScene* Scene = FD3D12DynamicRHI::ResourceCast(SceneBuildParams.Scene); FD3D12Buffer* ScratchBuffer = RetrieveObject(SceneBuildParams.ScratchBuffer); FD3D12Buffer* InstanceBuffer = RetrieveObject(SceneBuildParams.InstanceBuffer); Scene->bBuilt = true; #if D3D12_RHI_SUPPORT_RAYTRACING_SCENE_DEBUGGING D3D12RayTracingSceneDebugUpdate(*Scene, InstanceBuffer, SceneBuildParams.InstanceBufferOffset, *this); #endif // D3D12_RHI_SUPPORT_RAYTRACING_SCENE_DEBUGGING } } void FD3D12CommandContext::RHIBindAccelerationStructureMemory(FRHIRayTracingScene* InScene, FRHIBuffer* InBuffer, uint32 InBufferOffset) { FD3D12RayTracingScene* Scene = FD3D12DynamicRHI::ResourceCast(InScene); Scene->BindBuffer(InBuffer, InBufferOffset); } void FD3D12CommandContext::RHICommitShaderBindingTable(FRHIShaderBindingTable* InSBT, FRHIBuffer* InlineBindingDataBuffer) { FD3D12RayTracingShaderBindingTable* SBT = FD3D12DynamicRHI::ResourceCast(InSBT); check(SBT); FD3D12RayTracingShaderBindingTableInternal* ShaderTableForDevice = SBT->GetTableForDevice(GetParentDevice()); if (ShaderTableForDevice->bIsDirty) { ShaderTableForDevice->Commit(*this, InlineBindingDataBuffer); } } void FD3D12CommandContext::RHIClearShaderBindingTable(FRHIShaderBindingTable* InSBT) { FD3D12RayTracingShaderBindingTable* SBT = FD3D12DynamicRHI::ResourceCast(InSBT); check(SBT); SBT->ReleaseForDevice(GetParentDevice()); } static constexpr uint32 LooseParameterCBVIndex = 0; // Global uniform buffer is always assumed to be in slot 0 struct FD3D12RayTracingGlobalResourceBinder { FD3D12RayTracingGlobalResourceBinder(FD3D12CommandContext& InCommandContext, FD3D12ExplicitDescriptorCache& InDescriptorCache) : CommandContext(InCommandContext) , DescriptorCache(InDescriptorCache) { } void SetUniformBuffer(uint32 BaseSlotIndex, uint32 DescriptorIndex, FD3D12UniformBuffer* UniformBuffer) { // don't have to do anything } void SetRootCBV(uint32 BaseSlotIndex, uint32 DescriptorIndex, D3D12_GPU_VIRTUAL_ADDRESS Address) { CommandContext.GraphicsCommandList()->SetComputeRootConstantBufferView(BaseSlotIndex + DescriptorIndex, Address); } void SetRootSRV(uint32 BaseSlotIndex, uint32 DescriptorIndex, D3D12_GPU_VIRTUAL_ADDRESS Address) { CommandContext.GraphicsCommandList()->SetComputeRootShaderResourceView(BaseSlotIndex + DescriptorIndex, Address); } void SetRootDescriptorTable(uint32 SlotIndex, D3D12_GPU_DESCRIPTOR_HANDLE DescriptorTable) { CommandContext.GraphicsCommandList()->SetComputeRootDescriptorTable(SlotIndex, DescriptorTable); } FD3D12ConstantBufferView* SetLooseParameterData(const void* Data, uint32 DataSize, D3D12_GPU_VIRTUAL_ADDRESS& OutGPUVirtualAddress) { checkf(0, TEXT("Loose parameters are not implemented for global ray tracing shaders (raygen, miss, callable)")); return nullptr; } void AddReferencedShaderResource(FD3D12BaseShaderResource* ShaderResource) { CommandContext.UpdateResidency(ShaderResource->GetResource()); } void AddReferencedTexture(FRHITexture* RHITexture) { FD3D12Texture* Texture = FD3D12CommandContext::RetrieveTexture(RHITexture, CommandContext.GetGPUIndex()); CommandContext.UpdateResidency(Texture->ResourceLocation.GetResource()); } void AddReferencedUniformBuffer(uint32 BaseSlotIndex, uint32 DescriptorIndex, FD3D12UniformBuffer* UniformBuffer) { CommandContext.UpdateResidency(UniformBuffer->ResourceLocation.GetResource()); } void AddRayTracingSceneReference(FD3D12RayTracingScene* D3D12RayTracingScene) { D3D12RayTracingScene->UpdateResidency(CommandContext); } FD3D12Device* GetDevice() { return CommandContext.GetParentDevice(); } #if ENABLE_RHI_VALIDATION RHIValidation::FTracker* GetValidationTracker() { return CommandContext.Tracker; } #endif FD3D12CommandContext& CommandContext; FD3D12ExplicitDescriptorCache& DescriptorCache; static constexpr uint32 WorkerIndex = 0; }; struct FD3D12RayTracingLocalResourceBinder { FD3D12RayTracingLocalResourceBinder( FD3D12Device& InDevice, FD3D12RayTracingShaderBindingTableInternal& InShaderTable, const FD3D12RootSignature& InRootSignature, ERayTracingLocalShaderBindingType InBindingType, uint32 InRecordIndex, uint32 InWorkerIndex, ERayTracingBindingType RTBindingType) : Device(InDevice) , ShaderTable(InShaderTable) , DescriptorCache(*InShaderTable.DescriptorCache) , RootSignature(InRootSignature) , BindingType(InBindingType) , RecordIndex(InRecordIndex) , WorkerIndex(InWorkerIndex) { check(InShaderTable.DescriptorCache != nullptr); check(WorkerIndex < InShaderTable.MaxBindingWorkers); check(WorkerIndex < uint32(DescriptorCache.WorkerData.Num())); check(RecordIndex != ~0u); switch (RTBindingType) { case ERayTracingBindingType::CallableShader: ShaderTableOffset = InShaderTable.CallableShaderTableOffset; break; case ERayTracingBindingType::HitGroup: ShaderTableOffset = InShaderTable.HitGroupShaderTableOffset; break; case ERayTracingBindingType::MissShader: ShaderTableOffset = InShaderTable.MissShaderTableOffset; break; default: checkNoEntry(); } } uint32 ComputeOffsetWithinRootSignature(uint32 BaseSlotIndex, uint32 DescriptorIndex) { const uint32 BindOffsetBase = RootSignature.GetBindSlotOffsetInBytes(BaseSlotIndex); const uint32 DescriptorSize = uint32(sizeof(D3D12_GPU_VIRTUAL_ADDRESS)); return BindOffsetBase + DescriptorIndex * DescriptorSize; } void SetRootDescriptor(uint32 BaseSlotIndex, uint32 DescriptorIndex, D3D12_GPU_VIRTUAL_ADDRESS Address) { const uint32 OffsetWithinRootSignature = ComputeOffsetWithinRootSignature(BaseSlotIndex, DescriptorIndex); if (BindingType == ERayTracingLocalShaderBindingType::Validation) { ShaderTable.CompareLocalShaderParameters(ShaderTableOffset, RecordIndex, OffsetWithinRootSignature, Address); } else { ShaderTable.SetLocalShaderParameters(ShaderTableOffset, RecordIndex, OffsetWithinRootSignature, Address); } } void SetRootCBV(uint32 BaseSlotIndex, uint32 DescriptorIndex, D3D12_GPU_VIRTUAL_ADDRESS Address) { SetRootDescriptor(BaseSlotIndex, DescriptorIndex, Address); } void SetRootSRV(uint32 BaseSlotIndex, uint32 DescriptorIndex, D3D12_GPU_VIRTUAL_ADDRESS Address) { SetRootDescriptor(BaseSlotIndex, DescriptorIndex, Address); } void SetRootDescriptorTable(uint32 SlotIndex, D3D12_GPU_DESCRIPTOR_HANDLE DescriptorTable) { const uint32 BindOffset = RootSignature.GetBindSlotOffsetInBytes(SlotIndex); if (BindingType == ERayTracingLocalShaderBindingType::Validation) { ShaderTable.CompareLocalShaderParameters(ShaderTableOffset, RecordIndex, BindOffset, DescriptorTable); } else { ShaderTable.SetLocalShaderParameters(ShaderTableOffset, RecordIndex, BindOffset, DescriptorTable); } } FD3D12ConstantBufferView* SetLooseParameterData(const void* Data, uint32 DataSize, D3D12_GPU_VIRTUAL_ADDRESS& OutGPUVirtualAddress) { bHasLooseParameterData = true; return ShaderTable.SetLooseParameterData(WorkerIndex, RecordIndex, BindingType, Data, DataSize, OutGPUVirtualAddress); } void AddReferencedShaderResource(FD3D12BaseShaderResource* ShaderResource) { ShaderTable.AddReferencedShaderResource(WorkerIndex, RecordIndex, BindingType, ShaderResource); } void AddReferencedTexture(FRHITexture* RHITexture) { ShaderTable.AddReferencedTexture(WorkerIndex, RecordIndex, BindingType, RHITexture); } void AddReferencedUniformBuffer(uint32 BaseSlotIndex, uint32 DescriptorIndex, FD3D12UniformBuffer* UniformBuffer) { const uint32 OffsetWithinRootSignature = ComputeOffsetWithinRootSignature(BaseSlotIndex, DescriptorIndex); ShaderTable.AddReferencedUniformBuffer(WorkerIndex, RecordIndex, ShaderTableOffset, OffsetWithinRootSignature, BindingType, UniformBuffer); } void AddRayTracingSceneReference(FD3D12RayTracingScene* D3D12RayTracingScene) { checkf(false, TEXT("Unexpected RayTracingScene reference in local shader bindings")); } FD3D12Device* GetDevice() { return &Device; } #if ENABLE_RHI_VALIDATION RHIValidation::FTracker* GetValidationTracker() { // We can't validate resource states in RHISetBindingsOnShaderBindingTable because there's no command context at that point, and because the states will // change before the raytracing command is dispatched anyway. return nullptr; } #endif FD3D12Device& Device; FD3D12RayTracingShaderBindingTableInternal& ShaderTable; FD3D12ExplicitDescriptorCache& DescriptorCache; const FD3D12RootSignature& RootSignature; ERayTracingLocalShaderBindingType BindingType; uint32 ShaderTableOffset = 0; uint32 RecordIndex = ~0u; uint32 WorkerIndex = 0; bool bHasLooseParameterData = false; TArray> ReferencedViews; TArray> ReferencedCBResources; }; template static bool SetRayTracingShaderResources( const FD3D12RayTracingShader* Shader, const FD3D12RootSignature* RootSignature, uint32 InNumBindlessParameters, FRHIShaderParameterResource const* BindlessParameters, uint32 InNumTextures, FRHITexture* const* Textures, uint32 InNumSRVs, FRHIShaderResourceView* const* SRVs, uint32 InNumUniformBuffers, FRHIUniformBuffer* const* UniformBuffers, uint32 InNumSamplers, FRHISamplerState* const* Samplers, uint32 InNumUAVs, FRHIUnorderedAccessView* const* UAVs, uint32 InLooseParameterDataSize, const void* InLooseParameterData, ResourceBinderType& Binder) { check(Shader && RootSignature); struct FBindings { FBindings(ResourceBinderType& InBinder, uint32 InGPUIndex, const FD3D12ShaderData* ShaderData) : Binder(InBinder) , GPUIndex(InGPUIndex) #if PLATFORM_SUPPORTS_BINDLESS_RENDERING , bBindlessResources(EnumHasAnyFlags(ShaderData->ResourceCounts.UsageFlags, EShaderResourceUsageFlags::BindlessResources)) , bBindlessSamplers(EnumHasAnyFlags(ShaderData->ResourceCounts.UsageFlags, EShaderResourceUsageFlags::BindlessSamplers)) #endif { } ResourceBinderType& Binder; uint32 GPUIndex; #if PLATFORM_SUPPORTS_BINDLESS_RENDERING const bool bBindlessResources; const bool bBindlessSamplers; #endif #if D3D12RHI_USE_CONSTANT_BUFFER_VIEWS D3D12_CPU_DESCRIPTOR_HANDLE LocalCBVs[MAX_CBS]; #endif D3D12_GPU_VIRTUAL_ADDRESS RemoteCBVs[MAX_CBS]; FD3D12UniformBuffer* UniformBuffers[MAX_CBS] = {}; D3D12_CPU_DESCRIPTOR_HANDLE LocalSRVs[MAX_SRVS]; D3D12_CPU_DESCRIPTOR_HANDLE LocalUAVs[MAX_UAVS]; D3D12_CPU_DESCRIPTOR_HANDLE LocalSamplers[MAX_SAMPLERS]; #if D3D12RHI_USE_CONSTANT_BUFFER_VIEWS uint32 CBVVersions[MAX_CBS]; #endif uint32 SRVVersions[MAX_SRVS]; uint32 UAVVersions[MAX_SRVS]; uint32 SamplerVersions[MAX_SRVS]; TArray> ReferencedRayTracingScenes; uint64 BoundSRVMask = 0; uint64 BoundCBVMask = 0; uint64 BoundUAVMask = 0; uint64 BoundSamplerMask = 0; void SetUAV(FRHIUnorderedAccessView* RHIUAV, uint8 Index) { FD3D12UnorderedAccessView* UAV = FD3D12CommandContext::RetrieveObject(RHIUAV, GPUIndex); check(UAV != nullptr); #if PLATFORM_SUPPORTS_BINDLESS_RENDERING if (!bBindlessResources) #endif { FD3D12OfflineDescriptor Descriptor = UAV->GetOfflineCpuHandle(); LocalUAVs[Index] = Descriptor; UAVVersions[Index] = Descriptor.GetVersion(); BoundUAVMask |= 1ull << Index; } Binder.AddReferencedShaderResource(UAV->GetShaderResource()); } void SetSRV(FRHIShaderResourceView* RHISRV, uint8 Index) { FD3D12ShaderResourceView* SRV = FD3D12CommandContext::RetrieveObject(RHISRV, GPUIndex); check(SRV != nullptr); #if PLATFORM_SUPPORTS_BINDLESS_RENDERING if (!bBindlessResources) #endif { FD3D12OfflineDescriptor Descriptor = SRV->GetOfflineCpuHandle(); LocalSRVs[Index] = Descriptor; SRVVersions[Index] = Descriptor.GetVersion(); BoundSRVMask |= 1ull << Index; } Binder.AddReferencedShaderResource(SRV->GetShaderResource()); FD3D12RayTracingScene* ReferencedRayTracingScene = SRV->GetRayTracingScene(); if (ReferencedRayTracingScene) { ReferencedRayTracingScenes.Add(ReferencedRayTracingScene); } } void SetTexture(FRHITexture* RHITexture, uint8 Index) { FD3D12ShaderResourceView* SRV = FD3D12CommandContext::RetrieveTexture(RHITexture, GPUIndex)->GetShaderResourceView(); if (!ensure(SRV)) { SRV = FD3D12CommandContext::RetrieveTexture(GBlackTexture->TextureRHI, GPUIndex)->GetShaderResourceView(); } check(SRV != nullptr); #if PLATFORM_SUPPORTS_BINDLESS_RENDERING if (!bBindlessResources) #endif { FD3D12OfflineDescriptor Descriptor = SRV->GetOfflineCpuHandle(); LocalSRVs[Index] = Descriptor; SRVVersions[Index] = Descriptor.GetVersion(); BoundSRVMask |= 1ull << Index; } Binder.AddReferencedTexture(RHITexture); } void SetResourceCollection(FRHIResourceCollection* ResourceCollection, uint8 Index) { FD3D12ResourceCollection* D3D12ResourceCollection = FD3D12CommandContext::RetrieveObject(ResourceCollection, GPUIndex); FD3D12ShaderResourceView* SRV = D3D12ResourceCollection ? D3D12ResourceCollection->GetShaderResourceView() : nullptr; check(SRV != nullptr); #if PLATFORM_SUPPORTS_BINDLESS_RENDERING if (!bBindlessResources) #endif { FD3D12OfflineDescriptor Descriptor = SRV->GetOfflineCpuHandle(); LocalSRVs[Index] = Descriptor; SRVVersions[Index] = Descriptor.GetVersion(); } BoundSRVMask |= 1ull << Index; Binder.AddReferencedShaderResource(SRV->GetShaderResource()); } void SetSampler(FRHISamplerState* RHISampler, uint8 Index) { FD3D12SamplerState* Sampler = FD3D12CommandContext::RetrieveObject(RHISampler, GPUIndex); check(Sampler != nullptr); #if PLATFORM_SUPPORTS_BINDLESS_RENDERING if (!bBindlessSamplers) #endif { FD3D12OfflineDescriptor Descriptor = Sampler->OfflineDescriptor; LocalSamplers[Index] = Descriptor; SamplerVersions[Index] = Descriptor.GetVersion(); BoundSamplerMask |= 1ull << Index; } } }; FBindings Bindings(Binder, Binder.GetDevice()->GetGPUIndex(), Shader); #if PLATFORM_SUPPORTS_BINDLESS_RENDERING for (uint32 BindlessParameterIndex = 0; BindlessParameterIndex < InNumBindlessParameters; ++BindlessParameterIndex) { const FRHIShaderParameterResource& ShaderParameterResource = BindlessParameters[BindlessParameterIndex]; if (FRHIResource* Resource = ShaderParameterResource.Resource) { switch (ShaderParameterResource.Type) { case FRHIShaderParameterResource::EType::Texture: Bindings.SetTexture(static_cast(Resource), BindlessParameterIndex); break; case FRHIShaderParameterResource::EType::ResourceView: Bindings.SetSRV(static_cast(Resource), BindlessParameterIndex); break; case FRHIShaderParameterResource::EType::UnorderedAccessView: Bindings.SetUAV(static_cast(Resource), BindlessParameterIndex); break; case FRHIShaderParameterResource::EType::Sampler: Bindings.SetSampler(static_cast(Resource), BindlessParameterIndex); break; } } } #endif for (uint32 TextureIndex = 0; TextureIndex < InNumTextures; ++TextureIndex) { FRHITexture* Resource = Textures[TextureIndex]; if (Resource) { Bindings.SetTexture(Resource, TextureIndex); } } for (uint32 SRVIndex = 0; SRVIndex < InNumSRVs; ++SRVIndex) { FRHIShaderResourceView* Resource = SRVs[SRVIndex]; if (Resource) { Bindings.SetSRV(Resource, SRVIndex); } } for (uint32 CBVIndex = 0; CBVIndex < InNumUniformBuffers; ++CBVIndex) { FRHIUniformBuffer* Resource = UniformBuffers[CBVIndex]; if (Resource) { FD3D12UniformBuffer* CBV = FD3D12CommandContext::RetrieveObject(Resource, Bindings.GPUIndex); #if D3D12RHI_USE_CONSTANT_BUFFER_VIEWS FD3D12OfflineDescriptor Descriptor = CBV->View->GetOfflineCpuHandle(); Bindings.LocalCBVs[CBVIndex] = Descriptor; Bindings.CBVVersions[CBVIndex] = Descriptor.GetVersion(); #endif // D3D12RHI_USE_CONSTANT_BUFFER_VIEWS Bindings.RemoteCBVs[CBVIndex] = CBV->ResourceLocation.GetGPUVirtualAddress(); Bindings.BoundCBVMask |= 1ull << CBVIndex; Bindings.UniformBuffers[CBVIndex] = CBV; } } for (uint32 SamplerIndex = 0; SamplerIndex < InNumSamplers; ++SamplerIndex) { FRHISamplerState* Resource = Samplers[SamplerIndex]; if (Resource) { Bindings.SetSampler(Resource, SamplerIndex); } } for (uint32 UAVIndex = 0; UAVIndex < InNumUAVs; ++UAVIndex) { FRHIUnorderedAccessView* Resource = UAVs[UAVIndex]; if (Resource) { Bindings.SetUAV(Resource, UAVIndex); } } { uint32 DirtyUniformBuffers = ~(0u); UE::RHI::Private::SetUniformBufferResourcesFromTables( Bindings , *Shader , DirtyUniformBuffers , UniformBuffers #if ENABLE_RHI_VALIDATION , Binder.GetValidationTracker() #endif ); } // Bind loose parameters if (Shader->UsesGlobalUniformBuffer()) { checkf(InLooseParameterDataSize && InLooseParameterData, TEXT("Shader uses global uniform buffer, but the required loose parameter data is not provided.")); } if (InLooseParameterData && Shader->UsesGlobalUniformBuffer()) { D3D12_GPU_VIRTUAL_ADDRESS LooseParameterGPUVirtualAddress = 0; FD3D12ConstantBufferView* ConstantBufferView = Binder.SetLooseParameterData(InLooseParameterData, InLooseParameterDataSize, LooseParameterGPUVirtualAddress); #if D3D12RHI_USE_CONSTANT_BUFFER_VIEWS Bindings.LocalCBVs[LooseParameterCBVIndex] = ConstantBufferView->GetOfflineCpuHandle(); #endif // D3D12RHI_USE_CONSTANT_BUFFER_VIEWS Bindings.RemoteCBVs[LooseParameterCBVIndex] = LooseParameterGPUVirtualAddress; Bindings.BoundCBVMask |= 1ull << LooseParameterCBVIndex; } // Validate that all resources required by the shader are set auto IsCompleteBinding = [](uint32 ExpectedCount, uint64 BoundMask) { if (ExpectedCount > 64) return false; // Bound resource mask can't be represented by uint64 // All bits of the mask [0..ExpectedCount) are expected to be set uint64 ExpectedMask = ExpectedCount == 64 ? ~0ull : ((1ull << ExpectedCount) - 1); return (ExpectedMask & BoundMask) == ExpectedMask; }; check(IsCompleteBinding(Shader->ResourceCounts.NumSRVs , Bindings.BoundSRVMask)); check(IsCompleteBinding(Shader->ResourceCounts.NumUAVs , Bindings.BoundUAVMask)); check(IsCompleteBinding(Shader->ResourceCounts.NumCBs , Bindings.BoundCBVMask)); check(IsCompleteBinding(Shader->ResourceCounts.NumSamplers, Bindings.BoundSamplerMask)); FD3D12ExplicitDescriptorCache& DescriptorCache = Binder.DescriptorCache; const uint32 WorkerIndex = Binder.WorkerIndex; const uint32 NumSRVs = Shader->ResourceCounts.NumSRVs; if (NumSRVs) { const int32 DescriptorTableBaseIndex = DescriptorCache.AllocateDeduplicated(Bindings.SRVVersions, Bindings.LocalSRVs, NumSRVs, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, WorkerIndex); if (DescriptorTableBaseIndex < 0) { return false; } const uint32 BindSlot = RootSignature->SRVRDTBindSlot(SF_Compute); check(BindSlot != 0xFF); const D3D12_GPU_DESCRIPTOR_HANDLE ResourceDescriptorTableBaseGPU = DescriptorCache.ViewHeap.GetDescriptorGPU(DescriptorTableBaseIndex); Binder.SetRootDescriptorTable(BindSlot, ResourceDescriptorTableBaseGPU); } const uint32 NumUAVs = Shader->ResourceCounts.NumUAVs; if (NumUAVs) { const int32 DescriptorTableBaseIndex = DescriptorCache.AllocateDeduplicated(Bindings.UAVVersions, Bindings.LocalUAVs, NumUAVs, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, WorkerIndex); if (DescriptorTableBaseIndex < 0) { return false; } const uint32 BindSlot = RootSignature->UAVRDTBindSlot(SF_Compute); check(BindSlot != 0xFF); const D3D12_GPU_DESCRIPTOR_HANDLE ResourceDescriptorTableBaseGPU = DescriptorCache.ViewHeap.GetDescriptorGPU(DescriptorTableBaseIndex); Binder.SetRootDescriptorTable(BindSlot, ResourceDescriptorTableBaseGPU); } const uint32 NumCBVs = Shader->ResourceCounts.NumCBs; if (Shader->ResourceCounts.NumCBs) { #if D3D12RHI_USE_CONSTANT_BUFFER_VIEWS if (!EnumHasAllFlags(Shader->ResourceCounts.UsageFlags, EShaderResourceUsageFlags::BindlessResources)) { const uint32 DescriptorTableBaseIndex = DescriptorCache.AllocateDeduplicated(Bindings.CBVVersions, Bindings.LocalCBVs, NumCBVs, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, WorkerIndex); const uint32 BindSlot = RootSignature->CBVRDTBindSlot(SF_Compute); check(BindSlot != 0xFF); const D3D12_GPU_DESCRIPTOR_HANDLE ResourceDescriptorTableBaseGPU = DescriptorCache.ViewHeap.GetDescriptorGPU(DescriptorTableBaseIndex); Binder.SetRootDescriptorTable(BindSlot, ResourceDescriptorTableBaseGPU); } else #endif // D3D12RHI_USE_CONSTANT_BUFFER_VIEWS { checkf(RootSignature->CBVRDTBindSlot(SF_Compute) == 0xFF, TEXT("Root CBV descriptor tables are not implemented for ray tracing shaders.")); const uint32 BindSlot = RootSignature->CBVRDBaseBindSlot(SF_Compute); check(BindSlot != 0xFF); for (uint32 i = 0; i < Shader->ResourceCounts.NumCBs; ++i) { const uint64 SlotMask = (1ull << i); D3D12_GPU_VIRTUAL_ADDRESS BufferAddress = (Bindings.BoundCBVMask & SlotMask) ? Bindings.RemoteCBVs[i] : 0; Binder.SetRootCBV(BindSlot, i, BufferAddress); // Also set the uniform buffer in case it's valid so it can be notified about changes when it's a persistent binding if (Bindings.UniformBuffers[i]) { Binder.AddReferencedUniformBuffer(BindSlot, i, Bindings.UniformBuffers[i]); } } } } // Bind samplers const uint32 NumSamplers = Shader->ResourceCounts.NumSamplers; if (NumSamplers) { const int32 DescriptorTableBaseIndex = DescriptorCache.AllocateDeduplicated(Bindings.SamplerVersions, Bindings.LocalSamplers, NumSamplers, D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER, WorkerIndex); if (DescriptorTableBaseIndex < 0) { return false; } const uint32 BindSlot = RootSignature->SamplerRDTBindSlot(SF_Compute); check(BindSlot != 0xFF); const D3D12_GPU_DESCRIPTOR_HANDLE ResourceDescriptorTableBaseGPU = DescriptorCache.SamplerHeap.GetDescriptorGPU(DescriptorTableBaseIndex); Binder.SetRootDescriptorTable(BindSlot, ResourceDescriptorTableBaseGPU); } for (FD3D12RayTracingScene* RayTracingScene : Bindings.ReferencedRayTracingScenes) { Binder.AddRayTracingSceneReference(RayTracingScene); } return true; } template static bool SetRayTracingShaderResources( const FD3D12RayTracingShader* Shader, const FD3D12RootSignature* RootSignature, const FRayTracingShaderBindings& ResourceBindings, ResourceBinderType& Binder) { static_assert( sizeof(ResourceBindings.SRVs) / sizeof(*ResourceBindings.SRVs) == MAX_SRVS, "Ray Tracing Shader Bindings SRV array size must match D3D12 RHI Limit"); static_assert( sizeof(ResourceBindings.UniformBuffers) / sizeof(*ResourceBindings.UniformBuffers) == MAX_CBS, "Ray Tracing Shader Bindings Uniform Buffer array size must match D3D12 RHI Limit"); static_assert( sizeof(ResourceBindings.Samplers) / sizeof(*ResourceBindings.Samplers) == MAX_SAMPLERS, "Ray Tracing Shader Bindings Sampler array size must match D3D12 RHI Limit"); static_assert( sizeof(ResourceBindings.UAVs) / sizeof(*ResourceBindings.UAVs) == MAX_UAVS, "Ray Tracing Shader Bindings UAV array size must match D3D12 RHI Limit"); return SetRayTracingShaderResources( Shader, RootSignature, ResourceBindings.BindlessParameters.Num(), ResourceBindings.BindlessParameters.GetData(), UE_ARRAY_COUNT(ResourceBindings.Textures), ResourceBindings.Textures, UE_ARRAY_COUNT(ResourceBindings.SRVs), ResourceBindings.SRVs, UE_ARRAY_COUNT(ResourceBindings.UniformBuffers), ResourceBindings.UniformBuffers, UE_ARRAY_COUNT(ResourceBindings.Samplers), ResourceBindings.Samplers, UE_ARRAY_COUNT(ResourceBindings.UAVs), ResourceBindings.UAVs, 0, nullptr, // loose parameters Binder); } static void DispatchRays(FD3D12CommandContext& CommandContext, const FRayTracingShaderBindings& GlobalBindings, const FD3D12RayTracingPipelineState* Pipeline, uint32 RayGenShaderIndex, FD3D12RayTracingShaderBindingTableInternal* OptShaderTable, const D3D12_DISPATCH_RAYS_DESC& DispatchDesc, ED3D12QueueType QueueType, FD3D12Buffer* ArgumentBuffer = nullptr, uint32 ArgumentOffset = 0) { SCOPE_CYCLE_COUNTER(STAT_D3D12DispatchRays); // TODO: add optional validation that all (used/valid) shader identifiers used in the SBT are also available in the RTPSO FD3D12Device* Device = CommandContext.GetParentDevice(); FD3D12Adapter* Adapter = Device->GetParentAdapter(); FD3D12Buffer* DispatchRaysDescBuffer = nullptr; if (ArgumentBuffer) { // Source indirect argument buffer only contains the dispatch dimensions, however D3D12 requires a full D3D12_DISPATCH_RAYS_DESC structure. // We create a new buffer, fill the SBT pointers on CPU and copy the dispatch dimensions into the right place. DispatchRaysDescBuffer = Device->GetRayTracingDispatchRaysDescBuffer(QueueType); FD3D12Resource* DispatchRaysDescBufferResource = DispatchRaysDescBuffer->GetResource(); CommandContext.TransitionResource(DispatchRaysDescBufferResource, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, D3D12_RESOURCE_STATE_COPY_DEST, 0); CommandContext.TransitionResource(ArgumentBuffer->GetResource(), D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE|D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, D3D12_RESOURCE_STATE_COPY_SOURCE, 0); CommandContext.FlushResourceBarriers(); // Compute the allocation & copy sizes uint32 DispatchRayDescSize = sizeof(D3D12_DISPATCH_RAYS_DESC); uint32 SBTPartSize = offsetof(D3D12_DISPATCH_RAYS_DESC, Width); uint32 IndirectDimensionSize = DispatchRayDescSize - SBTPartSize; static_assert((sizeof(D3D12_DISPATCH_RAYS_DESC) - offsetof(D3D12_DISPATCH_RAYS_DESC, Width)) == sizeof(uint32) * 4, "Assume 4 uints at the end of the struct to store the dimension + alignment overhead"); uint32 BaseRayDescBufferOffset = DispatchRaysDescBuffer->ResourceLocation.GetOffsetFromBaseOfResource(); // Copy SBT data part of the dispatch desc to upload memory FD3D12ResourceLocation UploadResourceLocation(Device); void* Data = Device->GetDefaultFastAllocator().Allocate(DispatchRayDescSize, 256, &UploadResourceLocation); FMemory::Memcpy(Data, &DispatchDesc, SBTPartSize); // Copy SBT data part to resource CommandContext.CopyBufferRegionChecked( DispatchRaysDescBufferResource->GetResource(), DispatchRaysDescBufferResource->GetName(), BaseRayDescBufferOffset, UploadResourceLocation.GetResource()->GetResource(), UploadResourceLocation.GetResource()->GetName(), UploadResourceLocation.GetOffsetFromBaseOfResource(), SBTPartSize ); // Copy GPU computed indirect args to resource CommandContext.CopyBufferRegionChecked( DispatchRaysDescBufferResource->GetResource(), DispatchRaysDescBufferResource->GetName(), BaseRayDescBufferOffset + SBTPartSize, ArgumentBuffer->GetResource()->GetResource(), ArgumentBuffer->GetResource()->GetName(), ArgumentBuffer->ResourceLocation.GetOffsetFromBaseOfResource() + ArgumentOffset, IndirectDimensionSize ); CommandContext.TransitionResource( DispatchRaysDescBufferResource, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, 0 ); CommandContext.TransitionResource(ArgumentBuffer->GetResource(), D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE | D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, 0 ); CommandContext.FlushResourceBarriers(); } // Setup state for RT dispatch // Invalidate state cache to ensure all root parameters for regular shaders are reset when non-RT work is dispatched later. CommandContext.StateCache.TransitionComputeState(ED3D12PipelineType::RayTracing); CommandContext.GraphicsCommandList(); FD3D12RayTracingShader* RayGenShader = Pipeline->RayGenShaders.Shaders[RayGenShaderIndex]; const FRHIShaderBindingLayout& ShaderBindingLayout = CommandContext.GetShaderBindingLayout(); check(RayGenShader->ShaderBindingLayoutHash == ShaderBindingLayout.GetHash()); const TArray& StaticUniformBuffers = CommandContext.GetStaticUniformBuffers(); const FD3D12RootSignature* GlobalRTRootSignature = Adapter->GetGlobalRayTracingRootSignature(ShaderBindingLayout); bool bResourcesBound = false; if (OptShaderTable && OptShaderTable->DescriptorCache) { FD3D12ExplicitDescriptorCache* DescriptorCache = OptShaderTable->DescriptorCache; check(DescriptorCache != nullptr); UE::TScopeLock Lock(OptShaderTable->DispatchMutex); TRACE_CPUPROFILER_EVENT_SCOPE(SetRayTracingShaderResources); CommandContext.SetExplicitDescriptorCache(*DescriptorCache); CommandContext.GraphicsCommandList()->SetComputeRootSignature(Pipeline->GlobalRootSignature); FD3D12RayTracingGlobalResourceBinder ResourceBinder(CommandContext, *DescriptorCache); bResourcesBound = SetRayTracingShaderResources(RayGenShader, GlobalRTRootSignature, GlobalBindings, ResourceBinder); OptShaderTable->UpdateResidency(CommandContext); } else { FD3D12ExplicitDescriptorCache TransientDescriptorCache(CommandContext.GetParentDevice(), FD3D12RayTracingShaderBindingTableInternal::MaxBindingWorkers); TransientDescriptorCache.Init(0, MAX_SRVS + MAX_UAVS, MAX_SAMPLERS, ERHIBindlessConfiguration::RayTracing); CommandContext.SetExplicitDescriptorCache(TransientDescriptorCache); CommandContext.GraphicsCommandList()->SetComputeRootSignature(Pipeline->GlobalRootSignature); FD3D12RayTracingGlobalResourceBinder ResourceBinder(CommandContext, TransientDescriptorCache); bResourcesBound = SetRayTracingShaderResources(RayGenShader, GlobalRTRootSignature, GlobalBindings, ResourceBinder); } // Bind diagnostic buffer to allow asserts in ray generation shaders CommandContext.BindDiagnosticBuffer(GlobalRTRootSignature, ED3D12PipelineType::Compute); int8 StaticShaderBindingSlot = GlobalRTRootSignature->GetStaticShaderBindingSlot(); if (StaticShaderBindingSlot >= 0) { for (uint32 Index = 0; Index < ShaderBindingLayout.GetNumUniformBufferEntries(); ++Index) { const FRHIUniformBufferShaderBindingLayout& LayoutEntry = ShaderBindingLayout.GetUniformBufferEntry(Index); const uint32 RootParameterSlotIndex = uint32(StaticShaderBindingSlot) + LayoutEntry.CBVResourceIndex; FRHIUniformBuffer* UniformBuffer = StaticUniformBuffers[Index]; checkf(UniformBuffer, TEXT("Static uniform buffer at index %d is referenced in the shader binding layout but not provided in the last RHISetStaticUniformBuffers() command"), Index); FD3D12UniformBuffer* D3D12UniformBuffer = FD3D12CommandContext::RetrieveObject(UniformBuffer, Device->GetGPUIndex()); if (D3D12UniformBuffer->ResourceLocation.GetGPUVirtualAddress()) { const FD3D12ResourceLocation& ResourceLocation = D3D12UniformBuffer->ResourceLocation; CommandContext.GraphicsCommandList()->SetComputeRootConstantBufferView(RootParameterSlotIndex, ResourceLocation.GetGPUVirtualAddress()); } } } if (bResourcesBound) { CommandContext.FlushResourceBarriers(); ID3D12StateObject* RayTracingStateObject = nullptr; // Select a specialized RTPSO, if one is available if (GRayTracingAllowSpecializedStateObjects && !Pipeline->SpecializedStateObjects.IsEmpty() && !Pipeline->SpecializationIndices.IsEmpty()) { int32 SpecializationIndex = Pipeline->SpecializationIndices[RayGenShaderIndex]; if (SpecializationIndex != INDEX_NONE) { RayTracingStateObject = Pipeline->SpecializedStateObjects[SpecializationIndex]; } } // Fall back to default full RTPSO if specialization is not available if (!RayTracingStateObject) { RayTracingStateObject = Pipeline->StateObject.GetReference(); } Pipeline->FrameCounter.Set(CommandContext.GetFrameFenceCounter()); CommandContext.RayTracingCommandList()->SetPipelineState1(RayTracingStateObject); if (DispatchRaysDescBuffer) { ID3D12CommandSignature* CommandSignature = Adapter->GetDispatchRaysIndirectCommandSignature(); CommandContext.RayTracingCommandList()->ExecuteIndirect( CommandSignature, 1, DispatchRaysDescBuffer->ResourceLocation.GetResource()->GetResource(), DispatchRaysDescBuffer->ResourceLocation.GetOffsetFromBaseOfResource(), nullptr, 0 ); } else { CommandContext.RayTracingCommandList()->DispatchRays(&DispatchDesc); } #if (RHI_NEW_GPU_PROFILER == 0) if (CommandContext.IsDefaultContext()) { CommandContext.GetParentDevice()->RegisterGPUWork(1); } #endif } // Restore old global descriptor heaps CommandContext.UnsetExplicitDescriptorCache(); } void FD3D12CommandContext::RHIRayTraceDispatch(FRHIRayTracingPipelineState* InRayTracingPipelineState, FRHIRayTracingShader* RayGenShaderRHI, FRHIShaderBindingTable* InSBT, const FRayTracingShaderBindings& GlobalResourceBindings, uint32 Width, uint32 Height) { TRACE_CPUPROFILER_EVENT_SCOPE(RHIRayTraceDispatch); const FD3D12RayTracingPipelineState* Pipeline = FD3D12DynamicRHI::ResourceCast(InRayTracingPipelineState); FD3D12RayTracingShaderBindingTable* SBT = FD3D12DynamicRHI::ResourceCast(InSBT); FD3D12RayTracingShaderBindingTableInternal* ShaderTableForDevice = SBT->GetTableForDevice(GetParentDevice()); checkf(!ShaderTableForDevice->bIsDirty, TEXT("The shader table contains pending modifications. CommitRayTracingBindings must be called after SetRayTracingBindings")); FD3D12RayTracingShader* RayGenShader = FD3D12DynamicRHI::ResourceCast(RayGenShaderRHI); const int32 RayGenShaderIndex = Pipeline->RayGenShaders.Find(RayGenShader->GetHash()); checkf(RayGenShaderIndex != INDEX_NONE, TEXT("RayGen shader '%s' is not present in the given ray tracing pipeline. ") TEXT("All RayGen shaders must be declared when creating RTPSO."), *(RayGenShader->EntryPoint)); const FD3D12ShaderIdentifier& RayGenShaderIdentifier = Pipeline->RayGenShaders.Identifiers[RayGenShaderIndex]; D3D12_DISPATCH_RAYS_DESC DispatchDesc = ShaderTableForDevice->GetDispatchRaysDesc(GetParentDevice(), RayGenShaderIdentifier); DispatchDesc.Width = Width; DispatchDesc.Height = Height; DispatchDesc.Depth = 1; DispatchRays(*this, GlobalResourceBindings, Pipeline, RayGenShaderIndex, ShaderTableForDevice, DispatchDesc, QueueType); } void FD3D12CommandContext::RHIRayTraceDispatchIndirect(FRHIRayTracingPipelineState* InRayTracingPipelineState, FRHIRayTracingShader* RayGenShaderRHI, FRHIShaderBindingTable* InSBT, const FRayTracingShaderBindings& GlobalResourceBindings, FRHIBuffer* ArgumentBuffer, uint32 ArgumentOffset) { TRACE_CPUPROFILER_EVENT_SCOPE(RHIRayTraceDispatchIndirect); checkf(GRHISupportsRayTracingDispatchIndirect, TEXT("RHIRayTraceDispatchIndirect may not be used because DXR 1.1 is not supported on this machine.")); const FD3D12RayTracingPipelineState* Pipeline = FD3D12DynamicRHI::ResourceCast(InRayTracingPipelineState); FD3D12RayTracingShaderBindingTable* SBT = FD3D12DynamicRHI::ResourceCast(InSBT); FD3D12RayTracingShaderBindingTableInternal* ShaderTableForDevice = SBT->GetTableForDevice(GetParentDevice()); checkf(!ShaderTableForDevice->bIsDirty, TEXT("The shader table contains pending modifications. CommitRayTracingBindings must be called after SetRayTracingBindings")); FD3D12RayTracingShader* RayGenShader = FD3D12DynamicRHI::ResourceCast(RayGenShaderRHI); const int32 RayGenShaderIndex = Pipeline->RayGenShaders.Find(RayGenShader->GetHash()); checkf(RayGenShaderIndex != INDEX_NONE, TEXT("RayGen shader is not present in the given ray tracing pipeline. All RayGen shaders must be declared when creating RTPSO.")); const FD3D12ShaderIdentifier& RayGenShaderIdentifier = Pipeline->RayGenShaders.Identifiers[RayGenShaderIndex]; D3D12_DISPATCH_RAYS_DESC DispatchDesc = ShaderTableForDevice->GetDispatchRaysDesc(GetParentDevice(), RayGenShaderIdentifier); DispatchRays(*this, GlobalResourceBindings, Pipeline, RayGenShaderIndex, ShaderTableForDevice, DispatchDesc, QueueType, RetrieveObject(ArgumentBuffer), ArgumentOffset); } static void SetRayTracingHitGroup( FD3D12Device* Device, FD3D12RayTracingShaderBindingTableInternal* ShaderTable, uint32 RecordIndex, FD3D12RayTracingPipelineState* Pipeline, uint32 HitGroupIndex, const FD3D12RayTracingGeometry* Geometry, uint32 GeometrySegmentIndex, uint32 NumUniformBuffers, FRHIUniformBuffer* const* UniformBuffers, uint32 LooseParameterDataSize, const void* LooseParameterData, uint32 UserData, ERayTracingLocalShaderBindingType BindingType, uint32 WorkerIndex) { const uint32 GPUIndex = Device->GetGPUIndex(); // If Shader table doesn't support hit group indexing then only set the hit group identifier and it should be first record index if (ShaderTable->HitGroupIndexingMode == ERayTracingHitGroupIndexingMode::Disallow) { check(RecordIndex == 0 && Pipeline); if (Pipeline) { ShaderTable->SetHitGroupIdentifier(RecordIndex, Pipeline->HitGroupShaders.Identifiers[HitGroupIndex]); } return; } checkf(RecordIndex < ShaderTable->NumHitRecords, TEXT("Hit group record index is invalid. Make sure that NumGeometrySegments and NumShaderSlotsPerGeometrySegment is correct in FRayTracingShaderBindingTableInitializer.")); #if DO_CHECK { const uint32 NumGeometrySegments = Geometry->GetNumSegments(); checkf(GeometrySegmentIndex < NumGeometrySegments, TEXT("Segment %d is out of range for ray tracing geometry '%s' that contains %d segments"), GeometrySegmentIndex, Geometry->DebugName.IsNone() ? TEXT("UNKNOWN") : *Geometry->DebugName.ToString(), NumGeometrySegments); // If SBT and binding are persistent then all uniform buffers need to be allocated multiframe if (ShaderTable->Lifetime == ERayTracingShaderBindingTableLifetime::Persistent && BindingType == ERayTracingLocalShaderBindingType::Persistent) { for (uint32 UBIndex = 0; UBIndex < NumUniformBuffers; ++UBIndex) { FRHIUniformBuffer* Resource = UniformBuffers[UBIndex]; if (Resource) { FD3D12UniformBuffer* UB = FD3D12CommandContext::RetrieveObject(Resource, 0); checkf(UB->UniformBufferUsage == UniformBuffer_MultiFrame, TEXT("Trying to bind non multiframe uniform buffer to persistent SBT: %s"), *Resource->GetLayout().GetDebugName()); } } } } #endif // DO_CHECK ShaderTable->SetHitGroupGeometrySystemParameters(WorkerIndex, BindingType, RecordIndex, Geometry, UserData, GeometrySegmentIndex); if (EnumHasAnyFlags(ShaderTable->ShaderBindingMode, ERayTracingShaderBindingMode::RTPSO) && Pipeline) { const FD3D12RayTracingShader* Shader = Pipeline->HitGroupShaders.Shaders[HitGroupIndex]; FD3D12RayTracingShaderBindingTableInternal::FShaderRecordCacheKey CacheKey; // TODO: disable RecordCache when using persistent SBT const bool bCanUseRecordCache = GRayTracingCacheShaderRecords && LooseParameterDataSize == 0 // loose parameters end up in unique constant buffers, so SBT records can't be shared && NumUniformBuffers > 0 // there is no benefit from cache if no resources are being bound && NumUniformBuffers <= CacheKey.MaxUniformBuffers && (BindingType == ERayTracingLocalShaderBindingType::Transient || ShaderTable->Lifetime == ERayTracingShaderBindingTableLifetime::Transient); // Only transient SBTs or Transient records can use the record cache because otherwise the per record listeners need to tracked and copied as well ERayTracingLocalShaderBindingType RTPSOBindingType = BindingType; if (bCanUseRecordCache) { CacheKey = FD3D12RayTracingShaderBindingTableInternal::FShaderRecordCacheKey(NumUniformBuffers, UniformBuffers, HitGroupIndex); uint32* ExistingRecordIndex = ShaderTable->WorkerData[WorkerIndex].ShaderRecordCache.Find(CacheKey); if (ExistingRecordIndex) { // Simply copy local shader parameters from existing SBT record and set the shader identifier, skipping resource binding work. const uint32 OffsetFromRootSignatureStart = sizeof(FD3D12HitGroupSystemParameters); ShaderTable->SetHitGroupIdentifier(RecordIndex, Pipeline->HitGroupShaders.Identifiers[HitGroupIndex]); ShaderTable->CopyHitGroupParameters(RecordIndex, *ExistingRecordIndex, OffsetFromRootSignatureStart); #if DO_CHECK // Validate all other data against copied data RTPSOBindingType = ERayTracingLocalShaderBindingType::Validation; #else return; #endif // DO_CHECK } } FD3D12RayTracingLocalResourceBinder ResourceBinder(*Device, *ShaderTable, *(Shader->LocalRootSignature), RTPSOBindingType, RecordIndex, WorkerIndex, ERayTracingBindingType::HitGroup); const bool bResourcesBound = SetRayTracingShaderResources(Shader, Shader->LocalRootSignature, 0, nullptr, // BindlessParameters 0, nullptr, // Textures 0, nullptr, // SRVs NumUniformBuffers, UniformBuffers, 0, nullptr, // Samplers 0, nullptr, // UAVs LooseParameterDataSize, LooseParameterData, ResourceBinder); if (bCanUseRecordCache && bResourcesBound) { ShaderTable->WorkerData[WorkerIndex].ShaderRecordCache.FindOrAdd(CacheKey, RecordIndex); } FD3D12ShaderIdentifier ShaderIdentifier = bResourcesBound ? Pipeline->HitGroupShaders.Identifiers[HitGroupIndex] : FD3D12ShaderIdentifier::Null; if (RTPSOBindingType != ERayTracingLocalShaderBindingType::Validation) { ShaderTable->SetHitGroupIdentifier(RecordIndex, ShaderIdentifier); } else { ShaderTable->CompareHitGroupIdentifier(RecordIndex, ShaderIdentifier); } } } static void SetRayTracingCallableShader( FD3D12Device* Device, FD3D12RayTracingShaderBindingTableInternal* ShaderTable, uint32 RecordIndex, FD3D12RayTracingPipelineState* Pipeline, uint32 ShaderIndexInPipeline, uint32 NumUniformBuffers, FRHIUniformBuffer* const* UniformBuffers, uint32 LooseParameterDataSize, const void* LooseParameterData, uint32 UserData, uint32 WorkerIndex) { checkf(RecordIndex < ShaderTable->NumCallableRecords, TEXT("Callable shader record index is invalid. Make sure that NumCallableShaderSlots is correct in FRayTracingShaderBindingTableInitializer.")); const uint32 UserDataOffset = offsetof(FD3D12HitGroupSystemParameters, RootConstants) + offsetof(FHitGroupSystemRootConstants, UserData); ShaderTable->SetCallableShaderParameters(RecordIndex, UserDataOffset, UserData); const FD3D12ShaderIdentifier* ShaderIdentifier = &FD3D12ShaderIdentifier::Null; if (ShaderIndexInPipeline != INDEX_NONE) { const FD3D12RayTracingShader* Shader = Pipeline->CallableShaders.Shaders[ShaderIndexInPipeline]; FD3D12RayTracingLocalResourceBinder ResourceBinder(*Device, *ShaderTable, *(Shader->LocalRootSignature), ERayTracingLocalShaderBindingType::Transient, RecordIndex, WorkerIndex, ERayTracingBindingType::CallableShader); const bool bResourcesBound = SetRayTracingShaderResources(Shader, Shader->LocalRootSignature, 0, nullptr, // BindlessParameters 0, nullptr, // Textures 0, nullptr, // SRVs NumUniformBuffers, UniformBuffers, 0, nullptr, // Samplers 0, nullptr, // UAVs LooseParameterDataSize, LooseParameterData, // Loose parameters ResourceBinder); if (bResourcesBound) { ShaderIdentifier = &Pipeline->CallableShaders.Identifiers[ShaderIndexInPipeline]; } } ShaderTable->SetCallableIdentifier(RecordIndex, *ShaderIdentifier); } static void SetRayTracingMissShader( FD3D12Device* Device, FD3D12RayTracingShaderBindingTableInternal* ShaderTable, uint32 RecordIndex, FD3D12RayTracingPipelineState* Pipeline, uint32 ShaderIndexInPipeline, uint32 NumUniformBuffers, FRHIUniformBuffer* const* UniformBuffers, uint32 LooseParameterDataSize, const void* LooseParameterData, uint32 UserData, uint32 WorkerIndex) { checkf(RecordIndex < ShaderTable->NumMissRecords, TEXT("Miss shader record index is invalid. Make sure that NumMissShaderSlots is correct in FRayTracingShaderBindingTableInitializer.")); const uint32 UserDataOffset = offsetof(FD3D12HitGroupSystemParameters, RootConstants) + offsetof(FHitGroupSystemRootConstants, UserData); ShaderTable->SetMissShaderParameters(RecordIndex, UserDataOffset, UserData); const FD3D12RayTracingShader* Shader = Pipeline->MissShaders.Shaders[ShaderIndexInPipeline]; FD3D12RayTracingLocalResourceBinder ResourceBinder(*Device, *ShaderTable, *(Shader->LocalRootSignature), ERayTracingLocalShaderBindingType::Transient, RecordIndex, WorkerIndex, ERayTracingBindingType::MissShader); const bool bResourcesBound = SetRayTracingShaderResources(Shader, Shader->LocalRootSignature, 0, nullptr, // BindlessParameters 0, nullptr, // Textures 0, nullptr, // SRVs NumUniformBuffers, UniformBuffers, 0, nullptr, // Samplers 0, nullptr, // UAVs LooseParameterDataSize, LooseParameterData, // Loose parameters ResourceBinder); ShaderTable->SetMissIdentifier(RecordIndex, bResourcesBound ? Pipeline->MissShaders.Identifiers[ShaderIndexInPipeline] : FD3D12ShaderIdentifier::Null); } void FD3D12CommandContext::RHISetBindingsOnShaderBindingTable( FRHIShaderBindingTable* InSBT, FRHIRayTracingPipelineState* InPipeline, uint32 NumBindings, const FRayTracingLocalShaderBindings* Bindings, ERayTracingBindingType BindingType) { TRACE_CPUPROFILER_EVENT_SCOPE(RHISetBindingsOnShaderBindingTable); SCOPE_CYCLE_COUNTER(STAT_D3D12SetBindingsOnShaderBindingTable); FD3D12RayTracingShaderBindingTable* SBT = FD3D12DynamicRHI::ResourceCast(InSBT); FD3D12RayTracingPipelineState* Pipeline = FD3D12DynamicRHI::ResourceCast(InPipeline); // Pipeline shouldn't contain any shaders which have bigger local data size then currently set in the initializer // (Otherwise changing of local binding data size would need to supported) check(Pipeline == nullptr || Pipeline->MaxLocalRootSignatureSize <= SBT->GetInitializer().LocalBindingDataSize); FD3D12RayTracingShaderBindingTableInternal* ShaderTableForDevice = SBT->GetTableForDevice(GetParentDevice()); FGraphEventArray TaskList; const uint32 NumWorkerThreads = FMath::Max(1, FTaskGraphInterface::Get().GetNumWorkerThreads()); const uint32 MaxTasks = FApp::ShouldUseThreadingForPerformance() ? FMath::Min(NumWorkerThreads, FD3D12RayTracingShaderBindingTableInternal::MaxBindingWorkers) : 1; struct FTaskContext { uint32 WorkerIndex = 0; }; TArray> TaskContexts; for (uint32 WorkerIndex = 0; WorkerIndex < MaxTasks; ++WorkerIndex) { TaskContexts.Add(FTaskContext{WorkerIndex}); } auto BindingTask = [Bindings, Device = Device, ShaderTableForDevice, Pipeline, BindingType](const FTaskContext& Context, int32 CurrentIndex) { const FRayTracingLocalShaderBindings& Binding = Bindings[CurrentIndex]; if (BindingType == ERayTracingBindingType::HitGroup) { if (Binding.BindingType != ERayTracingLocalShaderBindingType::Clear) { //UE_LOG(LogD3D12RHI, Log, TEXT("Set hit record data for RecordIndex %d on SBT %#016llx with mode: %d"), Binding.RecordIndex, ShaderTableForDevice, Binding.BindingType); const FD3D12RayTracingGeometry* Geometry = FD3D12DynamicRHI::ResourceCast(Binding.Geometry); SetRayTracingHitGroup(Device, ShaderTableForDevice, Binding.RecordIndex, Pipeline, Binding.ShaderIndexInPipeline, Geometry, Binding.SegmentIndex, Binding.NumUniformBuffers, Binding.UniformBuffers, Binding.LooseParameterDataSize, Binding.LooseParameterData, Binding.UserData, Binding.BindingType, Context.WorkerIndex); } else { //UE_LOG(LogD3D12RHI, Log, TEXT("Clear hit record data for RecordIndex %d on SBT %#016llx"), Binding.RecordIndex, ShaderTableForDevice); ShaderTableForDevice->ClearHitRecordData(Context.WorkerIndex, Binding.RecordIndex); } } else if (BindingType == ERayTracingBindingType::CallableShader) { check(Binding.BindingType == ERayTracingLocalShaderBindingType::Transient); SetRayTracingCallableShader(Device, ShaderTableForDevice, Binding.RecordIndex, Pipeline, Binding.ShaderIndexInPipeline, Binding.NumUniformBuffers, Binding.UniformBuffers, Binding.LooseParameterDataSize, Binding.LooseParameterData, Binding.UserData, Context.WorkerIndex); } else if (BindingType == ERayTracingBindingType::MissShader) { check(Binding.BindingType == ERayTracingLocalShaderBindingType::Transient); SetRayTracingMissShader(Device, ShaderTableForDevice, Binding.RecordIndex, Pipeline, Binding.ShaderIndexInPipeline, Binding.NumUniformBuffers, Binding.UniformBuffers, Binding.LooseParameterDataSize, Binding.LooseParameterData, Binding.UserData, Context.WorkerIndex); } else { checkNoEntry(); } }; // One helper worker task will be created at most per this many work items, plus one worker for current thread (unless running on a task thread), // up to a hard maximum of FD3D12RayTracingScene::MaxBindingWorkers. // Internally, parallel for tasks still subdivide the work into smaller chunks and perform fine-grained load-balancing. const int32 ItemsPerTask = 1024; ParallelForWithExistingTaskContext(TEXT("SetRayTracingBindings"), MakeArrayView(TaskContexts), NumBindings, ItemsPerTask, BindingTask); ShaderTableForDevice->bIsDirty = true; } #endif // D3D12_RHI_RAYTRACING