Files
UnrealEngine/Engine/Source/Runtime/Renderer/Private/RayTracing/RayTracingInstanceBufferUtil.cpp
2025-05-18 13:04:45 +08:00

943 lines
35 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
#include "RayTracingInstanceBufferUtil.h"
#include "Lumen/Lumen.h"
#include "RayTracingDefinitions.h"
#include "GPUScene.h"
#include "RenderGraphBuilder.h"
#include "ShaderParameterUtils.h"
#include "RendererInterface.h"
#include "RenderCore.h"
#include "ShaderParameterStruct.h"
#include "GlobalShader.h"
#include "PipelineStateCache.h"
#include "DataDrivenShaderPlatformInfo.h"
#include "ShaderCompilerCore.h"
#include "SceneRendering.h"
#include "Async/ParallelFor.h"
#include "Experimental/Containers/SherwoodHashTable.h"
#if RHI_RAYTRACING
PRAGMA_DISABLE_DEPRECATION_WARNINGS
/*
*
* Each FRayTracingGeometryInstance can translate to multiple native TLAS instances (see FRayTracingGeometryInstance::NumTransforms).
*
* The FRayTracingGeometryInstance array (ie: FRayTracingScene::Instances) used to create FRayTracingSceneRHI
* can have mix of instances using GPUScene or CPU transforms.
* In order to reduce the number of dispatches to build the native RayTracing Instance Buffer,
* the upload buffer containing FRayTracingInstanceDescriptor is split in 2 sections, [GPUSceneInstances] [CPUInstances].
* This way native GPUScene and CPU instance descriptors can be built in a single dispatch per type.
*
* If the ray tracing scene contains multiple layers, the instance buffer is divided into multiple subranges as expected by the RHI.
*
*/
static TAutoConsoleVariable<bool> CVarRayTracingInstanceBufferRLE(
TEXT("r.RayTracing.InstanceBuffer.RLE"),
true,
TEXT("Whether to use RLE to build ray tracing instance buffer."),
ECVF_RenderThreadSafe | ECVF_Scalability
);
struct FRayTracingInstanceGroup
{
uint32 BaseInstanceIndex : 30;
uint32 bIncrementUserDataPerInstance : 1;
uint32 bReuseInstance : 1;
};
static_assert(sizeof(FRayTracingInstanceGroup) == sizeof(uint32), "FRayTracingInstanceGroup is expected be same size as uint32.");
static const uint32 GRayTracingInstanceGroupSize = 64;
// Helper structure to assign instances to FRayTracingInstanceGroup depending on whether the primitive is compatible with RLE
// TODO: Investigate better schemes to maximize RLE usage
// The current implementation fills incomplete "head" group (before generating RLE groups) and might also generate a "tail" group (neither of which can use RLE since they contain instances from different primitives)
// which means in practice only ISMs with >128 instances benefit from RLE unless they happen to end up at group boundaries.
// An alternative approach is to allow incomplete groups instead of packing so aggressively to maximize the number of groups using RLE,
// although that can lead to a lot of inactive threads depending on specific heuristics.
// Primitives could also be sorted by number of instances to reduce fragmentation, etc.
struct FGroupHelper
{
uint32 CurrentGroupIndex = 0;
uint32 CurrentIndexInGroup = 0;
uint32 NumInstanceDescriptors = 0;
uint32 OptimalNumInstanceDescriptors = 0;
void AddInstances(uint32 NumInstances, bool bRLECompatible)
{
if (bRLECompatible)
{
uint32 NumInstancesRemaining = NumInstances;
if (CurrentIndexInGroup != 0)
{
// first N instances are used to fill the current (partial) group
const uint32 N = FMath::Min(GRayTracingInstanceGroupSize - CurrentIndexInGroup, NumInstancesRemaining);
NumInstancesRemaining -= N;
CurrentIndexInGroup += N;
CurrentGroupIndex += CurrentIndexInGroup / GRayTracingInstanceGroupSize;
CurrentIndexInGroup %= GRayTracingInstanceGroupSize;
NumInstanceDescriptors += N;
}
if (NumInstancesRemaining > 0)
{
check(CurrentIndexInGroup == 0);
// remaining instances go into packed groups + tail group
CurrentIndexInGroup += NumInstancesRemaining;
CurrentGroupIndex += CurrentIndexInGroup / GRayTracingInstanceGroupSize;
CurrentIndexInGroup %= GRayTracingInstanceGroupSize;
const uint32 NumPackedGroups = NumInstancesRemaining / GRayTracingInstanceGroupSize;
NumInstanceDescriptors += NumPackedGroups;
NumInstanceDescriptors += CurrentIndexInGroup;
}
OptimalNumInstanceDescriptors += FMath::DivideAndRoundUp(NumInstances, GRayTracingInstanceGroupSize);
}
else
{
NumInstanceDescriptors += NumInstances;
OptimalNumInstanceDescriptors += NumInstances;
CurrentIndexInGroup += NumInstances;
CurrentGroupIndex += CurrentIndexInGroup / GRayTracingInstanceGroupSize;
CurrentIndexInGroup %= GRayTracingInstanceGroupSize;
}
}
};
FRayTracingSceneInitializationData BuildRayTracingSceneInitializationData(TConstArrayView<FRayTracingGeometryInstance> Instances)
{
const bool bRLEAllowed = CVarRayTracingInstanceBufferRLE.GetValueOnRenderThread();
const uint32 NumSceneInstances = Instances.Num();
FRayTracingSceneInitializationData Output;
Output.NumNativeGPUSceneInstances = 0;
Output.NumNativeCPUInstances = 0;
Output.InstanceGeometryIndices.SetNumUninitialized(NumSceneInstances);
Output.BaseUploadBufferOffsets.SetNumUninitialized(NumSceneInstances);
Output.BaseInstancePrefixSum.SetNumUninitialized(NumSceneInstances);
Output.InstanceGroupEntryRefs.SetNumUninitialized(NumSceneInstances);
TArray<uint32> InstanceGroups;
Experimental::TSherwoodMap<FRHIRayTracingGeometry*, uint32> UniqueGeometries;
uint32 NumNativeInstances = 0;
FGroupHelper GPUGroupHelper;
FGroupHelper CPUGroupHelper;
for (uint32 InstanceIndex = 0; InstanceIndex < NumSceneInstances; ++InstanceIndex)
{
const FRayTracingGeometryInstance& InstanceDesc = Instances[InstanceIndex];
const bool bGpuSceneInstance = InstanceDesc.BaseInstanceSceneDataOffset != -1 || !InstanceDesc.InstanceSceneDataOffsets.IsEmpty();
const bool bCpuInstance = !bGpuSceneInstance;
checkf(!bGpuSceneInstance || InstanceDesc.BaseInstanceSceneDataOffset != -1 || InstanceDesc.NumTransforms <= uint32(InstanceDesc.InstanceSceneDataOffsets.Num()),
TEXT("Expected at least %d ray tracing geometry instance scene data offsets, but got %d."),
InstanceDesc.NumTransforms, InstanceDesc.InstanceSceneDataOffsets.Num());
checkf(!bCpuInstance || InstanceDesc.NumTransforms <= uint32(InstanceDesc.Transforms.Num()),
TEXT("Expected at least %d ray tracing geometry instance transforms, but got %d."),
InstanceDesc.NumTransforms, InstanceDesc.Transforms.Num());
checkf(InstanceDesc.GeometryRHI, TEXT("Ray tracing instance must have a valid geometry."));
uint32 GeometryIndex = UniqueGeometries.FindOrAdd(InstanceDesc.GeometryRHI, Output.ReferencedGeometries.Num());
Output.InstanceGeometryIndices[InstanceIndex] = GeometryIndex;
if (GeometryIndex == Output.ReferencedGeometries.Num())
{
Output.ReferencedGeometries.Add(InstanceDesc.GeometryRHI);
}
if (bGpuSceneInstance)
{
check(InstanceDesc.Transforms.IsEmpty());
Output.BaseUploadBufferOffsets[InstanceIndex] = GPUGroupHelper.NumInstanceDescriptors;
Output.NumNativeGPUSceneInstances += InstanceDesc.NumTransforms;
}
else if (bCpuInstance)
{
Output.BaseUploadBufferOffsets[InstanceIndex] = CPUGroupHelper.NumInstanceDescriptors;
Output.NumNativeCPUInstances += InstanceDesc.NumTransforms;
}
else
{
checkNoEntry();
}
Output.BaseInstancePrefixSum[InstanceIndex] = NumNativeInstances;
NumNativeInstances += InstanceDesc.NumTransforms;
const bool bUseUniqueUserData = InstanceDesc.UserData.Num() != 0;
if (bGpuSceneInstance)
{
Output.InstanceGroupEntryRefs[InstanceIndex].GroupIndex = GPUGroupHelper.CurrentGroupIndex;
Output.InstanceGroupEntryRefs[InstanceIndex].BaseIndexInGroup = GPUGroupHelper.CurrentIndexInGroup;
const bool bRLECompatible = bRLEAllowed && (InstanceDesc.BaseInstanceSceneDataOffset != -1) && !bUseUniqueUserData;
GPUGroupHelper.AddInstances(InstanceDesc.NumTransforms, bRLECompatible);
}
else
{
Output.InstanceGroupEntryRefs[InstanceIndex].GroupIndex = CPUGroupHelper.CurrentGroupIndex;
Output.InstanceGroupEntryRefs[InstanceIndex].BaseIndexInGroup = CPUGroupHelper.CurrentIndexInGroup;
const bool bRLECompatible = bRLEAllowed && !bUseUniqueUserData;
CPUGroupHelper.AddInstances(InstanceDesc.NumTransforms, bRLECompatible);
}
}
Output.NumGPUInstanceGroups = GPUGroupHelper.CurrentGroupIndex + (GPUGroupHelper.CurrentIndexInGroup > 0 ? 1 : 0);
Output.NumCPUInstanceGroups = CPUGroupHelper.CurrentGroupIndex + (CPUGroupHelper.CurrentIndexInGroup > 0 ? 1 : 0);
Output.NumGPUInstanceDescriptors = GPUGroupHelper.NumInstanceDescriptors;
Output.NumCPUInstanceDescriptors = CPUGroupHelper.NumInstanceDescriptors;
return MoveTemp(Output);
}
void WriteInstanceDescriptor(
const FRayTracingGeometryInstance& SceneInstance,
uint32 SceneInstanceIndex,
uint32 TransformIndex,
uint32 AccelerationStructureIndex,
bool bGpuSceneInstance,
bool bUseUniqueUserData,
uint32 BaseInstanceIndex,
uint32 BaseTransformIndex,
FRayTracingInstanceDescriptor& OutInstanceDescriptor)
{
FRayTracingInstanceDescriptor InstanceDesc;
if (bGpuSceneInstance)
{
if (SceneInstance.BaseInstanceSceneDataOffset != -1)
{
InstanceDesc.GPUSceneInstanceOrTransformIndex = SceneInstance.BaseInstanceSceneDataOffset + TransformIndex;
}
else
{
InstanceDesc.GPUSceneInstanceOrTransformIndex = SceneInstance.InstanceSceneDataOffsets[TransformIndex];
}
}
else
{
InstanceDesc.GPUSceneInstanceOrTransformIndex = BaseTransformIndex + TransformIndex;
}
uint32 UserData;
if (bUseUniqueUserData)
{
UserData = SceneInstance.UserData[TransformIndex];
}
else
{
UserData = SceneInstance.DefaultUserData;
if (SceneInstance.bIncrementUserDataPerInstance)
{
UserData += TransformIndex;
}
}
InstanceDesc.OutputDescriptorIndex = BaseInstanceIndex + TransformIndex;
InstanceDesc.AccelerationStructureIndex = AccelerationStructureIndex;
InstanceDesc.InstanceId = UserData;
InstanceDesc.InstanceMaskAndFlags = SceneInstance.Mask | ((uint32)SceneInstance.Flags << 8);
InstanceDesc.InstanceContributionToHitGroupIndex = SceneInstance.InstanceContributionToHitGroupIndex;
InstanceDesc.SceneInstanceIndexAndApplyLocalBoundsTransform = (SceneInstance.bApplyLocalBoundsTransform ? 0x80000000 : 0) | SceneInstanceIndex;
ensureMsgf(InstanceDesc.InstanceId <= 0xFFFFFF, TEXT("InstanceId must fit in 24 bits."));
ensureMsgf(InstanceDesc.InstanceContributionToHitGroupIndex <= 0xFFFFFF, TEXT("InstanceContributionToHitGroupIndex must fit in 24 bits."));
// copy at the end to avoid reading from OutInstanceDescriptor in the checks above
OutInstanceDescriptor = InstanceDesc;
}
// Helper function to fill upload buffers required by BuildRayTracingInstanceBuffer with instance descriptors
// Transforms of CPU instances are copied to OutTransformData
void FillRayTracingInstanceUploadBuffer(
FVector PreViewTranslation,
TConstArrayView<FRayTracingGeometryInstance> Instances,
TConstArrayView<uint32> InstanceGeometryIndices,
TConstArrayView<uint32> BaseUploadBufferOffsets,
TConstArrayView<uint32> BaseInstancePrefixSum,
TConstArrayView<FRayTracingInstanceGroupEntryRef> InstanceGroupEntryRefs,
uint32 NumGPUInstanceGroups,
uint32 NumCPUInstanceGroups,
uint32 NumGPUInstanceDescriptors,
uint32 NumCPUInstanceDescriptors,
TArrayView<FRayTracingInstanceGroup> OutInstanceGroupUploadData,
TArrayView<FRayTracingInstanceDescriptor> OutInstanceUploadData,
TArrayView<FVector4f> OutTransformData)
{
TRACE_CPUPROFILER_EVENT_SCOPE(FillRayTracingInstanceUploadBuffer);
const bool bRLEAllowed = CVarRayTracingInstanceBufferRLE.GetValueOnRenderThread();
const int32 NumSceneInstances = Instances.Num();
const int32 MinBatchSize = 128;
ParallelFor(
TEXT("FillRayTracingInstanceUploadBuffer_Parallel"),
NumSceneInstances,
MinBatchSize,
[
OutInstanceGroupUploadData,
OutInstanceUploadData,
OutTransformData,
NumGPUInstanceGroups,
NumCPUInstanceGroups,
NumGPUInstanceDescriptors,
NumCPUInstanceDescriptors,
Instances,
InstanceGeometryIndices,
BaseUploadBufferOffsets,
BaseInstancePrefixSum,
InstanceGroupEntryRefs,
PreViewTranslation,
bRLEAllowed
](int32 SceneInstanceIndex)
{
const FRayTracingGeometryInstance& SceneInstance = Instances[SceneInstanceIndex];
const uint32 NumTransforms = SceneInstance.NumTransforms;
checkf(SceneInstance.UserData.Num() == 0 || SceneInstance.UserData.Num() >= int32(NumTransforms),
TEXT("User data array must be either be empty (Instance.DefaultUserData is used), or contain one entry per entry in Transforms array."));
const bool bUseUniqueUserData = SceneInstance.UserData.Num() != 0;
const bool bGpuSceneInstance = SceneInstance.BaseInstanceSceneDataOffset != -1 || !SceneInstance.InstanceSceneDataOffsets.IsEmpty();
const bool bCpuInstance = !bGpuSceneInstance;
checkf(bGpuSceneInstance + bCpuInstance == 1, TEXT("Instance can only get transforms from one of GPUScene, or Transforms array."));
const uint32 AccelerationStructureIndex = InstanceGeometryIndices[SceneInstanceIndex];
const uint32 BaseInstanceIndex = BaseInstancePrefixSum[SceneInstanceIndex];
const uint32 BaseTransformIndex = bCpuInstance ? BaseUploadBufferOffsets[SceneInstanceIndex] : 0;
uint32 BaseDescriptorIndex = BaseUploadBufferOffsets[SceneInstanceIndex];
uint32 BaseDescriptorOffset = 0;
// Upload buffer is split into 2 sections [GPUSceneInstances][CPUInstances]
if (!bGpuSceneInstance)
{
BaseDescriptorOffset += NumGPUInstanceDescriptors;
}
const bool bRLECompatible = bRLEAllowed && (!bGpuSceneInstance || SceneInstance.BaseInstanceSceneDataOffset != -1) && !bUseUniqueUserData;
const FRayTracingInstanceGroupEntryRef& GroupEntryRef = InstanceGroupEntryRefs[SceneInstanceIndex];
uint32 GroupIndex = GroupEntryRef.GroupIndex;
uint32 BaseIndexInGroup = GroupEntryRef.BaseIndexInGroup;
if (bCpuInstance)
{
GroupIndex += NumGPUInstanceGroups;
}
uint32 TransformIndex = 0;
if (BaseIndexInGroup > 0)
{
// write N instances to fill (partial) head group
const uint32 N = FMath::Min(GRayTracingInstanceGroupSize - BaseIndexInGroup, NumTransforms);
for (; TransformIndex < N; ++TransformIndex)
{
WriteInstanceDescriptor(
SceneInstance,
SceneInstanceIndex,
TransformIndex,
AccelerationStructureIndex,
bGpuSceneInstance,
bUseUniqueUserData,
BaseInstanceIndex,
BaseTransformIndex,
OutInstanceUploadData[BaseDescriptorOffset + BaseDescriptorIndex]);
++BaseDescriptorIndex;
}
++GroupIndex;
}
if (bRLECompatible)
{
const uint32 NumPackedGroups = (NumTransforms - TransformIndex) / GRayTracingInstanceGroupSize;
// write packed groups
for (uint32 PackedGroupIndex = 0; PackedGroupIndex < NumPackedGroups; ++PackedGroupIndex)
{
// write packed group
FRayTracingInstanceGroup Group;
Group.BaseInstanceIndex = BaseDescriptorIndex;
Group.bIncrementUserDataPerInstance = SceneInstance.bIncrementUserDataPerInstance ? 1 : 0;
Group.bReuseInstance = 1;
OutInstanceGroupUploadData[GroupIndex] = Group;
++GroupIndex;
// and corresponding instance
WriteInstanceDescriptor(
SceneInstance,
SceneInstanceIndex,
TransformIndex,
AccelerationStructureIndex,
bGpuSceneInstance,
bUseUniqueUserData,
BaseInstanceIndex,
BaseTransformIndex,
OutInstanceUploadData[BaseDescriptorOffset + BaseDescriptorIndex]);
++BaseDescriptorIndex;
TransformIndex += GRayTracingInstanceGroupSize;
}
}
if (TransformIndex < NumTransforms)
{
// write tail groups (not packed)
const uint32 NumTailGroups = FMath::DivideAndRoundUp(NumTransforms - TransformIndex, GRayTracingInstanceGroupSize);
for (uint32 TailGroupIndex = 0; TailGroupIndex < NumTailGroups; ++TailGroupIndex)
{
FRayTracingInstanceGroup Group;
Group.BaseInstanceIndex = BaseDescriptorIndex + TailGroupIndex * GRayTracingInstanceGroupSize;
Group.bIncrementUserDataPerInstance = 0;
Group.bReuseInstance = 0;
OutInstanceGroupUploadData[GroupIndex] = Group;
++GroupIndex;
}
// and instances
for (; TransformIndex < NumTransforms; ++TransformIndex)
{
WriteInstanceDescriptor(
SceneInstance,
SceneInstanceIndex,
TransformIndex,
AccelerationStructureIndex,
bGpuSceneInstance,
bUseUniqueUserData,
BaseInstanceIndex,
BaseTransformIndex,
OutInstanceUploadData[BaseDescriptorOffset + BaseDescriptorIndex]);
++BaseDescriptorIndex;
}
}
if (bCpuInstance)
{
for (uint32 TransformIndex2 = 0; TransformIndex2 < NumTransforms; ++TransformIndex2)
{
const uint32 TransformDataOffset = (BaseTransformIndex + TransformIndex2) * 3;
FMatrix LocalToTranslatedWorld = SceneInstance.Transforms[TransformIndex2].ConcatTranslation(PreViewTranslation);
const FMatrix44f LocalToTranslatedWorldF = FMatrix44f(LocalToTranslatedWorld.GetTransposed());
OutTransformData[TransformDataOffset + 0] = *(FVector4f*)&LocalToTranslatedWorldF.M[0];
OutTransformData[TransformDataOffset + 1] = *(FVector4f*)&LocalToTranslatedWorldF.M[1];
OutTransformData[TransformDataOffset + 2] = *(FVector4f*)&LocalToTranslatedWorldF.M[2];
}
}
});
}
void FillRayTracingInstanceUploadBuffer(
FRayTracingSceneRHIRef RayTracingSceneRHI,
FVector PreViewTranslation,
TConstArrayView<FRayTracingGeometryInstance> Instances,
TConstArrayView<uint32> InstanceGeometryIndices,
TConstArrayView<uint32> BaseUploadBufferOffsets,
TConstArrayView<uint32> BaseInstancePrefixSum,
uint32 NumNativeGPUSceneInstances,
uint32 NumNativeCPUInstances,
TArrayView<FRayTracingInstanceDescriptor> OutInstanceUploadData,
TArrayView<FVector4f> OutTransformData)
{
FillRayTracingInstanceUploadBuffer(
PreViewTranslation,
Instances,
InstanceGeometryIndices,
BaseUploadBufferOffsets,
BaseInstancePrefixSum,
{},
0,
0,
NumNativeGPUSceneInstances,
NumNativeCPUInstances,
{},
OutInstanceUploadData,
OutTransformData);
}
struct FRayTracingBuildInstanceBufferCS : public FGlobalShader
{
DECLARE_GLOBAL_SHADER(FRayTracingBuildInstanceBufferCS);
SHADER_USE_PARAMETER_STRUCT(FRayTracingBuildInstanceBufferCS, FGlobalShader);
BEGIN_SHADER_PARAMETER_STRUCT(FParameters, )
SHADER_PARAMETER_STRUCT_INCLUDE(FGPUSceneResourceParametersRHI, GPUSceneParameters)
SHADER_PARAMETER_UAV(RWStructuredBuffer, OutPlatformInstanceDescriptors)
SHADER_PARAMETER_SRV(StructuredBuffer, InstanceGroupDescriptors)
SHADER_PARAMETER_SRV(StructuredBuffer, InstanceDescriptors)
SHADER_PARAMETER_SRV(ByteAddressBuffer, AccelerationStructureAddresses)
SHADER_PARAMETER_SRV(StructuredBuffer, InstanceTransforms)
SHADER_PARAMETER(uint32, MaxNumInstances)
SHADER_PARAMETER(uint32, NumGroups)
SHADER_PARAMETER(uint32, NumInstanceDescriptors)
SHADER_PARAMETER(uint32, BaseGroupDescriptorIndex)
SHADER_PARAMETER(uint32, BaseInstanceDescriptorIndex)
SHADER_PARAMETER(FVector3f, PreViewTranslationHigh)
SHADER_PARAMETER(FVector3f, PreViewTranslationLow)
// Instance culling params
SHADER_PARAMETER(float, CullingRadius)
SHADER_PARAMETER(float, FarFieldCullingRadius)
SHADER_PARAMETER(float, AngleThresholdRatioSq)
SHADER_PARAMETER(FVector3f, ViewOrigin)
SHADER_PARAMETER(uint32, CullingMode)
SHADER_PARAMETER(uint32, CullUsingGroups)
SHADER_PARAMETER_UAV(RWStructuredBuffer<uint>, RWOutputStats)
SHADER_PARAMETER(uint32, OutputStatsOffset)
// Debug parameters
SHADER_PARAMETER_UAV(RWStructuredBuffer, RWInstanceExtraData)
END_SHADER_PARAMETER_STRUCT()
class FSupportInstanceGroupsDim : SHADER_PERMUTATION_BOOL("SUPPORT_INSTANCE_GROUPS");
class FUseGPUSceneDim : SHADER_PERMUTATION_BOOL("USE_GPUSCENE");
class FOutputInstanceExtraDataDim : SHADER_PERMUTATION_BOOL("OUTPUT_INSTANCE_EXTRA_DATA");
class FGpuCullingDim : SHADER_PERMUTATION_BOOL("GPU_CULLING");
class FOutputStatsDim : SHADER_PERMUTATION_BOOL("OUTPUT_STATS");
class FCompactOutputDim : SHADER_PERMUTATION_BOOL("COMPACT_OUTPUT");
class FUseWaveOpsDim : SHADER_PERMUTATION_BOOL("USE_WAVE_OPS");
using FPermutationDomain = TShaderPermutationDomain<FSupportInstanceGroupsDim, FUseGPUSceneDim, FOutputInstanceExtraDataDim, FGpuCullingDim, FOutputStatsDim, FCompactOutputDim, FUseWaveOpsDim>;
static constexpr uint32 ThreadGroupSize = GRayTracingInstanceGroupSize;
static inline void ModifyCompilationEnvironment(const FGlobalShaderPermutationParameters& Parameters, FShaderCompilerEnvironment& OutEnvironment)
{
FGlobalShader::ModifyCompilationEnvironment(Parameters, OutEnvironment);
OutEnvironment.SetDefine(TEXT("THREADGROUP_SIZE"), ThreadGroupSize);
OutEnvironment.SetDefine(TEXT("VF_SUPPORTS_PRIMITIVE_SCENE_DATA"), 1);
OutEnvironment.SetDefine(TEXT("USE_GLOBAL_GPU_SCENE_DATA"), 1);
// Force DXC to avoid shader reflection issues.
OutEnvironment.CompilerFlags.Add(CFLAG_ForceDXC);
}
static bool ShouldCompilePermutation(const FGlobalShaderPermutationParameters& Parameters)
{
FPermutationDomain PermutationVector(Parameters.PermutationId);
if (PermutationVector.Get<FUseWaveOpsDim>() && !RHISupportsWaveOperations(Parameters.Platform))
{
return false;
}
return IsRayTracingEnabledForProject(Parameters.Platform);
}
};
IMPLEMENT_GLOBAL_SHADER(FRayTracingBuildInstanceBufferCS, "/Engine/Private/Raytracing/RayTracingInstanceBufferUtil.usf", "RayTracingBuildInstanceBufferCS", SF_Compute);
void BuildRayTracingInstanceBuffer(
FRHICommandList& RHICmdList,
const FGPUScene* GPUScene,
const FDFVector3& PreViewTranslation,
uint32 MaxNumInstances,
uint32 NumGroups,
uint32 NumInstanceDescriptors,
FRHIUnorderedAccessView* InstancesUAV,
FRHIShaderResourceView* InstanceGroupUploadSRV,
uint32 InstanceGroupUploadOffset,
FRHIShaderResourceView* InstanceUploadSRV,
uint32 InstanceUploadOffset,
FRHIShaderResourceView* AccelerationStructureAddressesSRV,
FRHIShaderResourceView* InstanceTransformSRV,
const FRayTracingCullingParameters* CullingParameters,
bool bCompactOutput,
FRHIUnorderedAccessView* OutputStatsUAV,
uint32 OutputStatsOffset,
FRHIUnorderedAccessView* InstanceExtraDataUAV)
{
FRayTracingBuildInstanceBufferCS::FParameters PassParams;
PassParams.OutPlatformInstanceDescriptors = InstancesUAV;
PassParams.InstanceGroupDescriptors = InstanceGroupUploadSRV;
PassParams.InstanceDescriptors = InstanceUploadSRV;
PassParams.AccelerationStructureAddresses = AccelerationStructureAddressesSRV;
PassParams.InstanceTransforms = InstanceTransformSRV;
PassParams.MaxNumInstances = MaxNumInstances;
PassParams.NumGroups = NumGroups;
PassParams.NumInstanceDescriptors = NumInstanceDescriptors;
PassParams.BaseGroupDescriptorIndex = InstanceGroupUploadOffset;
PassParams.BaseInstanceDescriptorIndex = InstanceUploadOffset;
PassParams.PreViewTranslationHigh = PreViewTranslation.High;
PassParams.PreViewTranslationLow = PreViewTranslation.Low;
if (GPUScene)
{
PassParams.GPUSceneParameters = GPUScene->GetShaderParametersRHI();
}
if (CullingParameters)
{
PassParams.CullingRadius = CullingParameters->CullingRadius;
PassParams.FarFieldCullingRadius = CullingParameters->FarFieldCullingRadius;
PassParams.AngleThresholdRatioSq = CullingParameters->AngleThresholdRatioSq;
PassParams.ViewOrigin = CullingParameters->TranslatedViewOrigin;
PassParams.CullingMode = uint32(CullingParameters->CullingMode);
PassParams.CullUsingGroups = uint32(CullingParameters->bCullUsingGroupIds);
}
PassParams.RWOutputStats = OutputStatsUAV;
PassParams.OutputStatsOffset = OutputStatsOffset;
PassParams.RWInstanceExtraData = InstanceExtraDataUAV;
FRayTracingBuildInstanceBufferCS::FPermutationDomain PermutationVector;
PermutationVector.Set<FRayTracingBuildInstanceBufferCS::FSupportInstanceGroupsDim>(InstanceGroupUploadSRV != nullptr);
PermutationVector.Set<FRayTracingBuildInstanceBufferCS::FUseGPUSceneDim>(InstanceTransformSRV == nullptr);
PermutationVector.Set<FRayTracingBuildInstanceBufferCS::FOutputInstanceExtraDataDim>(InstanceExtraDataUAV != nullptr);
PermutationVector.Set<FRayTracingBuildInstanceBufferCS::FOutputStatsDim>(OutputStatsUAV != nullptr);
PermutationVector.Set<FRayTracingBuildInstanceBufferCS::FCompactOutputDim>(bCompactOutput);
PermutationVector.Set<FRayTracingBuildInstanceBufferCS::FUseWaveOpsDim>(GRHISupportsWaveOperations);
PermutationVector.Set<FRayTracingBuildInstanceBufferCS::FGpuCullingDim>(CullingParameters != nullptr);
auto ComputeShader = GetGlobalShaderMap(GMaxRHIFeatureLevel)->GetShader<FRayTracingBuildInstanceBufferCS>(PermutationVector);
const FIntVector GroupCount = FComputeShaderUtils::GetGroupCountWrapped(PassParams.NumGroups);
//ClearUnusedGraphResources(ComputeShader, &PassParams);
SetComputePipelineState(RHICmdList, ComputeShader.GetComputeShader());
SetShaderParameters(RHICmdList, ComputeShader, ComputeShader.GetComputeShader(), PassParams);
DispatchComputeShader(RHICmdList, ComputeShader.GetShader(), GroupCount.X, GroupCount.Y, GroupCount.Z);
UnsetShaderUAVs(RHICmdList, ComputeShader, ComputeShader.GetComputeShader());
}
void BuildRayTracingInstanceBuffer(
FRHICommandList& RHICmdList,
const FGPUScene* GPUScene,
const FDFVector3& PreViewTranslation,
FRHIUnorderedAccessView* InstancesUAV,
FRHIShaderResourceView* InstanceGroupUploadSRV,
FRHIShaderResourceView* InstanceUploadSRV,
FRHIShaderResourceView* AccelerationStructureAddressesSRV,
FRHIShaderResourceView* CPUInstanceTransformSRV,
uint32 MaxNumInstances,
uint32 NumGPUGroups,
uint32 NumCPUGroups,
uint32 NumGPUInstanceDescriptors,
uint32 NumCPUInstanceDescriptors,
const FRayTracingCullingParameters* CullingParameters,
bool bCompactOutput,
FRHIUnorderedAccessView* OutputStatsUAV,
uint32 OutputStatsOffset,
FRHIUnorderedAccessView* InstanceExtraDataUAV)
{
if (NumGPUInstanceDescriptors > 0)
{
BuildRayTracingInstanceBuffer(
RHICmdList,
GPUScene,
PreViewTranslation,
MaxNumInstances,
NumGPUGroups,
NumGPUInstanceDescriptors,
InstancesUAV,
InstanceGroupUploadSRV,
0,
InstanceUploadSRV,
0,
AccelerationStructureAddressesSRV,
nullptr,
CullingParameters,
bCompactOutput,
OutputStatsUAV,
OutputStatsOffset,
InstanceExtraDataUAV);
}
if (NumCPUInstanceDescriptors > 0)
{
BuildRayTracingInstanceBuffer(
RHICmdList,
GPUScene,
PreViewTranslation,
MaxNumInstances,
NumCPUGroups,
NumCPUInstanceDescriptors,
InstancesUAV,
InstanceGroupUploadSRV,
NumGPUGroups, // CPU instance group descriptors are stored after GPU Scene instance groups
InstanceUploadSRV,
NumGPUInstanceDescriptors, // CPU input instance descriptors are stored after GPU Scene instances
AccelerationStructureAddressesSRV,
CPUInstanceTransformSRV,
nullptr,
bCompactOutput,
OutputStatsUAV,
OutputStatsOffset,
InstanceExtraDataUAV);
}
}
void BuildRayTracingInstanceBuffer(
FRHICommandList& RHICmdList,
const FGPUScene* GPUScene,
const FDFVector3& PreViewTranslation,
FRHIUnorderedAccessView* InstancesUAV,
FRHIShaderResourceView* InstanceUploadSRV,
FRHIShaderResourceView* AccelerationStructureAddressesSRV,
FRHIShaderResourceView* CPUInstanceTransformSRV,
uint32 NumNativeGPUSceneInstances,
uint32 NumNativeCPUInstances,
const FRayTracingCullingParameters* CullingParameters,
FRHIUnorderedAccessView* OutputStatsUAV,
FRHIUnorderedAccessView* InstanceExtraDataUAV)
{
BuildRayTracingInstanceBuffer(
RHICmdList,
GPUScene,
PreViewTranslation,
InstancesUAV,
nullptr,
InstanceUploadSRV,
AccelerationStructureAddressesSRV,
CPUInstanceTransformSRV,
NumNativeGPUSceneInstances + NumNativeCPUInstances,
0,
0,
NumNativeGPUSceneInstances,
NumNativeCPUInstances,
CullingParameters,
/*bCompactOutput*/ false,
OutputStatsUAV,
0,
InstanceExtraDataUAV);
}
void FRayTracingInstanceBufferBuilder::Init(TConstArrayView<FRayTracingGeometryInstance> InInstances, FVector InPreViewTranslation)
{
Instances = InInstances;
PreViewTranslation = InPreViewTranslation;
Data = BuildRayTracingSceneInitializationData(Instances);
}
void FRayTracingInstanceBufferBuilder::FillRayTracingInstanceUploadBuffer(FRHICommandList& RHICmdList)
{
// Round up buffer sizes to some multiple to avoid pathological growth reallocations.
static constexpr uint32 AllocationGranularity = 8 * 1024;
static constexpr uint64 BufferAllocationGranularity = 16 * 1024 * 1024;
const uint32 NumInstanceGroups = Data.NumGPUInstanceGroups + Data.NumCPUInstanceGroups;
const uint32 NumInstanceGroupsAligned = FMath::DivideAndRoundUp(FMath::Max(NumInstanceGroups, 1U), AllocationGranularity) * AllocationGranularity;
const uint32 NumInstanceDescriptors = Data.NumGPUInstanceDescriptors + Data.NumCPUInstanceDescriptors;
const uint32 NumInstanceDescriptorsAligned = FMath::DivideAndRoundUp(FMath::Max(NumInstanceDescriptors, 1U), AllocationGranularity) * AllocationGranularity;
const uint32 NumTransformsAligned = FMath::DivideAndRoundUp(FMath::Max(Data.NumNativeCPUInstances, 1U), AllocationGranularity) * AllocationGranularity;
{
// Create/resize instance group upload buffer (if necessary)
const uint32 UploadBufferSize = NumInstanceGroupsAligned * sizeof(FRayTracingInstanceGroup);
if (!InstanceGroupUploadBuffer.IsValid()
|| UploadBufferSize > InstanceGroupUploadBuffer->GetSize()
|| UploadBufferSize < InstanceGroupUploadBuffer->GetSize() / 2)
{
const FRHIBufferCreateDesc CreateDesc =
FRHIBufferCreateDesc::CreateStructured(TEXT("FRayTracingScene::InstanceGroupUploadBuffer"), UploadBufferSize, sizeof(FRayTracingInstanceGroup))
.AddUsage(EBufferUsageFlags::ShaderResource | EBufferUsageFlags::Volatile)
.DetermineInitialState();
InstanceGroupUploadBuffer = RHICmdList.CreateBuffer(CreateDesc);
InstanceGroupUploadSRV = RHICmdList.CreateShaderResourceView(InstanceGroupUploadBuffer, FRHIViewDesc::CreateBufferSRV().SetTypeFromBuffer(InstanceGroupUploadBuffer));
}
}
{
// Create/resize instance upload buffer (if necessary)
const uint32 UploadBufferSize = NumInstanceDescriptorsAligned * sizeof(FRayTracingInstanceDescriptor);
if (!InstanceUploadBuffer.IsValid()
|| UploadBufferSize > InstanceUploadBuffer->GetSize()
|| UploadBufferSize < InstanceUploadBuffer->GetSize() / 2)
{
const FRHIBufferCreateDesc CreateDesc =
FRHIBufferCreateDesc::CreateStructured(TEXT("FRayTracingScene::InstanceUploadBuffer"), UploadBufferSize, sizeof(FRayTracingInstanceDescriptor))
.AddUsage(EBufferUsageFlags::ShaderResource | EBufferUsageFlags::Volatile)
.DetermineInitialState();
InstanceUploadBuffer = RHICmdList.CreateBuffer(CreateDesc);
InstanceUploadSRV = RHICmdList.CreateShaderResourceView(InstanceUploadBuffer, FRHIViewDesc::CreateBufferSRV().SetTypeFromBuffer(InstanceUploadBuffer));
}
}
{
const uint32 UploadBufferSize = NumTransformsAligned * sizeof(FVector4f) * 3;
// Create/resize transform upload buffer (if necessary)
if (!TransformUploadBuffer.IsValid()
|| UploadBufferSize > TransformUploadBuffer->GetSize()
|| UploadBufferSize < TransformUploadBuffer->GetSize() / 2)
{
const FRHIBufferCreateDesc CreateDesc =
FRHIBufferCreateDesc::CreateStructured(TEXT("FRayTracingScene::TransformUploadBuffer"), UploadBufferSize, sizeof(FVector4f))
.AddUsage(EBufferUsageFlags::ShaderResource | EBufferUsageFlags::Volatile)
.DetermineInitialState();
TransformUploadBuffer = RHICmdList.CreateBuffer(CreateDesc);
TransformUploadSRV = RHICmdList.CreateShaderResourceView(TransformUploadBuffer, FRHIViewDesc::CreateBufferSRV().SetTypeFromBuffer(TransformUploadBuffer));
}
}
const uint32 InstanceGroupUploadBytes = NumInstanceGroups * sizeof(FRayTracingInstanceGroup);
const uint32 InstanceUploadBytes = NumInstanceDescriptors * sizeof(FRayTracingInstanceDescriptor);
const uint32 TransformUploadBytes = Data.NumNativeCPUInstances * 3 * sizeof(FVector4f);
FRayTracingInstanceGroup* InstanceGroupUploadData = (FRayTracingInstanceGroup*)RHICmdList.LockBuffer(InstanceGroupUploadBuffer, 0, InstanceGroupUploadBytes, RLM_WriteOnly);
FRayTracingInstanceDescriptor* InstanceUploadData = (FRayTracingInstanceDescriptor*)RHICmdList.LockBuffer(InstanceUploadBuffer, 0, InstanceUploadBytes, RLM_WriteOnly);
FVector4f* TransformUploadData = (Data.NumNativeCPUInstances > 0) ? (FVector4f*)RHICmdList.LockBuffer(TransformUploadBuffer, 0, TransformUploadBytes, RLM_WriteOnly) : nullptr;
::FillRayTracingInstanceUploadBuffer(
PreViewTranslation,
Instances,
Data.InstanceGeometryIndices,
Data.BaseUploadBufferOffsets,
Data.BaseInstancePrefixSum,
Data.InstanceGroupEntryRefs,
Data.NumGPUInstanceGroups,
Data.NumCPUInstanceGroups,
Data.NumGPUInstanceDescriptors,
Data.NumCPUInstanceDescriptors,
MakeArrayView(InstanceGroupUploadData, NumInstanceGroups),
MakeArrayView(InstanceUploadData, NumInstanceDescriptors),
MakeArrayView(TransformUploadData, Data.NumNativeCPUInstances * 3));
RHICmdList.UnlockBuffer(InstanceGroupUploadBuffer);
RHICmdList.UnlockBuffer(InstanceUploadBuffer);
if (Data.NumNativeCPUInstances > 0)
{
RHICmdList.UnlockBuffer(TransformUploadBuffer);
}
}
void FRayTracingInstanceBufferBuilder::FillAccelerationStructureAddressesBuffer(FRHICommandList& RHICmdList)
{
const uint32 NumGeometries = FMath::RoundUpToPowerOfTwo(Data.ReferencedGeometries.Num());
{
// Round to PoT to avoid resizing too often
const uint32 NumGeometriesAligned = FMath::RoundUpToPowerOfTwo(NumGeometries);
const uint32 AccelerationStructureAddressesBufferSize = NumGeometriesAligned * sizeof(FRayTracingAccelerationStructureAddress);
if (AccelerationStructureAddressesBuffer.NumBytes < AccelerationStructureAddressesBufferSize)
{
// Need to pass "BUF_MultiGPUAllocate", as virtual addresses are different per GPU
AccelerationStructureAddressesBuffer.Initialize(RHICmdList, TEXT("FRayTracingScene::AccelerationStructureAddressesBuffer"), AccelerationStructureAddressesBufferSize, BUF_Volatile | BUF_MultiGPUAllocate);
}
}
for (uint32 GPUIndex : RHICmdList.GetGPUMask())
{
FRayTracingAccelerationStructureAddress* AddressesPtr = (FRayTracingAccelerationStructureAddress*)RHICmdList.LockBufferMGPU(
AccelerationStructureAddressesBuffer.Buffer,
GPUIndex,
0,
NumGeometries * sizeof(FRayTracingAccelerationStructureAddress), RLM_WriteOnly);
const TArrayView<FRHIRayTracingGeometry*> ReferencedGeometries = RHICmdList.AllocArray(MakeConstArrayView(Data.ReferencedGeometries));
RHICmdList.EnqueueLambda([AddressesPtr, ReferencedGeometries, GPUIndex](FRHICommandListBase&)
{
TRACE_CPUPROFILER_EVENT_SCOPE(GetAccelerationStructuresAddresses);
for (int32 GeometryIndex = 0; GeometryIndex < ReferencedGeometries.Num(); ++GeometryIndex)
{
AddressesPtr[GeometryIndex] = ReferencedGeometries[GeometryIndex]->GetAccelerationStructureAddress(GPUIndex);
}
});
RHICmdList.UnlockBufferMGPU(AccelerationStructureAddressesBuffer.Buffer, GPUIndex);
}
}
void FRayTracingInstanceBufferBuilder::BuildRayTracingInstanceBuffer(
FRHICommandList& RHICmdList,
const FGPUScene* GPUScene,
const FRayTracingCullingParameters* CullingParameters,
FRHIUnorderedAccessView* InstancesUAV,
uint32 MaxNumInstances,
bool bCompactOutput,
FRHIUnorderedAccessView* OutputStatsUAV,
uint32 OutputStatsOffset,
FRHIUnorderedAccessView* InstanceExtraDataUAV)
{
::BuildRayTracingInstanceBuffer(
RHICmdList,
GPUScene,
FDFVector3(PreViewTranslation),
InstancesUAV,
InstanceGroupUploadSRV,
InstanceUploadSRV,
AccelerationStructureAddressesBuffer.SRV,
TransformUploadSRV,
MaxNumInstances,
Data.NumGPUInstanceGroups,
Data.NumCPUInstanceGroups,
Data.NumGPUInstanceDescriptors,
Data.NumCPUInstanceDescriptors,
CullingParameters,
bCompactOutput,
OutputStatsUAV,
OutputStatsOffset,
InstanceExtraDataUAV);
}
PRAGMA_ENABLE_DEPRECATION_WARNINGS
#endif //RHI_RAYTRACING