183 lines
6.9 KiB
HLSL
183 lines
6.9 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
/*=============================================================================
|
|
MorphTargets.usf: Compute shader for calculating Morph Targets.
|
|
=============================================================================*/
|
|
|
|
|
|
#include "Common.ush"
|
|
#include "BitPacking.ush"
|
|
#include "ComputeShaderUtils.ush"
|
|
|
|
////////////////////////////////////////////////////////////////
|
|
|
|
#define GMorphTargetDispatchBatchSize 128
|
|
#define BATCH_SIZE 64
|
|
#define NUM_BATCH_HEADER_DWORDS 10
|
|
#define INDEX_MAX_BITS 31
|
|
#define POSITION_MAX_BITS 28
|
|
#define TANGENTZ_MAX_BITS 16
|
|
|
|
RWBuffer<uint> MorphVertexBuffer;
|
|
ByteAddressBuffer MorphDataBuffer;
|
|
|
|
//TODO: use a buffer instead of packing into constant buffer?
|
|
uint4 MorphTargetBatchOffsets[GMorphTargetDispatchBatchSize / 4];
|
|
uint4 MorphTargetGroupOffsets[GMorphTargetDispatchBatchSize / 4];
|
|
float4 MorphTargetWeights[GMorphTargetDispatchBatchSize / 4];
|
|
|
|
float4 PositionScale;
|
|
float2 Precision;
|
|
uint NumGroups;
|
|
|
|
groupshared uint LocalGroupOffsets[GMorphTargetDispatchBatchSize];
|
|
|
|
void LoadLocalData(uint GroupIndex)
|
|
{
|
|
if (GroupIndex < GMorphTargetDispatchBatchSize / 4)
|
|
{
|
|
uint GroupSharedIndex = GroupIndex * 4;
|
|
uint4 Offset = MorphTargetGroupOffsets[GroupIndex];
|
|
|
|
for (int i = 0; i < 4; ++i)
|
|
{
|
|
LocalGroupOffsets[GroupSharedIndex + i] = Offset[i];
|
|
}
|
|
}
|
|
|
|
GroupMemoryBarrierWithGroupSync();
|
|
}
|
|
|
|
uint GroupIndexToMorphTargetLocalIndex(uint GroupThreadIndex)
|
|
{
|
|
uint Idx = 0;
|
|
uint Width = GMorphTargetDispatchBatchSize >> 1;
|
|
|
|
UNROLL
|
|
while (Width > 0)
|
|
{
|
|
Idx += (GroupThreadIndex >= LocalGroupOffsets[Idx + Width]) ? Width : 0;
|
|
Width = Width >> 1;
|
|
}
|
|
|
|
return Idx;
|
|
}
|
|
|
|
struct FBatchHeader
|
|
{
|
|
uint DataOffset;
|
|
uint NumElements;
|
|
bool bTangents;
|
|
|
|
uint IndexBits;
|
|
uint3 PositionBits;
|
|
uint3 TangentZBits;
|
|
|
|
uint IndexMin;
|
|
int3 PositionMin;
|
|
int3 TangentZMin;
|
|
};
|
|
|
|
FBatchHeader GetBatchHeader(uint BatchIndex)
|
|
{
|
|
const uint BaseAddress = BatchIndex * (NUM_BATCH_HEADER_DWORDS * 4);
|
|
uint4 Data0 = MorphDataBuffer.Load4(BaseAddress);
|
|
uint2 Data1 = MorphDataBuffer.Load2(BaseAddress + 16);
|
|
|
|
FBatchHeader Header = (FBatchHeader)0;
|
|
Header.DataOffset = Data0.x;
|
|
Header.IndexBits = BitFieldExtractU32(Data0.y, 5, 0);
|
|
Header.PositionBits.x = BitFieldExtractU32(Data0.y, 5, 5);
|
|
Header.PositionBits.y = BitFieldExtractU32(Data0.y, 5, 10);
|
|
Header.PositionBits.z = BitFieldExtractU32(Data0.y, 5, 15);
|
|
Header.bTangents = BitFieldExtractU32(Data0.y, 1, 20);
|
|
Header.NumElements = Data0.y >> 21;
|
|
|
|
Header.IndexMin = Data0.z;
|
|
Header.PositionMin = int3(Data0.w, Data1);
|
|
|
|
if (Header.bTangents)
|
|
{
|
|
// TODO: Don't even store this. Need to handle two different batch header sizes.
|
|
uint4 Data2 = MorphDataBuffer.Load4(BaseAddress + 24);
|
|
Header.TangentZBits.x = BitFieldExtractU32(Data2.x, 5, 0);
|
|
Header.TangentZBits.y = BitFieldExtractU32(Data2.x, 5, 5);
|
|
Header.TangentZBits.z = BitFieldExtractU32(Data2.x, 5, 10);
|
|
Header.TangentZMin = int3(Data2.yzw);
|
|
}
|
|
|
|
return Header;
|
|
}
|
|
|
|
[numthreads(BATCH_SIZE, 1, 1)]
|
|
void GPUMorphUpdateCS(uint3 GroupId : SV_GroupID, uint GroupThreadIndex : SV_GroupIndex)
|
|
{
|
|
LoadLocalData(GroupThreadIndex);
|
|
|
|
const uint GroupIndex = GetUnWrappedDispatchGroupId(GroupId);
|
|
if (GroupIndex >= NumGroups)
|
|
{
|
|
return;
|
|
}
|
|
|
|
const uint MorphTargetLocalIndex = GroupIndexToMorphTargetLocalIndex(GroupIndex);
|
|
|
|
const uint BatchStartIndex = MorphTargetBatchOffsets[MorphTargetLocalIndex >> 2][MorphTargetLocalIndex & 3u]; //TODO: use a buffer instead of packing into constant buffer?
|
|
const uint GroupOffset = GroupIndex - LocalGroupOffsets[MorphTargetLocalIndex];
|
|
const FBatchHeader BatchHeader = GetBatchHeader(BatchStartIndex + GroupIndex - LocalGroupOffsets[MorphTargetLocalIndex]);
|
|
|
|
if (GroupThreadIndex < BatchHeader.NumElements)
|
|
{
|
|
const float Weight = MorphTargetWeights[MorphTargetLocalIndex >> 2][MorphTargetLocalIndex & 3u]; //TODO: use a buffer instead of packing into constant buffer?
|
|
const uint NumPositionBits = BatchHeader.PositionBits.x + BatchHeader.PositionBits.y + BatchHeader.PositionBits.z;
|
|
const uint NumTangentBits = BatchHeader.TangentZBits.x + BatchHeader.TangentZBits.y + BatchHeader.TangentZBits.z;
|
|
const uint StrideInBits = BatchHeader.IndexBits + NumPositionBits + (BatchHeader.bTangents ? NumTangentBits : 0);
|
|
|
|
const uint CompileTimeMaxBits = INDEX_MAX_BITS + 3 * POSITION_MAX_BITS + 3 * TANGENTZ_MAX_BITS;
|
|
FBitStreamReaderState InputStream = BitStreamReader_Create_Aligned(BatchHeader.DataOffset, GroupThreadIndex * StrideInBits, CompileTimeMaxBits);
|
|
const uint DestVertexIndex = BitStreamReader_Read_RO(MorphDataBuffer, InputStream, BatchHeader.IndexBits, INDEX_MAX_BITS) + BatchHeader.IndexMin + GroupThreadIndex;
|
|
|
|
const int3 LocalPosition = BitStreamReader_Read3_RO(MorphDataBuffer, InputStream, BatchHeader.PositionBits, POSITION_MAX_BITS);
|
|
const float3 PositionDelta = float3(LocalPosition + BatchHeader.PositionMin) * Precision.x;
|
|
const int3 QuantizedPositionOffset = (int3)round(PositionDelta * Weight * PositionScale.xyz);
|
|
|
|
const uint DestVertexAddress = DestVertexIndex * 6;
|
|
InterlockedAdd(MorphVertexBuffer[DestVertexAddress + 0], asuint(QuantizedPositionOffset.x));
|
|
InterlockedAdd(MorphVertexBuffer[DestVertexAddress + 1], asuint(QuantizedPositionOffset.y));
|
|
InterlockedAdd(MorphVertexBuffer[DestVertexAddress + 2], asuint(QuantizedPositionOffset.z));
|
|
|
|
if (BatchHeader.bTangents)
|
|
{
|
|
const int3 LocalTangent = BitStreamReader_Read3_RO(MorphDataBuffer, InputStream, BatchHeader.TangentZBits, TANGENTZ_MAX_BITS);
|
|
const float3 TangentZDelta = float3(LocalTangent + BatchHeader.TangentZMin) * Precision.y;
|
|
const int3 QuantizedTangentZOffset = (int3)round(TangentZDelta * Weight * PositionScale.w);
|
|
InterlockedAdd(MorphVertexBuffer[DestVertexAddress + 3], asuint(QuantizedTangentZOffset.x));
|
|
InterlockedAdd(MorphVertexBuffer[DestVertexAddress + 4], asuint(QuantizedTangentZOffset.y));
|
|
InterlockedAdd(MorphVertexBuffer[DestVertexAddress + 5], asuint(QuantizedTangentZOffset.z));
|
|
}
|
|
}
|
|
}
|
|
|
|
uint NumVertices;
|
|
|
|
[numthreads(64, 1, 1)]
|
|
void GPUMorphNormalizeCS(uint3 GroupId : SV_GroupID, uint GroupThreadIndex : SV_GroupIndex)
|
|
{
|
|
const uint DestVertexIndex = GetUnWrappedDispatchThreadId(GroupId, GroupThreadIndex, 64);
|
|
if (DestVertexIndex >= NumVertices)
|
|
{
|
|
return;
|
|
}
|
|
|
|
const uint DestVertexAddress = DestVertexIndex * 6;
|
|
const float3 Position = float3(int3(MorphVertexBuffer[DestVertexAddress + 0], MorphVertexBuffer[DestVertexAddress + 1], MorphVertexBuffer[DestVertexAddress + 2])) * PositionScale.xyz;
|
|
|
|
MorphVertexBuffer[DestVertexAddress + 0] = asuint(Position.x);
|
|
MorphVertexBuffer[DestVertexAddress + 1] = asuint(Position.y);
|
|
MorphVertexBuffer[DestVertexAddress + 2] = asuint(Position.z);
|
|
|
|
float3 TangentZ = float3(int3(MorphVertexBuffer[DestVertexAddress + 3], MorphVertexBuffer[DestVertexAddress + 4], MorphVertexBuffer[DestVertexAddress + 5])) * PositionScale.w;
|
|
MorphVertexBuffer[DestVertexAddress + 3] = asuint(TangentZ.x);
|
|
MorphVertexBuffer[DestVertexAddress + 4] = asuint(TangentZ.y);
|
|
MorphVertexBuffer[DestVertexAddress + 5] = asuint(TangentZ.z);
|
|
} |