// Copyright Epic Games, Inc. All Rights Reserved. #include "Speech2FaceInternal.h" #include "MetaHumanAuthoringObjects.h" #include "UObject/Package.h" #include "DataDefs.h" #include "Sound/SoundWave.h" #include "Misc/AssertionMacros.h" #include "HAL/UnrealMemory.h" #include "Math/UnrealMathUtility.h" #include "NNE.h" #include "NNEModelData.h" #include "SampleBuffer.h" DEFINE_LOG_CATEGORY(LogSpeech2FaceSolver) #if WITH_EDITOR FSpeech2FaceInternal::FSpeech2FaceInternal() = default; TUniquePtr FSpeech2FaceInternal::Create(const FAudioDrivenAnimationModels& InModels) { TUniquePtr Result = TUniquePtr(new FSpeech2FaceInternal()); if (!Result->Init(InModels)) { return nullptr; } return Result; } bool FSpeech2FaceInternal::Init(const FAudioDrivenAnimationModels& InModels) { check(IsInGameThread()); AudioExtractor = TryLoadModelData(InModels.AudioEncoder); RigLogicPredictor = TryLoadModelData(InModels.AnimationDecoder); return AudioExtractor && RigLogicPredictor; } TSharedPtr FSpeech2FaceInternal::TryLoadModelData(const FSoftObjectPath& InModelAssetPath) { const FSoftObjectPtr ModelAsset(InModelAssetPath); UNNEModelData* ModelData = Cast(ModelAsset.LoadSynchronous()); if (!IsValid(ModelData)) { check(false); UE_LOG(LogSpeech2FaceSolver, Error, TEXT("Failed to load model, it is invalid (nullptr)")); return nullptr; } if (!FModuleManager::Get().LoadModule(TEXT("NNERuntimeORT"))) { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("Failed to load model, could not load NNE Runtime module (NNERuntimeORT): %s"), *ModelData->GetPathName()); return nullptr; } const TWeakInterfacePtr NNERuntimeCPU = UE::NNE::GetRuntime(TEXT("NNERuntimeORTCpu")); if (!NNERuntimeCPU.IsValid()) { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("Failed to load model, could not load NNE Runtime: %s"), *ModelData->GetPathName()); return nullptr; } TSharedPtr ModelCpu = NNERuntimeCPU->CreateModelCPU(ModelData); if (!ModelCpu.IsValid()) { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("Failed to load model, could not create model CPU: %s"), *ModelData->GetPathName()); return nullptr; } TSharedPtr ModelInstance = ModelCpu->CreateModelInstanceCPU(); if (ModelInstance.IsValid()) { UE_LOG(LogSpeech2FaceSolver, Display, TEXT("Loaded model: %s"), *ModelData->GetPathName()); } else { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("Failed to load model, could not create model instance: %s"), *ModelData->GetPathName()); } return ModelInstance; } bool FSpeech2FaceInternal::GenerateFaceAnimation(const FSpeech2Face::FAudioParams& InAudioParams, float InOutputAnimationFps, bool bInGenerateBlinks, TFunction InShouldCancelCallback, TArray& OutAnimation, TArray& OutHeadAnimation) const { using namespace UE::MetaHuman; check(InAudioParams.SpeechRecording.IsValid()); check(InAudioParams.AudioStartOffsetSec >= 0); // If the user has not opted to downmix the audio, the audio channel index should be valid. check(InAudioParams.bDownmixChannels || (InAudioParams.AudioChannelIndex < InAudioParams.SpeechRecording->NumChannels && InAudioParams.AudioChannelIndex >= 0)); check(InOutputAnimationFps > 0); TArray PcmData; uint16 ChannelNum; uint32 SampleRate; if (!InAudioParams.SpeechRecording->GetImportedSoundWaveData(PcmData, SampleRate, ChannelNum)) { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("Could not get imported PCM data for SoundWave %s"), *InAudioParams.SpeechRecording->GetName()); return false; } if (InShouldCancelCallback()) { return false; } // Prepare audio UE_LOG(LogSpeech2FaceSolver, Log, TEXT("Preparing samples for solve")); FloatSamples Samples; if (!GetFloatSamples(InAudioParams.SpeechRecording, PcmData, SampleRate, InAudioParams.bDownmixChannels, InAudioParams.AudioChannelIndex, InAudioParams.AudioStartOffsetSec, Samples)) { return false; } if (InShouldCancelCallback()) { return false; } UE_LOG(LogSpeech2FaceSolver, Log, TEXT("Extracting audio features")); TArray ExtractedAudioData; if (!ExtractAudioFeatures(Samples, AudioExtractor, ExtractedAudioData)) { return false; } if (InShouldCancelCallback()) { return false; } TArray RigLogicValues; TArray RigLogicBlinkValues; TArray RigLogicHeadValues; UE_LOG(LogSpeech2FaceSolver, Log, TEXT("Running predictor")); if (!RunPredictor(RigControlNames.Num(), BlinkRigControlNames.Num(), Samples.Num(), ExtractedAudioData, RigLogicValues, RigLogicBlinkValues, RigLogicHeadValues)) { return false; } if (InShouldCancelCallback()) { return false; } TArray HeadControlNamesGui; HeadControlsGuiToRawLookupTable.GetKeys(HeadControlNamesGui); // Copy rig logic values to structured frame by frame rig control values if (InOutputAnimationFps == RigLogicPredictorOutputFps) { UE_LOG(LogSpeech2FaceSolver, Log, TEXT("Copying samples")); const int32 NumFrames = RigLogicValues.Num() / RigControlNames.Num(); OutAnimation.Empty(NumFrames); OutHeadAnimation.Empty(NumFrames); for (int32 FrameIndex = 0; FrameIndex < NumFrames; FrameIndex++) { // Face animation OutAnimation.AddDefaulted(); OutAnimation.Last().Reserve(RigControlNames.Num()); for (int32 ControlIndex = 0; ControlIndex < RigControlNames.Num(); ControlIndex++) { OutAnimation.Last().Add(RigControlNames[ControlIndex], RigLogicValues[FrameIndex * RigControlNames.Num() + ControlIndex]); } if (bInGenerateBlinks) { for (int32 BlinkControlIndex = 0; BlinkControlIndex < BlinkRigControlNames.Num(); BlinkControlIndex++) { OutAnimation.Last()[BlinkRigControlNames[BlinkControlIndex]] = RigLogicBlinkValues[FrameIndex * BlinkRigControlNames.Num() + BlinkControlIndex]; } } // Head animation FSpeech2Face::FAnimationFrame& HeadAnimationFrame = OutHeadAnimation.AddDefaulted_GetRef(); for (const FString& HeadControlNameGui : HeadControlNamesGui) { const int32 ModelHeadControlIndex = ModelHeadControls.IndexOfByKey(HeadControlNameGui); if (ModelHeadControlIndex != INDEX_NONE) { const int32 HeadValueIndex = FrameIndex * ModelHeadControls.Num() + ModelHeadControlIndex; check(RigLogicHeadValues.IsValidIndex(HeadValueIndex)); const float ModelHeadControlValue = RigLogicHeadValues[HeadValueIndex]; HeadAnimationFrame.Emplace(HeadControlNameGui, ModelHeadControlValue); } else { // Not provided by the model so we clamp it to zero HeadAnimationFrame.Emplace(HeadControlNameGui, 0.0f); } } } } else { UE_LOG(LogSpeech2FaceSolver, Log, TEXT("Resampling")); // Resample output animation OutAnimation = ResampleAnimation(RigLogicValues, RigControlNames, RigControlNames.Num(), InOutputAnimationFps); if (bInGenerateBlinks) { TArray BlinkAnimation = ResampleAnimation(RigLogicBlinkValues, BlinkRigControlNames, BlinkRigControlNames.Num(), InOutputAnimationFps); for (int32 FrameIndex = 0; FrameIndex < BlinkAnimation.Num(); FrameIndex++) { for (const FString& BlinkControlName : BlinkRigControlNames) { OutAnimation[FrameIndex][BlinkControlName] += BlinkAnimation[FrameIndex][BlinkControlName]; } } } // Head animation TArray ResampledHeadAnimation = ResampleAnimation(RigLogicHeadValues, ModelHeadControls, ModelHeadControls.Num(), InOutputAnimationFps); for (const FSpeech2Face::FAnimationFrame& ResampledHeadAnimationFrame : ResampledHeadAnimation) { FSpeech2Face::FAnimationFrame& HeadAnimationFrame = OutHeadAnimation.AddDefaulted_GetRef(); for (const FString& HeadControlNameGui : HeadControlNamesGui) { const float* HeadControlValue = ResampledHeadAnimationFrame.Find(HeadControlNameGui); if (HeadControlValue) { HeadAnimationFrame.Emplace(HeadControlNameGui, *HeadControlValue); } else { // Not provided by the model so we clamp it to zero HeadAnimationFrame.Emplace(HeadControlNameGui, 0.0f); } } } } // We should always have the same number of frames for the face and head animation check(OutHeadAnimation.Num() == OutAnimation.Num()); UE_LOG(LogSpeech2FaceSolver, Log, TEXT("Sound Wave Processing Complete")); return true; } void FSpeech2FaceInternal::SetMood(const EAudioDrivenAnimationMood& InMood) { DesiredMood = InMood; } void FSpeech2FaceInternal::SetMoodIntensity(const float InMoodIntensity) { check(InMoodIntensity >= 0.0f); DesiredMoodIntensity = InMoodIntensity; } bool FSpeech2FaceInternal::ExtractAudioFeatures(const FloatSamples& Samples, const TSharedPtr& AudioExtractor, TArray& OutAudioData) { using namespace UE::NNE; OutAudioData.Empty((Samples.Num() / SamplesPerFrame) * 512); // Restrict extracting of audio features to 30 second chunks as the model does not support more for (int32 SampleIndex = 0; SampleIndex < Samples.Num(); SampleIndex += RigLogicPredictorMaxAudioSamples) { const uint32 SamplesCount = FMath::Clamp(Samples.Num() - SampleIndex, 0, RigLogicPredictorMaxAudioSamples); TArray> ExtractorInputShapesData = { 1, SamplesCount }; TArray> ExtractorInputShapes = { FTensorShape::Make(ExtractorInputShapesData) }; if (AudioExtractor->SetInputTensorShapes(ExtractorInputShapes) != IModelInstanceCPU::ESetInputTensorShapesStatus::Ok) { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("Could not set the audio extractor input tensor shapes")); return false; } // Todo: last frame of the last chunk will not be complete (if not multiple of SamplesPerFrame). Should we ceil/pad/0-fill? const uint32 NumFrames = static_cast(SamplesCount / SamplesPerFrame); TArray> ExtractorOutputShapeData = { 1, NumFrames, 512 }; FTensorShape ExtractorOutputShape = FTensorShape::Make(ExtractorOutputShapeData); TArray ExtractorOutputData; ExtractorOutputData.SetNumUninitialized(ExtractorOutputShape.Volume()); TArray> ExtractorInputBindings = { {(void*)(Samples.GetData() + SampleIndex), SamplesCount * sizeof(float)} }; TArray> ExtractorOutputBindings = { {(void*)ExtractorOutputData.GetData(), ExtractorOutputData.Num() * sizeof(float)} }; if (AudioExtractor->RunSync(ExtractorInputBindings, ExtractorOutputBindings) != IModelInstanceCPU::ESetInputTensorShapesStatus::Ok) { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("The audio extractor NNE model failed to execute")); return false; } OutAudioData.Append(ExtractorOutputData.GetData(), ExtractorOutputData.Num()); } return true; } bool FSpeech2FaceInternal::RunPredictor( const uint32 InFaceControlNum, const uint32 InBlinkControlNum, const uint32 InSamplesNum, const TArray& InAudioData, TArray& OutRigLogicValues, TArray& OutRigLogicBlinkValues, TArray& OutRigLogicHeadValues ) const { using namespace UE::NNE; const uint32 NumFrames = static_cast(InSamplesNum / SamplesPerFrame); TArray> AudioShapeData = { 1, NumFrames, 512 }; const int32 MoodIndex = GetModelMoodIndex(); const TArray> MoodIndexArray = { MoodIndex, }; TArray> MoodIndexShapeData = { 1, }; const TArray> MoodIntensityArray = { DesiredMoodIntensity, }; TArray> MoodIntensityShapeData = { 1, }; TArray> InputTensorShapes = { FTensorShape::Make(AudioShapeData), FTensorShape::Make(MoodIndexShapeData), FTensorShape::Make(MoodIntensityShapeData) }; check(RigLogicPredictor); if (RigLogicPredictor->SetInputTensorShapes(InputTensorShapes) != IModelInstanceCPU::ESetInputTensorShapesStatus::Ok) { return false; } // Bind the inputs // Tensor binding requires non-const void* - we're trusting it not to mutate the input data. void* AudioDataPtr = const_cast(static_cast(InAudioData.GetData())); void* MoodIndexDataPtr = const_cast(static_cast(MoodIndexArray.GetData())); void* MoodIntensityDataPtr = const_cast(static_cast(MoodIntensityArray.GetData())); TArray> InputBindings = { {AudioDataPtr, InAudioData.Num() * sizeof(float)}, {MoodIndexDataPtr, MoodIndexArray.Num() * sizeof(float)}, {MoodIntensityDataPtr, MoodIntensityArray.Num() * sizeof(float)} }; // Bind the outputs TArray FaceParameters; TArray> FaceParametersShapeData = { 1, NumFrames, InFaceControlNum }; TArray> FaceParametersShape = { FTensorShape::Make(FaceParametersShapeData) }; FaceParameters.SetNumUninitialized(FaceParametersShape[0].Volume()); TArray BlinkParameters; TArray> BlinkParametersShapeData = { 1, NumFrames, InBlinkControlNum }; TArray> BlinkParametersShape = { FTensorShape::Make(BlinkParametersShapeData) }; BlinkParameters.SetNumUninitialized(BlinkParametersShape[0].Volume()); const uint32 NumOutputHeadControls = static_cast(ModelHeadControls.Num()); TArray HeadParameters; TArray> HeadParametersShapeData = { 1, NumFrames, NumOutputHeadControls }; TArray> HeadParametersShape = { FTensorShape::Make(HeadParametersShapeData) }; HeadParameters.SetNumUninitialized(HeadParametersShape[0].Volume()); void* FaceParametersPtr = static_cast(FaceParameters.GetData()); void* BlinkParametersPtr = static_cast(BlinkParameters.GetData()); void* HeadParametersPtr = static_cast(HeadParameters.GetData()); TArray> OutputBindings = { {FaceParametersPtr, FaceParameters.Num() * sizeof(float)}, {BlinkParametersPtr, BlinkParameters.Num() * sizeof(float)}, {HeadParametersPtr, HeadParameters.Num() * sizeof(float) } }; if (RigLogicPredictor->RunSync(InputBindings, OutputBindings) != IModelInstanceCPU::ESetInputTensorShapesStatus::Ok) { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("The rig logic model failed to execute")); return false; } OutRigLogicValues = MoveTemp(FaceParameters); OutRigLogicBlinkValues = MoveTemp(BlinkParameters); OutRigLogicHeadValues = MoveTemp(HeadParameters); return true; } int32 FSpeech2FaceInternal::GetModelMoodIndex() const { if (DesiredMood == EAudioDrivenAnimationMood::AutoDetect) { // Special case for AutoDetect. The blueprintable UENUM requires it to be backed by a uint8, so we instead encode the // AutoDetect value as 255 in the UENUM and replace that value here with -1 (which is what the model expects) return -1; } return static_cast(DesiredMood); } bool FSpeech2FaceInternal::GetFloatSamples(const TWeakObjectPtr& SoundWave, const TArray& PcmData, uint32 SampleRate, bool bDownmixChannels, uint32 ChannelToUse, float SecondsToSkip, FloatSamples& OutSamples) { int16 Sample; const uint32 TotalSampleCount = PcmData.Num() / sizeof(Sample); const uint32 TotalSamplesToSkip = SecondsToSkip * SampleRate * SoundWave->NumChannels; if (TotalSamplesToSkip >= TotalSampleCount) { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("Could not get float samples with %d skipped samples from %d samples for SoundWave %s"), TotalSamplesToSkip, TotalSampleCount, *SoundWave->GetName()); return false; } // Audio data is stored as 16 bit signed samples with channels interleaved so that must be taken into account const uint8* PcmDataPtr = PcmData.GetData() + TotalSamplesToSkip * sizeof(Sample); const uint32 SamplesToSkipPerChannel = SecondsToSkip * SampleRate; const uint32 SampleCountPerChannel = PcmData.Num() / (sizeof(Sample) * SoundWave->NumChannels) - SamplesToSkipPerChannel; OutSamples.SetNumUninitialized(SampleCountPerChannel); if (bDownmixChannels && SoundWave->NumChannels > 1) { const int32 SampleCount = TotalSampleCount - TotalSamplesToSkip; Audio::FAlignedFloatBuffer Buffer; Buffer.SetNumUninitialized(SampleCount); Audio::ArrayPcm16ToFloat(MakeArrayView((int16*)PcmDataPtr, SampleCount), Buffer); Audio::TSampleBuffer FloatSampleBuffer(Buffer, SoundWave->NumChannels, SampleRate); FloatSampleBuffer.MixBufferToChannels(1); Audio::FAlignedFloatBuffer MonoBuffer; MonoBuffer.SetNumUninitialized(FloatSampleBuffer.GetNumSamples()); MonoBuffer = FloatSampleBuffer.GetArrayView(); const float MaxValue = Audio::ArrayMaxAbsValue(MonoBuffer); if (MaxValue > 1.f) { Audio::ArrayMultiplyByConstantInPlace(MonoBuffer, 1.f / MaxValue); } OutSamples = MonoBuffer; } else { for (uint32 SampleIndex = 0; SampleIndex < SampleCountPerChannel; SampleIndex++) { // Position ourselves at the sample of appropriate channel, taking into account the channel layout const uint8* SampleData = PcmDataPtr + ChannelToUse * sizeof(uint16); FMemory::Memcpy(&Sample, SampleData, sizeof(Sample)); // Convert to range [-1.0, 1.0) OutSamples[SampleIndex] = Sample / 32768.0f; PcmDataPtr += sizeof(Sample) * SoundWave->NumChannels; } } if (SampleRate != AudioEncoderSampleRateHz) { FloatSamples ResampledAudio; if (!ResampleAudio(MoveTemp(OutSamples), SampleRate, AudioEncoderSampleRateHz, ResampledAudio)) { UE_LOG(LogSpeech2FaceSolver, Error, TEXT("Could not resample audio from %d to %d for SoundWave %s"), SampleRate, AudioEncoderSampleRateHz, *SoundWave->GetName()); return false; } OutSamples = MoveTemp(ResampledAudio); } return true; } bool FSpeech2FaceInternal::ResampleAudio(FloatSamples InSamples, int32 InSampleRate, int32 InResampleRate, FloatSamples& OutResampledSamples) { const Audio::FResamplingParameters Params = { Audio::EResamplingMethod::Linear, 1, // NumChannels static_cast(InSampleRate), static_cast(InResampleRate), InSamples }; const int32 ExpectedSampleCount = GetOutputBufferSize(Params); OutResampledSamples.SetNumUninitialized(ExpectedSampleCount); Audio::FResamplerResults Result; Result.OutBuffer = &OutResampledSamples; const bool bIsSuccess = Audio::Resample(Params, Result); if (!bIsSuccess) { return false; } if (Result.OutputFramesGenerated != ExpectedSampleCount) { OutResampledSamples.SetNum(Result.OutputFramesGenerated, EAllowShrinking::No); } return true; } TArray FSpeech2FaceInternal::ResampleAnimation(TArrayView InRawAnimation, TArrayView InRigControlNames, uint32 ControlNum, float InOutputFps) { const uint32 RawFrameCount = InRawAnimation.Num() / ControlNum; const float AnimationLengthSec = RawFrameCount * RigLogicPredictorFrameDuration; const uint32 ResampledFrameCount = FMath::FloorToInt32(AnimationLengthSec * InOutputFps); // Resample using linear interpolation TArray ResampledAnimation; ResampledAnimation.AddDefaulted(ResampledFrameCount); for (uint32 ResampledFrameIndex = 0; ResampledFrameIndex < ResampledFrameCount; ++ResampledFrameIndex) { // Get corresponding raw frame time const float FrameStartSec = ResampledFrameIndex / InOutputFps; const float RawFrameIndex = FMath::Clamp(FrameStartSec * RigLogicPredictorOutputFps, 0, RawFrameCount - 1); // Get nearest full frames and distance between the two const uint32 PrevRawFrameIndex = FMath::FloorToInt32(RawFrameIndex); const uint32 NextRawFrameIndex = FMath::CeilToInt32(RawFrameIndex); const float RawFramesDelta = RawFrameIndex - PrevRawFrameIndex; // Add interpolated control values for the given frames ResampledAnimation[ResampledFrameIndex].Reserve(ControlNum); for (uint32 ControlIndex = 0; ControlIndex < ControlNum; ++ControlIndex) { const float PrevRawControlValue = InRawAnimation[PrevRawFrameIndex * ControlNum + ControlIndex]; const float NextRawControlValue = InRawAnimation[NextRawFrameIndex * ControlNum + ControlIndex]; const float ResampledValue = FMath::Lerp(PrevRawControlValue, NextRawControlValue, RawFramesDelta); ResampledAnimation[ResampledFrameIndex].Add(InRigControlNames[ControlIndex], ResampledValue); } } return ResampledAnimation; } #endif //WITH_EDITOR