// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "LearningArray.h" #include "LearningObservation.h" #include "LearningAction.h" #include "Containers/Set.h" #include "Templates/SharedPointer.h" #include "UObject/NameTypes.h" class FJsonObject; namespace UE::Learning { enum class ECompletionMode : uint8; struct FResetInstanceBuffer; /** * Buffer storing the observations, actions, and rewards of multiple instances over an episode */ struct LEARNINGTRAINING_API FEpisodeBuffer { friend struct FReplayBuffer; /** * Resize the experience buffer * * @param InMaxInstanceNum Maximum number of instances * @param InMaxStepNum Maximum number of steps in an episode */ void Resize(const int32 InMaxInstanceNum, const int32 InMaxStepNum); /** * Reset the buffer for the given set of instances */ void Reset(const FIndexSet Instances); // Observations int32 AddObservations(const FName& Name, const int32 SchemaId, const int32 Size); void PushObservations(const int32 ObservationId, const TLearningArrayView<2, const float> InObservations, const FIndexSet Instances); const TLearningArrayView<2, const float> GetObservations(const int32 ObservationId, const int32 InstanceIdx) const; // Actions int32 AddActions(const FName& Name, const int32 SchemaId, const int32 Size); void PushActions(const int32 ActionId, const TLearningArrayView<2, const float> InActions, const FIndexSet Instances); const TLearningArrayView<2, const float> GetActions(const int32 ActionId, const int32 InstanceIdx) const; // Action Modifiers int32 AddActionModifiers(const FName& Name, const int32 SchemaId, const int32 Size); void PushActionModifiers(const int32 ActionModifierId, const TLearningArrayView<2, const float> InActionModifiers, const FIndexSet Instances); const TLearningArrayView<2, const float> GetActionModifiers(const int32 ActiActionModifierIdnId, const int32 InstanceIdx) const; // Memory States int32 AddMemoryStates(const FName& Name, const int32 Size); void PushMemoryStates(const int32 MemoryStateId, const TLearningArrayView<2, const float> InMemoryStates, const FIndexSet Instances); const TLearningArrayView<2, const float> GetMemoryStates(const int32 MemoryStateId, const int32 InstanceIdx) const; // Rewards int32 AddRewards(const FName& Name, const int32 Size); // Convenience overload for pushing rewards from a Training Environment void PushRewards(const int32 RewardId, const TLearningArrayView<1, const float> InRewards, const FIndexSet Instances); void PushRewards(const int32 RewardId, const TLearningArrayView<2, const float> InRewards, const FIndexSet Instances); const TLearningArrayView<2, const float> GetRewards(const int32 RewardId, const int32 InstanceIdx) const; // Episode Step Nums void IncrementEpisodeStepNums(const FIndexSet Instances); const TLearningArrayView<1, const int32> GetEpisodeStepNums() const; int32 GetMaxInstanceNum() const; int32 GetMaxStepNum() const; private: bool bHasBeenSized = false; int32 MaxInstanceNum = 0; int32 MaxStepNum = 0; // Observations TArray> ObservationNames; TArray> ObservationSchemaIds; TArray> ObservationSizes; TArray, TInlineAllocator<1>> ObservationArrays; // Actions TArray> ActionNames; TArray> ActionSchemaIds; TArray> ActionSizes; TArray, TInlineAllocator<1>> ActionArrays; // Action Modifiers TArray> ActionModifierNames; TArray> ActionModifierSchemaIds; TArray> ActionModifierSizes; TArray, TInlineAllocator<1>> ActionModifierArrays; // Memory States TArray> MemoryStateNames; TArray> MemoryStateSizes; TArray, TInlineAllocator<1>> MemoryStateArrays; // Rewards TArray> RewardNames; TArray> RewardSizes; TArray, TInlineAllocator<1>> RewardArrays; // Episode Step Nums TLearningArray<1, int32> EpisodeStepNums; }; /** * Large buffer that sequentially concatenates a series of episodes in a large * flat array. Used to collate episodic data together from multiple instances. */ struct LEARNINGTRAINING_API FReplayBuffer { /** * Resizes the replay buffer. * * @param EpisodeBuffer Determines the dimensionality of the various arrays * @param MaxEpisodeNum Maximum number of episodes to be stored in the buffer * @param MaxStepNum Maximum number of steps to be stored in the buffer */ void Resize( const FEpisodeBuffer& EpisodeBuffer, const int32 InMaxEpisodeNum = 2048, const int32 InMaxStepNum = 16384); /** * Reset the replay buffer. Does not free memory - just resets episode and sample num to zero. */ void Reset(); /** * Add a set of episodes to the replay buffer * * @param InEpisodeCompletionModes Array of completion modes for each instance of shape (InstanceNum) * @param InEpisodeFinalObservations Array of final observations for each instance of shape (InstanceNum, ObservationVectorDimNum) * @param InEpisodeFinalMemoryStates Array of final memory states (post-evaluation) for each instance of shape (InstanceNum, MemoryStateVectorDimNum) * @param EpisodeBuffer Episode buffer to add experience from * @param Instances Instances to add * @param bAddTruncatedEpisodeWhenFull When enabled, this will add a truncated, partial episode to the buffer when full * * @returns True when the replay buffer is full */ bool AddEpisodes( const TLearningArrayView<1, const ECompletionMode> InEpisodeCompletionModes, const TArrayView> InEpisodeFinalObservations, const TArrayView> InEpisodeFinalMemoryStates, const FEpisodeBuffer& EpisodeBuffer, const FIndexSet Instances, const bool bAddTruncatedEpisodeWhenFull = true); /** * Alternate way to add data from records generated via imitation learning. Does the resizing needed. * * Putting this in place until we have more time later to rewrite how imitation learning stores data. At this * time, it seems logical to have the data records be implemented in terms of EpisodeBuffer(s), in which case * this method may no longer be needed. */ void AddRecords( const int32 InEpisodeNum, const int32 InMaxStepNum, const int32 ObservationSchemaId, const int32 ObservationNum, const int32 ActionSchemaId, const int32 ActionNum, const TLearningArrayView<1, const int32> RecordedEpisodeStarts, const TLearningArrayView<1, const int32> RecordedEpisodeLengths, const TLearningArrayView<2, const float> RecordedObservations, const TLearningArrayView<2, const float> RecordedActions); bool HasCompletions() const; bool HasFinalObservations() const; bool HasFinalMemoryStates() const; int32 GetMaxEpisodeNum() const; int32 GetMaxStepNum() const; int32 GetEpisodeNum() const; int32 GetStepNum() const; const TLearningArrayView<1, const int32> GetEpisodeStarts() const; const TLearningArrayView<1, const int32> GetEpisodeLengths() const; const TLearningArrayView<1, const ECompletionMode> GetEpisodeCompletionModes() const; int32 GetObservationsNum() const; const TLearningArrayView<2, const float> GetObservations(const int32 Index) const; const TLearningArrayView<2, const float> GetEpisodeFinalObservations(const int32 Index) const; int32 GetActionsNum() const; const TLearningArrayView<2, const float> GetActions(const int32 Index) const; int32 GetActionModifiersNum() const; const TLearningArrayView<2, const float> GetActionModifiers(const int32 Index) const; int32 GetMemoryStatesNum() const; const TLearningArrayView<2, const float> GetMemoryStates(const int32 Index) const; const TLearningArrayView<2, const float> GetEpisodeFinalMemoryStates(const int32 Index) const; int32 GetRewardsNum() const; const TLearningArrayView<2, const float> GetRewards(const int32 Index) const; TSharedRef AsJsonConfig(const int32 ReplayBufferId) const; private: bool bHasCompletions = false; bool bHasFinalObservations = false; bool bHasFinalMemoryStates = false; int32 MaxEpisodeNum = 0; int32 MaxStepNum = 0; int32 EpisodeNum = 0; int32 StepNum = 0; TLearningArray<1, int32> EpisodeStarts; TLearningArray<1, int32> EpisodeLengths; TLearningArray<1, ECompletionMode> EpisodeCompletionModes; TArray> ObservationNames; TArray> ObservationSchemaIds; TArray, TInlineAllocator<1>> Observations; TArray, TInlineAllocator<1>> EpisodeFinalObservations; TArray> ActionNames; TArray> ActionSchemaIds; TArray, TInlineAllocator<1>> Actions; TArray> ActionModifierNames; TArray> ActionModifierSchemaIds; TArray, TInlineAllocator<1>> ActionModifiers; TArray> MemoryStateNames; TArray, TInlineAllocator<1>> MemoryStates; TArray, TInlineAllocator<1>> EpisodeFinalMemoryStates; TArray> RewardNames; TArray, TInlineAllocator<1>> Rewards; }; namespace Experience { /** * Resets, and then runs experience gathering until the provided replay buffer is full * * @param ReplayBuffer Replay Buffer * @param EpisodeBuffer Episode Buffer * @param ResetBuffer Reset Buffer * @param ObservationVectorBuffers Buffers to read/write observation vectors into * @param ActionVectorBuffers Buffers to read/write action vectors into * @param PreEvaluationMemoryStateVectorBuffers Buffers to read/write pre-evaluation memory state vectors into * @param MemoryStateVectorBuffers Buffers to read/write (post-evaluation) memory state vectors into * @param RewardBuffers Buffers to read/write rewards into * @param CompletionBuffer Buffer to read/write completions into * @param EpisodeCompletionBuffer Additional buffer to record completions from full episode buffers * @param AllCompletionBuffer Additional buffer to record all completions from full episodes and normal completions * @param ResetFunction Function to run for resetting the environment * @param ObservationFunctions Functions to run for evaluating observations * @param PolicyFunctions Functions to run generating actions from observations * @param ActionFunctions Functions to run for evaluating actions * @param UpdateFunctions Functions to run for updating the environment * @param RewardFunctions Functions to run for evaluating rewards * @param CompletionFunction Function to run for evaluating completions * @param Instances Set of instances to gather experience for */ LEARNINGTRAINING_API void GatherExperienceUntilReplayBufferFull( FReplayBuffer& ReplayBuffer, FEpisodeBuffer& EpisodeBuffer, FResetInstanceBuffer& ResetBuffer, const TArrayView> ObservationVectorBuffers, const TArrayView> ActionVectorBuffers, const TArrayView> PreEvaluationMemoryStateVectorBuffers, const TArrayView> MemoryStateVectorBuffers, const TArrayView> RewardBuffers, TLearningArrayView<1, ECompletionMode> CompletionBuffer, TLearningArrayView<1, ECompletionMode> EpisodeCompletionBuffer, TLearningArrayView<1, ECompletionMode> AllCompletionBuffer, const TFunctionRef ResetFunction, const TArrayView> ObservationFunctions, const TArrayView> PolicyFunctions, const TArrayView> ActionFunctions, const TArrayView> UpdateFunctions, const TArrayView> RewardFunctions, const TFunctionRef CompletionFunction, const FIndexSet Instances); } };