// Copyright Epic Games, Inc. All Rights Reserved. #pragma once #include "MuR/ImagePrivate.h" #include "MuR/Platform.h" #include "MuR/ParallelExecutionUtils.h" #include "Math/VectorRegister.h" namespace mu { template inline void ImageSaturate(FImage* AImage, float Factor) { MUTABLE_CPUPROFILER_SCOPE(ImageSaturateInplace); if (!AImage) { return; } // Clamp the factor // TODO: See what happens if we don't Factor = FMath::Max(0.0f, Factor); uint16 FactorUNorm = (uint16)(Factor*255.0f); // Generic implementation const int32 BytesPerElem = GetImageFormatData(AImage->GetFormat()).BytesPerBlock; constexpr int32 NumBatchElems = 1 << 14; const int32 NumBatches = AImage->DataStorage.GetNumBatches(NumBatchElems, BytesPerElem); ParallelExecutionUtils::InvokeBatchParallelFor(NumBatches, [ AImage, NumBatchElems, BytesPerElem, FactorUNorm ](int32 BatchId) { TArrayView AView = AImage->DataStorage.GetBatch(BatchId, NumBatchElems, BytesPerElem); uint8* ABuf = AView.GetData(); const int32 NumElems = AView.Num() / BytesPerElem; switch (AImage->GetFormat()) { case EImageFormat::RGB_UByte: { if constexpr (!bUseVectorImpl) { for (int32 I = 0; I < NumElems; ++I) { uint16 PixelData[4] = {ABuf[3*I + 0], ABuf[3*I + 1], ABuf[3*I + 2], 0}; const uint16 LumTimesFactor = ((76 * PixelData[0] + 150 * PixelData[1] + 29 * PixelData[2]) / 255) * (255 - FactorUNorm); ABuf[3*I + 0] = static_cast((PixelData[0] * FactorUNorm + LumTimesFactor) / 255); ABuf[3*I + 1] = static_cast((PixelData[1] * FactorUNorm + LumTimesFactor) / 255); ABuf[3*I + 2] = static_cast((PixelData[2] * FactorUNorm + LumTimesFactor) / 255); } } else // if constexpr bUseVectorImpl { constexpr VectorRegister4Int StaurationFactorRGB = MakeVectorRegisterIntConstant(76, 150, 29, 0); constexpr VectorRegister4Int Value255 = MakeVectorRegisterIntConstant(255, 255, 255, 255); const VectorRegister4Int F = VectorIntSet1(FactorUNorm); const VectorRegister4Int OneMinusF = VectorIntSubtract(Value255, F); for (int32 I = 0; I < NumElems; ++I) { const VectorRegister4Int Value = MakeVectorRegisterInt( static_cast(ABuf[3*I + 0]), static_cast(ABuf[3*I + 1]), static_cast(ABuf[3*I + 2]), 0); const VectorRegister4Int ScaledValue = VectorIntMultiply(Value, StaurationFactorRGB); alignas(VectorRegister4Int) int32 IndexableScaledValue[4]; VectorIntStoreAligned(ScaledValue, IndexableScaledValue); // Add scaled values and divide by 255 const VectorRegister4Int L = VectorShiftRightImmLogical( VectorIntMultiply( VectorIntSet1(IndexableScaledValue[0] + IndexableScaledValue[1] + IndexableScaledValue[2]), VectorIntSet1(32897)), 23); const VectorRegister4Int Lerp = VectorIntAdd(VectorIntMultiply(L, OneMinusF), VectorIntMultiply(Value, F)); alignas(VectorRegister4Int) int32 IndexableResult[4]; // divide by 255 and store. do we need to clamp? VectorIntStoreAligned(VectorShiftRightImmLogical(VectorIntMultiply(Lerp, VectorIntSet1(32897)), 23), IndexableResult); ABuf[3*I + 0] = static_cast(IndexableResult[0]); ABuf[3*I + 1] = static_cast(IndexableResult[1]); ABuf[3*I + 2] = static_cast(IndexableResult[2]); } } break; } case EImageFormat::RGBA_UByte: { if constexpr (!bUseVectorImpl) { for (int32 I = 0; I < NumElems; ++I) { uint16 PixelData[4] = {ABuf[4*I + 0], ABuf[4*I + 1], ABuf[4*I + 2], ABuf[4*I + 3]}; const uint16 LumTimesFactor = ((76 * PixelData[0] + 150 * PixelData[1] + 29 * PixelData[2]) / 255) * (255 - FactorUNorm); ABuf[4*I + 0] = static_cast((PixelData[0] * FactorUNorm + LumTimesFactor) / 255); ABuf[4*I + 1] = static_cast((PixelData[1] * FactorUNorm + LumTimesFactor) / 255); ABuf[4*I + 2] = static_cast((PixelData[2] * FactorUNorm + LumTimesFactor) / 255); } } else // if constexpr bUseVectorImpl { constexpr VectorRegister4Int StaurationFactorRGB = MakeVectorRegisterIntConstant(76, 150, 29, 0); constexpr VectorRegister4Int Value255 = MakeVectorRegisterIntConstant(255, 255, 255, 255); const VectorRegister4Int F = VectorIntSet1(FactorUNorm); const VectorRegister4Int OneMinusF = VectorIntSubtract(Value255, F); for (int32 I = 0; I < NumElems; ++I) { const VectorRegister4Int Value = MakeVectorRegisterInt( static_cast(ABuf[4*I + 0]), static_cast(ABuf[4*I + 1]), static_cast(ABuf[4*I + 2]), 0); const VectorRegister4Int ScaledValue = VectorIntMultiply(Value, StaurationFactorRGB); alignas(VectorRegister4Int) int32 IndexableScaledValue[4]; VectorIntStoreAligned(ScaledValue, IndexableScaledValue); // Add scaled values and divide by 255 const VectorRegister4Int L = VectorShiftRightImmLogical( VectorIntMultiply( VectorIntSet1(IndexableScaledValue[0] + IndexableScaledValue[1] + IndexableScaledValue[2]), VectorIntSet1(32897)), 23); const VectorRegister4Int Lerp = VectorIntAdd(VectorIntMultiply(L, OneMinusF), VectorIntMultiply(Value, F)); alignas(VectorRegister4Int) int32 IndexableResult[4]; // divide by 255 and store. do we need to clamp? VectorIntStoreAligned(VectorShiftRightImmLogical(VectorIntMultiply(Lerp, VectorIntSet1(32897)), 23), IndexableResult); ABuf[4*I + 0] = static_cast(IndexableResult[0]); ABuf[4*I + 1] = static_cast(IndexableResult[1]); ABuf[4*I + 2] = static_cast(IndexableResult[2]); } } break; } case EImageFormat::BGRA_UByte: { if constexpr (!bUseVectorImpl) { for (int32 I = 0; I < NumElems; ++I) { uint16 PixelData[4] = {ABuf[4*I + 0], ABuf[4*I + 1], ABuf[4*I + 2], ABuf[4*I + 3]}; const uint16 LumTimesFactor = ((29 * PixelData[0] + 150 * PixelData[1] + 76 * PixelData[2]) / 255) * (255 - FactorUNorm); ABuf[4*I + 0] = static_cast((PixelData[0] * FactorUNorm + LumTimesFactor) / 255); ABuf[4*I + 1] = static_cast((PixelData[1] * FactorUNorm + LumTimesFactor) / 255); ABuf[4*I + 2] = static_cast((PixelData[2] * FactorUNorm + LumTimesFactor) / 255); } } else // if constexpr bUseVectorImpl { constexpr VectorRegister4Int StaurationFactorRGB = MakeVectorRegisterIntConstant(29, 150, 76, 0); constexpr VectorRegister4Int Value255 = MakeVectorRegisterIntConstant(255, 255, 255, 255); const VectorRegister4Int F = VectorIntSet1(FactorUNorm); const VectorRegister4Int OneMinusF = VectorIntSubtract(Value255, F); for (int32 I = 0; I < NumElems; ++I) { const VectorRegister4Int Value = MakeVectorRegisterInt( static_cast(ABuf[4*I + 0]), static_cast(ABuf[4*I + 1]), static_cast(ABuf[4*I + 2]), 0); const VectorRegister4Int ScaledValue = VectorIntMultiply(Value, StaurationFactorRGB); alignas(VectorRegister4Int) int32 IndexableScaledValue[4]; VectorIntStoreAligned(ScaledValue, IndexableScaledValue); // Add scaled values and divide by 255 const VectorRegister4Int L = VectorShiftRightImmLogical( VectorIntMultiply( VectorIntSet1(IndexableScaledValue[0] + IndexableScaledValue[1] + IndexableScaledValue[2]), VectorIntSet1(32897)), 23); const VectorRegister4Int Lerp = VectorIntAdd(VectorIntMultiply(L, OneMinusF), VectorIntMultiply(Value, F)); alignas(VectorRegister4Int) int32 IndexableResult[4]; // divide by 255 and store. do we need to clamp? VectorIntStoreAligned(VectorShiftRightImmLogical(VectorIntMultiply(Lerp, VectorIntSet1(32897)), 23), IndexableResult); ABuf[4*I + 0] = static_cast(IndexableResult[0]); ABuf[4*I + 1] = static_cast(IndexableResult[1]); ABuf[4*I + 2] = static_cast(IndexableResult[2]); } } break; } default: check(false); } }); } }