Files
UnrealEngine/Engine/Source/Runtime/AVEncoder/Private/Decoders/vdecmpeg4/M4MemOps_Generic.cpp
2025-05-18 13:04:45 +08:00

1006 lines
25 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
#include "M4Decoder.h"
#include "M4MemOps.h"
#include "M4Memory.h"
#include "M4Image.h"
#include "M4idct.h"
#include "M4InvQuant.h"
template <typename T, typename C>
T ADVANCE_POINTER(T pPointer, C numBytes)
{
return T(size_t(pPointer) + size_t(numBytes));
}
namespace vdecmpeg4
{
// Enable code optimizations. Works only for little endian systems and may not result in more performant code than
// what the compiler generates, depending on the compiler.
// Some compilers produces better results without these bit fiddling optimizations.
#define ENABLE_OPTIMIZATION 0
static int16 iclip[1024];
static int16 *iclp = nullptr;
struct CLIPinitializer
{
CLIPinitializer()
{
iclp = iclip + 512;
for(int32 i= -512; i<512; i++)
{
iclp[i] = IntCastChecked<int16>(i < -256 ? -256 : i > 255 ? 255 : i);
}
}
};
static CLIPinitializer _sgClipTableInitializer;
static uint8 clampToUINT8(int16 In)
{
return (uint8)(In < 0 ? 0 : In > 255 ? 255 : In);
}
static void _memTrans8to8Y(uint8* dst, const uint8* src, int32 stride)
{
const uint64 * src64 = (const uint64 *)src;
uint64 * dst64 = (uint64 *)dst;
for(uint32 i=8; i; --i)
{
*dst64 = *src64;
src64 = ADVANCE_POINTER(src64, stride);
dst64 = ADVANCE_POINTER(dst64, stride);
}
}
static void _memTrans8to8Yx4(uint8* dst, const uint8* src, int32 stride)
{
const uint64 * src64 = (const uint64 *)src;
uint64 * dst64 = (uint64 *)dst;
for(uint32 i=16; i; --i)
{
dst64[0] = src64[0];
dst64[1] = src64[1];
src64 = ADVANCE_POINTER(src64, stride);
dst64 = ADVANCE_POINTER(dst64, stride);
}
}
static void _memTrans16to8Y(uint8* dst, const int16* src, const int32 stride)
{
for(uint32 i=8; i; --i)
{
dst[0] = clampToUINT8(*src++);
dst[1] = clampToUINT8(*src++);
dst[2] = clampToUINT8(*src++);
dst[3] = clampToUINT8(*src++);
dst[4] = clampToUINT8(*src++);
dst[5] = clampToUINT8(*src++);
dst[6] = clampToUINT8(*src++);
dst[7] = clampToUINT8(*src++);
dst += stride;
}
}
static void _memTrans16to8Yx4(uint8* dst, const int16* src, const int32 stride)
{
const int16 *srcA = src;
const int16 *srcB = src + 64;
for(uint32 i=8; i; --i)
{
dst[ 0] = clampToUINT8(*srcA++);
dst[ 1] = clampToUINT8(*srcA++);
dst[ 2] = clampToUINT8(*srcA++);
dst[ 3] = clampToUINT8(*srcA++);
dst[ 4] = clampToUINT8(*srcA++);
dst[ 5] = clampToUINT8(*srcA++);
dst[ 6] = clampToUINT8(*srcA++);
dst[ 7] = clampToUINT8(*srcA++);
dst[ 8] = clampToUINT8(*srcB++);
dst[ 9] = clampToUINT8(*srcB++);
dst[10] = clampToUINT8(*srcB++);
dst[11] = clampToUINT8(*srcB++);
dst[12] = clampToUINT8(*srcB++);
dst[13] = clampToUINT8(*srcB++);
dst[14] = clampToUINT8(*srcB++);
dst[15] = clampToUINT8(*srcB++);
dst += stride;
}
srcA = srcB + 64;
for(uint32 i=8; i; --i)
{
dst[ 0] = clampToUINT8(*srcB++);
dst[ 1] = clampToUINT8(*srcB++);
dst[ 2] = clampToUINT8(*srcB++);
dst[ 3] = clampToUINT8(*srcB++);
dst[ 4] = clampToUINT8(*srcB++);
dst[ 5] = clampToUINT8(*srcB++);
dst[ 6] = clampToUINT8(*srcB++);
dst[ 7] = clampToUINT8(*srcB++);
dst[ 8] = clampToUINT8(*srcA++);
dst[ 9] = clampToUINT8(*srcA++);
dst[10] = clampToUINT8(*srcA++);
dst[11] = clampToUINT8(*srcA++);
dst[12] = clampToUINT8(*srcA++);
dst[13] = clampToUINT8(*srcA++);
dst[14] = clampToUINT8(*srcA++);
dst[15] = clampToUINT8(*srcA++);
dst += stride;
}
}
static void _memTrans16to8AddY(uint8* dst, const int16* src, const int32 stride)
{
for(uint32 i=8; i; --i)
{
dst[0] = clampToUINT8((int16)dst[0] + *src++);
dst[1] = clampToUINT8((int16)dst[1] + *src++);
dst[2] = clampToUINT8((int16)dst[2] + *src++);
dst[3] = clampToUINT8((int16)dst[3] + *src++);
dst[4] = clampToUINT8((int16)dst[4] + *src++);
dst[5] = clampToUINT8((int16)dst[5] + *src++);
dst[6] = clampToUINT8((int16)dst[6] + *src++);
dst[7] = clampToUINT8((int16)dst[7] + *src++);
dst += stride;
}
}
static void _memTrans16to8AddYx4(uint8* dst, const int16* src, const int32 stride)
{
const int16 *srcA = src;
const int16 *srcB = src + 64;
for(uint32 i=8; i; --i)
{
dst[ 0] = clampToUINT8((int16)dst[ 0] + *srcA++);
dst[ 1] = clampToUINT8((int16)dst[ 1] + *srcA++);
dst[ 2] = clampToUINT8((int16)dst[ 2] + *srcA++);
dst[ 3] = clampToUINT8((int16)dst[ 3] + *srcA++);
dst[ 4] = clampToUINT8((int16)dst[ 4] + *srcA++);
dst[ 5] = clampToUINT8((int16)dst[ 5] + *srcA++);
dst[ 6] = clampToUINT8((int16)dst[ 6] + *srcA++);
dst[ 7] = clampToUINT8((int16)dst[ 7] + *srcA++);
dst[ 8] = clampToUINT8((int16)dst[ 8] + *srcB++);
dst[ 9] = clampToUINT8((int16)dst[ 9] + *srcB++);
dst[10] = clampToUINT8((int16)dst[10] + *srcB++);
dst[11] = clampToUINT8((int16)dst[11] + *srcB++);
dst[12] = clampToUINT8((int16)dst[12] + *srcB++);
dst[13] = clampToUINT8((int16)dst[13] + *srcB++);
dst[14] = clampToUINT8((int16)dst[14] + *srcB++);
dst[15] = clampToUINT8((int16)dst[15] + *srcB++);
dst += stride;
}
srcA = srcB + 64;
for(uint32 i=8; i; --i)
{
dst[ 0] = clampToUINT8((int16)dst[ 0] + *srcB++);
dst[ 1] = clampToUINT8((int16)dst[ 1] + *srcB++);
dst[ 2] = clampToUINT8((int16)dst[ 2] + *srcB++);
dst[ 3] = clampToUINT8((int16)dst[ 3] + *srcB++);
dst[ 4] = clampToUINT8((int16)dst[ 4] + *srcB++);
dst[ 5] = clampToUINT8((int16)dst[ 5] + *srcB++);
dst[ 6] = clampToUINT8((int16)dst[ 6] + *srcB++);
dst[ 7] = clampToUINT8((int16)dst[ 7] + *srcB++);
dst[ 8] = clampToUINT8((int16)dst[ 8] + *srcA++);
dst[ 9] = clampToUINT8((int16)dst[ 9] + *srcA++);
dst[10] = clampToUINT8((int16)dst[10] + *srcA++);
dst[11] = clampToUINT8((int16)dst[11] + *srcA++);
dst[12] = clampToUINT8((int16)dst[12] + *srcA++);
dst[13] = clampToUINT8((int16)dst[13] + *srcA++);
dst[14] = clampToUINT8((int16)dst[14] + *srcA++);
dst[15] = clampToUINT8((int16)dst[15] + *srcA++);
dst += stride;
}
}
void M4MemOpInterMBCopyAll(void* _current, int32 mbx, int32 mby, void* _reference)
{
M4Image* current = (M4Image*)_current;
M4Image* reference = (M4Image*)_reference;
uint8* currentY = current->mImage.y;
uint8* referenceY = reference->mImage.y;
int32 edgedWidth = current->mImage.texWidth;
int32 stride = current->mImage.texWidth;
static_assert(M4_MEM_SHIFT_MB_TO_Y == M4_MEM_SHIFT_MB_TO_UV+1, "Constant mismatch");
static_assert(M4_MEM_OFFSET_LEFT_BLOCK == 8, "Constant mismatch");
int32 commonOffset = ( mbx + edgedWidth * mby) << 4;
int32 commonOffsetUV = ((mbx << 1) + edgedWidth * mby) << 2;
#if 1
_memTrans8to8Yx4(currentY + commonOffset, referenceY + commonOffset, stride);
#else
_memTrans8to8Y(currentY + commonOffset, referenceY + commonOffset, stride);
_memTrans8to8Y(currentY + commonOffset + M4_MEM_OFFSET_LEFT_BLOCK, referenceY + commonOffset + M4_MEM_OFFSET_LEFT_BLOCK, stride);
_memTrans8to8Y(currentY + commonOffset + (edgedWidth << 3), referenceY + commonOffset + (edgedWidth << 3), stride);
_memTrans8to8Y(currentY + commonOffset + M4_MEM_OFFSET_LEFT_BLOCK + (edgedWidth << 3), referenceY + commonOffset + M4_MEM_OFFSET_LEFT_BLOCK + (edgedWidth << 3), stride);
#endif
_memTrans8to8Y(current->mImage.u + commonOffsetUV, reference->mImage.u + commonOffsetUV, stride >> 1);
_memTrans8to8Y(current->mImage.v + commonOffsetUV, reference->mImage.v + commonOffsetUV, stride >> 1);
}
void M4MemOpIntraMBAll(void* _current, int32 mbx, int32 mby, void* _dct)
{
M4Image* current = (M4Image*)_current;
const int16* dct = (const int16*)_dct;
int32 stride = current->mImage.texWidth;
int32 stride2 = stride >> 1;
uint8* pY_Cur = current->mImage.y + (mby << 4) * stride + (mbx << M4_MEM_SHIFT_MB_TO_Y);
uint8* pU_Cur = current->mImage.u + (mby << 3) * stride2 + (mbx << M4_MEM_SHIFT_MB_TO_UV);
uint8* pV_Cur = current->mImage.v + (mby << 3) * stride2 + (mbx << M4_MEM_SHIFT_MB_TO_UV);
#if 1
_memTrans16to8Yx4(pY_Cur, dct, stride);
#else
int32 next_block = stride << 3;
_memTrans16to8Y(pY_Cur, &dct[0 * 64], stride);
_memTrans16to8Y(pY_Cur + M4_MEM_OFFSET_LEFT_BLOCK, &dct[1 * 64], stride);
_memTrans16to8Y(pY_Cur + next_block, &dct[2 * 64], stride);
_memTrans16to8Y(pY_Cur + M4_MEM_OFFSET_LEFT_BLOCK + next_block, &dct[3 * 64], stride);
#endif
_memTrans16to8Y(pU_Cur, &dct[4 * 64], stride2);
_memTrans16to8Y(pV_Cur, &dct[5 * 64], stride2);
}
void M4InvQuantType0Intra(int16 *data, const int16 *coeff, uint8 quant, uint16 dcscalar)
{
#if 0
const int32 quant_m_2 = quant << 1;
const int32 quant_add = quant & ~1;
data[0] = __ssat(coeff[0] * dcscalar, 12);
for(uint32 i=63; i>0; --i)
{
int32 acLevel = coeff[i];
if (acLevel == 0)
{
data[i] = 0;
}
else
{
data[i] = int16(__ssat(quant_m_2 * acLevel + (acLevel > 0 ? quant_add : -quant_add), 12));
}
}
#else
const int32 quant_m_2 = quant << 1;
const int32 quant_add = (quant & 1 ? quant : quant - 1);
data[0] = coeff[0] * dcscalar;
if (data[0] < -2048)
{
data[0] = -2048;
}
else if (data[0] > 2047)
{
data[0] = 2047;
}
for(uint32 i=1; i<64; ++i)
{
int32 acLevel = coeff[i];
if (acLevel == 0)
{
data[i] = 0;
}
else if (acLevel < 0)
{
acLevel = quant_m_2 * -acLevel + quant_add;
data[i] = (int16)(acLevel <= 2048 ? -acLevel : -2048);
}
else
{
acLevel = quant_m_2 * acLevel + quant_add;
data[i] = (int16)(acLevel <= 2047 ? acLevel : 2047);
}
}
#endif
}
void M4InvQuantType0Inter(int16 *data, const int16 *coeff, const uint8 quant)
{
const uint16 quant_m_2 = (uint16)(quant << 1);
const uint16 quant_add = (quant & 1 ? quant : quant - 1);
for(uint32 i=0; i<64; ++i)
{
int16 acLevel = coeff[i];
if (acLevel == 0)
{
data[i] = 0;
}
else if (acLevel < 0)
{
acLevel = acLevel * quant_m_2 - quant_add;
data[i] = (acLevel >= -2048 ? acLevel : -2048);
}
else
{
acLevel = acLevel * quant_m_2 + quant_add;
data[i] = (acLevel <= 2047 ? acLevel : 2047);
}
}
}
void M4MemOpInterMBAdd(void* _current, int32 mbx, int32 mby, void* _dct, uint32 cbp)
{
M4Image* current = (M4Image*)_current;
const int16* dct = (const int16*)_dct;
int32 stride = current->mImage.texWidth;
int32 stride2 = stride / 2;
int32 next_block = stride * 8;
uint8* pY_Cur = current->mImage.y + (mby << 4) * stride + (mbx << M4_MEM_SHIFT_MB_TO_Y);
uint8* pU_Cur = current->mImage.u + (mby << 3) * stride2 + (mbx << M4_MEM_SHIFT_MB_TO_UV);
uint8* pV_Cur = current->mImage.v + (mby << 3) * stride2 + (mbx << M4_MEM_SHIFT_MB_TO_UV);
if ((cbp & 60) == 60)
{
_memTrans16to8AddYx4(pY_Cur, dct, stride);
}
else
{
if (cbp & 32) _memTrans16to8AddY(pY_Cur, &dct[0 * 64], stride);
if (cbp & 16) _memTrans16to8AddY(pY_Cur + M4_MEM_OFFSET_LEFT_BLOCK, &dct[1 * 64], stride);
if (cbp & 8) _memTrans16to8AddY(pY_Cur + next_block, &dct[2 * 64], stride);
if (cbp & 4) _memTrans16to8AddY(pY_Cur + M4_MEM_OFFSET_LEFT_BLOCK + next_block, &dct[3 * 64], stride);
}
if (cbp & 2) _memTrans16to8AddY(pU_Cur, &dct[4 * 64], stride2);
if (cbp & 1) _memTrans16to8AddY(pV_Cur, &dct[5 * 64], stride2);
}
void M4idct(int16* block)
{
#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */
int16* blk;
int32 X0, X1, X2, X3, X4, X5, X6, X7;
int32 tmp0;
for(int32 i=7; i>=0; --i) // idct rows
{
blk = block + (i << 3);
X4 = blk[1];
X3 = blk[2];
X7 = blk[3];
X1 = blk[4] << 11;
X6 = blk[5];
X2 = blk[6];
X5 = blk[7];
if (! (X1 | X2 | X3 | X4 | X5 | X6 | X7) )
{
blk[0] = blk[1] = blk[2] = blk[3] = blk[4] = blk[5] = blk[6] = blk[7] = (int16)(blk[0] << 3);
continue;
}
X0 = (blk[0] << 11) + 128; // for proper rounding in the fourth stage
// first stage
tmp0 = W7 * (X4 + X5);
X4 = tmp0 + (W1 - W7) * X4;
X5 = tmp0 - (W1 + W7) * X5;
tmp0 = W6 * (X3 + X2);
X2 = tmp0 - (W2 + W6) * X2;
X3 = tmp0 + (W2 - W6) * X3;
tmp0 = W3 * (X6 + X7);
X6 = tmp0 - (W3 - W5) * X6;
X7 = tmp0 - (W3 + W5) * X7;
// second stage
tmp0 = X0 + X1;
X0 -= X1;
X1 = X4 + X6;
X4 -= X6;
X6 = X5 + X7;
X5 -= X7;
// third stage
X7 = tmp0 + X3;
tmp0 -= X3;
X3 = X0 + X2;
X0 -= X2;
X2 = (181 * (X4 + X5) + 128) >> 8;
X4 = (181 * (X4 - X5) + 128) >> 8;
// fourth stage
blk[0] = (int16)((X7 + X1) >> 8);
blk[1] = (int16)((X3 + X2) >> 8);
blk[2] = (int16)((X0 + X4) >> 8);
blk[3] = (int16)((tmp0 + X6) >> 8);
blk[4] = (int16)((tmp0 - X6) >> 8);
blk[5] = (int16)((X0 - X4) >> 8);
blk[6] = (int16)((X3 - X2) >> 8);
blk[7] = (int16)((X7 - X1) >> 8);
} // IDCT-rows
for(int32 i=7; i>=0; --i) // idct columns
{
blk = block + i;
X1 = blk[8 * 4] << 8;
X2 = blk[8 * 6];
X3 = blk[8 * 2];
X4 = blk[8 * 1];
X5 = blk[8 * 7];
X6 = blk[8 * 5];
X7 = blk[8 * 3];
if (! (X1 | X2 | X3 | X4 | X5 | X6 | X7) )
{
blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] = blk[8 * 4] =
blk[8 * 5] = blk[8 * 6] = blk[8 * 7] = iclp[(blk[8 * 0] + 32) >> 6];
continue;
}
X0 = (blk[8 * 0] << 8) + 8192;
// first stage
tmp0 = W7 * (X4 + X5) + 4;
X4 = (tmp0 + (W1 - W7) * X4) >> 3;
X5 = (tmp0 - (W1 + W7) * X5) >> 3;
tmp0 = W3 * (X6 + X7) + 4;
X6 = (tmp0 - (W3 - W5) * X6) >> 3;
X7 = (tmp0 - (W3 + W5) * X7) >> 3;
tmp0 = W6 * (X3 + X2) + 4;
X2 = (tmp0 - (W2 + W6) * X2) >> 3;
X3 = (tmp0 + (W2 - W6) * X3) >> 3;
// second stage
tmp0 = X0 + X1;
X0 -= X1;
X1 = X4 + X6;
X4 -= X6;
X6 = X5 + X7;
X5 -= X7;
// third stage
X7 = tmp0 + X3;
tmp0 -= X3;
X3 = X0 + X2;
X0 -= X2;
X2 = (181 * (X4 + X5) + 128) >> 8;
X4 = (181 * (X4 - X5) + 128) >> 8;
// fourth stage
blk[8 * 0] = iclp[(X7 + X1) >> 14];
blk[8 * 1] = iclp[(X3 + X2) >> 14];
blk[8 * 2] = iclp[(X0 + X4) >> 14];
blk[8 * 3] = iclp[(tmp0 + X6) >> 14];
blk[8 * 4] = iclp[(tmp0 - X6) >> 14];
blk[8 * 5] = iclp[(X0 - X4) >> 14];
blk[8 * 6] = iclp[(X3 - X2) >> 14];
blk[8 * 7] = iclp[(X7 - X1) >> 14];
}
}
static void _mbBlendSrcDst8x8(uint8* Dest, const uint8* Src, int32 Stride)
{
const int32 blkSize = 8;
for(int32 j=blkSize; j>0; --j)
{
const uint8* pS = Src;
uint8* pD = Dest;
for(int32 i=blkSize; i>0; --i)
{
int32 tot = (*pS++ + *pD + 1) >> 1;
*pD++ = (uint8)tot;
}
Src += Stride;
Dest += Stride;
}
}
static void _mbBlendSrcDst16x16(uint8* Dest, const uint8* Src, int32 Stride)
{
const int32 blkSize = 16;
for(int32 j=blkSize; j>0; --j)
{
const uint8* pS = Src;
uint8* pD = Dest;
for(int32 i=blkSize; i>0; --i)
{
int32 tot = (*pS++ + *pD + 1) >> 1;
*pD++ = (uint8)tot;
}
Src += Stride;
Dest += Stride;
}
}
static void _mbCopy8x8(uint8* Dest, const uint8* Src, int32 Stride)
{
_memTrans8to8Y(Dest, Src, Stride);
}
static void _mbCopy16x16(uint8* Dest, const uint8* Src, int32 Stride)
{
_memTrans8to8Yx4(Dest, Src, Stride);
}
static void _mbInterpolateHorizontal8x8(uint8* Dest, const uint8* Src, int32 Rounding, int32 Stride)
{
#if ENABLE_OPTIMIZATION
uint64 R = 0;
for(uint32 v=8; v; --v)
{
// BIG ENDIAN: uint64 A8 = *reinterpret_cast<const uint64*>(Src);
// BIG ENDIAN: uint64 B8 = (A8 << 8) | Src[8];
// NOTE: This is correct for little endian machines only
uint64 A8 = *reinterpret_cast<const uint64*>(Src + 1);
uint64 B8 = (A8 << 8) | Src[0];
for(uint32 u=8; u; --u)
{
int32 V = ((A8 & 255) + (B8 & 255) + Rounding) >> 1;
R = (R >> 8) | ((uint64)V << 56);
A8 >>= 8;
B8 >>= 8;
}
*reinterpret_cast<uint64*>(Dest) = R;
Src = ADVANCE_POINTER(Src, Stride);
Dest = ADVANCE_POINTER(Dest, Stride);
}
#else
const int32 blkSize = 8;
for(int32 v=0; v<blkSize; ++v)
{
for(int32 u=0; u<blkSize; ++u)
{
int32 sum = (Src[u] + Src[u + 1] + Rounding) / 2;
Dest[u] = (uint8)sum;
}
Src += Stride;
Dest += Stride;
}
#endif
}
static void _mbInterpolateHorizontal16x16(uint8* Dest, const uint8* Src, int32 Rounding, int32 Stride)
{
#if ENABLE_OPTIMIZATION
uint64 R0 = 0;
uint64 R1 = 0;
for(uint32 v=16; v; --v)
{
// NOTE: This is correct for little endian machines only
uint64 A8R = *reinterpret_cast<const uint64*>(Src + 1);
uint64 A8L = *reinterpret_cast<const uint64*>(Src + 9);
uint64 B8R = (A8R << 8) | Src[0];
uint64 B8L = (A8L << 8) | (A8R >> 56);
for(uint32 u=8; u; --u)
{
int32 V0 = ((A8R & 255) + (B8R & 255) + Rounding) >> 1;
int32 V1 = ((A8L & 255) + (B8L & 255) + Rounding) >> 1;
R0 = (R0 >> 8) | ((uint64)V0 << 56);
R1 = (R1 >> 8) | ((uint64)V1 << 56);
A8L >>= 8;
A8R >>= 8;
B8L >>= 8;
B8R >>= 8;
}
*reinterpret_cast<uint64*>(Dest ) = R0;
*reinterpret_cast<uint64*>(Dest + 8) = R1;
Src = ADVANCE_POINTER(Src, Stride);
Dest = ADVANCE_POINTER(Dest, Stride);
}
#else
const int32 blkSize = 16;
for(int32 v=0; v<blkSize; ++v)
{
for(int32 u=0; u<blkSize; ++u)
{
int32 sum = (Src[u] + Src[u + 1] + Rounding) / 2;
M4CHECK(sum <= 255);
Dest[u] = (uint8)sum;
}
Src += Stride;
Dest += Stride;
}
#endif
}
static void _mbInterpolateVertical8x8(uint8* Dest, const uint8* Src, int32 Rounding, int32 Stride)
{
#if ENABLE_OPTIMIZATION
// NOTE: This is machine endian independent!
uint64 R = 0;
uint64 A8 = *reinterpret_cast<const uint64*>(Src);
for(uint32 v=8; v; --v)
{
uint64 NextRow = *reinterpret_cast<const uint64*>(Src + Stride);
uint64 B8 = NextRow;
for(uint32 u=8; u; --u)
{
int32 V = ((A8 & 255) + (B8 & 255) + Rounding) >> 1;
R = (R >> 8) | ((uint64)V << 56);
A8 >>= 8;
B8 >>= 8;
}
*reinterpret_cast<uint64*>(Dest) = R;
A8 = NextRow;
Src = ADVANCE_POINTER(Src, Stride);
Dest = ADVANCE_POINTER(Dest, Stride);
}
#else
const int32 blkSize = 8;
for(int32 v=0; v<blkSize; ++v)
{
for(int32 u=0; u<blkSize; ++u)
{
int32 sum = (Src[u] + Src[u + Stride] + Rounding) / 2;
M4CHECK(sum <= 255);
Dest[u] = (uint8)sum;
}
Src += Stride;
Dest += Stride;
}
#endif
}
static void _mbInterpolateVertical16x16(uint8* Dest, const uint8* Src, int32 Rounding, int32 Stride)
{
#if ENABLE_OPTIMIZATION
// NOTE: This is machine endian independent!
uint64 R0 = 0;
uint64 R1 = 0;
uint64 A8R = *reinterpret_cast<const uint64*>(Src);
uint64 A8L = *reinterpret_cast<const uint64*>(Src + 8);
for(uint32 v=16; v; --v)
{
uint64 NextRowR = *reinterpret_cast<const uint64*>(Src + Stride);
uint64 NextRowL = *reinterpret_cast<const uint64*>(Src + Stride + 8);
uint64 B8R = NextRowR;
uint64 B8L = NextRowL;
for(uint32 u=8; u; --u)
{
int32 V0 = ((A8R & 255) + (B8R & 255) + Rounding) >> 1;
int32 V1 = ((A8L & 255) + (B8L & 255) + Rounding) >> 1;
R0 = (R0 >> 8) | ((uint64)V0 << 56);
R1 = (R1 >> 8) | ((uint64)V1 << 56);
A8L >>= 8;
A8R >>= 8;
B8L >>= 8;
B8R >>= 8;
}
*reinterpret_cast<uint64*>(Dest ) = R0;
*reinterpret_cast<uint64*>(Dest + 8) = R1;
A8R = NextRowR;
A8L = NextRowL;
Src = ADVANCE_POINTER(Src, Stride);
Dest = ADVANCE_POINTER(Dest, Stride);
}
#else
const int32 blkSize = 16;
for(int32 v=0; v<blkSize; ++v)
{
for(int32 u=0; u<blkSize; ++u)
{
int32 sum = (Src[u] + Src[u + Stride] + Rounding) / 2;
M4CHECK(sum <= 255);
Dest[u] = (uint8)sum;
}
Src += Stride;
Dest += Stride;
}
#endif
}
static void _mbInterpolateBoth8x8(uint8* Dest, const uint8* Src, int32 Rounding, int32 Stride)
{
#if ENABLE_OPTIMIZATION
// NOTE: This is correct for little endian machines only
uint64 R = 0;
uint64 R0C0 = *reinterpret_cast<const uint64*>(Src + 1);
uint64 R0C1 = (R0C0 << 8) | Src[0];
Src = ADVANCE_POINTER(Src, Stride);
for(uint32 v=8; v; --v)
{
uint64 NextRow0 = *reinterpret_cast<const uint64*>(Src + 1);
uint64 NextRow1 = (NextRow0 << 8) | Src[0];
uint64 R1C0 = NextRow0;
uint64 R1C1 = NextRow1;
for(uint32 u=8; u; --u)
{
int32 V = ((R0C0 & 255) + (R0C1 & 255) + (R1C0 & 255) + (R1C1 & 255) + Rounding) >> 2;
R = (R >> 8) | ((uint64)V << 56);
R0C0 >>= 8;
R0C1 >>= 8;
R1C0 >>= 8;
R1C1 >>= 8;
}
*reinterpret_cast<uint64*>(Dest) = R;
R0C0 = NextRow0;
R0C1 = NextRow1;
Src = ADVANCE_POINTER(Src, Stride);
Dest = ADVANCE_POINTER(Dest, Stride);
}
#else
const int32 blkSize = 8;
for(int32 v=0; v<blkSize; ++v)
{
for(int32 u=0; u<blkSize; ++u)
{
int32 sum = (Src[u] + Src[u + 1] + Src[u + Stride] + Src[u + Stride + 1] + Rounding) / 4;
M4CHECK(sum <= 255);
Dest[u] = (uint8)sum;
}
Src += Stride;
Dest += Stride;
}
#endif
}
static void _mbInterpolateBoth16x16(uint8* Dest, const uint8* Src, int32 Rounding, int32 Stride)
{
#if ENABLE_OPTIMIZATION
// NOTE: This is correct for little endian machines only
uint64 R0 = 0;
uint64 R1 = 0;
uint64 R0C1 = *reinterpret_cast<const uint64*>(Src + 1);
uint64 R0C2 = *reinterpret_cast<const uint64*>(Src + 9);
uint64 R0C1S = (R0C1 << 8) | Src[0];
uint64 R0C2S = (R0C2 << 8) | (R0C1 >> 56);
Src = ADVANCE_POINTER(Src, Stride);
for(uint32 v=16; v; --v)
{
uint64 R1C1 = *reinterpret_cast<const uint64*>(Src + 1);
uint64 R1C2 = *reinterpret_cast<const uint64*>(Src + 9);
uint64 R1C1S = (R1C1 << 8) | Src[0];
uint64 R1C2S = (R1C2 << 8) | (R1C1 >> 56);
uint64 n0 = R1C1;
uint64 n1 = R1C2;
uint64 n2 = R1C1S;
uint64 n3 = R1C2S;
for(uint32 u=8; u; --u)
{
int32 V0 = ((R0C1 & 255) + (R0C1S & 255) + (R1C1 & 255) + (R1C1S & 255) + Rounding) >> 2;
int32 V1 = ((R0C2 & 255) + (R0C2S & 255) + (R1C2 & 255) + (R1C2S & 255) + Rounding) >> 2;
R0 = (R0 >> 8) | ((uint64)V0 << 56);
R1 = (R1 >> 8) | ((uint64)V1 << 56);
R0C1 >>= 8;
R0C2 >>= 8;
R0C1S >>= 8;
R0C2S >>= 8;
R1C1 >>= 8;
R1C2 >>= 8;
R1C1S >>= 8;
R1C2S >>= 8;
}
*reinterpret_cast<uint64*>(Dest ) = R0;
*reinterpret_cast<uint64*>(Dest + 8) = R1;
R0C1 = n0;
R0C2 = n1;
R0C1S = n2;
R0C2S = n3;
Src = ADVANCE_POINTER(Src, Stride);
Dest = ADVANCE_POINTER(Dest, Stride);
}
#else
const int32 blkSize = 16;
for(int32 v=0; v<blkSize; ++v)
{
for(int32 u=0; u<blkSize; ++u)
{
int32 sum = (Src[u] + Src[u + 1] + Src[u + Stride] + Src[u + Stride + 1] + Rounding) / 4;
M4CHECK(sum <= 255);
Dest[u] = (uint8)sum;
}
Src += Stride;
Dest += Stride;
}
#endif
}
void M4MemHalfPelInterpolate(void* dst, void* src, int32 stride, int32 x, int32 y, void* mv, uint32 rounding, bool b4x4)
{
M4CHECK(((size_t)dst & 3) == 0); // better be the case!
uint8* cur = (uint8*)dst;
const uint8* refn = (const uint8*)src;
const M4_VECTOR* delta = (const M4_VECTOR*)mv;
int32 ddx, ddy;
// function entered with actual x/y position
switch(((delta->x & 1) << 1) + (delta->y & 1))
{
case 0:
{
// No interpolation, straight copy
ddx = delta->x / 2;
ddy = delta->y / 2;
refn += x + ddx + (y + ddy) * stride;
cur += x + y * stride;
if (b4x4)
{
_mbCopy16x16(cur, refn, stride);
}
else
{
_mbCopy8x8(cur, refn, stride);
}
break;
}
case 1:
{
//-------------------------------------------------------
// Vertical interpolate
//
ddx = delta->x / 2;
ddy = (delta->y - 1) / 2;
refn += x + ddx + (y + ddy) * stride;
cur += x + y * stride;
int32 r = 1 - rounding;
if (b4x4)
{
_mbInterpolateVertical16x16(cur, refn, r, stride);
}
else
{
_mbInterpolateVertical8x8(cur, refn, r, stride);
}
break;
}
case 2:
{
//-------------------------------------------------------
// Horizontal interpolate
//
ddx = (delta->x-1)/2;
ddy = delta->y/2;
refn += x + ddx + (y + ddy) * stride;
cur += x + y * stride;
int32 r = 1 - rounding;
if (b4x4)
{
_mbInterpolateHorizontal16x16(cur, refn, r, stride);
}
else
{
_mbInterpolateHorizontal8x8(cur, refn, r, stride);
}
break;
}
default:
{
//-------------------------------------------------------
// Both axis interpolate
//
ddx = (delta->x - 1) / 2;
ddy = (delta->y - 1) / 2;
refn += x + ddx + (y + ddy) * stride;
cur += x + y * stride;
int32 r = 2 - rounding;
if (b4x4)
{
_mbInterpolateBoth16x16(cur, refn, r, stride);
}
else
{
_mbInterpolateBoth8x8(cur, refn, r, stride);
}
break;
}
}
}
static void _interpolate8x8Simple(uint8* dst, const uint8* src, const int32 x, const int32 y, const int32 stride)
{
int32 off = x + y * stride;
src += off;
dst += off;
_mbBlendSrcDst8x8(dst, src, stride);
}
static void _interpolate16x16Simple(uint8* dst, const uint8* src, const int32 x, const int32 y, const int32 stride)
{
int32 off = x + y * stride;
src += off;
dst += off;
_mbBlendSrcDst16x16(dst, src, stride);
}
void M4MemOpInterpolateAll(void* _current, int32 mbx, int32 mby, void* _reference)
{
M4Image* current = (M4Image*)_current;
const M4Image* reference = (const M4Image*)_reference;
int32 stridex = current->mImage.texWidth;
int32 stride2x = stridex / 2;
int32 pmbx = mbx << 4;
int32 pmby = mby << 4;
int32 pmbx2 = mbx << 3;
int32 pmby2 = mby << 3;
// merge forward and backward images
uint8* curx = current->mImage.y;
uint8* refx = reference->mImage.y;
_interpolate16x16Simple(curx, refx, pmbx, pmby, stridex);
_interpolate8x8Simple(current->mImage.u, reference->mImage.u, pmbx2, pmby2, stride2x);
_interpolate8x8Simple(current->mImage.v, reference->mImage.v, pmbx2, pmby2, stride2x);
}
}