// Copyright Epic Games, Inc. All Rights Reserved.

/*=============================================================================
	ETCCompressionCommon.ush:
	Helpers for compute shader ETC2 texture compression
=============================================================================*/

#pragma once

#include "GammaCorrectionCommon.ush"

#define BLOCK_MODE_INDIVIDUAL				0
#define BLOCK_MODE_DIFFERENTIAL				1u
#define NUM_RGB_TABLES 8
#define NUM_ALPHA_TABLES 16

float EtcLuminance(in float3 c)
{
	return dot(c, float3(0.3f, 0.5f, 0.2f));
}

uint EtcPackColorBits(in float3 FloatColor)
{
	uint3 ColorScaled = uint3(round(saturate(FloatColor) * 255.f));
	uint ColorPacked = (ColorScaled.r & 0x000000f8) | ((ColorScaled.g & 0x000000f8) << 8u) | ((ColorScaled.b & 0x000000f8) << 16u);
	return ColorPacked;
}

uint EtcSwapEndian32(in uint x)
{
	return
		((x & 0x0000ff) << 24u) |
		((x & 0x00ff00) <<  8u) |
		((x & 0xff0000) >>  8u) |
		(x >> 24u);
}

// Get a single scale factor to use for a YCoCg color block
// This increases precision at the expense of potential blending artifacts across blocks
float EtcGetYCoCgScale(float2 MinCoCg, float2 MaxCoCg)
{
	MinCoCg = abs(MinCoCg - 128.f / 255.f);
	MaxCoCg = abs(MaxCoCg - 128.f / 255.f);

	float MaxComponent = max(max(MinCoCg.x, MinCoCg.y), max(MaxCoCg.x, MaxCoCg.y));

	return (MaxComponent < 32.f / 255.f) ? 4.f : (MaxComponent < 64.f / 255.f) ? 2.f : 1.f;
}

void EtcApplyYCoCgScale(inout float2 CoCg, float Scale)
{
	CoCg = (CoCg - 128.f / 255.f) * Scale + 128.f / 255.f;
}

uint EtcGetBlockWeights(in float3 Block[16], in float3 BaseColor, in uint TableIdx)
{
	// weights
	// 3 - big negative distance
	// 2 - small negative distance
	// 1 - big positive distance
	// 0 - small positive distance
	uint BlockWeights = 0;

	float Distance0 = EtcParameters.RGB_DISTANCE_TABLES[TableIdx].z / 255.f;
	float Distance1 = EtcParameters.RGB_DISTANCE_TABLES[TableIdx].w / 255.f;
	
	float BaseLum = EtcLuminance(BaseColor);
	float Lum1 = min(1.f, BaseLum + Distance1);
	float Lum0 = BaseLum + Distance0;
	float Lum3 = max(0.f, BaseLum - Distance1);
	float Lum2 = BaseLum - Distance0;
	
	for (uint Y = 0; Y < 4; ++Y)
	{
		for (uint X = 0; X < 4; ++X)
		{
			float3 Color = Block[4u * Y + X];
			float ColorLum = EtcLuminance(Color);
			uint EncIndex = 0u;
			if (ColorLum < BaseLum || ColorLum == 0.f) // make sure fully Black block is encoded correctly
			{
				EncIndex = (ColorLum - Lum3) < (Lum2 - ColorLum) ? 3u : 2u;
			}
			else
			{
				EncIndex = (Lum1 - ColorLum) < (ColorLum - Lum0) ? 1u : 0u;
			}
			
			uint IndexInBlock = X * 4u + Y;
			BlockWeights |= ((EncIndex & 1u) << IndexInBlock) | ((EncIndex >> 1u) << (16u + IndexInBlock));
		}
	}
	return BlockWeights;
}

uint EtcSelectRGBTableIndex(float LuminanceR)
{
	// guess a table using sub-block luminance range
	float Range = LuminanceR*255.f + 1.f;
	if (Range < 8.0)
	{
		return 0u;
	}
	else if (Range < 17.0)
	{
		return 1u;
	}
	else if (Range < 29.0)
	{
		return 2u;
	}
	else if (Range < 42.0)
	{
		return 3u;
	}
	else if (Range < 60.0)
	{
		return 4u;
	}
	else if (Range < 80.0)
	{
		return 5u;
	}
	else if (Range < 106.0)
	{
		return 6u;
	}
	return 7u;
}

uint2 CompressBlock_ETC1S(in float3 Block[16], bool bSRGB, bool bCoCg)
{
	// ETC1S: differential with color delta=0
	uint FlipBit = 0u;
	uint BlockModeBits = (BLOCK_MODE_DIFFERENTIAL << 25u);

	float3 BaseColorMin = 1;
	float3 BaseColorMax = 0;
	float3 BaseColorAvg = 0;
	for (int i = 0; i < 16; ++i)
	{
		BaseColorMin = min(BaseColorMin, Block[i]);
		BaseColorMax = max(BaseColorMax, Block[i]);
		BaseColorAvg = BaseColorAvg + Block[i];
	}
	BaseColorAvg = BaseColorAvg / 16.f;
		
	float3 BaseColor = (bSRGB ? LinearToSrgb(BaseColorAvg) : BaseColorAvg);
	
	uint ColorBits = 0u;
	if (bCoCg)
	{
		float CoCgScale = EtcGetYCoCgScale(BaseColorMin.xy, BaseColorMax.xy);
		EtcApplyYCoCgScale(BaseColor.xy, CoCgScale);
		ColorBits = EtcPackColorBits(BaseColor);
		ColorBits |= (((uint)CoCgScale - 1) << 19u);
	}
	else
	{
		ColorBits = EtcPackColorBits(BaseColor);
	}

	float LumMin = EtcLuminance(BaseColorMin);
	float LumMax = EtcLuminance(BaseColorMax);
	float LumRange = (LumMax - LumMin) * 0.5f;

	uint BlockTableIdx = EtcSelectRGBTableIndex(LumRange);
	uint BlockWeights = EtcGetBlockWeights(Block, BaseColorAvg, BlockTableIdx);

	// Both these values need to be big-endian. We can build ModeBits directly in big-endian layout, but for IndexBits
	// it's too hard, so we'll just swap here.
	uint ModeBits = (BlockTableIdx << 29u) | (BlockTableIdx << 26u) | (BlockModeBits) | (FlipBit << 24u) | ColorBits;
	uint IndexBits = EtcSwapEndian32(BlockWeights);

	return uint2(ModeBits, IndexBits);
}

uint2 CompressBlock_ETC2_SRGB(in float3 Block[16])
{
	const bool bSRGB = true;
	const bool bCoCg = false;
	return CompressBlock_ETC1S(Block, bSRGB, bCoCg);
}

uint2 CompressBlock_ETC2_RGB(in float3 Block[16])
{
	const bool bSRGB = false;
	const bool bCoCg = false;
	return CompressBlock_ETC1S(Block, bSRGB, bCoCg);
}

void SelectAlphaMod(in float SourceAlpha, in float EncodedAlpha, int IndexInTable, inout int SelectedIndex, inout float MinDif)
{
	float Dif = abs(EncodedAlpha - SourceAlpha);
	if (Dif < MinDif)
	{
		MinDif = Dif;
		SelectedIndex = IndexInTable;
	}
}

uint2 CompressBlock_ETC2_Alpha(in float BlockA[16])
{
	float MinAlpha = 1.f;
	float MaxAlpha = 0.f;
	for (int k = 0; k < 16; ++k)
	{
		float A = BlockA[k];
		MinAlpha = min(A, MinAlpha);
		MaxAlpha = max(A, MaxAlpha);
	}

	MinAlpha = round(MinAlpha*255.f);
	MaxAlpha = round(MaxAlpha*255.f);
	
	float AlphaRange = MaxAlpha - MinAlpha;
	const float MidRange = 21.f;// an average range in ALPHA_DISTANCE_TABLES
	float Multiplier = clamp(round(AlphaRange/MidRange), 1.f, 15.f);
	
	int TableIdx = 0;
	float4 TableValueNeg = float4(0,0,0,0);
	float4 TableValuePos = float4(0,0,0,0);
	
	// iterating through all tables to find a best fit is quite slow
	// instead guess the best table based on alpha range
	const int TablesToTest[5] = {15,11,6,2,0};
	for (int i = 0; i < 5; ++i)
	{
		TableIdx = TablesToTest[i];
		TableValuePos = EtcParameters.ALPHA_DISTANCE_TABLES[TableIdx];
				
		float TableRange = (TableValuePos.w*2 + 1)*Multiplier;
		float Dif = TableRange - AlphaRange;
		if (Dif > 0.f)
		{
			break;
		}
	}
	TableValueNeg = -(TableValuePos + float4(1,1,1,1));
	
	TableValuePos*=Multiplier;
	TableValueNeg*=Multiplier;
	
	// make sure an exact value of MinAlpha can always be decoded from a BaseValue
	float BaseValue = clamp(round(-TableValueNeg.w + MinAlpha), 0.f, 255.f);
	
	TableValueNeg = TableValueNeg + BaseValue.xxxx;
	TableValuePos = TableValuePos + BaseValue.xxxx;
	uint2 BlockWeights = 0;
	
	for (int PixelIndex = 0; PixelIndex < 16; ++PixelIndex)
	{
		float Alpha = BlockA[PixelIndex]*255.f;
		int SelectedIndex = 0;
		float MinDif = 100000.f;
		
		if (Alpha < TableValuePos.x)
		{
			SelectAlphaMod(Alpha, TableValueNeg.x, 0, SelectedIndex, MinDif);
			SelectAlphaMod(Alpha, TableValueNeg.y, 1, SelectedIndex, MinDif);
			SelectAlphaMod(Alpha, TableValueNeg.z, 2, SelectedIndex, MinDif);
			SelectAlphaMod(Alpha, TableValueNeg.w, 3, SelectedIndex, MinDif);
		}
		else
		{
			SelectAlphaMod(Alpha, TableValuePos.x, 4, SelectedIndex, MinDif);
			SelectAlphaMod(Alpha, TableValuePos.y, 5, SelectedIndex, MinDif);
			SelectAlphaMod(Alpha, TableValuePos.z, 6, SelectedIndex, MinDif);
			SelectAlphaMod(Alpha, TableValuePos.w, 7, SelectedIndex, MinDif);
		}

		// ETC uses column-major indexing for the pixels in a block...
		int TransposedIndex = (PixelIndex >> 2) | ((PixelIndex & 3) << 2);
		int StartBit = (15 - TransposedIndex) * 3;
		BlockWeights.x |= (StartBit < 32) ? SelectedIndex << StartBit : 0;
		int ShiftRight = (StartBit == 30) ? 2 : 0;
		int ShiftLeft = (StartBit >= 32) ? StartBit - 32 : 0;
		BlockWeights.y |= (StartBit >= 30) ? (SelectedIndex >> ShiftRight) << ShiftLeft : 0;
	}

	int MultiplierInt = Multiplier;
	int BaseValueInt = BaseValue;
	
	uint2 AlphaBits;
	AlphaBits.x = EtcSwapEndian32(BlockWeights.y | (TableIdx << 16) | (MultiplierInt << 20) | (BaseValueInt << 24));
	AlphaBits.y = EtcSwapEndian32(BlockWeights.x);

	return AlphaBits;
}

uint4 CompressBlock_ETC2_RGBA(in float3 BlockRGB[16], in float BlockA[16])
{
	uint2 CompressedRGB = CompressBlock_ETC2_RGB(BlockRGB);
	uint2 CompressedAlpha = CompressBlock_ETC2_Alpha(BlockA);
	return uint4(CompressedAlpha, CompressedRGB);
}

uint4 CompressBlock_ETC2_SRGBA(in float3 BlockRGB[16], in float BlockA[16])
{
	uint2 CompressedRGB = CompressBlock_ETC2_SRGB(BlockRGB);
	uint2 CompressedAlpha = CompressBlock_ETC2_Alpha(BlockA);
	return uint4(CompressedAlpha, CompressedRGB);
}

uint4 CompressBlock_ETC2_YCoCg(in float3 BlockRGB[16])
{
	float BlockY[16];
	for (int i = 0; i < 16; ++i)
	{
		float3 YCoCg = RGB2YCoCg(BlockRGB[i]);
		BlockRGB[i] = float3(YCoCg.yz, 0.0);
		BlockY[i] = YCoCg.x;
	}

	const bool bSRGB = false;
	const bool bCoCg = true;
	uint2 CompressedRGB = CompressBlock_ETC1S(BlockRGB, bSRGB, bCoCg);
	uint2 CompressedAlpha = CompressBlock_ETC2_Alpha(BlockY);
	return uint4(CompressedAlpha, CompressedRGB);
}

uint4 CompressBlock_ETC2_RG(in float BlockU[16], in float BlockV[16])
{
	uint2 R = CompressBlock_ETC2_Alpha(BlockU);
	uint2 G = CompressBlock_ETC2_Alpha(BlockV);
	return uint4(R, G);
}