// Copyright Epic Games, Inc. All Rights Reserved.

/*=============================================================================
	BCCompressionCommon.ush:
	Helpers for compute shader BC texture compression
	
	todo[vt]: First implementation is keeping it simple. Lots of possible optimizations to do...
	* Pack float3 colors to uint earlier to reduce vector register pressure?
	* Calling code can build blocks with one color sample per thread (with layout optimized for coalesce)?
	* Code here could use wave ops for the block processing?
=============================================================================*/

#pragma once

#include "GammaCorrectionCommon.ush"

// A bit slower, but higher quality compression
#define LEAST_SQUARES_ENDPOINT_OPTIMIZATION 1

uint RoundToUInt(float X)
{
	return (uint)round(X);
}

// Simple convert float3 color to 565 uint using 'round' arithmetic
uint Float3ToUint565( in float3 Color )
{
	float3 Scale       = float3(31.f, 63.f, 31.f);
	float3 ColorScaled = round(saturate(Color) * Scale);
	uint ColorPacked   = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5) | (uint)ColorScaled.b;
	
	return ColorPacked;
}

// Convert float3 color to 565 uint using 'ceil' arithmetic
// Color parameter is inout and is modified to match the converted value
uint Float3ToUint565_Ceil( inout float3 Color )
{
	float3 Scale         = float3(31.f, 63.f, 31.f);
	float3 ColorScaled   = ceil(saturate(Color) * Scale);
	uint ColorPacked     = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5) | (uint)ColorScaled.b;
	Color				 = ColorScaled / Scale;

	return ColorPacked;
}

// Convert float3 color to 565 uint using 'floor' arithmetic
// Color parameter is inout and is modified to match the converted value
uint Float3ToUint565_Floor( inout float3 Color )
{
	float3 Scale         = float3(31.f, 63.f, 31.f);
	float3 ColorScaled   = floor(saturate(Color) * Scale);
	uint ColorPacked     = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5) | (uint)ColorScaled.b;
	Color				 = ColorScaled / Scale;

	return ColorPacked;
}

// Get min and max values in a single channel block
void GetMinMax( in float Block[16], out float OutMin, out float OutMax )
{
	OutMin = Block[0];
	OutMax = Block[0];

	for (int i=1; i<16; ++i)
	{
		OutMin = min(OutMin, Block[i]);
		OutMax = max(OutMax, Block[i]);
	}
}

// Get min and max values in two single channel blocks
void GetMinMax( in float Block0[16], in float Block1[16], out float OutMin0, out float OutMax0, out float OutMin1, out float OutMax1 )
{
	OutMin0 = Block0[0];
	OutMax0 = Block0[0];
	OutMin1 = Block1[0];
	OutMax1 = Block1[0];

	for (int i=1; i<16; ++i)
	{
		OutMin0 = min(OutMin0, Block0[i]);
		OutMax0 = max(OutMax0, Block0[i]);
		OutMin1 = min(OutMin1, Block1[i]);
		OutMax1 = max(OutMax1, Block1[i]);
	}
}

// Get min and max values in a single three channel block
void GetMinMax( in float3 Block[16], out float3 OutMin, out float3 OutMax )
{
	OutMin = Block[0];
	OutMax = Block[0];

	for (int i=1; i<16; ++i)
	{
		OutMin = min(OutMin, Block[i]);
		OutMax = max(OutMax, Block[i]);
	}
}

// Calculate the final packed indices for a color block
uint GetPackedColorBlockIndices( in float3 Block[16], in float3 MinColor, in float3 MaxColor )
{
	uint PackedIndices = 0;

	// Project onto max->min color vector and segment into range [0,3]
	float3 Range = MinColor - MaxColor;
	float Scale = 3.f / dot(Range, Range);
	float3 ScaledRange  = Range * Scale;
	float Bias = (dot(MaxColor, MaxColor) - dot(MaxColor, MinColor)) * Scale;
	
	for (int i=15; i>=0; --i)
	{
		// Compute the distance index for this element
		uint Index = RoundToUInt(dot(Block[i], ScaledRange) + Bias);
		// Convert distance index into the BC index
		Index += (Index > 0) - (3 * (Index == 3));
		// OR into the final PackedIndices
		PackedIndices = (PackedIndices<<2)|Index;
	}

	return PackedIndices;
}

// Calculate the final packed indices for an alpha block
// The results are in their final location of the uint2 indices and will need ORing with the min and max alpha
uint2 GetPackedAlphaBlockIndices( in float Block[16], in float MinAlpha, in float MaxAlpha )
{
	uint2 PackedIndices = 0;

	// Segment alpha max->min into range [0,7]
	float Range = MinAlpha - MaxAlpha;
	float Scale = 7.f / Range;
	float Bias = -Scale * MaxAlpha;

	uint i = 0;
	// The first 5 elements of the block will go into the top 16 bits of the x component
	for (i=0; i<5; ++i)
	{
		// Compute the distance index for this element
		uint Index = RoundToUInt(Block[i] * Scale + Bias);
		// Convert distance index into the BC index
		Index += (Index > 0) - (7 * (Index == 7));
		// OR into the final PackedIndices
		PackedIndices.x |= (Index << (i*3 + 16));
	}

	// The 6th element is split across the x and y components
	{
		i = 5;
		uint Index = RoundToUInt(Block[i] * Scale + Bias);
		Index += (Index > 0) - (7 * (Index == 7));
		PackedIndices.x |= (Index << 31);
		PackedIndices.y |= (Index >> 1);
	}

	// The rest of the elements fill the y component
	for (i=6; i<16; ++i)
	{
		uint Index = RoundToUInt(Block[i] * Scale + Bias);
		Index += (Index > 0) - (7 * (Index == 7 ? 1 : 0));
		PackedIndices.y |= (Index << (i*3 - 16));
	}

	return PackedIndices;
}

// Compress a BC1 block
uint2 CompressBC1Block( in float3 Block[16] )
{
	float3 MinColor, MaxColor;
	GetMinMax( Block, MinColor, MaxColor );

	uint MinColor565 = Float3ToUint565_Floor( MinColor );
	uint MaxColor565 = Float3ToUint565_Ceil( MaxColor );

	uint Indices = 0;
	if (MinColor565 < MaxColor565)
	{
		Indices = GetPackedColorBlockIndices( Block, MinColor, MaxColor );
	}

	return uint2((MinColor565 << 16) | MaxColor565, Indices);
}

// Compress a BC1 block that will be sampled as sRGB
// We expect linear colors as input and convert internally
uint2 CompressBC1Block_SRGB( in float3 Block[16] )
{
	float3 MinColor, MaxColor;
	GetMinMax( Block, MinColor, MaxColor );

	uint MinColor565 = Float3ToUint565( LinearToSrgb(MinColor) );
	uint MaxColor565 = Float3ToUint565( LinearToSrgb(MaxColor) );

	uint Indices = 0;
	if (MinColor565 < MaxColor565)
	{
		Indices = GetPackedColorBlockIndices( Block, MinColor, MaxColor );
	}

	return uint2((MinColor565 << 16) | MaxColor565, Indices);
}

// Compress a BC3 block
uint4 CompressBC3Block( in float3 BlockRGB[16], in float BlockA[16] )
{
	float3 MinColor, MaxColor;
	GetMinMax( BlockRGB, MinColor, MaxColor );

	uint MinColor565 = Float3ToUint565_Floor( MinColor );
	uint MaxColor565 = Float3ToUint565_Ceil( MaxColor );

	float MinAlpha, MaxAlpha;
	GetMinMax( BlockA, MinAlpha, MaxAlpha );

	uint MinAlphaUint = RoundToUInt(MinAlpha * 255.f);
	uint MaxAlphaUint = RoundToUInt(MaxAlpha * 255.f);

	uint ColorIndices = 0;
	if (MinColor565 < MaxColor565)
	{
		ColorIndices = GetPackedColorBlockIndices( BlockRGB, MinColor, MaxColor );
	}

	uint2 AlphaIndices = 0;
	if (MinAlphaUint < MaxAlphaUint)
	{
		AlphaIndices = GetPackedAlphaBlockIndices(BlockA, MinAlpha, MaxAlpha);
	}

	return uint4((MinAlphaUint << 8) | MaxAlphaUint | AlphaIndices.x, AlphaIndices.y, (MinColor565 << 16) | MaxColor565, ColorIndices);
}

// Compress a BC3 block that will be sampled as sRGB
// We expect linear colors as input and convert internally
uint4 CompressBC3Block_SRGB( in float3 BlockRGB[16], in float BlockA[16] )
{
	float3 MinColor, MaxColor;
	GetMinMax( BlockRGB, MinColor, MaxColor );

	uint MinColor565 = Float3ToUint565( LinearToSrgb( MinColor ) );
	uint MaxColor565 = Float3ToUint565( LinearToSrgb( MaxColor ) );

	float MinAlpha, MaxAlpha;
	GetMinMax( BlockA, MinAlpha, MaxAlpha );

	uint MinAlphaUint = RoundToUInt(MinAlpha * 255.f);
	uint MaxAlphaUint = RoundToUInt(MaxAlpha * 255.f);

	uint ColorIndices = 0;
	if (MinColor565 < MaxColor565)
	{
		ColorIndices = GetPackedColorBlockIndices(BlockRGB, MinColor, MaxColor);
	}

	uint2 AlphaIndices = 0;
	if (MinAlphaUint < MaxAlphaUint)
	{
		AlphaIndices = GetPackedAlphaBlockIndices(BlockA, MinAlpha, MaxAlpha);
	}

	return uint4((MinAlphaUint << 8) | MaxAlphaUint | AlphaIndices.x, AlphaIndices.y, (MinColor565 << 16) | MaxColor565, ColorIndices);
}

// Compress a BC4 block
uint2 CompressBC4Block( in float Block[16] )
{
	float MinAlpha, MaxAlpha;
	GetMinMax( Block, MinAlpha, MaxAlpha );

	uint MinAlphaUint = RoundToUInt(MinAlpha * 255.f);
	uint MaxAlphaUint = RoundToUInt(MaxAlpha * 255.f);

	uint2 Indices = 0;
	if (MinAlphaUint < MaxAlphaUint)
	{
		Indices = GetPackedAlphaBlockIndices( Block, MinAlpha, MaxAlpha );
	}

	uint2 BC4Block = 0;
	BC4Block.x = (MinAlphaUint << 8) | MaxAlphaUint;
	BC4Block.x |= Indices.x;
	BC4Block.y = Indices.y;
	return BC4Block;
}

// Compress a BC5 block
uint4 CompressBC5Block( in float BlockU[16], in float BlockV[16] )
{
	float MinU, MaxU, MinV, MaxV;
	GetMinMax( BlockU, BlockV, MinU, MaxU, MinV, MaxV );

	uint MinUUint = RoundToUInt(MinU * 255.f);
	uint MaxUUint = RoundToUInt(MaxU * 255.f);
	uint MinVUint = RoundToUInt(MinV * 255.f);
	uint MaxVUint = RoundToUInt(MaxV * 255.f);

	uint2 IndicesU = 0;
	if (MinUUint < MaxUUint)
	{
		IndicesU = GetPackedAlphaBlockIndices( BlockU, MinU, MaxU );
	}

	uint2 IndicesV = 0;
	if (MinVUint < MaxVUint)
	{
		IndicesV = GetPackedAlphaBlockIndices( BlockV, MinV, MaxV );
	}

	return uint4((MinUUint << 8) | MaxUUint | IndicesU.x, IndicesU.y, (MinVUint << 8) | MaxVUint | IndicesV.x, IndicesV.y);
}

// Convert RGB linear color to YCoCG
float3 RGB2YCoCg(float3 RGB)
{
	float Y = (RGB.r + 2.f * RGB.g + RGB.b) * 0.25f;
	float Co = ((2.f * RGB.r - 2.f * RGB.b) * 0.25f + 128.f / 255.f);
	float Cg = ((-RGB.r + 2.f * RGB.g - RGB.b) * 0.25f + 128.f / 255.f);
	return float3(Y, Co, Cg);
}

// Get a single scale factor to use for a YCoCg color block
// This increases precision at the expense of potential blending artifacts across blocks
float2 GetYCoCgScale(float2 MinCoCg, float2 MaxCoCg)
{
	MinCoCg = abs(MinCoCg - 128.f / 255.f);
	MaxCoCg = abs(MaxCoCg - 128.f / 255.f);

	float MaxComponent = max(max(MinCoCg.x, MinCoCg.y), max(MaxCoCg.x, MaxCoCg.y));

	return (MaxComponent < 32.f / 255.f) ? float2(4.f, 0.25f) : (MaxComponent < 64.f / 255.f) ? float2(2.f, 0.5f) : float2(1.f, 1.f);
}

void ApplyYCoCgScale(inout float2 MinCoCg, inout float2 MaxCoCg, float Scale)
{
	MinCoCg = (MinCoCg - 128.f / 255.f) * Scale + 128.f / 255.f;
	MaxCoCg = (MaxCoCg - 128.f / 255.f) * Scale + 128.f / 255.f;
}

// Inset the CoCg bounding end points
void InsetCoCgEndPoints(inout float2 MinCoCg, inout float2 MaxCoGg)
{
	float2 Inset = (MaxCoGg - MinCoCg - (8.f / 255.f)) / 16.f;
	MinCoCg = saturate(MinCoCg + Inset);
	MaxCoGg = saturate(MaxCoGg - Inset);
}

// Inset the luminance end points
void InsetLumaEndPoints(inout float MinY, inout float MaxY)
{
	float Inset = (MaxY - MinY - (16.f / 255.f)) / 32.f;
	MinY = saturate(MinY + Inset);
	MaxY = saturate(MaxY - Inset);
}

// Select the 2 min/max end points from the CoCg bounding rectangle based on the block contents 
void AdjustMinMaxDiagonalYCoCg(const float3 Block[16], inout float2 MinCoCg, inout float2 MaxCoGg)
{
	float2 MidCoCg = (MaxCoGg + MinCoCg) * 0.5;

	float Sum = 0.f;
	for (int i = 0; i < 16; i++)
	{
		float2 Diff = Block[i].yz - MidCoCg;
		Sum += Diff.x * Diff.y;
	}
	if (Sum < 0.f)
	{
		float Temp = MaxCoGg.y;
		MaxCoGg.y = MinCoCg.y;
		MinCoCg.y = Temp;
	}
}

uint CoCgToUint565(inout float2 CoCg)
{
	float2 Scale = float2(31.f, 63.f);
	float2 ColorScaled = round(saturate(CoCg) * Scale);
	uint ColorPacked = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5);
	CoCg = ColorScaled / Scale;

	return ColorPacked;
}

// Calculate the final packed indices for the CoCg part of a color block
uint GetPackedCoCgBlockIndices(in float3 Block[16], in float2 MinCoCg, in float2 MaxCoCg)
{
	uint PackedIndices = 0;

	// Project onto max->min color vector and segment into range [0,3]
	float2 Range = MinCoCg - MaxCoCg;
	float Scale = 3.f / dot(Range, Range);
	float2 ScaledRange = Range * Scale;
	float Bias = (dot(MaxCoCg, MaxCoCg) - dot(MaxCoCg, MinCoCg)) * Scale;

	for (int i = 15; i >= 0; --i)
	{
		// Compute the distance index for this element
		uint Index = RoundToUInt(dot(Block[i].yz, ScaledRange) + Bias);
		// Convert distance index into the BC index
		Index += (Index > 0) - (3 * (Index == 3));
		// OR into the final PackedIndices
		PackedIndices = (PackedIndices << 2) | Index;
	}

	return PackedIndices;
}

// Calculate the final packed indices for the Luma part of a color block
uint2 GetPackedLumaBlockIndices(in float3 Block[16], in float MinAlpha, in float MaxAlpha)
{
	uint2 PackedIndices = 0;

	// Segment alpha max->min into range [0,7]
	float Range = MinAlpha - MaxAlpha;
	float Scale = 7.f / Range;
	float Bias = -Scale * MaxAlpha;

	uint i = 0;
	// The first 5 elements of the block will go into the top 16 bits of the x component
	for (i = 0; i < 5; ++i)
	{
		// Compute the distance index for this element
		uint Index = RoundToUInt(Block[i].x * Scale + Bias);
		// Convert distance index into the BC index
		Index += (Index > 0) - (7 * (Index == 7));
		// OR into the final PackedIndices
		PackedIndices.x |= (Index << (i * 3 + 16));
	}

	// The 6th element is split across the x and y components
	{
		i = 5;
		uint Index = RoundToUInt(Block[i].x * Scale + Bias);
		Index += (Index > 0) - (7 * (Index == 7));
		PackedIndices.x |= (Index << 31);
		PackedIndices.y |= (Index >> 1);
	}

	// The rest of the elements fill the y component
	for (i = 6; i < 16; ++i)
	{
		uint Index = RoundToUInt(Block[i].x * Scale + Bias);
		Index += (Index > 0) - (7 * (Index == 7 ? 1 : 0));
		PackedIndices.y |= (Index << (i * 3 - 16));
	}

	return PackedIndices;
}

// Convert a linear RGB block to YCoCg and compress a BC3 block 
uint4 CompressBC3BlockYCoCg(in float3 Block[16])
{
	for (int i = 0; i < 16; ++i)
	{
		Block[i] = RGB2YCoCg(Block[i]);
	}

	float3 MinColor, MaxColor;
	GetMinMax(Block, MinColor, MaxColor);

	AdjustMinMaxDiagonalYCoCg(Block, MinColor.yz, MaxColor.yz);

	float2 Scale = GetYCoCgScale(MinColor.yz, MaxColor.yz);

	ApplyYCoCgScale(MinColor.yz, MaxColor.yz, Scale.x);

	InsetCoCgEndPoints(MinColor.yz, MaxColor.yz);

	uint MinColor565 = CoCgToUint565(MinColor.yz) | ((uint)Scale.x - 1);
	uint MaxColor565 = CoCgToUint565(MaxColor.yz) | ((uint)Scale.x - 1);

	ApplyYCoCgScale(MinColor.yz, MaxColor.yz, Scale.y);

	uint CoCgIndices = GetPackedCoCgBlockIndices(Block, MinColor.yz, MaxColor.yz);

	InsetLumaEndPoints(MinColor.x, MaxColor.x);

	uint MinLumaUint = RoundToUInt(MinColor.x * 255.f);
	uint MaxLumaUint = RoundToUInt(MaxColor.x * 255.f);

	uint2 Indices = GetPackedLumaBlockIndices(Block, MinColor.x, MaxColor.x);

	return uint4((MinLumaUint << 8) | MaxLumaUint | Indices.x, Indices.y, (MinColor565 << 16) | MaxColor565, CoCgIndices);
}

float3 Quantize10(float3 X)
{
	return (f32tof16(X) * 1024.0f) / (0x7bff + 1.0f);
}

uint ComputeIndexBC6HIndex(float3 Color, float3 BlockVector, float EndPoint0Pos, float EndPoint1Pos)
{
	float Pos = (float)f32tof16(dot(Color, BlockVector));
	float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
	return (uint)clamp(NormalizedPos * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f);
}

// Compress a BC6H block. Evaluates only mode 11 for performance
uint4 CompressBC6HBlock(in float3 BlockRGB[16])
{
	// Compute initial endpoints
	float3 BlockMin = BlockRGB[0];
	float3 BlockMax = BlockRGB[0];
	{
		for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex)
		{
			BlockMin = min(BlockMin, BlockRGB[TexelIndex]);
			BlockMax = max(BlockMax, BlockRGB[TexelIndex]);
		}
	}

	float3 BlockVector = BlockMax - BlockMin;
	BlockVector = BlockVector / (BlockVector.x + BlockVector.y + BlockVector.z);

	float3 Endpoint0 = Quantize10(BlockMin);
	float3 Endpoint1 = Quantize10(BlockMax);
	float EndPoint0Pos = (float)f32tof16(dot(BlockMin, BlockVector));
	float EndPoint1Pos = (float)f32tof16(dot(BlockMax, BlockVector));

	// Check if endpoint swap is required
	uint FixupIndex = ComputeIndexBC6HIndex(BlockRGB[0], BlockVector, EndPoint0Pos, EndPoint1Pos);
	if (FixupIndex > 7)
	{
		Swap(EndPoint0Pos, EndPoint1Pos);
		Swap(Endpoint0, Endpoint1);
	}

	// Compute indices
	uint Indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
	for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
	{
		Indices[TexelIndex] = ComputeIndexBC6HIndex(BlockRGB[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);
	}

	uint4 Block = 0;
	{
		// Encode mode 11 block
		Block.x = 0x03;

		// Encode endpoints
		Block.x |= (uint)Endpoint0.x << 5;
		Block.x |= (uint)Endpoint0.y << 15;
		Block.x |= (uint)Endpoint0.z << 25;
		Block.y |= (uint)Endpoint0.z >> 7;
		Block.y |= (uint)Endpoint1.x << 3;
		Block.y |= (uint)Endpoint1.y << 13;
		Block.y |= (uint)Endpoint1.z << 23;
		Block.z |= (uint)Endpoint1.z >> 9;

		// Encode indices
		Block.z |= Indices[0] << 1;
		Block.z |= Indices[1] << 4;
		Block.z |= Indices[2] << 8;
		Block.z |= Indices[3] << 12;
		Block.z |= Indices[4] << 16;
		Block.z |= Indices[5] << 20;
		Block.z |= Indices[6] << 24;
		Block.z |= Indices[7] << 28;
		Block.w |= Indices[8] << 0;
		Block.w |= Indices[9] << 4;
		Block.w |= Indices[10] << 8;
		Block.w |= Indices[11] << 12;
		Block.w |= Indices[12] << 16;
		Block.w |= Indices[13] << 20;
		Block.w |= Indices[14] << 24;
		Block.w |= Indices[15] << 28;
	}

	return Block;
}

uint3 Quantize7(float3 X)
{
	return (uint3(X * 0xFF)) >> 1;
}

uint ComputeBC7Index(float3 Color, float3 BlockVector, float EndPoint0Pos, float EndPoint1Pos)
{
	float Pos = dot(Color, BlockVector);
	float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
	return (uint)clamp(NormalizedPos * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f);
}

// Least squares optimization to find best endpoints for the selected block indices in a mode 6 BC7 block
void OptimizeEndpointsBC7(float3 Texels[16], inout float3 BlockMin, inout float3 BlockMax)
{
	float3 BlockVector = BlockMax - BlockMin;

	float EndPoint0Pos = dot(BlockMin, BlockVector);
	float EndPoint1Pos = dot(BlockMax, BlockVector);

	float3 AlphaTexelSum = 0.0f;
	float3 BetaTexelSum = 0.0f;
	float AlphaBetaSum = 0.0f;
	float AlphaSqSum = 0.0f;
	float BetaSqSum = 0.0f;

	for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
	{
		uint Index = ComputeBC7Index(Texels[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);

		float Beta = saturate(Index / 15.0f);
		float Alpha = 1.0f - Beta;

		AlphaTexelSum += Alpha * Texels[TexelIndex];
		BetaTexelSum += Beta * Texels[TexelIndex];

		AlphaBetaSum += Alpha * Beta;

		AlphaSqSum += Alpha * Alpha;
		BetaSqSum += Beta * Beta;
	}

	float Det = AlphaSqSum * BetaSqSum - AlphaBetaSum * AlphaBetaSum;

	if (abs(Det) > 0.1f)
	{
		float RcpDet = rcp(Det);
		BlockMin = saturate(RcpDet * (AlphaTexelSum * BetaSqSum - BetaTexelSum * AlphaBetaSum));
		BlockMax = saturate(RcpDet * (BetaTexelSum * AlphaSqSum - AlphaTexelSum * AlphaBetaSum));
	}
}

// Compress a BC7 color only block. Evaluates only mode 6 for performance
uint4 CompressBC7Block(in float3 BlockRGB[16])
{
	// Compute initial endpoints
	float3 BlockMin = BlockRGB[0];
	float3 BlockMax = BlockRGB[0];
	{
		for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex)
		{
			BlockMin = min(BlockMin, BlockRGB[TexelIndex]);
			BlockMax = max(BlockMax, BlockRGB[TexelIndex]);
		}
	}

	#if LEAST_SQUARES_ENDPOINT_OPTIMIZATION
	{
		OptimizeEndpointsBC7(BlockRGB, BlockMin, BlockMax);
	}
	#endif

	float3 BlockVector = BlockMax - BlockMin;

	uint3 Endpoint0 = Quantize7(BlockMin);
	uint3 Endpoint1 = Quantize7(BlockMax);
	float EndPoint0Pos = dot(BlockMin, BlockVector);
	float EndPoint1Pos = dot(BlockMax, BlockVector);

	// Check if endpoint swap is required
	uint FixupIndex = ComputeBC7Index(BlockRGB[0], BlockVector, EndPoint0Pos, EndPoint1Pos);
	if (FixupIndex > 7)
	{
		Swap(EndPoint0Pos, EndPoint1Pos);
		Swap(Endpoint0, Endpoint1);
	}

	// Compute indices
	uint Indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
	for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
	{
		Indices[TexelIndex] = ComputeBC7Index(BlockRGB[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);
	}

	uint4 Block = 0;
	Block.x = 0;
	Block.y = 0;
	Block.z = 0;
	Block.w = 0;
	{
		// Encode mode 6 block
		Block.x = 0x40;

		// Encode endpoints
		Block.x |= Endpoint0.x << 7;
		Block.x |= Endpoint1.x << 14;
		Block.x |= Endpoint0.y << 21;
		Block.x |= Endpoint1.y << 28;
		Block.y |= Endpoint1.y >> 4;
		Block.y |= Endpoint0.z << 3;
		Block.y |= Endpoint1.z << 10;

		// Encode endpoint p-bit

		// Encode indices
		Block.z |= Indices[0] << 1;
		Block.z |= Indices[1] << 4;
		Block.z |= Indices[2] << 8;
		Block.z |= Indices[3] << 12;
		Block.z |= Indices[4] << 16;
		Block.z |= Indices[5] << 20;
		Block.z |= Indices[6] << 24;
		Block.z |= Indices[7] << 28;
		Block.w |= Indices[8] << 0;
		Block.w |= Indices[9] << 4;
		Block.w |= Indices[10] << 8;
		Block.w |= Indices[11] << 12;
		Block.w |= Indices[12] << 16;
		Block.w |= Indices[13] << 20;
		Block.w |= Indices[14] << 24;
		Block.w |= Indices[15] << 28;
	}

	return Block;
}

uint4 Quantize7A(float4 X)
{
	return (uint4(X * 0xFF)) >> 1;
}

uint ComputeBC7AIndex(float4 Color, float4 BlockVector, float EndPoint0Pos, float EndPoint1Pos)
{
	float Pos = dot(Color, BlockVector);
	float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
	return (uint)clamp(NormalizedPos * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f);
}

// Least squares optimization to find best endpoints for the selected block indices in a mode 6 BC7 block
void OptimizeEndpointsBC7A(float4 Texels[16], inout float4 BlockMin, inout float4 BlockMax)
{
	float4 BlockVector = BlockMax - BlockMin;

	float EndPoint0Pos = dot(BlockMin, BlockVector);
	float EndPoint1Pos = dot(BlockMax, BlockVector);

	float4 AlphaTexelSum = 0.0f;
	float4 BetaTexelSum = 0.0f;
	float AlphaBetaSum = 0.0f;
	float AlphaSqSum = 0.0f;
	float BetaSqSum = 0.0f;

	for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
	{
		uint Index = ComputeBC7AIndex(Texels[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);

		float Beta = saturate(Index / 15.0f);
		float Alpha = 1.0f - Beta;

		AlphaTexelSum += Alpha * Texels[TexelIndex];
		BetaTexelSum += Beta * Texels[TexelIndex];

		AlphaBetaSum += Alpha * Beta;

		AlphaSqSum += Alpha * Alpha;
		BetaSqSum += Beta * Beta;
	}

	float Det = AlphaSqSum * BetaSqSum - AlphaBetaSum * AlphaBetaSum;

	if (abs(Det) > 0.1f)
	{
		float RcpDet = rcp(Det);
		BlockMin = saturate(RcpDet * (AlphaTexelSum * BetaSqSum - BetaTexelSum * AlphaBetaSum));
		BlockMax = saturate(RcpDet * (BetaTexelSum * AlphaSqSum - AlphaTexelSum * AlphaBetaSum));
	}
}

// Compress a BC7 color only block. Evaluates only mode 6 for performance
uint4 CompressBC7ABlock(in float4 BlockRGBA[16])
{
	// Compute initial endpoints
	float4 BlockMin = BlockRGBA[0];
	float4 BlockMax = BlockRGBA[0];
	{
		for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex)
		{
			BlockMin = min(BlockMin, BlockRGBA[TexelIndex]);
			BlockMax = max(BlockMax, BlockRGBA[TexelIndex]);
		}
	}

	#if LEAST_SQUARES_ENDPOINT_OPTIMIZATION
	{
		OptimizeEndpointsBC7A(BlockRGBA, BlockMin, BlockMax);
	}
	#endif

	float4 BlockVector = BlockMax - BlockMin;
	uint4 Endpoint0 = Quantize7A(BlockMin);
	uint4 Endpoint1 = Quantize7A(BlockMax);
	float EndPoint0Pos = dot(BlockMin, BlockVector);
	float EndPoint1Pos = dot(BlockMax, BlockVector);

	// Check if endpoint swap is required
	uint FixupIndex = ComputeBC7AIndex(BlockRGBA[0], BlockVector, EndPoint0Pos, EndPoint1Pos);
	if (FixupIndex > 7)
	{
		Swap(EndPoint0Pos, EndPoint1Pos);
		Swap(Endpoint0, Endpoint1);
	}

	// Compute indices
	uint Indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
	for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
	{
		Indices[TexelIndex] = ComputeBC7AIndex(BlockRGBA[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);
	}

	uint4 Block = 0;
	Block.x = 0;
	Block.y = 0;
	Block.z = 0;
	Block.w = 0;
	{
		// Encode mode 6 block
		Block.x = 0x40;

		// Encode endpoints
		Block.x |= Endpoint0.x << 7;
		Block.x |= Endpoint1.x << 14;
		Block.x |= Endpoint0.y << 21;
		Block.x |= Endpoint1.y << 28;
		Block.y |= Endpoint1.y >> 4;
		Block.y |= Endpoint0.z << 3;
		Block.y |= Endpoint1.z << 10;
		Block.y |= Endpoint0.w << 17;
		Block.y |= Endpoint1.w << 24;

		// Encode endpoint p-bit

		// Encode indices
		Block.z |= Indices[0] << 1;
		Block.z |= Indices[1] << 4;
		Block.z |= Indices[2] << 8;
		Block.z |= Indices[3] << 12;
		Block.z |= Indices[4] << 16;
		Block.z |= Indices[5] << 20;
		Block.z |= Indices[6] << 24;
		Block.z |= Indices[7] << 28;
		Block.w |= Indices[8] << 0;
		Block.w |= Indices[9] << 4;
		Block.w |= Indices[10] << 8;
		Block.w |= Indices[11] << 12;
		Block.w |= Indices[12] << 16;
		Block.w |= Indices[13] << 20;
		Block.w |= Indices[14] << 24;
		Block.w |= Indices[15] << 28;
	}

	return Block;
}