// Copyright Epic Games, Inc. All Rights Reserved.

/*=============================================================================
	ASTCompressionCommon.ush:
	Helpers for compute shader ASTC texture compression
=============================================================================*/

#pragma once

#include "BlockCompressionCommon.ush"
#include "BCCompressionCommon.ush"
#include "GammaCorrectionCommon.ush"
#include "IntegerSequenceEncoding.ush"


float MaxNorm3(float3 v)
{
	v = abs(v);
	return max(max(v.x, v.y), v.z);
}

float MaxNorm4(float4 v)
{
	v = abs(v);
	return max(max(max(v.x, v.y), v.z), v.w);
}

void PCAOptimizeEndpoints3f(in float4 BlockRGBA[16], inout float4 BlockMin, inout float4 BlockMax)
{
	// Compute mean of all texels (3D).
	float3 Mean = 0.0f;
	float m00 = 0.0f, m01 = 0.0f, m02 = 0.0f;
	float m11 = 0.0f, m12 = 0.0f, m22 = 0.0f;
	for (uint i = 0; i < 16; i++)
	{
		float3 c = BlockRGBA[i].rgb;
		Mean += c;
		m00 += c.x * c.x;
		m01 += c.x * c.y;
		m02 += c.x * c.z;
		m11 += c.y * c.y;
		m12 += c.y * c.z;
		m22 += c.z * c.z;
	}
	Mean /= 16.0f;

	m00 -= 16.0f * Mean.x * Mean.x;
	m01 -= 16.0f * Mean.x * Mean.y;
	m02 -= 16.0f * Mean.x * Mean.z;
	m11 -= 16.0f * Mean.y * Mean.y;
	m12 -= 16.0f * Mean.y * Mean.z;
	m22 -= 16.0f * Mean.z * Mean.z;

	// Build candidate vectors from the "rows" of the covariance matrix.
	float3 CovCol0 = float3(m00, m01, m02);
	float3 CovCol1 = float3(m01, m11, m12);
	float3 CovCol2 = float3(m02, m12, m22);

	// Choose the candidate with the largest norm.
	float3 CurrentAxis = CovCol0;
	float CurrentNorm = MaxNorm3(CovCol0);

	float CandidateNorm = MaxNorm3(CovCol1);
	if (CandidateNorm > CurrentNorm)
	{
		CurrentAxis = CovCol1;
		CurrentNorm = CandidateNorm;
	}
	CandidateNorm = MaxNorm3(CovCol2);
	if (CandidateNorm > CurrentNorm)
	{
		CurrentAxis = CovCol2;
		CurrentNorm = CandidateNorm;
	}
	
	if (CurrentNorm != 0.0f)
	{
		CurrentAxis = normalize(CurrentAxis);
		
		// Power-Method-style iteration to refine the principal axis.
		for (uint iteration = 0; iteration < 3; iteration++)
		{
			CurrentAxis = normalize(CovCol0 * CurrentAxis.x + CovCol1 * CurrentAxis.y + CovCol2 * CurrentAxis.z);
		}
	}
	else
	{
		CurrentAxis = float3(1.0f, 0.0f, 0.0f);
	}

	// Project each texel onto the principal axis to find minimum / maximum projections.
	float MinProj = 999999.0f;
	float MaxProj = -999999.0f;
	for (uint i = 0; i < 16; i++)
	{
		float Projection = dot(BlockRGBA[i].rgb - Mean, CurrentAxis);
		MinProj = min(MinProj, Projection);
		MaxProj = max(MaxProj, Projection);
	}

	// Reconstruct endpoints from the min / max projections, clamped to [0..1].
	BlockMin.rgb = saturate(Mean + CurrentAxis * MinProj);
	BlockMax.rgb = saturate(Mean + CurrentAxis * MaxProj);
}

void PCAOptimizeEndpoints3f_6x6(in float4 BlockRGBA[36], inout float3 BlockMin, inout float3 BlockMax)
{
	// Compute the mean and covariance of all texels (only RGB channels).
	float3 Mean = 0.0f;
	float m00 = 0.0f, m01 = 0.0f, m02 = 0.0f;
	float m11 = 0.0f, m12 = 0.0f, m22 = 0.0f;
	for (uint i = 0; i < 36; i++)
	{
		float3 c = BlockRGBA[i].rgb;
		Mean += c;
		m00 += c.x * c.x;
		m01 += c.x * c.y;
		m02 += c.x * c.z;
		m11 += c.y * c.y;
		m12 += c.y * c.z;
		m22 += c.z * c.z;
	}
	Mean /= 36.0f;
	
	// Compute the covariance components.
	m00 -= 36.0f * Mean.x * Mean.x;
	m01 -= 36.0f * Mean.x * Mean.y;
	m02 -= 36.0f * Mean.x * Mean.z;
	m11 -= 36.0f * Mean.y * Mean.y;
	m12 -= 36.0f * Mean.y * Mean.z;
	m22 -= 36.0f * Mean.z * Mean.z;
	
	// Build the candidate axis vectors from the covariance matrix.
	float3 CovCol0 = float3(m00, m01, m02);
	float3 CovCol1 = float3(m01, m11, m12);
	float3 CovCol2 = float3(m02, m12, m22);
	
	// Choose the candidate with the largest norm.
	float3 CurrentAxis = CovCol0;
	float CurrentNorm = MaxNorm3(CovCol0);
	float CandidateNorm = MaxNorm3(CovCol1);
	if (CandidateNorm > CurrentNorm)
	{
		CurrentAxis = CovCol1;
		CurrentNorm = CandidateNorm;
	}
	CandidateNorm = MaxNorm3(CovCol2);
	if (CandidateNorm > CurrentNorm)
	{
		CurrentAxis = CovCol2;
		CurrentNorm = CandidateNorm;
	}
	
	// Refine the principal axis using power iterations.
	if (CurrentNorm != 0.0f)
	{
		CurrentAxis = normalize(CurrentAxis);
		for (uint iteration = 0; iteration < 3; iteration++)
		{
			CurrentAxis = normalize(CovCol0 * CurrentAxis.x + CovCol1 * CurrentAxis.y + CovCol2 * CurrentAxis.z);
		}
	}
	else
	{
		CurrentAxis = float3(1.0f, 0.0f, 0.0f);
	}
	
	// Project each texel onto the principal axis to find the minimum and maximum projections.
	float MinProj = 999999.0f;
	float MaxProj = -999999.0f;
	for (uint i = 0; i < 36; i++)
	{
		float Projection = dot(BlockRGBA[i].rgb - Mean, CurrentAxis);
		MinProj = min(MinProj, Projection);
		MaxProj = max(MaxProj, Projection);
	}
	
	// Reconstruct the endpoints from the min and max projections, clamping results to [0,1].
	BlockMin = saturate(Mean + CurrentAxis * MinProj);
	BlockMax = saturate(Mean + CurrentAxis * MaxProj);
}

void PCAOptimizeEndpoints4f(in float4 BlockRGBA[16], inout float4 BlockMin, inout float4 BlockMax)
{
	// Compute mean over all texels
	float4 Mean = float4(0.0, 0.0, 0.0, 0.0);
	float m00 = 0.0, m01 = 0.0, m02 = 0.0, m03 = 0.0;
	float m11 = 0.0, m12 = 0.0, m13 = 0.0;
	float m22 = 0.0, m23 = 0.0;
	float m33 = 0.0;
	for (uint i = 0; i < 16; i++)
	{
		float4 c = BlockRGBA[i];
		Mean += c;
		m00 += c.x * c.x;
		m01 += c.x * c.y;
		m02 += c.x * c.z;
		m03 += c.x * c.w;
		m11 += c.y * c.y;
		m12 += c.y * c.z;
		m13 += c.y * c.w;
		m22 += c.z * c.z;
		m23 += c.z * c.w;
		m33 += c.w * c.w;
	}
	Mean /= 16.0;

	m00 -= 16.0 * Mean.x * Mean.x;
	m01 -= 16.0 * Mean.x * Mean.y;
	m02 -= 16.0 * Mean.x * Mean.z;
	m03 -= 16.0 * Mean.x * Mean.w;
	m11 -= 16.0 * Mean.y * Mean.y;
	m12 -= 16.0 * Mean.y * Mean.z;
	m13 -= 16.0 * Mean.y * Mean.w;
	m22 -= 16.0 * Mean.z * Mean.z;
	m23 -= 16.0 * Mean.z * Mean.w;
	m33 -= 16.0 * Mean.w * Mean.w;

	// Build candidate vectors from the symmetric 4x4 covariance matrix.
	float4 CovCol0 = float4(m00, m01, m02, m03);
	float4 CovCol1 = float4(m01, m11, m12, m13);
	float4 CovCol2 = float4(m02, m12, m22, m23);
	float4 CovCol3 = float4(m03, m13, m23, m33);

	// Evaluate max norms of the candidates.
	float4 CurrentAxis = CovCol0;
	float CurrentNorm = MaxNorm4(CovCol0);

	float CandidateNorm = MaxNorm4(CovCol1);
	if (CandidateNorm > CurrentNorm)
	{
		CurrentAxis = CovCol1;
		CurrentNorm = CandidateNorm;
	}
	CandidateNorm = MaxNorm4(CovCol2);
	if (CandidateNorm > CurrentNorm)
	{
		CurrentAxis = CovCol2;
		CurrentNorm = CandidateNorm;
	}
	CandidateNorm = MaxNorm4(CovCol3);
	if (CandidateNorm > CurrentNorm)
	{
		CurrentAxis = CovCol3;
		CurrentNorm = CandidateNorm;
	}
	if (CurrentNorm > 0.0)
	{
		CurrentAxis = normalize(CurrentAxis);
		// Refine with a few iterations of the power method.
		for (uint iteration = 0; iteration < 5; iteration++)
		{
			CurrentAxis = normalize(CovCol0 * CurrentAxis.x + CovCol1 * CurrentAxis.y + CovCol2 * CurrentAxis.z + CovCol3 * CurrentAxis.w);
		}
	}
	else
	{
		CurrentAxis = float4(1.0, 0.0, 0.0, 0.0);
	}
	
	// Project each texel (in 4D) onto Axis to locate end‐points.
	float minProj = 1e10;
	float maxProj = -1e10;
	for (uint i = 0; i < 16; i++)
	{
		float proj = dot(BlockRGBA[i] - Mean, CurrentAxis);
		minProj = min(minProj, proj);
		maxProj = max(maxProj, proj);
	}

	BlockMin = saturate(Mean + CurrentAxis * minProj);
	BlockMax = saturate(Mean + CurrentAxis * maxProj);
}

void LLSOptimizeEndpoints4f16(float4 Texels[16], uint WeightRange, inout float4 BlockMin, inout float4 BlockMax)
{
	float4 BlockVector = BlockMax - BlockMin;
	float EndPoint0Pos = dot(BlockMin, BlockVector);
	float EndPoint1Pos = dot(BlockMax, BlockVector);

	float4 TexelSum = 0.0f;
	float4 BetaTexelSum = 0.0f;
	float BetaSum = 0.0f;
	float BetaSqSum = 0.0f;

	for (uint i = 0; i < 16; ++i)
	{
		float Pos = dot(Texels[i], BlockVector);
		float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
		uint Index = (uint) (NormalizedPos * WeightRange + 0.5f);
		float Beta = Index / (float) WeightRange;

		TexelSum += Texels[i];
		BetaTexelSum += Texels[i] * Beta;
		BetaSum += Beta;
		BetaSqSum += Beta * Beta;
	}

	float4 AlphaTexelSum = TexelSum - BetaTexelSum;
	float AlphaSqSum = 16.0f - 2 * BetaSum + BetaSqSum;
	float AlphaBetaSum = BetaSum - BetaSqSum;

	float Det = AlphaSqSum * BetaSqSum - AlphaBetaSum * AlphaBetaSum;
	if (abs(Det) > 0.1f)
	{
		float RcpDet = 1.0f / Det;
		BlockMin = saturate((AlphaTexelSum * BetaSqSum - BetaTexelSum * AlphaBetaSum) * RcpDet);
		BlockMax = saturate((BetaTexelSum * AlphaSqSum - AlphaTexelSum * AlphaBetaSum) * RcpDet);
	}
}

void LLSOptimizeEndpoints3f16(float4 Texels[16], uint WeightRange, inout float3 BlockMin, inout float3 BlockMax)
{
	float3 BlockVector = BlockMax - BlockMin;
	float EndPoint0Pos = dot(BlockMin, BlockVector);
	float EndPoint1Pos = dot(BlockMax, BlockVector);

	float3 TexelSum     = float3(0.0f, 0.0f, 0.0f);
	float3 BetaTexelSum = float3(0.0f, 0.0f, 0.0f);
	float BetaSum  = 0.0f;
	float BetaSqSum = 0.0f;

	for (uint i = 0; i < 16; ++i)
	{
		float Pos = dot(Texels[i].rgb, BlockVector);
		float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
		uint Index = (uint)(NormalizedPos * WeightRange + 0.5f);
		float Beta = Index / (float)WeightRange;

		TexelSum     += Texels[i].rgb;
		BetaTexelSum += Texels[i].rgb * Beta;
		BetaSum    += Beta;
		BetaSqSum  += Beta * Beta;
	}

	float3 AlphaTexelSum = TexelSum - BetaTexelSum;
	float AlphaSqSum = 16.0f - 2.0f * BetaSum + BetaSqSum;
	float AlphaBetaSum = BetaSum - BetaSqSum;

	float Det = AlphaSqSum * BetaSqSum - AlphaBetaSum * AlphaBetaSum;
	if (abs(Det) > 0.1f)
	{
		float RcpDet = 1.0f / Det;
		BlockMin = saturate((AlphaTexelSum * BetaSqSum - BetaTexelSum * AlphaBetaSum) * RcpDet);
		BlockMax = saturate((BetaTexelSum * AlphaSqSum - AlphaTexelSum * AlphaBetaSum) * RcpDet);
	}
}

// https://registry.khronos.org/DataFormat/specs/1.3/dataformat.1.3.html See 23.10 Block Mode for more details
uint BuildBlockMode(uint WeightQuantMethod, uint BlockSize)
{
	uint H = (BlockSize - 2) & 0x3;
	uint W = (BlockSize - 4) & 0x3;

	uint DualPlane = 0;
	uint Precision = (WeightQuantMethod < 6) ? 0 : 1;
	uint Range = (WeightQuantMethod % 6) + 2;

	uint BlockMode = (Range >> 1) & 0x3;
	BlockMode |= (Range & 0x1) << 4;
	BlockMode |= H << 5;
	BlockMode |= W << 7;
	BlockMode |= Precision << 9;
	BlockMode |= DualPlane << 10;
	return BlockMode;
}

void EncodeWeights3f(in float4 BlockRGBA[16], uint WeightRange, uint WeightQuantMethod, float4 BlockMin, float4 BlockMax, inout uint4 BlockBuffer)
{
	float3 BlockVector = BlockMax.rgb - BlockMin.rgb;

	float EndPoint0Pos = dot(BlockMin.rgb, BlockVector);
	float EndPoint1Pos = dot(BlockMax.rgb, BlockVector);

	uint WeightedBlocks[16];
	// Quantize the weights using the endpoings and the Weight Range
	for (uint ColorIndex = 0; ColorIndex < 16; ++ColorIndex)
	{
		float Pos = dot(BlockRGBA[ColorIndex].rgb, BlockVector);
		float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
		uint QuantizedWeight = (uint) (NormalizedPos * WeightRange + 0.5f);

		uint WeightIndex = WeightQuantMethod * WEIGHT_QUANTIZE_NUM + QuantizedWeight;
		WeightedBlocks[ColorIndex] = AstcParameters.ScrambleTable[WeightIndex];
	}

	uint4 QuantWeights = uint4(0, 0, 0, 0);
	uint QuantOffset = 0;
	EncodeWeightsQuant16(WeightQuantMethod, WeightedBlocks, QuantWeights, QuantOffset);
	
	BlockBuffer.w = reversebits(QuantWeights.x);
	BlockBuffer.z = reversebits(QuantWeights.y);
	BlockBuffer.y = reversebits(QuantWeights.z);
}

void EncodeWeights4f(in float4 BlockRGBA[16], uint WeightRange, uint WeightQuantMethod, float4 BlockMin, float4 BlockMax, inout uint4 BlockBuffer)
{
	float4 BlockVector = BlockMax - BlockMin;

	float EndPoint0Pos = dot(BlockMin, BlockVector);
	float EndPoint1Pos = dot(BlockMax, BlockVector);

	uint WeightedBlocks[16];
	// Quantize the weights using the endpoings and the Weight Range
	for (uint ColorIndex = 0; ColorIndex < 16; ++ColorIndex)
	{
		float Pos = dot(BlockRGBA[ColorIndex], BlockVector);
		float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
		uint QuantizedWeight = (uint) (NormalizedPos * WeightRange + 0.5f);

		uint WeightIndex = WeightQuantMethod * WEIGHT_QUANTIZE_NUM + QuantizedWeight;
		WeightedBlocks[ColorIndex] = AstcParameters.ScrambleTable[WeightIndex];
	}

	uint4 QuantWeights = uint4(0, 0, 0, 0);
	uint QuantOffset = 0;
	EncodeWeightsQuant16(WeightQuantMethod, WeightedBlocks, QuantWeights, QuantOffset);
	
	BlockBuffer.w = reversebits(QuantWeights.x);
	BlockBuffer.z = reversebits(QuantWeights.y);
	BlockBuffer.y = reversebits(QuantWeights.z);
}

uint4 Compress_ASTC_RGBA_4x4(in float4 BlockRGBA[16])
{
	// Compute initial endpoints
	float4 BlockMin = BlockRGBA[0];
	float4 BlockMax = BlockRGBA[0];
	for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex)
	{
		BlockMin = min(BlockMin, BlockRGBA[TexelIndex]);
		BlockMax = max(BlockMax, BlockRGBA[TexelIndex]);
	}

	uint BlockSize = 4;
	
	// Note: This is just a way to encode a void-extent like block - doesn't actually encode void-extent.
	if (all(BlockMin == BlockMax))
	{
		uint ColorEndpointMode = 12; // LDR RGBA, direct
		uint WeightQuantMethod = QUANT_4;
		uint EndpointQuantMethod = QUANT_256;
		uint WeightRange = 4u - 1u;
		uint EndpointRange = 256u - 1u;

		uint PartitionCount = 1;

		uint BlockMode = BuildBlockMode(WeightQuantMethod, BlockSize);
		BlockMode |= (PartitionCount - 1) << 11;
		BlockMode |= ColorEndpointMode << 13;
		
		uint4 BlockBuffer = uint4(BlockMode, 0, 0, 0);
		uint BlockOffset = 11 + 2 + 4;

		uint4 QuantizeEndpoint = uint4(BlockMin * EndpointRange + 0.5f);

		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint.x, 8);
		WriteBits(BlockBuffer, BlockOffset, 0xFF, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint.y, 8);
		WriteBits(BlockBuffer, BlockOffset, 0xFF, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint.z, 8);
		WriteBits(BlockBuffer, BlockOffset, 0xFF, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint.w, 8);
		WriteBits(BlockBuffer, BlockOffset, 0xFF, 8);
		return BlockBuffer;
	}

	// Calculate the endpoint bit range
	bool NoAlpha = BlockMin.a == 1.0f;

	uint EndpointQuantMethod = QUANT_256;
	uint EndpointRange = 256u - 1u;

	uint4 BlockBuffer = uint4(0, 0, 0, 0);
	uint BlockOffset = 11 + 2 + 4; // BlockMode + (PartitionCount-1) + CEM

	if (NoAlpha)
	{
		uint ColorEndpointMode = 8; // LDR RGB, direct
		uint WeightQuantMethod = QUANT_12;
		uint WeightRange = 12u - 1u;
		uint PartitionCount = 1;

		uint BlockMode = BuildBlockMode(WeightQuantMethod, BlockSize);
		BlockMode |= (PartitionCount - 1) << 11;
		BlockMode |= ColorEndpointMode << 13;
		BlockBuffer.x = BlockMode;
	
#ifdef ASTC_HIGH_PROFILE
		PCAOptimizeEndpoints3f(BlockRGBA, BlockMin, BlockMax);
#endif
		//LLSOptimizeEndpoints3f16(BlockRGBA, WeightRange, BlockMin, BlockMax);
	
		// Inset the min/max
		// See https://developer.download.nvidia.com/whitepapers/2007/Real-Time-YCoCg-DXT-Compression/Real-Time%20YCoCg-DXT%20Compression.pdf
		{
			float3 offset = (1.0f / ((WeightRange + 1) * 4.f)) * (BlockMax.rgb - BlockMin.rgb);
			BlockMax.rgb = ceil((BlockMax.rgb - offset) * 255.0f) / 255.0f;
			BlockMin.rgb = floor((BlockMin.rgb + offset) * 255.0f) / 255.0f;
		}

		float SumMin = BlockMin.x + BlockMin.y + BlockMin.z;
		float SumMax = BlockMax.x + BlockMax.y + BlockMax.z;
		if (SumMin > SumMax)
		{
			float4 tmp = BlockMin;
			BlockMin = BlockMax;
			BlockMax = tmp;
		}

		EncodeWeights3f(BlockRGBA, WeightRange, WeightQuantMethod, BlockMin, BlockMax, BlockBuffer);

		uint4 QuantizeEndpoint0 = uint4(BlockMin * EndpointRange + 0.5f);
		uint4 QuantizeEndpoint1 = uint4(BlockMax * EndpointRange + 0.5f);

		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint0.x, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint1.x, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint0.y, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint1.y, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint0.z, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint1.z, 8);
	}
	else
	{
		uint ColorEndpointMode = 12; // LDR RGBA, direct
		uint WeightQuantMethod = QUANT_6;
		uint WeightRange = 6u - 1u;
		uint PartitionCount = 1;

		uint BlockMode = BuildBlockMode(WeightQuantMethod, BlockSize);
		BlockMode |= (PartitionCount - 1) << 11;
		BlockMode |= ColorEndpointMode << 13;
		BlockBuffer.x = BlockMode;

#ifdef ASTC_HIGH_PROFILE
		PCAOptimizeEndpoints4f(BlockRGBA, BlockMin, BlockMax);
#endif
		//LLSOptimizeEndpoints4f16(BlockRGBA, WeightRange, BlockMin, BlockMax);
	
		// Inset the min/max
		// See https://developer.download.nvidia.com/whitepapers/2007/Real-Time-YCoCg-DXT-Compression/Real-Time%20YCoCg-DXT%20Compression.pdf
		{
			float3 offset = (1.0f / ((WeightRange + 1) * 4.f)) * (BlockMax.rgb - BlockMin.rgb);
			BlockMax.rgb = ceil((BlockMax.rgb - offset) * 255.0f) / 255.0f;
			BlockMin.rgb = floor((BlockMin.rgb + offset) * 255.0f) / 255.0f;
		}

		float SumMin = BlockMin.x + BlockMin.y + BlockMin.z;
		float SumMax = BlockMax.x + BlockMax.y + BlockMax.z;
		if (SumMin > SumMax)
		{
			float4 tmp = BlockMin;
			BlockMin = BlockMax;
			BlockMax = tmp;
		}

		EncodeWeights4f(BlockRGBA, WeightRange, WeightQuantMethod, BlockMin, BlockMax, BlockBuffer);
		uint4 QuantizeEndpoint0 = uint4(BlockMin * EndpointRange + 0.5f);
		uint4 QuantizeEndpoint1 = uint4(BlockMax * EndpointRange + 0.5f);

		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint0.x, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint1.x, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint0.y, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint1.y, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint0.z, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint1.z, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint0.w, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint1.w, 8);
	}
		
	return BlockBuffer;
}

uint BuildBlockMode2(uint WeightQuantMethod, uint BlockX, uint BlockY)
{
	uint H = (BlockX - 6) & 0x3;
	uint W = (BlockY - 6) & 0x3;

	uint Range = (WeightQuantMethod % 6) + 2;
	uint BlockMode = 0x100;
	BlockMode |= (Range & 0x1) << 4;
	BlockMode |= ((Range >> 1) & 0x3) << 2;
	BlockMode |= H << 9;
	BlockMode |= W << 5;
	return BlockMode;
}

void EncodeWeights3f_6x6(in float4 BlockRGBA[36], uint WeightRange, uint WeightQuantMethod, float3 BlockMin, float3 BlockMax, inout uint4 BlockBuffer)
{
	float3 BlockVector = BlockMax - BlockMin;

	float EndPoint0Pos = dot(BlockMin, BlockVector);
	float EndPoint1Pos = dot(BlockMax, BlockVector);

	uint WeightedBlocks[36];
	// Quantize the weights using the endpoints and the Weight Range
	for (uint ColorIndex = 0; ColorIndex < 36; ++ColorIndex)
	{
		float Pos = dot(BlockRGBA[ColorIndex].rgb, BlockVector);
		float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
		uint QuantizedWeight = (uint) (NormalizedPos * WeightRange + 0.5f);

		uint WeightIndex = WeightQuantMethod * WEIGHT_QUANTIZE_NUM + QuantizedWeight;
		WeightedBlocks[ColorIndex] = AstcParameters.ScrambleTable[WeightIndex];
	}

	uint4 QuantWeights = uint4(0, 0, 0, 0);
	uint QuantOffset = 0;
	EncodeWeightsQuant36(WeightQuantMethod, WeightedBlocks, QuantWeights, QuantOffset);
	
	BlockBuffer.w |= reversebits(QuantWeights.x);
	BlockBuffer.z |= reversebits(QuantWeights.y);
	BlockBuffer.y |= reversebits(QuantWeights.z);
}

uint4 Compress_ASTC_RGB_6x6(in float4 BlockRGBA[36])
{
	// Compute initial endpoints
	float3 BlockMin = BlockRGBA[0].rgb;
	float3 BlockMax = BlockRGBA[0].rgb;
	for (uint TexelIndex = 1; TexelIndex < 36; ++TexelIndex)
	{
		BlockMin = min(BlockMin, BlockRGBA[TexelIndex].rgb);
		BlockMax = max(BlockMax, BlockRGBA[TexelIndex].rgb);
	}

	// Note: This is just a way to encode a void-extent like block - doesn't actually encode void-extent.
	if (all(BlockMin == BlockMax))
	{
		uint ColorEndpointMode = 8; // LDR RGBA, direct
		uint WeightQuantMethod = QUANT_4;
		uint EndpointQuantMethod = QUANT_256;
		uint WeightRange = 4u - 1u;
		uint EndpointRange = 256u - 1u;

		uint PartitionCount = 1;

		uint BlockMode = BuildBlockMode2(WeightQuantMethod, 4, 4);
		BlockMode |= (PartitionCount - 1) << 11;
		BlockMode |= ColorEndpointMode << 13;
		
		uint4 BlockBuffer = uint4(BlockMode, 0, 0, 0);
		uint BlockOffset = 11 + 2 + 4;

		uint3 QuantizeEndpoint = uint3(BlockMin * EndpointRange + 0.5f);

		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint.x, 8);
		WriteBits(BlockBuffer, BlockOffset, 0xFF, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint.y, 8);
		WriteBits(BlockBuffer, BlockOffset, 0xFF, 8);
		WriteBits(BlockBuffer, BlockOffset, QuantizeEndpoint.z, 8);
		WriteBits(BlockBuffer, BlockOffset, 0xFF, 8);
		return BlockBuffer;
	}
	

	uint4 BlockBuffer = uint4(0, 0, 0, 0);
	uint BlockOffset = 11 + 2 + 4;

	uint EndpointQuantMethod = QUANT_80;
	uint EndpointRange = 80u - 1u;

	uint ColorEndpointMode = 8; // LDR RGB, direct
	uint WeightQuantMethod = QUANT_4;
	uint WeightRange = 4u - 1u;
	uint PartitionCount = 1;

	uint BlockMode = BuildBlockMode2(WeightQuantMethod, 6, 6);
	BlockMode |= (PartitionCount - 1) << 11;
	BlockMode |= ColorEndpointMode << 13;
	BlockBuffer.x = BlockMode;


#ifdef ASTC_HIGH_PROFILE
	PCAOptimizeEndpoints3f_6x6(BlockRGBA, BlockMin, BlockMax);
#endif

	// Inset the min/max
	// See https://developer.download.nvidia.com/whitepapers/2007/Real-Time-YCoCg-DXT-Compression/Real-Time%20YCoCg-DXT%20Compression.pdf
	{
		float3 offset = (1.0f / ((WeightRange + 1) * 4.f)) * (BlockMax - BlockMin);
		BlockMax = ceil((BlockMax - offset) * (float) EndpointRange) / (float) EndpointRange;
		BlockMin = floor((BlockMin + offset) * (float) EndpointRange) / (float) EndpointRange;
	}

	float SumMin = BlockMin.x + BlockMin.y + BlockMin.z;
	float SumMax = BlockMax.x + BlockMax.y + BlockMax.z;
	if (SumMin > SumMax)
	{
		float3 tmp = BlockMin;
		BlockMin = BlockMax;
		BlockMax = tmp;
	}

	EncodeWeights3f_6x6(BlockRGBA, WeightRange, WeightQuantMethod, BlockMin, BlockMax, BlockBuffer);

	uint3 QuantizeEndpoint0 = uint3(BlockMin * 255.0f + 0.5f);
	uint3 QuantizeEndpoint1 = uint3(BlockMax * 255.0f + 0.5f);

	uint EndpointNumbers[6] =
	{
		AstcParameters.ColorScrambleTable80[QuantizeEndpoint0.x], AstcParameters.ColorScrambleTable80[QuantizeEndpoint1.x],
		AstcParameters.ColorScrambleTable80[QuantizeEndpoint0.y], AstcParameters.ColorScrambleTable80[QuantizeEndpoint1.y],
		AstcParameters.ColorScrambleTable80[QuantizeEndpoint0.z], AstcParameters.ColorScrambleTable80[QuantizeEndpoint1.z]
	};

	EncodeEndpointsQuant6(EndpointQuantMethod, EndpointNumbers, BlockBuffer, BlockOffset);
	return BlockBuffer;
}

uint4 CompressBlock_ASTC_RGBA(in float3 BlockRGB[16], in float BlockA[16])
{
	float4 B[16];
	[unroll]
	for (int i = 0; i < 16; ++i)
	{
		B[i] = float4(BlockRGB[i], BlockA[i]);
	}
	return Compress_ASTC_RGBA_4x4(B);
}

uint4 CompressBlock_ASTC_SRGBA(in float3 BlockRGB[16], in float BlockA[16])
{
	float4 B[16];
	[unroll]
	for (int i = 0; i < 16; ++i)
	{
		B[i] = float4(LinearToSrgb(BlockRGB[i]), BlockA[i]);
	}
	return Compress_ASTC_RGBA_4x4(B);
}

uint4 CompressBlock_ASTC_YCoCg(in float3 BlockRGB[16])
{
	float4 B[16];
	[unroll]
	for (int i = 0; i < 16; ++i)
	{
		float3 YCoCg = RGB2YCoCg(BlockRGB[i]);
		B[i] = float4(YCoCg.yz, 0.0, YCoCg.x);
	}
	return Compress_ASTC_RGBA_4x4(B);
}