847 lines
24 KiB
HLSL
847 lines
24 KiB
HLSL
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
/*=============================================================================
|
|
BCCompressionCommon.ush:
|
|
Helpers for compute shader BC texture compression
|
|
|
|
todo[vt]: First implementation is keeping it simple. Lots of possible optimizations to do...
|
|
* Pack float3 colors to uint earlier to reduce vector register pressure?
|
|
* Calling code can build blocks with one color sample per thread (with layout optimized for coalesce)?
|
|
* Code here could use wave ops for the block processing?
|
|
=============================================================================*/
|
|
|
|
#pragma once
|
|
|
|
#include "GammaCorrectionCommon.ush"
|
|
|
|
// A bit slower, but higher quality compression
|
|
#define LEAST_SQUARES_ENDPOINT_OPTIMIZATION 1
|
|
|
|
uint RoundToUInt(float X)
|
|
{
|
|
return (uint)round(X);
|
|
}
|
|
|
|
// Simple convert float3 color to 565 uint using 'round' arithmetic
|
|
uint Float3ToUint565( in float3 Color )
|
|
{
|
|
float3 Scale = float3(31.f, 63.f, 31.f);
|
|
float3 ColorScaled = round(saturate(Color) * Scale);
|
|
uint ColorPacked = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5) | (uint)ColorScaled.b;
|
|
|
|
return ColorPacked;
|
|
}
|
|
|
|
// Convert float3 color to 565 uint using 'ceil' arithmetic
|
|
// Color parameter is inout and is modified to match the converted value
|
|
uint Float3ToUint565_Ceil( inout float3 Color )
|
|
{
|
|
float3 Scale = float3(31.f, 63.f, 31.f);
|
|
float3 ColorScaled = ceil(saturate(Color) * Scale);
|
|
uint ColorPacked = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5) | (uint)ColorScaled.b;
|
|
Color = ColorScaled / Scale;
|
|
|
|
return ColorPacked;
|
|
}
|
|
|
|
// Convert float3 color to 565 uint using 'floor' arithmetic
|
|
// Color parameter is inout and is modified to match the converted value
|
|
uint Float3ToUint565_Floor( inout float3 Color )
|
|
{
|
|
float3 Scale = float3(31.f, 63.f, 31.f);
|
|
float3 ColorScaled = floor(saturate(Color) * Scale);
|
|
uint ColorPacked = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5) | (uint)ColorScaled.b;
|
|
Color = ColorScaled / Scale;
|
|
|
|
return ColorPacked;
|
|
}
|
|
|
|
// Get min and max values in a single channel block
|
|
void GetMinMax( in float Block[16], out float OutMin, out float OutMax )
|
|
{
|
|
OutMin = Block[0];
|
|
OutMax = Block[0];
|
|
|
|
for (int i=1; i<16; ++i)
|
|
{
|
|
OutMin = min(OutMin, Block[i]);
|
|
OutMax = max(OutMax, Block[i]);
|
|
}
|
|
}
|
|
|
|
// Get min and max values in two single channel blocks
|
|
void GetMinMax( in float Block0[16], in float Block1[16], out float OutMin0, out float OutMax0, out float OutMin1, out float OutMax1 )
|
|
{
|
|
OutMin0 = Block0[0];
|
|
OutMax0 = Block0[0];
|
|
OutMin1 = Block1[0];
|
|
OutMax1 = Block1[0];
|
|
|
|
for (int i=1; i<16; ++i)
|
|
{
|
|
OutMin0 = min(OutMin0, Block0[i]);
|
|
OutMax0 = max(OutMax0, Block0[i]);
|
|
OutMin1 = min(OutMin1, Block1[i]);
|
|
OutMax1 = max(OutMax1, Block1[i]);
|
|
}
|
|
}
|
|
|
|
// Get min and max values in a single three channel block
|
|
void GetMinMax( in float3 Block[16], out float3 OutMin, out float3 OutMax )
|
|
{
|
|
OutMin = Block[0];
|
|
OutMax = Block[0];
|
|
|
|
for (int i=1; i<16; ++i)
|
|
{
|
|
OutMin = min(OutMin, Block[i]);
|
|
OutMax = max(OutMax, Block[i]);
|
|
}
|
|
}
|
|
|
|
// Calculate the final packed indices for a color block
|
|
uint GetPackedColorBlockIndices( in float3 Block[16], in float3 MinColor, in float3 MaxColor )
|
|
{
|
|
uint PackedIndices = 0;
|
|
|
|
// Project onto max->min color vector and segment into range [0,3]
|
|
float3 Range = MinColor - MaxColor;
|
|
float Scale = 3.f / dot(Range, Range);
|
|
float3 ScaledRange = Range * Scale;
|
|
float Bias = (dot(MaxColor, MaxColor) - dot(MaxColor, MinColor)) * Scale;
|
|
|
|
for (int i=15; i>=0; --i)
|
|
{
|
|
// Compute the distance index for this element
|
|
uint Index = RoundToUInt(dot(Block[i], ScaledRange) + Bias);
|
|
// Convert distance index into the BC index
|
|
Index += (Index > 0) - (3 * (Index == 3));
|
|
// OR into the final PackedIndices
|
|
PackedIndices = (PackedIndices<<2)|Index;
|
|
}
|
|
|
|
return PackedIndices;
|
|
}
|
|
|
|
// Calculate the final packed indices for an alpha block
|
|
// The results are in their final location of the uint2 indices and will need ORing with the min and max alpha
|
|
uint2 GetPackedAlphaBlockIndices( in float Block[16], in float MinAlpha, in float MaxAlpha )
|
|
{
|
|
uint2 PackedIndices = 0;
|
|
|
|
// Segment alpha max->min into range [0,7]
|
|
float Range = MinAlpha - MaxAlpha;
|
|
float Scale = 7.f / Range;
|
|
float Bias = -Scale * MaxAlpha;
|
|
|
|
uint i = 0;
|
|
// The first 5 elements of the block will go into the top 16 bits of the x component
|
|
for (i=0; i<5; ++i)
|
|
{
|
|
// Compute the distance index for this element
|
|
uint Index = RoundToUInt(Block[i] * Scale + Bias);
|
|
// Convert distance index into the BC index
|
|
Index += (Index > 0) - (7 * (Index == 7));
|
|
// OR into the final PackedIndices
|
|
PackedIndices.x |= (Index << (i*3 + 16));
|
|
}
|
|
|
|
// The 6th element is split across the x and y components
|
|
{
|
|
i = 5;
|
|
uint Index = RoundToUInt(Block[i] * Scale + Bias);
|
|
Index += (Index > 0) - (7 * (Index == 7));
|
|
PackedIndices.x |= (Index << 31);
|
|
PackedIndices.y |= (Index >> 1);
|
|
}
|
|
|
|
// The rest of the elements fill the y component
|
|
for (i=6; i<16; ++i)
|
|
{
|
|
uint Index = RoundToUInt(Block[i] * Scale + Bias);
|
|
Index += (Index > 0) - (7 * (Index == 7 ? 1 : 0));
|
|
PackedIndices.y |= (Index << (i*3 - 16));
|
|
}
|
|
|
|
return PackedIndices;
|
|
}
|
|
|
|
// Compress a BC1 block
|
|
uint2 CompressBC1Block( in float3 Block[16] )
|
|
{
|
|
float3 MinColor, MaxColor;
|
|
GetMinMax( Block, MinColor, MaxColor );
|
|
|
|
uint MinColor565 = Float3ToUint565_Floor( MinColor );
|
|
uint MaxColor565 = Float3ToUint565_Ceil( MaxColor );
|
|
|
|
uint Indices = 0;
|
|
if (MinColor565 < MaxColor565)
|
|
{
|
|
Indices = GetPackedColorBlockIndices( Block, MinColor, MaxColor );
|
|
}
|
|
|
|
return uint2((MinColor565 << 16) | MaxColor565, Indices);
|
|
}
|
|
|
|
// Compress a BC1 block that will be sampled as sRGB
|
|
// We expect linear colors as input and convert internally
|
|
uint2 CompressBC1Block_SRGB( in float3 Block[16] )
|
|
{
|
|
float3 MinColor, MaxColor;
|
|
GetMinMax( Block, MinColor, MaxColor );
|
|
|
|
uint MinColor565 = Float3ToUint565( LinearToSrgb(MinColor) );
|
|
uint MaxColor565 = Float3ToUint565( LinearToSrgb(MaxColor) );
|
|
|
|
uint Indices = 0;
|
|
if (MinColor565 < MaxColor565)
|
|
{
|
|
Indices = GetPackedColorBlockIndices( Block, MinColor, MaxColor );
|
|
}
|
|
|
|
return uint2((MinColor565 << 16) | MaxColor565, Indices);
|
|
}
|
|
|
|
// Compress a BC3 block
|
|
uint4 CompressBC3Block( in float3 BlockRGB[16], in float BlockA[16] )
|
|
{
|
|
float3 MinColor, MaxColor;
|
|
GetMinMax( BlockRGB, MinColor, MaxColor );
|
|
|
|
uint MinColor565 = Float3ToUint565_Floor( MinColor );
|
|
uint MaxColor565 = Float3ToUint565_Ceil( MaxColor );
|
|
|
|
float MinAlpha, MaxAlpha;
|
|
GetMinMax( BlockA, MinAlpha, MaxAlpha );
|
|
|
|
uint MinAlphaUint = RoundToUInt(MinAlpha * 255.f);
|
|
uint MaxAlphaUint = RoundToUInt(MaxAlpha * 255.f);
|
|
|
|
uint ColorIndices = 0;
|
|
if (MinColor565 < MaxColor565)
|
|
{
|
|
ColorIndices = GetPackedColorBlockIndices( BlockRGB, MinColor, MaxColor );
|
|
}
|
|
|
|
uint2 AlphaIndices = 0;
|
|
if (MinAlphaUint < MaxAlphaUint)
|
|
{
|
|
AlphaIndices = GetPackedAlphaBlockIndices(BlockA, MinAlpha, MaxAlpha);
|
|
}
|
|
|
|
return uint4((MinAlphaUint << 8) | MaxAlphaUint | AlphaIndices.x, AlphaIndices.y, (MinColor565 << 16) | MaxColor565, ColorIndices);
|
|
}
|
|
|
|
// Compress a BC3 block that will be sampled as sRGB
|
|
// We expect linear colors as input and convert internally
|
|
uint4 CompressBC3Block_SRGB( in float3 BlockRGB[16], in float BlockA[16] )
|
|
{
|
|
float3 MinColor, MaxColor;
|
|
GetMinMax( BlockRGB, MinColor, MaxColor );
|
|
|
|
uint MinColor565 = Float3ToUint565( LinearToSrgb( MinColor ) );
|
|
uint MaxColor565 = Float3ToUint565( LinearToSrgb( MaxColor ) );
|
|
|
|
float MinAlpha, MaxAlpha;
|
|
GetMinMax( BlockA, MinAlpha, MaxAlpha );
|
|
|
|
uint MinAlphaUint = RoundToUInt(MinAlpha * 255.f);
|
|
uint MaxAlphaUint = RoundToUInt(MaxAlpha * 255.f);
|
|
|
|
uint ColorIndices = 0;
|
|
if (MinColor565 < MaxColor565)
|
|
{
|
|
ColorIndices = GetPackedColorBlockIndices(BlockRGB, MinColor, MaxColor);
|
|
}
|
|
|
|
uint2 AlphaIndices = 0;
|
|
if (MinAlphaUint < MaxAlphaUint)
|
|
{
|
|
AlphaIndices = GetPackedAlphaBlockIndices(BlockA, MinAlpha, MaxAlpha);
|
|
}
|
|
|
|
return uint4((MinAlphaUint << 8) | MaxAlphaUint | AlphaIndices.x, AlphaIndices.y, (MinColor565 << 16) | MaxColor565, ColorIndices);
|
|
}
|
|
|
|
// Compress a BC4 block
|
|
uint2 CompressBC4Block( in float Block[16] )
|
|
{
|
|
float MinAlpha, MaxAlpha;
|
|
GetMinMax( Block, MinAlpha, MaxAlpha );
|
|
|
|
uint MinAlphaUint = RoundToUInt(MinAlpha * 255.f);
|
|
uint MaxAlphaUint = RoundToUInt(MaxAlpha * 255.f);
|
|
|
|
uint2 Indices = 0;
|
|
if (MinAlphaUint < MaxAlphaUint)
|
|
{
|
|
Indices = GetPackedAlphaBlockIndices( Block, MinAlpha, MaxAlpha );
|
|
}
|
|
|
|
uint2 BC4Block = 0;
|
|
BC4Block.x = (MinAlphaUint << 8) | MaxAlphaUint;
|
|
BC4Block.x |= Indices.x;
|
|
BC4Block.y = Indices.y;
|
|
return BC4Block;
|
|
}
|
|
|
|
// Compress a BC5 block
|
|
uint4 CompressBC5Block( in float BlockU[16], in float BlockV[16] )
|
|
{
|
|
float MinU, MaxU, MinV, MaxV;
|
|
GetMinMax( BlockU, BlockV, MinU, MaxU, MinV, MaxV );
|
|
|
|
uint MinUUint = RoundToUInt(MinU * 255.f);
|
|
uint MaxUUint = RoundToUInt(MaxU * 255.f);
|
|
uint MinVUint = RoundToUInt(MinV * 255.f);
|
|
uint MaxVUint = RoundToUInt(MaxV * 255.f);
|
|
|
|
uint2 IndicesU = 0;
|
|
if (MinUUint < MaxUUint)
|
|
{
|
|
IndicesU = GetPackedAlphaBlockIndices( BlockU, MinU, MaxU );
|
|
}
|
|
|
|
uint2 IndicesV = 0;
|
|
if (MinVUint < MaxVUint)
|
|
{
|
|
IndicesV = GetPackedAlphaBlockIndices( BlockV, MinV, MaxV );
|
|
}
|
|
|
|
return uint4((MinUUint << 8) | MaxUUint | IndicesU.x, IndicesU.y, (MinVUint << 8) | MaxVUint | IndicesV.x, IndicesV.y);
|
|
}
|
|
|
|
// Convert RGB linear color to YCoCG
|
|
float3 RGB2YCoCg(float3 RGB)
|
|
{
|
|
float Y = (RGB.r + 2.f * RGB.g + RGB.b) * 0.25f;
|
|
float Co = ((2.f * RGB.r - 2.f * RGB.b) * 0.25f + 128.f / 255.f);
|
|
float Cg = ((-RGB.r + 2.f * RGB.g - RGB.b) * 0.25f + 128.f / 255.f);
|
|
return float3(Y, Co, Cg);
|
|
}
|
|
|
|
// Get a single scale factor to use for a YCoCg color block
|
|
// This increases precision at the expense of potential blending artifacts across blocks
|
|
float2 GetYCoCgScale(float2 MinCoCg, float2 MaxCoCg)
|
|
{
|
|
MinCoCg = abs(MinCoCg - 128.f / 255.f);
|
|
MaxCoCg = abs(MaxCoCg - 128.f / 255.f);
|
|
|
|
float MaxComponent = max(max(MinCoCg.x, MinCoCg.y), max(MaxCoCg.x, MaxCoCg.y));
|
|
|
|
return (MaxComponent < 32.f / 255.f) ? float2(4.f, 0.25f) : (MaxComponent < 64.f / 255.f) ? float2(2.f, 0.5f) : float2(1.f, 1.f);
|
|
}
|
|
|
|
void ApplyYCoCgScale(inout float2 MinCoCg, inout float2 MaxCoCg, float Scale)
|
|
{
|
|
MinCoCg = (MinCoCg - 128.f / 255.f) * Scale + 128.f / 255.f;
|
|
MaxCoCg = (MaxCoCg - 128.f / 255.f) * Scale + 128.f / 255.f;
|
|
}
|
|
|
|
// Inset the CoCg bounding end points
|
|
void InsetCoCgEndPoints(inout float2 MinCoCg, inout float2 MaxCoGg)
|
|
{
|
|
float2 Inset = (MaxCoGg - MinCoCg - (8.f / 255.f)) / 16.f;
|
|
MinCoCg = saturate(MinCoCg + Inset);
|
|
MaxCoGg = saturate(MaxCoGg - Inset);
|
|
}
|
|
|
|
// Inset the luminance end points
|
|
void InsetLumaEndPoints(inout float MinY, inout float MaxY)
|
|
{
|
|
float Inset = (MaxY - MinY - (16.f / 255.f)) / 32.f;
|
|
MinY = saturate(MinY + Inset);
|
|
MaxY = saturate(MaxY - Inset);
|
|
}
|
|
|
|
// Select the 2 min/max end points from the CoCg bounding rectangle based on the block contents
|
|
void AdjustMinMaxDiagonalYCoCg(const float3 Block[16], inout float2 MinCoCg, inout float2 MaxCoGg)
|
|
{
|
|
float2 MidCoCg = (MaxCoGg + MinCoCg) * 0.5;
|
|
|
|
float Sum = 0.f;
|
|
for (int i = 0; i < 16; i++)
|
|
{
|
|
float2 Diff = Block[i].yz - MidCoCg;
|
|
Sum += Diff.x * Diff.y;
|
|
}
|
|
if (Sum < 0.f)
|
|
{
|
|
float Temp = MaxCoGg.y;
|
|
MaxCoGg.y = MinCoCg.y;
|
|
MinCoCg.y = Temp;
|
|
}
|
|
}
|
|
|
|
uint CoCgToUint565(inout float2 CoCg)
|
|
{
|
|
float2 Scale = float2(31.f, 63.f);
|
|
float2 ColorScaled = round(saturate(CoCg) * Scale);
|
|
uint ColorPacked = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5);
|
|
CoCg = ColorScaled / Scale;
|
|
|
|
return ColorPacked;
|
|
}
|
|
|
|
// Calculate the final packed indices for the CoCg part of a color block
|
|
uint GetPackedCoCgBlockIndices(in float3 Block[16], in float2 MinCoCg, in float2 MaxCoCg)
|
|
{
|
|
uint PackedIndices = 0;
|
|
|
|
// Project onto max->min color vector and segment into range [0,3]
|
|
float2 Range = MinCoCg - MaxCoCg;
|
|
float Scale = 3.f / dot(Range, Range);
|
|
float2 ScaledRange = Range * Scale;
|
|
float Bias = (dot(MaxCoCg, MaxCoCg) - dot(MaxCoCg, MinCoCg)) * Scale;
|
|
|
|
for (int i = 15; i >= 0; --i)
|
|
{
|
|
// Compute the distance index for this element
|
|
uint Index = RoundToUInt(dot(Block[i].yz, ScaledRange) + Bias);
|
|
// Convert distance index into the BC index
|
|
Index += (Index > 0) - (3 * (Index == 3));
|
|
// OR into the final PackedIndices
|
|
PackedIndices = (PackedIndices << 2) | Index;
|
|
}
|
|
|
|
return PackedIndices;
|
|
}
|
|
|
|
// Calculate the final packed indices for the Luma part of a color block
|
|
uint2 GetPackedLumaBlockIndices(in float3 Block[16], in float MinAlpha, in float MaxAlpha)
|
|
{
|
|
uint2 PackedIndices = 0;
|
|
|
|
// Segment alpha max->min into range [0,7]
|
|
float Range = MinAlpha - MaxAlpha;
|
|
float Scale = 7.f / Range;
|
|
float Bias = -Scale * MaxAlpha;
|
|
|
|
uint i = 0;
|
|
// The first 5 elements of the block will go into the top 16 bits of the x component
|
|
for (i = 0; i < 5; ++i)
|
|
{
|
|
// Compute the distance index for this element
|
|
uint Index = RoundToUInt(Block[i].x * Scale + Bias);
|
|
// Convert distance index into the BC index
|
|
Index += (Index > 0) - (7 * (Index == 7));
|
|
// OR into the final PackedIndices
|
|
PackedIndices.x |= (Index << (i * 3 + 16));
|
|
}
|
|
|
|
// The 6th element is split across the x and y components
|
|
{
|
|
i = 5;
|
|
uint Index = RoundToUInt(Block[i].x * Scale + Bias);
|
|
Index += (Index > 0) - (7 * (Index == 7));
|
|
PackedIndices.x |= (Index << 31);
|
|
PackedIndices.y |= (Index >> 1);
|
|
}
|
|
|
|
// The rest of the elements fill the y component
|
|
for (i = 6; i < 16; ++i)
|
|
{
|
|
uint Index = RoundToUInt(Block[i].x * Scale + Bias);
|
|
Index += (Index > 0) - (7 * (Index == 7 ? 1 : 0));
|
|
PackedIndices.y |= (Index << (i * 3 - 16));
|
|
}
|
|
|
|
return PackedIndices;
|
|
}
|
|
|
|
// Convert a linear RGB block to YCoCg and compress a BC3 block
|
|
uint4 CompressBC3BlockYCoCg(in float3 Block[16])
|
|
{
|
|
for (int i = 0; i < 16; ++i)
|
|
{
|
|
Block[i] = RGB2YCoCg(Block[i]);
|
|
}
|
|
|
|
float3 MinColor, MaxColor;
|
|
GetMinMax(Block, MinColor, MaxColor);
|
|
|
|
AdjustMinMaxDiagonalYCoCg(Block, MinColor.yz, MaxColor.yz);
|
|
|
|
float2 Scale = GetYCoCgScale(MinColor.yz, MaxColor.yz);
|
|
|
|
ApplyYCoCgScale(MinColor.yz, MaxColor.yz, Scale.x);
|
|
|
|
InsetCoCgEndPoints(MinColor.yz, MaxColor.yz);
|
|
|
|
uint MinColor565 = CoCgToUint565(MinColor.yz) | ((uint)Scale.x - 1);
|
|
uint MaxColor565 = CoCgToUint565(MaxColor.yz) | ((uint)Scale.x - 1);
|
|
|
|
ApplyYCoCgScale(MinColor.yz, MaxColor.yz, Scale.y);
|
|
|
|
uint CoCgIndices = GetPackedCoCgBlockIndices(Block, MinColor.yz, MaxColor.yz);
|
|
|
|
InsetLumaEndPoints(MinColor.x, MaxColor.x);
|
|
|
|
uint MinLumaUint = RoundToUInt(MinColor.x * 255.f);
|
|
uint MaxLumaUint = RoundToUInt(MaxColor.x * 255.f);
|
|
|
|
uint2 Indices = GetPackedLumaBlockIndices(Block, MinColor.x, MaxColor.x);
|
|
|
|
return uint4((MinLumaUint << 8) | MaxLumaUint | Indices.x, Indices.y, (MinColor565 << 16) | MaxColor565, CoCgIndices);
|
|
}
|
|
|
|
float3 Quantize10(float3 X)
|
|
{
|
|
return (f32tof16(X) * 1024.0f) / (0x7bff + 1.0f);
|
|
}
|
|
|
|
uint ComputeIndexBC6HIndex(float3 Color, float3 BlockVector, float EndPoint0Pos, float EndPoint1Pos)
|
|
{
|
|
float Pos = (float)f32tof16(dot(Color, BlockVector));
|
|
float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
|
|
return (uint)clamp(NormalizedPos * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f);
|
|
}
|
|
|
|
// Compress a BC6H block. Evaluates only mode 11 for performance
|
|
uint4 CompressBC6HBlock(in float3 BlockRGB[16])
|
|
{
|
|
// Compute initial endpoints
|
|
float3 BlockMin = BlockRGB[0];
|
|
float3 BlockMax = BlockRGB[0];
|
|
{
|
|
for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex)
|
|
{
|
|
BlockMin = min(BlockMin, BlockRGB[TexelIndex]);
|
|
BlockMax = max(BlockMax, BlockRGB[TexelIndex]);
|
|
}
|
|
}
|
|
|
|
float3 BlockVector = BlockMax - BlockMin;
|
|
BlockVector = BlockVector / (BlockVector.x + BlockVector.y + BlockVector.z);
|
|
|
|
float3 Endpoint0 = Quantize10(BlockMin);
|
|
float3 Endpoint1 = Quantize10(BlockMax);
|
|
float EndPoint0Pos = (float)f32tof16(dot(BlockMin, BlockVector));
|
|
float EndPoint1Pos = (float)f32tof16(dot(BlockMax, BlockVector));
|
|
|
|
// Check if endpoint swap is required
|
|
uint FixupIndex = ComputeIndexBC6HIndex(BlockRGB[0], BlockVector, EndPoint0Pos, EndPoint1Pos);
|
|
if (FixupIndex > 7)
|
|
{
|
|
Swap(EndPoint0Pos, EndPoint1Pos);
|
|
Swap(Endpoint0, Endpoint1);
|
|
}
|
|
|
|
// Compute indices
|
|
uint Indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
|
|
{
|
|
Indices[TexelIndex] = ComputeIndexBC6HIndex(BlockRGB[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);
|
|
}
|
|
|
|
uint4 Block = 0;
|
|
{
|
|
// Encode mode 11 block
|
|
Block.x = 0x03;
|
|
|
|
// Encode endpoints
|
|
Block.x |= (uint)Endpoint0.x << 5;
|
|
Block.x |= (uint)Endpoint0.y << 15;
|
|
Block.x |= (uint)Endpoint0.z << 25;
|
|
Block.y |= (uint)Endpoint0.z >> 7;
|
|
Block.y |= (uint)Endpoint1.x << 3;
|
|
Block.y |= (uint)Endpoint1.y << 13;
|
|
Block.y |= (uint)Endpoint1.z << 23;
|
|
Block.z |= (uint)Endpoint1.z >> 9;
|
|
|
|
// Encode indices
|
|
Block.z |= Indices[0] << 1;
|
|
Block.z |= Indices[1] << 4;
|
|
Block.z |= Indices[2] << 8;
|
|
Block.z |= Indices[3] << 12;
|
|
Block.z |= Indices[4] << 16;
|
|
Block.z |= Indices[5] << 20;
|
|
Block.z |= Indices[6] << 24;
|
|
Block.z |= Indices[7] << 28;
|
|
Block.w |= Indices[8] << 0;
|
|
Block.w |= Indices[9] << 4;
|
|
Block.w |= Indices[10] << 8;
|
|
Block.w |= Indices[11] << 12;
|
|
Block.w |= Indices[12] << 16;
|
|
Block.w |= Indices[13] << 20;
|
|
Block.w |= Indices[14] << 24;
|
|
Block.w |= Indices[15] << 28;
|
|
}
|
|
|
|
return Block;
|
|
}
|
|
|
|
uint3 Quantize7(float3 X)
|
|
{
|
|
return (uint3(X * 0xFF)) >> 1;
|
|
}
|
|
|
|
uint ComputeBC7Index(float3 Color, float3 BlockVector, float EndPoint0Pos, float EndPoint1Pos)
|
|
{
|
|
float Pos = dot(Color, BlockVector);
|
|
float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
|
|
return (uint)clamp(NormalizedPos * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f);
|
|
}
|
|
|
|
// Least squares optimization to find best endpoints for the selected block indices in a mode 6 BC7 block
|
|
void OptimizeEndpointsBC7(float3 Texels[16], inout float3 BlockMin, inout float3 BlockMax)
|
|
{
|
|
float3 BlockVector = BlockMax - BlockMin;
|
|
|
|
float EndPoint0Pos = dot(BlockMin, BlockVector);
|
|
float EndPoint1Pos = dot(BlockMax, BlockVector);
|
|
|
|
float3 AlphaTexelSum = 0.0f;
|
|
float3 BetaTexelSum = 0.0f;
|
|
float AlphaBetaSum = 0.0f;
|
|
float AlphaSqSum = 0.0f;
|
|
float BetaSqSum = 0.0f;
|
|
|
|
for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
|
|
{
|
|
uint Index = ComputeBC7Index(Texels[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);
|
|
|
|
float Beta = saturate(Index / 15.0f);
|
|
float Alpha = 1.0f - Beta;
|
|
|
|
AlphaTexelSum += Alpha * Texels[TexelIndex];
|
|
BetaTexelSum += Beta * Texels[TexelIndex];
|
|
|
|
AlphaBetaSum += Alpha * Beta;
|
|
|
|
AlphaSqSum += Alpha * Alpha;
|
|
BetaSqSum += Beta * Beta;
|
|
}
|
|
|
|
float Det = AlphaSqSum * BetaSqSum - AlphaBetaSum * AlphaBetaSum;
|
|
|
|
if (abs(Det) > 0.1f)
|
|
{
|
|
float RcpDet = rcp(Det);
|
|
BlockMin = saturate(RcpDet * (AlphaTexelSum * BetaSqSum - BetaTexelSum * AlphaBetaSum));
|
|
BlockMax = saturate(RcpDet * (BetaTexelSum * AlphaSqSum - AlphaTexelSum * AlphaBetaSum));
|
|
}
|
|
}
|
|
|
|
// Compress a BC7 color only block. Evaluates only mode 6 for performance
|
|
uint4 CompressBC7Block(in float3 BlockRGB[16])
|
|
{
|
|
// Compute initial endpoints
|
|
float3 BlockMin = BlockRGB[0];
|
|
float3 BlockMax = BlockRGB[0];
|
|
{
|
|
for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex)
|
|
{
|
|
BlockMin = min(BlockMin, BlockRGB[TexelIndex]);
|
|
BlockMax = max(BlockMax, BlockRGB[TexelIndex]);
|
|
}
|
|
}
|
|
|
|
#if LEAST_SQUARES_ENDPOINT_OPTIMIZATION
|
|
{
|
|
OptimizeEndpointsBC7(BlockRGB, BlockMin, BlockMax);
|
|
}
|
|
#endif
|
|
|
|
float3 BlockVector = BlockMax - BlockMin;
|
|
|
|
uint3 Endpoint0 = Quantize7(BlockMin);
|
|
uint3 Endpoint1 = Quantize7(BlockMax);
|
|
float EndPoint0Pos = dot(BlockMin, BlockVector);
|
|
float EndPoint1Pos = dot(BlockMax, BlockVector);
|
|
|
|
// Check if endpoint swap is required
|
|
uint FixupIndex = ComputeBC7Index(BlockRGB[0], BlockVector, EndPoint0Pos, EndPoint1Pos);
|
|
if (FixupIndex > 7)
|
|
{
|
|
Swap(EndPoint0Pos, EndPoint1Pos);
|
|
Swap(Endpoint0, Endpoint1);
|
|
}
|
|
|
|
// Compute indices
|
|
uint Indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
|
|
{
|
|
Indices[TexelIndex] = ComputeBC7Index(BlockRGB[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);
|
|
}
|
|
|
|
uint4 Block = 0;
|
|
Block.x = 0;
|
|
Block.y = 0;
|
|
Block.z = 0;
|
|
Block.w = 0;
|
|
{
|
|
// Encode mode 6 block
|
|
Block.x = 0x40;
|
|
|
|
// Encode endpoints
|
|
Block.x |= Endpoint0.x << 7;
|
|
Block.x |= Endpoint1.x << 14;
|
|
Block.x |= Endpoint0.y << 21;
|
|
Block.x |= Endpoint1.y << 28;
|
|
Block.y |= Endpoint1.y >> 4;
|
|
Block.y |= Endpoint0.z << 3;
|
|
Block.y |= Endpoint1.z << 10;
|
|
|
|
// Encode endpoint p-bit
|
|
|
|
// Encode indices
|
|
Block.z |= Indices[0] << 1;
|
|
Block.z |= Indices[1] << 4;
|
|
Block.z |= Indices[2] << 8;
|
|
Block.z |= Indices[3] << 12;
|
|
Block.z |= Indices[4] << 16;
|
|
Block.z |= Indices[5] << 20;
|
|
Block.z |= Indices[6] << 24;
|
|
Block.z |= Indices[7] << 28;
|
|
Block.w |= Indices[8] << 0;
|
|
Block.w |= Indices[9] << 4;
|
|
Block.w |= Indices[10] << 8;
|
|
Block.w |= Indices[11] << 12;
|
|
Block.w |= Indices[12] << 16;
|
|
Block.w |= Indices[13] << 20;
|
|
Block.w |= Indices[14] << 24;
|
|
Block.w |= Indices[15] << 28;
|
|
}
|
|
|
|
return Block;
|
|
}
|
|
|
|
uint4 Quantize7A(float4 X)
|
|
{
|
|
return (uint4(X * 0xFF)) >> 1;
|
|
}
|
|
|
|
uint ComputeBC7AIndex(float4 Color, float4 BlockVector, float EndPoint0Pos, float EndPoint1Pos)
|
|
{
|
|
float Pos = dot(Color, BlockVector);
|
|
float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos));
|
|
return (uint)clamp(NormalizedPos * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f);
|
|
}
|
|
|
|
// Least squares optimization to find best endpoints for the selected block indices in a mode 6 BC7 block
|
|
void OptimizeEndpointsBC7A(float4 Texels[16], inout float4 BlockMin, inout float4 BlockMax)
|
|
{
|
|
float4 BlockVector = BlockMax - BlockMin;
|
|
|
|
float EndPoint0Pos = dot(BlockMin, BlockVector);
|
|
float EndPoint1Pos = dot(BlockMax, BlockVector);
|
|
|
|
float4 AlphaTexelSum = 0.0f;
|
|
float4 BetaTexelSum = 0.0f;
|
|
float AlphaBetaSum = 0.0f;
|
|
float AlphaSqSum = 0.0f;
|
|
float BetaSqSum = 0.0f;
|
|
|
|
for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
|
|
{
|
|
uint Index = ComputeBC7AIndex(Texels[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);
|
|
|
|
float Beta = saturate(Index / 15.0f);
|
|
float Alpha = 1.0f - Beta;
|
|
|
|
AlphaTexelSum += Alpha * Texels[TexelIndex];
|
|
BetaTexelSum += Beta * Texels[TexelIndex];
|
|
|
|
AlphaBetaSum += Alpha * Beta;
|
|
|
|
AlphaSqSum += Alpha * Alpha;
|
|
BetaSqSum += Beta * Beta;
|
|
}
|
|
|
|
float Det = AlphaSqSum * BetaSqSum - AlphaBetaSum * AlphaBetaSum;
|
|
|
|
if (abs(Det) > 0.1f)
|
|
{
|
|
float RcpDet = rcp(Det);
|
|
BlockMin = saturate(RcpDet * (AlphaTexelSum * BetaSqSum - BetaTexelSum * AlphaBetaSum));
|
|
BlockMax = saturate(RcpDet * (BetaTexelSum * AlphaSqSum - AlphaTexelSum * AlphaBetaSum));
|
|
}
|
|
}
|
|
|
|
// Compress a BC7 color only block. Evaluates only mode 6 for performance
|
|
uint4 CompressBC7ABlock(in float4 BlockRGBA[16])
|
|
{
|
|
// Compute initial endpoints
|
|
float4 BlockMin = BlockRGBA[0];
|
|
float4 BlockMax = BlockRGBA[0];
|
|
{
|
|
for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex)
|
|
{
|
|
BlockMin = min(BlockMin, BlockRGBA[TexelIndex]);
|
|
BlockMax = max(BlockMax, BlockRGBA[TexelIndex]);
|
|
}
|
|
}
|
|
|
|
#if LEAST_SQUARES_ENDPOINT_OPTIMIZATION
|
|
{
|
|
OptimizeEndpointsBC7A(BlockRGBA, BlockMin, BlockMax);
|
|
}
|
|
#endif
|
|
|
|
float4 BlockVector = BlockMax - BlockMin;
|
|
uint4 Endpoint0 = Quantize7A(BlockMin);
|
|
uint4 Endpoint1 = Quantize7A(BlockMax);
|
|
float EndPoint0Pos = dot(BlockMin, BlockVector);
|
|
float EndPoint1Pos = dot(BlockMax, BlockVector);
|
|
|
|
// Check if endpoint swap is required
|
|
uint FixupIndex = ComputeBC7AIndex(BlockRGBA[0], BlockVector, EndPoint0Pos, EndPoint1Pos);
|
|
if (FixupIndex > 7)
|
|
{
|
|
Swap(EndPoint0Pos, EndPoint1Pos);
|
|
Swap(Endpoint0, Endpoint1);
|
|
}
|
|
|
|
// Compute indices
|
|
uint Indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex)
|
|
{
|
|
Indices[TexelIndex] = ComputeBC7AIndex(BlockRGBA[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos);
|
|
}
|
|
|
|
uint4 Block = 0;
|
|
Block.x = 0;
|
|
Block.y = 0;
|
|
Block.z = 0;
|
|
Block.w = 0;
|
|
{
|
|
// Encode mode 6 block
|
|
Block.x = 0x40;
|
|
|
|
// Encode endpoints
|
|
Block.x |= Endpoint0.x << 7;
|
|
Block.x |= Endpoint1.x << 14;
|
|
Block.x |= Endpoint0.y << 21;
|
|
Block.x |= Endpoint1.y << 28;
|
|
Block.y |= Endpoint1.y >> 4;
|
|
Block.y |= Endpoint0.z << 3;
|
|
Block.y |= Endpoint1.z << 10;
|
|
Block.y |= Endpoint0.w << 17;
|
|
Block.y |= Endpoint1.w << 24;
|
|
|
|
// Encode endpoint p-bit
|
|
|
|
// Encode indices
|
|
Block.z |= Indices[0] << 1;
|
|
Block.z |= Indices[1] << 4;
|
|
Block.z |= Indices[2] << 8;
|
|
Block.z |= Indices[3] << 12;
|
|
Block.z |= Indices[4] << 16;
|
|
Block.z |= Indices[5] << 20;
|
|
Block.z |= Indices[6] << 24;
|
|
Block.z |= Indices[7] << 28;
|
|
Block.w |= Indices[8] << 0;
|
|
Block.w |= Indices[9] << 4;
|
|
Block.w |= Indices[10] << 8;
|
|
Block.w |= Indices[11] << 12;
|
|
Block.w |= Indices[12] << 16;
|
|
Block.w |= Indices[13] << 20;
|
|
Block.w |= Indices[14] << 24;
|
|
Block.w |= Indices[15] << 28;
|
|
}
|
|
|
|
return Block;
|
|
}
|