// Copyright Epic Games, Inc. All Rights Reserved. /*============================================================================= BCCompressionCommon.ush: Helpers for compute shader BC texture compression todo[vt]: First implementation is keeping it simple. Lots of possible optimizations to do... * Pack float3 colors to uint earlier to reduce vector register pressure? * Calling code can build blocks with one color sample per thread (with layout optimized for coalesce)? * Code here could use wave ops for the block processing? =============================================================================*/ #pragma once #include "GammaCorrectionCommon.ush" // A bit slower, but higher quality compression #define LEAST_SQUARES_ENDPOINT_OPTIMIZATION 1 uint RoundToUInt(float X) { return (uint)round(X); } // Simple convert float3 color to 565 uint using 'round' arithmetic uint Float3ToUint565( in float3 Color ) { float3 Scale = float3(31.f, 63.f, 31.f); float3 ColorScaled = round(saturate(Color) * Scale); uint ColorPacked = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5) | (uint)ColorScaled.b; return ColorPacked; } // Convert float3 color to 565 uint using 'ceil' arithmetic // Color parameter is inout and is modified to match the converted value uint Float3ToUint565_Ceil( inout float3 Color ) { float3 Scale = float3(31.f, 63.f, 31.f); float3 ColorScaled = ceil(saturate(Color) * Scale); uint ColorPacked = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5) | (uint)ColorScaled.b; Color = ColorScaled / Scale; return ColorPacked; } // Convert float3 color to 565 uint using 'floor' arithmetic // Color parameter is inout and is modified to match the converted value uint Float3ToUint565_Floor( inout float3 Color ) { float3 Scale = float3(31.f, 63.f, 31.f); float3 ColorScaled = floor(saturate(Color) * Scale); uint ColorPacked = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5) | (uint)ColorScaled.b; Color = ColorScaled / Scale; return ColorPacked; } // Get min and max values in a single channel block void GetMinMax( in float Block[16], out float OutMin, out float OutMax ) { OutMin = Block[0]; OutMax = Block[0]; for (int i=1; i<16; ++i) { OutMin = min(OutMin, Block[i]); OutMax = max(OutMax, Block[i]); } } // Get min and max values in two single channel blocks void GetMinMax( in float Block0[16], in float Block1[16], out float OutMin0, out float OutMax0, out float OutMin1, out float OutMax1 ) { OutMin0 = Block0[0]; OutMax0 = Block0[0]; OutMin1 = Block1[0]; OutMax1 = Block1[0]; for (int i=1; i<16; ++i) { OutMin0 = min(OutMin0, Block0[i]); OutMax0 = max(OutMax0, Block0[i]); OutMin1 = min(OutMin1, Block1[i]); OutMax1 = max(OutMax1, Block1[i]); } } // Get min and max values in a single three channel block void GetMinMax( in float3 Block[16], out float3 OutMin, out float3 OutMax ) { OutMin = Block[0]; OutMax = Block[0]; for (int i=1; i<16; ++i) { OutMin = min(OutMin, Block[i]); OutMax = max(OutMax, Block[i]); } } // Calculate the final packed indices for a color block uint GetPackedColorBlockIndices( in float3 Block[16], in float3 MinColor, in float3 MaxColor ) { uint PackedIndices = 0; // Project onto max->min color vector and segment into range [0,3] float3 Range = MinColor - MaxColor; float Scale = 3.f / dot(Range, Range); float3 ScaledRange = Range * Scale; float Bias = (dot(MaxColor, MaxColor) - dot(MaxColor, MinColor)) * Scale; for (int i=15; i>=0; --i) { // Compute the distance index for this element uint Index = RoundToUInt(dot(Block[i], ScaledRange) + Bias); // Convert distance index into the BC index Index += (Index > 0) - (3 * (Index == 3)); // OR into the final PackedIndices PackedIndices = (PackedIndices<<2)|Index; } return PackedIndices; } // Calculate the final packed indices for an alpha block // The results are in their final location of the uint2 indices and will need ORing with the min and max alpha uint2 GetPackedAlphaBlockIndices( in float Block[16], in float MinAlpha, in float MaxAlpha ) { uint2 PackedIndices = 0; // Segment alpha max->min into range [0,7] float Range = MinAlpha - MaxAlpha; float Scale = 7.f / Range; float Bias = -Scale * MaxAlpha; uint i = 0; // The first 5 elements of the block will go into the top 16 bits of the x component for (i=0; i<5; ++i) { // Compute the distance index for this element uint Index = RoundToUInt(Block[i] * Scale + Bias); // Convert distance index into the BC index Index += (Index > 0) - (7 * (Index == 7)); // OR into the final PackedIndices PackedIndices.x |= (Index << (i*3 + 16)); } // The 6th element is split across the x and y components { i = 5; uint Index = RoundToUInt(Block[i] * Scale + Bias); Index += (Index > 0) - (7 * (Index == 7)); PackedIndices.x |= (Index << 31); PackedIndices.y |= (Index >> 1); } // The rest of the elements fill the y component for (i=6; i<16; ++i) { uint Index = RoundToUInt(Block[i] * Scale + Bias); Index += (Index > 0) - (7 * (Index == 7 ? 1 : 0)); PackedIndices.y |= (Index << (i*3 - 16)); } return PackedIndices; } // Compress a BC1 block uint2 CompressBC1Block( in float3 Block[16] ) { float3 MinColor, MaxColor; GetMinMax( Block, MinColor, MaxColor ); uint MinColor565 = Float3ToUint565_Floor( MinColor ); uint MaxColor565 = Float3ToUint565_Ceil( MaxColor ); uint Indices = 0; if (MinColor565 < MaxColor565) { Indices = GetPackedColorBlockIndices( Block, MinColor, MaxColor ); } return uint2((MinColor565 << 16) | MaxColor565, Indices); } // Compress a BC1 block that will be sampled as sRGB // We expect linear colors as input and convert internally uint2 CompressBC1Block_SRGB( in float3 Block[16] ) { float3 MinColor, MaxColor; GetMinMax( Block, MinColor, MaxColor ); uint MinColor565 = Float3ToUint565( LinearToSrgb(MinColor) ); uint MaxColor565 = Float3ToUint565( LinearToSrgb(MaxColor) ); uint Indices = 0; if (MinColor565 < MaxColor565) { Indices = GetPackedColorBlockIndices( Block, MinColor, MaxColor ); } return uint2((MinColor565 << 16) | MaxColor565, Indices); } // Compress a BC3 block uint4 CompressBC3Block( in float3 BlockRGB[16], in float BlockA[16] ) { float3 MinColor, MaxColor; GetMinMax( BlockRGB, MinColor, MaxColor ); uint MinColor565 = Float3ToUint565_Floor( MinColor ); uint MaxColor565 = Float3ToUint565_Ceil( MaxColor ); float MinAlpha, MaxAlpha; GetMinMax( BlockA, MinAlpha, MaxAlpha ); uint MinAlphaUint = RoundToUInt(MinAlpha * 255.f); uint MaxAlphaUint = RoundToUInt(MaxAlpha * 255.f); uint ColorIndices = 0; if (MinColor565 < MaxColor565) { ColorIndices = GetPackedColorBlockIndices( BlockRGB, MinColor, MaxColor ); } uint2 AlphaIndices = 0; if (MinAlphaUint < MaxAlphaUint) { AlphaIndices = GetPackedAlphaBlockIndices(BlockA, MinAlpha, MaxAlpha); } return uint4((MinAlphaUint << 8) | MaxAlphaUint | AlphaIndices.x, AlphaIndices.y, (MinColor565 << 16) | MaxColor565, ColorIndices); } // Compress a BC3 block that will be sampled as sRGB // We expect linear colors as input and convert internally uint4 CompressBC3Block_SRGB( in float3 BlockRGB[16], in float BlockA[16] ) { float3 MinColor, MaxColor; GetMinMax( BlockRGB, MinColor, MaxColor ); uint MinColor565 = Float3ToUint565( LinearToSrgb( MinColor ) ); uint MaxColor565 = Float3ToUint565( LinearToSrgb( MaxColor ) ); float MinAlpha, MaxAlpha; GetMinMax( BlockA, MinAlpha, MaxAlpha ); uint MinAlphaUint = RoundToUInt(MinAlpha * 255.f); uint MaxAlphaUint = RoundToUInt(MaxAlpha * 255.f); uint ColorIndices = 0; if (MinColor565 < MaxColor565) { ColorIndices = GetPackedColorBlockIndices(BlockRGB, MinColor, MaxColor); } uint2 AlphaIndices = 0; if (MinAlphaUint < MaxAlphaUint) { AlphaIndices = GetPackedAlphaBlockIndices(BlockA, MinAlpha, MaxAlpha); } return uint4((MinAlphaUint << 8) | MaxAlphaUint | AlphaIndices.x, AlphaIndices.y, (MinColor565 << 16) | MaxColor565, ColorIndices); } // Compress a BC4 block uint2 CompressBC4Block( in float Block[16] ) { float MinAlpha, MaxAlpha; GetMinMax( Block, MinAlpha, MaxAlpha ); uint MinAlphaUint = RoundToUInt(MinAlpha * 255.f); uint MaxAlphaUint = RoundToUInt(MaxAlpha * 255.f); uint2 Indices = 0; if (MinAlphaUint < MaxAlphaUint) { Indices = GetPackedAlphaBlockIndices( Block, MinAlpha, MaxAlpha ); } uint2 BC4Block = 0; BC4Block.x = (MinAlphaUint << 8) | MaxAlphaUint; BC4Block.x |= Indices.x; BC4Block.y = Indices.y; return BC4Block; } // Compress a BC5 block uint4 CompressBC5Block( in float BlockU[16], in float BlockV[16] ) { float MinU, MaxU, MinV, MaxV; GetMinMax( BlockU, BlockV, MinU, MaxU, MinV, MaxV ); uint MinUUint = RoundToUInt(MinU * 255.f); uint MaxUUint = RoundToUInt(MaxU * 255.f); uint MinVUint = RoundToUInt(MinV * 255.f); uint MaxVUint = RoundToUInt(MaxV * 255.f); uint2 IndicesU = 0; if (MinUUint < MaxUUint) { IndicesU = GetPackedAlphaBlockIndices( BlockU, MinU, MaxU ); } uint2 IndicesV = 0; if (MinVUint < MaxVUint) { IndicesV = GetPackedAlphaBlockIndices( BlockV, MinV, MaxV ); } return uint4((MinUUint << 8) | MaxUUint | IndicesU.x, IndicesU.y, (MinVUint << 8) | MaxVUint | IndicesV.x, IndicesV.y); } // Convert RGB linear color to YCoCG float3 RGB2YCoCg(float3 RGB) { float Y = (RGB.r + 2.f * RGB.g + RGB.b) * 0.25f; float Co = ((2.f * RGB.r - 2.f * RGB.b) * 0.25f + 128.f / 255.f); float Cg = ((-RGB.r + 2.f * RGB.g - RGB.b) * 0.25f + 128.f / 255.f); return float3(Y, Co, Cg); } // Get a single scale factor to use for a YCoCg color block // This increases precision at the expense of potential blending artifacts across blocks float2 GetYCoCgScale(float2 MinCoCg, float2 MaxCoCg) { MinCoCg = abs(MinCoCg - 128.f / 255.f); MaxCoCg = abs(MaxCoCg - 128.f / 255.f); float MaxComponent = max(max(MinCoCg.x, MinCoCg.y), max(MaxCoCg.x, MaxCoCg.y)); return (MaxComponent < 32.f / 255.f) ? float2(4.f, 0.25f) : (MaxComponent < 64.f / 255.f) ? float2(2.f, 0.5f) : float2(1.f, 1.f); } void ApplyYCoCgScale(inout float2 MinCoCg, inout float2 MaxCoCg, float Scale) { MinCoCg = (MinCoCg - 128.f / 255.f) * Scale + 128.f / 255.f; MaxCoCg = (MaxCoCg - 128.f / 255.f) * Scale + 128.f / 255.f; } // Inset the CoCg bounding end points void InsetCoCgEndPoints(inout float2 MinCoCg, inout float2 MaxCoGg) { float2 Inset = (MaxCoGg - MinCoCg - (8.f / 255.f)) / 16.f; MinCoCg = saturate(MinCoCg + Inset); MaxCoGg = saturate(MaxCoGg - Inset); } // Inset the luminance end points void InsetLumaEndPoints(inout float MinY, inout float MaxY) { float Inset = (MaxY - MinY - (16.f / 255.f)) / 32.f; MinY = saturate(MinY + Inset); MaxY = saturate(MaxY - Inset); } // Select the 2 min/max end points from the CoCg bounding rectangle based on the block contents void AdjustMinMaxDiagonalYCoCg(const float3 Block[16], inout float2 MinCoCg, inout float2 MaxCoGg) { float2 MidCoCg = (MaxCoGg + MinCoCg) * 0.5; float Sum = 0.f; for (int i = 0; i < 16; i++) { float2 Diff = Block[i].yz - MidCoCg; Sum += Diff.x * Diff.y; } if (Sum < 0.f) { float Temp = MaxCoGg.y; MaxCoGg.y = MinCoCg.y; MinCoCg.y = Temp; } } uint CoCgToUint565(inout float2 CoCg) { float2 Scale = float2(31.f, 63.f); float2 ColorScaled = round(saturate(CoCg) * Scale); uint ColorPacked = ((uint)ColorScaled.r << 11) | ((uint)ColorScaled.g << 5); CoCg = ColorScaled / Scale; return ColorPacked; } // Calculate the final packed indices for the CoCg part of a color block uint GetPackedCoCgBlockIndices(in float3 Block[16], in float2 MinCoCg, in float2 MaxCoCg) { uint PackedIndices = 0; // Project onto max->min color vector and segment into range [0,3] float2 Range = MinCoCg - MaxCoCg; float Scale = 3.f / dot(Range, Range); float2 ScaledRange = Range * Scale; float Bias = (dot(MaxCoCg, MaxCoCg) - dot(MaxCoCg, MinCoCg)) * Scale; for (int i = 15; i >= 0; --i) { // Compute the distance index for this element uint Index = RoundToUInt(dot(Block[i].yz, ScaledRange) + Bias); // Convert distance index into the BC index Index += (Index > 0) - (3 * (Index == 3)); // OR into the final PackedIndices PackedIndices = (PackedIndices << 2) | Index; } return PackedIndices; } // Calculate the final packed indices for the Luma part of a color block uint2 GetPackedLumaBlockIndices(in float3 Block[16], in float MinAlpha, in float MaxAlpha) { uint2 PackedIndices = 0; // Segment alpha max->min into range [0,7] float Range = MinAlpha - MaxAlpha; float Scale = 7.f / Range; float Bias = -Scale * MaxAlpha; uint i = 0; // The first 5 elements of the block will go into the top 16 bits of the x component for (i = 0; i < 5; ++i) { // Compute the distance index for this element uint Index = RoundToUInt(Block[i].x * Scale + Bias); // Convert distance index into the BC index Index += (Index > 0) - (7 * (Index == 7)); // OR into the final PackedIndices PackedIndices.x |= (Index << (i * 3 + 16)); } // The 6th element is split across the x and y components { i = 5; uint Index = RoundToUInt(Block[i].x * Scale + Bias); Index += (Index > 0) - (7 * (Index == 7)); PackedIndices.x |= (Index << 31); PackedIndices.y |= (Index >> 1); } // The rest of the elements fill the y component for (i = 6; i < 16; ++i) { uint Index = RoundToUInt(Block[i].x * Scale + Bias); Index += (Index > 0) - (7 * (Index == 7 ? 1 : 0)); PackedIndices.y |= (Index << (i * 3 - 16)); } return PackedIndices; } // Convert a linear RGB block to YCoCg and compress a BC3 block uint4 CompressBC3BlockYCoCg(in float3 Block[16]) { for (int i = 0; i < 16; ++i) { Block[i] = RGB2YCoCg(Block[i]); } float3 MinColor, MaxColor; GetMinMax(Block, MinColor, MaxColor); AdjustMinMaxDiagonalYCoCg(Block, MinColor.yz, MaxColor.yz); float2 Scale = GetYCoCgScale(MinColor.yz, MaxColor.yz); ApplyYCoCgScale(MinColor.yz, MaxColor.yz, Scale.x); InsetCoCgEndPoints(MinColor.yz, MaxColor.yz); uint MinColor565 = CoCgToUint565(MinColor.yz) | ((uint)Scale.x - 1); uint MaxColor565 = CoCgToUint565(MaxColor.yz) | ((uint)Scale.x - 1); ApplyYCoCgScale(MinColor.yz, MaxColor.yz, Scale.y); uint CoCgIndices = GetPackedCoCgBlockIndices(Block, MinColor.yz, MaxColor.yz); InsetLumaEndPoints(MinColor.x, MaxColor.x); uint MinLumaUint = RoundToUInt(MinColor.x * 255.f); uint MaxLumaUint = RoundToUInt(MaxColor.x * 255.f); uint2 Indices = GetPackedLumaBlockIndices(Block, MinColor.x, MaxColor.x); return uint4((MinLumaUint << 8) | MaxLumaUint | Indices.x, Indices.y, (MinColor565 << 16) | MaxColor565, CoCgIndices); } float3 Quantize10(float3 X) { return (f32tof16(X) * 1024.0f) / (0x7bff + 1.0f); } uint ComputeIndexBC6HIndex(float3 Color, float3 BlockVector, float EndPoint0Pos, float EndPoint1Pos) { float Pos = (float)f32tof16(dot(Color, BlockVector)); float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos)); return (uint)clamp(NormalizedPos * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f); } // Compress a BC6H block. Evaluates only mode 11 for performance uint4 CompressBC6HBlock(in float3 BlockRGB[16]) { // Compute initial endpoints float3 BlockMin = BlockRGB[0]; float3 BlockMax = BlockRGB[0]; { for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex) { BlockMin = min(BlockMin, BlockRGB[TexelIndex]); BlockMax = max(BlockMax, BlockRGB[TexelIndex]); } } float3 BlockVector = BlockMax - BlockMin; BlockVector = BlockVector / (BlockVector.x + BlockVector.y + BlockVector.z); float3 Endpoint0 = Quantize10(BlockMin); float3 Endpoint1 = Quantize10(BlockMax); float EndPoint0Pos = (float)f32tof16(dot(BlockMin, BlockVector)); float EndPoint1Pos = (float)f32tof16(dot(BlockMax, BlockVector)); // Check if endpoint swap is required uint FixupIndex = ComputeIndexBC6HIndex(BlockRGB[0], BlockVector, EndPoint0Pos, EndPoint1Pos); if (FixupIndex > 7) { Swap(EndPoint0Pos, EndPoint1Pos); Swap(Endpoint0, Endpoint1); } // Compute indices uint Indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex) { Indices[TexelIndex] = ComputeIndexBC6HIndex(BlockRGB[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos); } uint4 Block = 0; { // Encode mode 11 block Block.x = 0x03; // Encode endpoints Block.x |= (uint)Endpoint0.x << 5; Block.x |= (uint)Endpoint0.y << 15; Block.x |= (uint)Endpoint0.z << 25; Block.y |= (uint)Endpoint0.z >> 7; Block.y |= (uint)Endpoint1.x << 3; Block.y |= (uint)Endpoint1.y << 13; Block.y |= (uint)Endpoint1.z << 23; Block.z |= (uint)Endpoint1.z >> 9; // Encode indices Block.z |= Indices[0] << 1; Block.z |= Indices[1] << 4; Block.z |= Indices[2] << 8; Block.z |= Indices[3] << 12; Block.z |= Indices[4] << 16; Block.z |= Indices[5] << 20; Block.z |= Indices[6] << 24; Block.z |= Indices[7] << 28; Block.w |= Indices[8] << 0; Block.w |= Indices[9] << 4; Block.w |= Indices[10] << 8; Block.w |= Indices[11] << 12; Block.w |= Indices[12] << 16; Block.w |= Indices[13] << 20; Block.w |= Indices[14] << 24; Block.w |= Indices[15] << 28; } return Block; } uint3 Quantize7(float3 X) { return (uint3(X * 0xFF)) >> 1; } uint ComputeBC7Index(float3 Color, float3 BlockVector, float EndPoint0Pos, float EndPoint1Pos) { float Pos = dot(Color, BlockVector); float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos)); return (uint)clamp(NormalizedPos * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f); } // Least squares optimization to find best endpoints for the selected block indices in a mode 6 BC7 block void OptimizeEndpointsBC7(float3 Texels[16], inout float3 BlockMin, inout float3 BlockMax) { float3 BlockVector = BlockMax - BlockMin; float EndPoint0Pos = dot(BlockMin, BlockVector); float EndPoint1Pos = dot(BlockMax, BlockVector); float3 AlphaTexelSum = 0.0f; float3 BetaTexelSum = 0.0f; float AlphaBetaSum = 0.0f; float AlphaSqSum = 0.0f; float BetaSqSum = 0.0f; for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex) { uint Index = ComputeBC7Index(Texels[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos); float Beta = saturate(Index / 15.0f); float Alpha = 1.0f - Beta; AlphaTexelSum += Alpha * Texels[TexelIndex]; BetaTexelSum += Beta * Texels[TexelIndex]; AlphaBetaSum += Alpha * Beta; AlphaSqSum += Alpha * Alpha; BetaSqSum += Beta * Beta; } float Det = AlphaSqSum * BetaSqSum - AlphaBetaSum * AlphaBetaSum; if (abs(Det) > 0.1f) { float RcpDet = rcp(Det); BlockMin = saturate(RcpDet * (AlphaTexelSum * BetaSqSum - BetaTexelSum * AlphaBetaSum)); BlockMax = saturate(RcpDet * (BetaTexelSum * AlphaSqSum - AlphaTexelSum * AlphaBetaSum)); } } // Compress a BC7 color only block. Evaluates only mode 6 for performance uint4 CompressBC7Block(in float3 BlockRGB[16]) { // Compute initial endpoints float3 BlockMin = BlockRGB[0]; float3 BlockMax = BlockRGB[0]; { for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex) { BlockMin = min(BlockMin, BlockRGB[TexelIndex]); BlockMax = max(BlockMax, BlockRGB[TexelIndex]); } } #if LEAST_SQUARES_ENDPOINT_OPTIMIZATION { OptimizeEndpointsBC7(BlockRGB, BlockMin, BlockMax); } #endif float3 BlockVector = BlockMax - BlockMin; uint3 Endpoint0 = Quantize7(BlockMin); uint3 Endpoint1 = Quantize7(BlockMax); float EndPoint0Pos = dot(BlockMin, BlockVector); float EndPoint1Pos = dot(BlockMax, BlockVector); // Check if endpoint swap is required uint FixupIndex = ComputeBC7Index(BlockRGB[0], BlockVector, EndPoint0Pos, EndPoint1Pos); if (FixupIndex > 7) { Swap(EndPoint0Pos, EndPoint1Pos); Swap(Endpoint0, Endpoint1); } // Compute indices uint Indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex) { Indices[TexelIndex] = ComputeBC7Index(BlockRGB[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos); } uint4 Block = 0; Block.x = 0; Block.y = 0; Block.z = 0; Block.w = 0; { // Encode mode 6 block Block.x = 0x40; // Encode endpoints Block.x |= Endpoint0.x << 7; Block.x |= Endpoint1.x << 14; Block.x |= Endpoint0.y << 21; Block.x |= Endpoint1.y << 28; Block.y |= Endpoint1.y >> 4; Block.y |= Endpoint0.z << 3; Block.y |= Endpoint1.z << 10; // Encode endpoint p-bit // Encode indices Block.z |= Indices[0] << 1; Block.z |= Indices[1] << 4; Block.z |= Indices[2] << 8; Block.z |= Indices[3] << 12; Block.z |= Indices[4] << 16; Block.z |= Indices[5] << 20; Block.z |= Indices[6] << 24; Block.z |= Indices[7] << 28; Block.w |= Indices[8] << 0; Block.w |= Indices[9] << 4; Block.w |= Indices[10] << 8; Block.w |= Indices[11] << 12; Block.w |= Indices[12] << 16; Block.w |= Indices[13] << 20; Block.w |= Indices[14] << 24; Block.w |= Indices[15] << 28; } return Block; } uint4 Quantize7A(float4 X) { return (uint4(X * 0xFF)) >> 1; } uint ComputeBC7AIndex(float4 Color, float4 BlockVector, float EndPoint0Pos, float EndPoint1Pos) { float Pos = dot(Color, BlockVector); float NormalizedPos = saturate((Pos - EndPoint0Pos) / (EndPoint1Pos - EndPoint0Pos)); return (uint)clamp(NormalizedPos * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f); } // Least squares optimization to find best endpoints for the selected block indices in a mode 6 BC7 block void OptimizeEndpointsBC7A(float4 Texels[16], inout float4 BlockMin, inout float4 BlockMax) { float4 BlockVector = BlockMax - BlockMin; float EndPoint0Pos = dot(BlockMin, BlockVector); float EndPoint1Pos = dot(BlockMax, BlockVector); float4 AlphaTexelSum = 0.0f; float4 BetaTexelSum = 0.0f; float AlphaBetaSum = 0.0f; float AlphaSqSum = 0.0f; float BetaSqSum = 0.0f; for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex) { uint Index = ComputeBC7AIndex(Texels[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos); float Beta = saturate(Index / 15.0f); float Alpha = 1.0f - Beta; AlphaTexelSum += Alpha * Texels[TexelIndex]; BetaTexelSum += Beta * Texels[TexelIndex]; AlphaBetaSum += Alpha * Beta; AlphaSqSum += Alpha * Alpha; BetaSqSum += Beta * Beta; } float Det = AlphaSqSum * BetaSqSum - AlphaBetaSum * AlphaBetaSum; if (abs(Det) > 0.1f) { float RcpDet = rcp(Det); BlockMin = saturate(RcpDet * (AlphaTexelSum * BetaSqSum - BetaTexelSum * AlphaBetaSum)); BlockMax = saturate(RcpDet * (BetaTexelSum * AlphaSqSum - AlphaTexelSum * AlphaBetaSum)); } } // Compress a BC7 color only block. Evaluates only mode 6 for performance uint4 CompressBC7ABlock(in float4 BlockRGBA[16]) { // Compute initial endpoints float4 BlockMin = BlockRGBA[0]; float4 BlockMax = BlockRGBA[0]; { for (uint TexelIndex = 1; TexelIndex < 16; ++TexelIndex) { BlockMin = min(BlockMin, BlockRGBA[TexelIndex]); BlockMax = max(BlockMax, BlockRGBA[TexelIndex]); } } #if LEAST_SQUARES_ENDPOINT_OPTIMIZATION { OptimizeEndpointsBC7A(BlockRGBA, BlockMin, BlockMax); } #endif float4 BlockVector = BlockMax - BlockMin; uint4 Endpoint0 = Quantize7A(BlockMin); uint4 Endpoint1 = Quantize7A(BlockMax); float EndPoint0Pos = dot(BlockMin, BlockVector); float EndPoint1Pos = dot(BlockMax, BlockVector); // Check if endpoint swap is required uint FixupIndex = ComputeBC7AIndex(BlockRGBA[0], BlockVector, EndPoint0Pos, EndPoint1Pos); if (FixupIndex > 7) { Swap(EndPoint0Pos, EndPoint1Pos); Swap(Endpoint0, Endpoint1); } // Compute indices uint Indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (uint TexelIndex = 0; TexelIndex < 16; ++TexelIndex) { Indices[TexelIndex] = ComputeBC7AIndex(BlockRGBA[TexelIndex], BlockVector, EndPoint0Pos, EndPoint1Pos); } uint4 Block = 0; Block.x = 0; Block.y = 0; Block.z = 0; Block.w = 0; { // Encode mode 6 block Block.x = 0x40; // Encode endpoints Block.x |= Endpoint0.x << 7; Block.x |= Endpoint1.x << 14; Block.x |= Endpoint0.y << 21; Block.x |= Endpoint1.y << 28; Block.y |= Endpoint1.y >> 4; Block.y |= Endpoint0.z << 3; Block.y |= Endpoint1.z << 10; Block.y |= Endpoint0.w << 17; Block.y |= Endpoint1.w << 24; // Encode endpoint p-bit // Encode indices Block.z |= Indices[0] << 1; Block.z |= Indices[1] << 4; Block.z |= Indices[2] << 8; Block.z |= Indices[3] << 12; Block.z |= Indices[4] << 16; Block.z |= Indices[5] << 20; Block.z |= Indices[6] << 24; Block.z |= Indices[7] << 28; Block.w |= Indices[8] << 0; Block.w |= Indices[9] << 4; Block.w |= Indices[10] << 8; Block.w |= Indices[11] << 12; Block.w |= Indices[12] << 16; Block.w |= Indices[13] << 20; Block.w |= Indices[14] << 24; Block.w |= Indices[15] << 28; } return Block; }