Files
UnrealEngine/Engine/Shaders/Private/PathTracing/Utilities/PathTracingRandomSequence.ush
2025-05-18 13:04:45 +08:00

516 lines
20 KiB
HLSL

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================================
PathTracingRandomSequence.ush: Reference path tracing
===============================================================================================*/
#pragma once
#include "../../MortonCode.ush"
#define RANDSEQ_PURERANDOM 0
#define RANDSEQ_HALTON 1
#define RANDSEQ_OWENSOBOL 2
#define RANDSEQ_LATTICE 3
// Select a default if none was specified
#ifndef RANDSEQ
#define RANDSEQ RANDSEQ_OWENSOBOL
#endif
#ifndef RANDSEQ_RANDOMIZED
#define RANDSEQ_RANDOMIZED 1 // 1 to randomize per pixel, 0 to share the same sequence in all pixels (useful to benchmark coherent sampling)
#endif
#ifndef RANDSEQ_ERROR_DIFFUSION
#define RANDSEQ_ERROR_DIFFUSION 1 // Enabled by default, only works when using RANDSEQ_OWENSOBOL or RANDSEQ_LATTICE
#endif
#ifndef RANDSEQ_UNROLL_SOBOL
#define RANDSEQ_UNROLL_SOBOL 1 // Use unrolled Sobol loop such that table can be inline constants.
#endif
// This hash mixes bits at the theoretical bias limit
uint PerfectIntegerHash(uint x)
{
// https://nullprogram.com/blog/2018/07/31/
// exact bias: 0.020888578919738908
x ^= x >> 17;
x *= 0xed5ad4bbu;
x ^= x >> 11;
x *= 0xac4c1b51u;
x ^= x >> 15;
x *= 0x31848babu;
x ^= x >> 14;
return x;
}
// High quality integer hash - this mixes bits almost perfectly
uint StrongIntegerHash(uint x)
{
// From https://github.com/skeeto/hash-prospector
// bias = 0.16540778981744320
x ^= x >> 16;
x *= 0xa812d533;
x ^= x >> 15;
x *= 0xb278e4ad;
x ^= x >> 17;
return x;
}
// This is a much weaker hash, but is faster and can be used to drive other hashes
uint WeakIntegerHash(uint x)
{
// Generated using https://github.com/skeeto/hash-prospector
// Estimated Bias ~583
x *= 0x92955555u;
x ^= x >> 15;
return x;
}
struct RandomSequence
{
uint SampleIndex; // index into the random sequence
uint SampleSeed; // changes as we draw samples to reflect the change in dimension
};
float Rand(inout uint Seed)
{
// Counter based PRNG -- safer than most small-state PRNGs since we use the random values directly here.
Seed += 1;
uint Output = StrongIntegerHash(Seed);
// take low 24 bits
return (Output & 0xFFFFFF) * 5.96046447754e-08; // * 2^-24
}
// #dxr_todo: convert prime factors to constant buffer
static const uint Primes512LUT[] = {
2,3,5,7,11,13,17,19,23,29,
31,37,41,43,47,53,59,61,67,71,
73,79,83,89,97,101,103,107,109,113,
127,131,137,139,149,151,157,163,167,173,
179,181,191,193,197,199,211,223,227,229,
233,239,241,251,257,263,269,271,277,281,
283,293,307,311,313,317,331,337,347,349,
353,359,367,373,379,383,389,397,401,409,
419,421,431,433,439,443,449,457,461,463,
467,479,487,491,499,503,509,521,523,541,
547,557,563,569,571,577,587,593,599,601,
607,613,617,619,631,641,643,647,653,659,
661,673,677,683,691,701,709,719,727,733,
739,743,751,757,761,769,773,787,797,809,
811,821,823,827,829,839,853,857,859,863,
877,881,883,887,907,911,919,929,937,941,
947,953,967,971,977,983,991,997,1009,1013,
1019,1021,1031,1033,1039,1049,1051,1061,1063,1069,
1087,1091,1093,1097,1103,1109,1117,1123,1129,1151,
1153,1163,1171,1181,1187,1193,1201,1213,1217,1223,
1229,1231,1237,1249,1259,1277,1279,1283,1289,1291,
1297,1301,1303,1307,1319,1321,1327,1361,1367,1373,
1381,1399,1409,1423,1427,1429,1433,1439,1447,1451,
1453,1459,1471,1481,1483,1487,1489,1493,1499,1511,
1523,1531,1543,1549,1553,1559,1567,1571,1579,1583,
1597,1601,1607,1609,1613,1619,1621,1627,1637,1657,
1663,1667,1669,1693,1697,1699,1709,1721,1723,1733,
1741,1747,1753,1759,1777,1783,1787,1789,1801,1811,
1823,1831,1847,1861,1867,1871,1873,1877,1879,1889,
1901,1907,1913,1931,1933,1949,1951,1973,1979,1987,
1993,1997,1999,2003,2011,2017,2027,2029,2039,2053,
2063,2069,2081,2083,2087,2089,2099,2111,2113,2129,
2131,2137,2141,2143,2153,2161,2179,2203,2207,2213,
2221,2237,2239,2243,2251,2267,2269,2273,2281,2287,
2293,2297,2309,2311,2333,2339,2341,2347,2351,2357,
2371,2377,2381,2383,2389,2393,2399,2411,2417,2423,
2437,2441,2447,2459,2467,2473,2477,2503,2521,2531,
2539,2543,2549,2551,2557,2579,2591,2593,2609,2617,
2621,2633,2647,2657,2659,2663,2671,2677,2683,2687,
2689,2693,2699,2707,2711,2713,2719,2729,2731,2741,
2749,2753,2767,2777,2789,2791,2797,2801,2803,2819,
2833,2837,2843,2851,2857,2861,2879,2887,2897,2903,
2909,2917,2927,2939,2953,2957,2963,2969,2971,2999,
3001,3011,3019,3023,3037,3041,3049,3061,3067,3079,
3083,3089,3109,3119,3121,3137,3163,3167,3169,3181,
3187,3191,3203,3209,3217,3221,3229,3251,3253,3257,
3259,3271,3299,3301,3307,3313,3319,3323,3329,3331,
3343,3347,3359,3361,3371,3373,3389,3391,3407,3413,
3433,3449,3457,3461,3463,3467,3469,3491,3499,3511,
3517,3527,3529,3533,3539,3541,3547,3557,3559,3571,
3581,3583,3593,3607,3613,3617,3623,3631,3637,3643,
3659,3671
};
uint Prime512(uint Dimension)
{
return Primes512LUT[Dimension % 512];
}
float Halton(uint Index, uint Base)
{
float r = 0.0;
float f = 1.0;
float BaseInv = 1.0 / Base;
while (Index > 0)
{
f *= BaseInv;
r += f * (Index % Base);
Index /= Base;
}
return r;
}
uint FastOwenScrambling(uint Index, uint Seed) {
#if 0
// reference implementation - scramble bits one at a time
// loop below does not require pre-inverted bits, so undo the inversion assumed in the optimized case below
Index = reversebits(Index);
for (uint Mask = 1u << 31, Input = Index; Mask; Mask >>= 1) {
Seed = PerfectIntegerHash(Seed); // randomize state
Index ^= Seed & Mask; // flip output (depending on state)
Seed ^= Input & Mask; // flip state (depending on input)
}
return Index;
#else
// Laine and Karras / Stratified Sampling for Stochastic Transparency / EGSR 2011
// The operations below will mix bits toward the left, so temporarily reverse the order
// NOTE: This operation has been performed outside this call
// Index = reversebits(Index);
// This follows the basic construction from the paper above. The error-diffusion sampler which
// tiles a single point set across the screen, makes it much easier to visually "see" the impact of the scrambling.
// When the scrambling is not good enough, the structure of the space-filling curve is left behind.
// Care must be taken to ensure that the scrambling is still unbiased on average though. I found some cases
// that seemed to produce lower visual error but were actually biased due to not touching certain bits,
// leading to correlations across dimensions.
// After much experimentation with the hash prospector, I discovered the following two constants which appear to
// give results as good (or better) than the original 4 xor-mul constants from the Laine/Karras paper.
// It isn't entirely clear to me why some constants work better than others. Some hashes with slightly less
// bias produced visibly more error or worse looking power spectra.
// Estimates below from hash prospector for all hashes of the form: add,xmul=c0,xmul=c1 (with c0 and c1 being
// the constants below).
// Ran with score_quality=16 for about ~10000 random hashes
// Average bias: ~727.02
// Best bias: ~723.05
// Worst bias: ~735.19
Index += Seed; // randomize the index by our seed (pushes bits toward the left)
Index ^= Index * 0x9c117646u;
Index ^= Index * 0xe0705d72u;
// Undo the reverse so that we get left-to-right scrambling
// thereby emulating owen-scrambling
return reversebits(Index);
#endif
}
// 32-bit Sobol matrices for dimension 1,2,3 from:
// S. Joe and F. Y. Kuo, Constructing Sobol sequences with better two-dimensional projections, SIAM J. Sci. Comput. 30, 2635-2654 (2008)
// https://web.maths.unsw.edu.au/~fkuo/sobol/
// NOTE: we don't bother storing dimension 0 since it is just a bit reversal
// NOTE2: we don't bother storing dimension 1 either since it has a very simple pattern
// NOTE3: the matrix elements are reversed to save one reverse in the owen scrambling
static const uint2 SobolMatrices[] = {
uint2(0x00000001, 0x00000001),
uint2(0x00000003, 0x00000003),
uint2(0x00000006, 0x00000004),
uint2(0x00000009, 0x0000000a),
uint2(0x00000017, 0x0000001f),
uint2(0x0000003a, 0x0000002e),
uint2(0x00000071, 0x00000045),
uint2(0x000000a3, 0x000000c9),
uint2(0x00000116, 0x0000011b),
uint2(0x00000339, 0x000002a4),
uint2(0x00000677, 0x0000079a),
uint2(0x000009aa, 0x00000b67),
uint2(0x00001601, 0x0000101e),
uint2(0x00003903, 0x0000302d),
uint2(0x00007706, 0x00004041),
uint2(0x0000aa09, 0x0000a0c3),
uint2(0x00010117, 0x0001f104),
uint2(0x0003033a, 0x0002e28a),
uint2(0x00060671, 0x000457df),
uint2(0x000909a3, 0x000c9bae),
uint2(0x00171616, 0x0011a105),
uint2(0x003a3939, 0x002a7289),
uint2(0x00717777, 0x0079e7db),
uint2(0x00a3aaaa, 0x00b6dba4),
uint2(0x01170001, 0x0100011a),
uint2(0x033a0003, 0x030002a7),
uint2(0x06710006, 0x0400079e),
uint2(0x09a30009, 0x0a000b6d),
uint2(0x16160017, 0x1f001001),
uint2(0x3939003a, 0x2e003003),
uint2(0x77770071, 0x45004004),
uint2(0xaaaa00a3, 0xc900a00a)
};
uint EvolveSobolSeed(inout uint Seed)
{
// constant from: https://www.pcg-random.org/posts/does-it-beat-the-minimal-standard.html
const uint MCG_C = 2739110765;
#if 0
return Seed += MCG_C;
#elif 0
Seed += MCG_C;
return Seed ^ (Seed >> 16); // scramble lower bits
#elif 0
return PerfectIntegerHash(Seed += 1);
#elif 0
return StrongIntegerHash(Seed += 1);
#else
// a slightly weaker hash is ok since this drives FastOwenScrambling which is itself a hash
// Note that the Seed evolution is just an integer addition and the hash should optimize away
// when a particular dimension is not used
return WeakIntegerHash(Seed += MCG_C);
#endif
}
float4 LatticeSampler(uint SampleIndex, inout uint Seed)
{
#if 1
// Same as FastOwenScrambling, but without the final reversebits
uint LatticeIndex = SampleIndex + EvolveSobolSeed(Seed);
LatticeIndex ^= LatticeIndex * 0x9c117646u;
LatticeIndex ^= LatticeIndex * 0xe0705d72u;
#else
// This is a higher quality owen-scrambling hash but doesn't appear to make much difference
// https://psychopath.io/post/2021_01_30_building_a_better_lk_hash
uint LatticeIndex = SampleIndex, S = EvolveSobolSeed(Seed);
LatticeIndex ^= LatticeIndex * 0x3d20adeau;
LatticeIndex += S;
LatticeIndex *= (S >> 16) | 1;
LatticeIndex ^= LatticeIndex * 0x05526c56u;
LatticeIndex ^= LatticeIndex * 0x53a22864u;
#endif
// Lattice parameters taken from:
// Weighted compound integration rules with higher order convergence for all N
// Fred J. Hickernell, Peter Kritzer, Frances Y. Kuo, Dirk Nuyens
// Numerical Algorithms - February 2012
uint4 Result = LatticeIndex * uint4(1, 364981, 245389, 97823);
return (Result >> 8) * 5.96046447754e-08; // * 2^-24
}
float4 SobolSampler(uint SampleIndex, inout uint Seed)
{
// first scramble the index to decorelate from other 4-tuples
uint SobolIndex = FastOwenScrambling(SampleIndex, EvolveSobolSeed(Seed));
// now get Sobol' point from this index
uint4 Result = uint4(SobolIndex, SobolIndex, 0, 0);
// y component can be computed without iteration
// "An Implementation Algorithm of 2D Sobol Sequence Fast, Elegant, and Compact"
// Abdalla Ahmed, EGSR 2024
// See listing (19) in the paper
// The code is different here because we want the output to be bit-reversed, but
// the methodology is the same
Result.y ^= Result.y >> 16;
Result.y ^= (Result.y & 0xFF00FF00) >> 8;
Result.y ^= (Result.y & 0xF0F0F0F0) >> 4;
Result.y ^= (Result.y & 0xCCCCCCCC) >> 2;
Result.y ^= (Result.y & 0xAAAAAAAA) >> 1;
#if RANDSEQ_UNROLL_SOBOL // unrolled version seems faster in most cases
UNROLL for (uint b = 0; b < 32; b++)
{
uint IndexBit = (SobolIndex >> b) & 1; // bitfield extract
#else // however a loop with early exit seems faster in the path tracing context (possibly due to low occupancy?)
for (uint b = 0; SobolIndex; SobolIndex >>= 1, b++)
{
uint IndexBit = SobolIndex & 1;
#endif
Result.zw ^= IndexBit * SobolMatrices[b];
}
// finally scramble the points to avoid structured artifacts
Result.x = FastOwenScrambling(Result.x, EvolveSobolSeed(Seed));
Result.y = FastOwenScrambling(Result.y, EvolveSobolSeed(Seed));
Result.z = FastOwenScrambling(Result.z, EvolveSobolSeed(Seed));
Result.w = FastOwenScrambling(Result.w, EvolveSobolSeed(Seed));
// output as float in [0,1) taking care not to skew the distribution
// due to the non-uniform spacing of floats in this range
return (Result >> 8) * 5.96046447754e-08; // * 2^-24
}
// This initializes a multi-dimensional sampler for a particular pixel. The TimeSeed parameter should be set
// such that it changes per sample. See the function below when you know how many samples you plan on taking
// for a particular integral.
void RandomSequence_Initialize(inout RandomSequence RandSequence, uint PositionSeed, uint TimeSeed)
{
// optionally disable randomization per pixel so that all pixels follow the same path in primary sample space
#if RANDSEQ_RANDOMIZED == 0
PositionSeed = 0;
#endif
#if RANDSEQ == RANDSEQ_PURERANDOM
RandSequence.SampleIndex = 0; // not used
RandSequence.SampleSeed = StrongIntegerHash(PositionSeed + StrongIntegerHash(TimeSeed));
#elif RANDSEQ == RANDSEQ_HALTON
RandSequence.SampleIndex = TimeSeed + StrongIntegerHash(PositionSeed);
RandSequence.SampleSeed = 0; // this sampler just counts dimensions
#elif RANDSEQ == RANDSEQ_OWENSOBOL || RANDSEQ == RANDSEQ_LATTICE
// pre-compute bit reversal needed for FastOwenScrambling since this index doesn't change
RandSequence.SampleIndex = reversebits(TimeSeed);
// change seed to get a unique sequence per pixel
RandSequence.SampleSeed = StrongIntegerHash(PositionSeed);
#else
#error "Unknown random sequence chosen in path tracer"
#endif
}
// This is an alternative initialization to the method above when you know how many samples you are planning on taking. This incorporates the
// TimeSeed0 (this should usually be the frame number when using all samples in one frame, or should be the frame on which the first sample was
// taken if taking one sample per frame like the path tracer).
void RandomSequence_Initialize(inout RandomSequence RandSequence, uint2 PixelCoord, uint SampleIndex, uint TimeSeed0, uint MaxSamples)
{
#if RANDSEQ_ERROR_DIFFUSION == 1 && (RANDSEQ == RANDSEQ_OWENSOBOL || RANDSEQ == RANDSEQ_LATTICE)
// The core idea is to use a single sobol pattern across the screen using a space filling curve to correlate nearby pixels
// This achieves a much better error diffusion than the random sequence assignment of the method above
// This algorithm is covered in the following paper:
// "Screen-Space Blue-Noise Diffusion of Monte Carlo Sampling Error via Hierarchical Ordering of Pixels"
// Siggraph Asia 2020
// http://abdallagafar.com/publications/zsampler/
// Improvements made in this version are:
// - switching the z-order curve for a hilbert curve
// - switching sobol points for owen-sobol points
// - recognizing that the hierarchical scrambling proposed in the original paper is equivalent to Owen-Scrambling of the index
// Downsides of this method:
// - quality does not improve beyond the target sample count, making usage with adaptive sampling difficult
// - error-diffusion property is only achieved at the target sample count, earlier samples look random (sometimes slightly worse than random)
uint X = PixelCoord.x;
uint Y = PixelCoord.y;
#if 0
uint TileID = ((X & 255) + (Y & 255) * 256); // Scanline order
#elif 0
uint TileID = ((Y & 1) ? 255 - (X & 255) : (X & 255)) + (Y & 255) * 256; // zig-zag curve
#elif 0
uint TileID = MortonEncode(uint2(X, Y)); // Z-order curve
#elif 0
// ordered dither curve
uint TileID = MortonEncode(uint2(X ^ Y, X));
#elif 0
// pure random (breaks nice properties)
uint TileID = StrongIntegerHash(X + StrongIntegerHash(Y));
#else
// Hilbert curve: https://github.com/hcs0/Hackers-Delight/blob/master/hilbert/hil_s_from_xy.c.txt
uint TileID = 0, HilbertState = 0;
for (int i = 7; i >= 0; i--) {
uint xi = (X >> i) & 1;
uint yi = (Y >> i) & 1;
uint Row = 8 * HilbertState + 4 * xi + 2 * yi;
TileID = TileID * 4 + ((0x361E9CB4u >> Row) & 3);
HilbertState = (0x8FE65831u >> Row) & 3;
}
#endif
// Combine pixel-level and sample-level bits into the sample index (visible structure will be hidden by owen scrambling of the index)
RandSequence.SampleIndex = reversebits((65536 * TimeSeed0 + TileID) * MaxSamples + SampleIndex);
// progressive encodings (these kind of work, but quality is much worse)
//RandSequence.SampleIndex = reversebits(MortonEncode(uint2(TimeSeed0, TileID)) * MaxSamples + SampleIndex);
//RandSequence.SampleIndex = reversebits((MortonCode3(MaxSamples * TimeSeed0 + SampleIndex) * 4) | (MortonCode3(Y) * 2) | MortonCode3(X));
//RandSequence.SampleIndex = reversebits(MortonEncode(uint2(MortonEncode(uint2(X, Y)), SampleIndex)));
RandSequence.SampleSeed = 0; // always use the same sequence
#else
// Error diffusion sampler is disabled, just use the simple init function instead
RandomSequence_Initialize(RandSequence, PixelCoord.x + PixelCoord.y * 65536, TimeSeed0 * MaxSamples + SampleIndex);
#endif
}
float RandomSequence_GenerateSample1D(inout RandomSequence RandSequence)
{
float Result;
#if RANDSEQ == RANDSEQ_PURERANDOM
Result = Rand(RandSequence.SampleSeed);
#elif RANDSEQ == RANDSEQ_HALTON
Result = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 0));
RandSequence.SampleSeed += 1;
#elif RANDSEQ == RANDSEQ_OWENSOBOL
// rely on compiler to optimize out dead code for now
// #dxr_todo: benchmark and specialize if needed
Result = SobolSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).x;
#elif RANDSEQ == RANDSEQ_LATTICE
Result = LatticeSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).x;
#else
#error "Unknown random sequence chosen in path tracer"
#endif
return Result;
}
float2 RandomSequence_GenerateSample2D(inout RandomSequence RandSequence)
{
float2 Result;
#if RANDSEQ == RANDSEQ_PURERANDOM
Result.x = Rand(RandSequence.SampleSeed);
Result.y = Rand(RandSequence.SampleSeed);
#elif RANDSEQ == RANDSEQ_HALTON
Result.x = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 0));
Result.y = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 1));
RandSequence.SampleSeed += 2;
#elif RANDSEQ == RANDSEQ_OWENSOBOL
// rely on compiler to optimize out dead code for now
// #dxr_todo: benchmark and specialize if needed
Result = SobolSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).xy;
#elif RANDSEQ == RANDSEQ_LATTICE
Result = LatticeSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).xy;
#else
#error "Unknown random sequence chosen in path tracer"
#endif
return Result;
}
float3 RandomSequence_GenerateSample3D(inout RandomSequence RandSequence)
{
float3 Result;
#if RANDSEQ == RANDSEQ_PURERANDOM
Result.x = Rand(RandSequence.SampleSeed);
Result.y = Rand(RandSequence.SampleSeed);
Result.z = Rand(RandSequence.SampleSeed);
#elif RANDSEQ == RANDSEQ_HALTON
Result.x = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 0));
Result.y = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 1));
Result.z = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 2));
RandSequence.SampleSeed += 3;
#elif RANDSEQ == RANDSEQ_OWENSOBOL
// rely on compiler to optimize out dead code for now
// #dxr_todo: benchmark and specialize if needed
Result = SobolSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).xyz;
#elif RANDSEQ == RANDSEQ_LATTICE
Result = LatticeSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).xyz;
#else
#error "Unknown random sequence chosen in path tracer"
#endif
return Result;
}
float4 RandomSequence_GenerateSample4D(inout RandomSequence RandSequence)
{
float4 Result;
#if RANDSEQ == RANDSEQ_PURERANDOM
Result.x = Rand(RandSequence.SampleSeed);
Result.y = Rand(RandSequence.SampleSeed);
Result.z = Rand(RandSequence.SampleSeed);
Result.w = Rand(RandSequence.SampleSeed);
#elif RANDSEQ == RANDSEQ_HALTON
Result.x = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 0));
Result.y = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 1));
Result.z = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 2));
Result.w = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 3));
RandSequence.SampleSeed += 4;
#elif RANDSEQ == RANDSEQ_OWENSOBOL
Result = SobolSampler(RandSequence.SampleIndex, RandSequence.SampleSeed);
#elif RANDSEQ == RANDSEQ_LATTICE
Result = LatticeSampler(RandSequence.SampleIndex, RandSequence.SampleSeed);
#else
#error "Unknown random sequence chosen in path tracer"
#endif
return Result;
}