// Copyright Epic Games, Inc. All Rights Reserved. /*============================================================================================= PathTracingRandomSequence.ush: Reference path tracing ===============================================================================================*/ #pragma once #include "../../MortonCode.ush" #define RANDSEQ_PURERANDOM 0 #define RANDSEQ_HALTON 1 #define RANDSEQ_OWENSOBOL 2 #define RANDSEQ_LATTICE 3 // Select a default if none was specified #ifndef RANDSEQ #define RANDSEQ RANDSEQ_OWENSOBOL #endif #ifndef RANDSEQ_RANDOMIZED #define RANDSEQ_RANDOMIZED 1 // 1 to randomize per pixel, 0 to share the same sequence in all pixels (useful to benchmark coherent sampling) #endif #ifndef RANDSEQ_ERROR_DIFFUSION #define RANDSEQ_ERROR_DIFFUSION 1 // Enabled by default, only works when using RANDSEQ_OWENSOBOL or RANDSEQ_LATTICE #endif #ifndef RANDSEQ_UNROLL_SOBOL #define RANDSEQ_UNROLL_SOBOL 1 // Use unrolled Sobol loop such that table can be inline constants. #endif // This hash mixes bits at the theoretical bias limit uint PerfectIntegerHash(uint x) { // https://nullprogram.com/blog/2018/07/31/ // exact bias: 0.020888578919738908 x ^= x >> 17; x *= 0xed5ad4bbu; x ^= x >> 11; x *= 0xac4c1b51u; x ^= x >> 15; x *= 0x31848babu; x ^= x >> 14; return x; } // High quality integer hash - this mixes bits almost perfectly uint StrongIntegerHash(uint x) { // From https://github.com/skeeto/hash-prospector // bias = 0.16540778981744320 x ^= x >> 16; x *= 0xa812d533; x ^= x >> 15; x *= 0xb278e4ad; x ^= x >> 17; return x; } // This is a much weaker hash, but is faster and can be used to drive other hashes uint WeakIntegerHash(uint x) { // Generated using https://github.com/skeeto/hash-prospector // Estimated Bias ~583 x *= 0x92955555u; x ^= x >> 15; return x; } struct RandomSequence { uint SampleIndex; // index into the random sequence uint SampleSeed; // changes as we draw samples to reflect the change in dimension }; float Rand(inout uint Seed) { // Counter based PRNG -- safer than most small-state PRNGs since we use the random values directly here. Seed += 1; uint Output = StrongIntegerHash(Seed); // take low 24 bits return (Output & 0xFFFFFF) * 5.96046447754e-08; // * 2^-24 } // #dxr_todo: convert prime factors to constant buffer static const uint Primes512LUT[] = { 2,3,5,7,11,13,17,19,23,29, 31,37,41,43,47,53,59,61,67,71, 73,79,83,89,97,101,103,107,109,113, 127,131,137,139,149,151,157,163,167,173, 179,181,191,193,197,199,211,223,227,229, 233,239,241,251,257,263,269,271,277,281, 283,293,307,311,313,317,331,337,347,349, 353,359,367,373,379,383,389,397,401,409, 419,421,431,433,439,443,449,457,461,463, 467,479,487,491,499,503,509,521,523,541, 547,557,563,569,571,577,587,593,599,601, 607,613,617,619,631,641,643,647,653,659, 661,673,677,683,691,701,709,719,727,733, 739,743,751,757,761,769,773,787,797,809, 811,821,823,827,829,839,853,857,859,863, 877,881,883,887,907,911,919,929,937,941, 947,953,967,971,977,983,991,997,1009,1013, 1019,1021,1031,1033,1039,1049,1051,1061,1063,1069, 1087,1091,1093,1097,1103,1109,1117,1123,1129,1151, 1153,1163,1171,1181,1187,1193,1201,1213,1217,1223, 1229,1231,1237,1249,1259,1277,1279,1283,1289,1291, 1297,1301,1303,1307,1319,1321,1327,1361,1367,1373, 1381,1399,1409,1423,1427,1429,1433,1439,1447,1451, 1453,1459,1471,1481,1483,1487,1489,1493,1499,1511, 1523,1531,1543,1549,1553,1559,1567,1571,1579,1583, 1597,1601,1607,1609,1613,1619,1621,1627,1637,1657, 1663,1667,1669,1693,1697,1699,1709,1721,1723,1733, 1741,1747,1753,1759,1777,1783,1787,1789,1801,1811, 1823,1831,1847,1861,1867,1871,1873,1877,1879,1889, 1901,1907,1913,1931,1933,1949,1951,1973,1979,1987, 1993,1997,1999,2003,2011,2017,2027,2029,2039,2053, 2063,2069,2081,2083,2087,2089,2099,2111,2113,2129, 2131,2137,2141,2143,2153,2161,2179,2203,2207,2213, 2221,2237,2239,2243,2251,2267,2269,2273,2281,2287, 2293,2297,2309,2311,2333,2339,2341,2347,2351,2357, 2371,2377,2381,2383,2389,2393,2399,2411,2417,2423, 2437,2441,2447,2459,2467,2473,2477,2503,2521,2531, 2539,2543,2549,2551,2557,2579,2591,2593,2609,2617, 2621,2633,2647,2657,2659,2663,2671,2677,2683,2687, 2689,2693,2699,2707,2711,2713,2719,2729,2731,2741, 2749,2753,2767,2777,2789,2791,2797,2801,2803,2819, 2833,2837,2843,2851,2857,2861,2879,2887,2897,2903, 2909,2917,2927,2939,2953,2957,2963,2969,2971,2999, 3001,3011,3019,3023,3037,3041,3049,3061,3067,3079, 3083,3089,3109,3119,3121,3137,3163,3167,3169,3181, 3187,3191,3203,3209,3217,3221,3229,3251,3253,3257, 3259,3271,3299,3301,3307,3313,3319,3323,3329,3331, 3343,3347,3359,3361,3371,3373,3389,3391,3407,3413, 3433,3449,3457,3461,3463,3467,3469,3491,3499,3511, 3517,3527,3529,3533,3539,3541,3547,3557,3559,3571, 3581,3583,3593,3607,3613,3617,3623,3631,3637,3643, 3659,3671 }; uint Prime512(uint Dimension) { return Primes512LUT[Dimension % 512]; } float Halton(uint Index, uint Base) { float r = 0.0; float f = 1.0; float BaseInv = 1.0 / Base; while (Index > 0) { f *= BaseInv; r += f * (Index % Base); Index /= Base; } return r; } uint FastOwenScrambling(uint Index, uint Seed) { #if 0 // reference implementation - scramble bits one at a time // loop below does not require pre-inverted bits, so undo the inversion assumed in the optimized case below Index = reversebits(Index); for (uint Mask = 1u << 31, Input = Index; Mask; Mask >>= 1) { Seed = PerfectIntegerHash(Seed); // randomize state Index ^= Seed & Mask; // flip output (depending on state) Seed ^= Input & Mask; // flip state (depending on input) } return Index; #else // Laine and Karras / Stratified Sampling for Stochastic Transparency / EGSR 2011 // The operations below will mix bits toward the left, so temporarily reverse the order // NOTE: This operation has been performed outside this call // Index = reversebits(Index); // This follows the basic construction from the paper above. The error-diffusion sampler which // tiles a single point set across the screen, makes it much easier to visually "see" the impact of the scrambling. // When the scrambling is not good enough, the structure of the space-filling curve is left behind. // Care must be taken to ensure that the scrambling is still unbiased on average though. I found some cases // that seemed to produce lower visual error but were actually biased due to not touching certain bits, // leading to correlations across dimensions. // After much experimentation with the hash prospector, I discovered the following two constants which appear to // give results as good (or better) than the original 4 xor-mul constants from the Laine/Karras paper. // It isn't entirely clear to me why some constants work better than others. Some hashes with slightly less // bias produced visibly more error or worse looking power spectra. // Estimates below from hash prospector for all hashes of the form: add,xmul=c0,xmul=c1 (with c0 and c1 being // the constants below). // Ran with score_quality=16 for about ~10000 random hashes // Average bias: ~727.02 // Best bias: ~723.05 // Worst bias: ~735.19 Index += Seed; // randomize the index by our seed (pushes bits toward the left) Index ^= Index * 0x9c117646u; Index ^= Index * 0xe0705d72u; // Undo the reverse so that we get left-to-right scrambling // thereby emulating owen-scrambling return reversebits(Index); #endif } // 32-bit Sobol matrices for dimension 1,2,3 from: // S. Joe and F. Y. Kuo, Constructing Sobol sequences with better two-dimensional projections, SIAM J. Sci. Comput. 30, 2635-2654 (2008) // https://web.maths.unsw.edu.au/~fkuo/sobol/ // NOTE: we don't bother storing dimension 0 since it is just a bit reversal // NOTE2: we don't bother storing dimension 1 either since it has a very simple pattern // NOTE3: the matrix elements are reversed to save one reverse in the owen scrambling static const uint2 SobolMatrices[] = { uint2(0x00000001, 0x00000001), uint2(0x00000003, 0x00000003), uint2(0x00000006, 0x00000004), uint2(0x00000009, 0x0000000a), uint2(0x00000017, 0x0000001f), uint2(0x0000003a, 0x0000002e), uint2(0x00000071, 0x00000045), uint2(0x000000a3, 0x000000c9), uint2(0x00000116, 0x0000011b), uint2(0x00000339, 0x000002a4), uint2(0x00000677, 0x0000079a), uint2(0x000009aa, 0x00000b67), uint2(0x00001601, 0x0000101e), uint2(0x00003903, 0x0000302d), uint2(0x00007706, 0x00004041), uint2(0x0000aa09, 0x0000a0c3), uint2(0x00010117, 0x0001f104), uint2(0x0003033a, 0x0002e28a), uint2(0x00060671, 0x000457df), uint2(0x000909a3, 0x000c9bae), uint2(0x00171616, 0x0011a105), uint2(0x003a3939, 0x002a7289), uint2(0x00717777, 0x0079e7db), uint2(0x00a3aaaa, 0x00b6dba4), uint2(0x01170001, 0x0100011a), uint2(0x033a0003, 0x030002a7), uint2(0x06710006, 0x0400079e), uint2(0x09a30009, 0x0a000b6d), uint2(0x16160017, 0x1f001001), uint2(0x3939003a, 0x2e003003), uint2(0x77770071, 0x45004004), uint2(0xaaaa00a3, 0xc900a00a) }; uint EvolveSobolSeed(inout uint Seed) { // constant from: https://www.pcg-random.org/posts/does-it-beat-the-minimal-standard.html const uint MCG_C = 2739110765; #if 0 return Seed += MCG_C; #elif 0 Seed += MCG_C; return Seed ^ (Seed >> 16); // scramble lower bits #elif 0 return PerfectIntegerHash(Seed += 1); #elif 0 return StrongIntegerHash(Seed += 1); #else // a slightly weaker hash is ok since this drives FastOwenScrambling which is itself a hash // Note that the Seed evolution is just an integer addition and the hash should optimize away // when a particular dimension is not used return WeakIntegerHash(Seed += MCG_C); #endif } float4 LatticeSampler(uint SampleIndex, inout uint Seed) { #if 1 // Same as FastOwenScrambling, but without the final reversebits uint LatticeIndex = SampleIndex + EvolveSobolSeed(Seed); LatticeIndex ^= LatticeIndex * 0x9c117646u; LatticeIndex ^= LatticeIndex * 0xe0705d72u; #else // This is a higher quality owen-scrambling hash but doesn't appear to make much difference // https://psychopath.io/post/2021_01_30_building_a_better_lk_hash uint LatticeIndex = SampleIndex, S = EvolveSobolSeed(Seed); LatticeIndex ^= LatticeIndex * 0x3d20adeau; LatticeIndex += S; LatticeIndex *= (S >> 16) | 1; LatticeIndex ^= LatticeIndex * 0x05526c56u; LatticeIndex ^= LatticeIndex * 0x53a22864u; #endif // Lattice parameters taken from: // Weighted compound integration rules with higher order convergence for all N // Fred J. Hickernell, Peter Kritzer, Frances Y. Kuo, Dirk Nuyens // Numerical Algorithms - February 2012 uint4 Result = LatticeIndex * uint4(1, 364981, 245389, 97823); return (Result >> 8) * 5.96046447754e-08; // * 2^-24 } float4 SobolSampler(uint SampleIndex, inout uint Seed) { // first scramble the index to decorelate from other 4-tuples uint SobolIndex = FastOwenScrambling(SampleIndex, EvolveSobolSeed(Seed)); // now get Sobol' point from this index uint4 Result = uint4(SobolIndex, SobolIndex, 0, 0); // y component can be computed without iteration // "An Implementation Algorithm of 2D Sobol Sequence Fast, Elegant, and Compact" // Abdalla Ahmed, EGSR 2024 // See listing (19) in the paper // The code is different here because we want the output to be bit-reversed, but // the methodology is the same Result.y ^= Result.y >> 16; Result.y ^= (Result.y & 0xFF00FF00) >> 8; Result.y ^= (Result.y & 0xF0F0F0F0) >> 4; Result.y ^= (Result.y & 0xCCCCCCCC) >> 2; Result.y ^= (Result.y & 0xAAAAAAAA) >> 1; #if RANDSEQ_UNROLL_SOBOL // unrolled version seems faster in most cases UNROLL for (uint b = 0; b < 32; b++) { uint IndexBit = (SobolIndex >> b) & 1; // bitfield extract #else // however a loop with early exit seems faster in the path tracing context (possibly due to low occupancy?) for (uint b = 0; SobolIndex; SobolIndex >>= 1, b++) { uint IndexBit = SobolIndex & 1; #endif Result.zw ^= IndexBit * SobolMatrices[b]; } // finally scramble the points to avoid structured artifacts Result.x = FastOwenScrambling(Result.x, EvolveSobolSeed(Seed)); Result.y = FastOwenScrambling(Result.y, EvolveSobolSeed(Seed)); Result.z = FastOwenScrambling(Result.z, EvolveSobolSeed(Seed)); Result.w = FastOwenScrambling(Result.w, EvolveSobolSeed(Seed)); // output as float in [0,1) taking care not to skew the distribution // due to the non-uniform spacing of floats in this range return (Result >> 8) * 5.96046447754e-08; // * 2^-24 } // This initializes a multi-dimensional sampler for a particular pixel. The TimeSeed parameter should be set // such that it changes per sample. See the function below when you know how many samples you plan on taking // for a particular integral. void RandomSequence_Initialize(inout RandomSequence RandSequence, uint PositionSeed, uint TimeSeed) { // optionally disable randomization per pixel so that all pixels follow the same path in primary sample space #if RANDSEQ_RANDOMIZED == 0 PositionSeed = 0; #endif #if RANDSEQ == RANDSEQ_PURERANDOM RandSequence.SampleIndex = 0; // not used RandSequence.SampleSeed = StrongIntegerHash(PositionSeed + StrongIntegerHash(TimeSeed)); #elif RANDSEQ == RANDSEQ_HALTON RandSequence.SampleIndex = TimeSeed + StrongIntegerHash(PositionSeed); RandSequence.SampleSeed = 0; // this sampler just counts dimensions #elif RANDSEQ == RANDSEQ_OWENSOBOL || RANDSEQ == RANDSEQ_LATTICE // pre-compute bit reversal needed for FastOwenScrambling since this index doesn't change RandSequence.SampleIndex = reversebits(TimeSeed); // change seed to get a unique sequence per pixel RandSequence.SampleSeed = StrongIntegerHash(PositionSeed); #else #error "Unknown random sequence chosen in path tracer" #endif } // This is an alternative initialization to the method above when you know how many samples you are planning on taking. This incorporates the // TimeSeed0 (this should usually be the frame number when using all samples in one frame, or should be the frame on which the first sample was // taken if taking one sample per frame like the path tracer). void RandomSequence_Initialize(inout RandomSequence RandSequence, uint2 PixelCoord, uint SampleIndex, uint TimeSeed0, uint MaxSamples) { #if RANDSEQ_ERROR_DIFFUSION == 1 && (RANDSEQ == RANDSEQ_OWENSOBOL || RANDSEQ == RANDSEQ_LATTICE) // The core idea is to use a single sobol pattern across the screen using a space filling curve to correlate nearby pixels // This achieves a much better error diffusion than the random sequence assignment of the method above // This algorithm is covered in the following paper: // "Screen-Space Blue-Noise Diffusion of Monte Carlo Sampling Error via Hierarchical Ordering of Pixels" // Siggraph Asia 2020 // http://abdallagafar.com/publications/zsampler/ // Improvements made in this version are: // - switching the z-order curve for a hilbert curve // - switching sobol points for owen-sobol points // - recognizing that the hierarchical scrambling proposed in the original paper is equivalent to Owen-Scrambling of the index // Downsides of this method: // - quality does not improve beyond the target sample count, making usage with adaptive sampling difficult // - error-diffusion property is only achieved at the target sample count, earlier samples look random (sometimes slightly worse than random) uint X = PixelCoord.x; uint Y = PixelCoord.y; #if 0 uint TileID = ((X & 255) + (Y & 255) * 256); // Scanline order #elif 0 uint TileID = ((Y & 1) ? 255 - (X & 255) : (X & 255)) + (Y & 255) * 256; // zig-zag curve #elif 0 uint TileID = MortonEncode(uint2(X, Y)); // Z-order curve #elif 0 // ordered dither curve uint TileID = MortonEncode(uint2(X ^ Y, X)); #elif 0 // pure random (breaks nice properties) uint TileID = StrongIntegerHash(X + StrongIntegerHash(Y)); #else // Hilbert curve: https://github.com/hcs0/Hackers-Delight/blob/master/hilbert/hil_s_from_xy.c.txt uint TileID = 0, HilbertState = 0; for (int i = 7; i >= 0; i--) { uint xi = (X >> i) & 1; uint yi = (Y >> i) & 1; uint Row = 8 * HilbertState + 4 * xi + 2 * yi; TileID = TileID * 4 + ((0x361E9CB4u >> Row) & 3); HilbertState = (0x8FE65831u >> Row) & 3; } #endif // Combine pixel-level and sample-level bits into the sample index (visible structure will be hidden by owen scrambling of the index) RandSequence.SampleIndex = reversebits((65536 * TimeSeed0 + TileID) * MaxSamples + SampleIndex); // progressive encodings (these kind of work, but quality is much worse) //RandSequence.SampleIndex = reversebits(MortonEncode(uint2(TimeSeed0, TileID)) * MaxSamples + SampleIndex); //RandSequence.SampleIndex = reversebits((MortonCode3(MaxSamples * TimeSeed0 + SampleIndex) * 4) | (MortonCode3(Y) * 2) | MortonCode3(X)); //RandSequence.SampleIndex = reversebits(MortonEncode(uint2(MortonEncode(uint2(X, Y)), SampleIndex))); RandSequence.SampleSeed = 0; // always use the same sequence #else // Error diffusion sampler is disabled, just use the simple init function instead RandomSequence_Initialize(RandSequence, PixelCoord.x + PixelCoord.y * 65536, TimeSeed0 * MaxSamples + SampleIndex); #endif } float RandomSequence_GenerateSample1D(inout RandomSequence RandSequence) { float Result; #if RANDSEQ == RANDSEQ_PURERANDOM Result = Rand(RandSequence.SampleSeed); #elif RANDSEQ == RANDSEQ_HALTON Result = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 0)); RandSequence.SampleSeed += 1; #elif RANDSEQ == RANDSEQ_OWENSOBOL // rely on compiler to optimize out dead code for now // #dxr_todo: benchmark and specialize if needed Result = SobolSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).x; #elif RANDSEQ == RANDSEQ_LATTICE Result = LatticeSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).x; #else #error "Unknown random sequence chosen in path tracer" #endif return Result; } float2 RandomSequence_GenerateSample2D(inout RandomSequence RandSequence) { float2 Result; #if RANDSEQ == RANDSEQ_PURERANDOM Result.x = Rand(RandSequence.SampleSeed); Result.y = Rand(RandSequence.SampleSeed); #elif RANDSEQ == RANDSEQ_HALTON Result.x = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 0)); Result.y = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 1)); RandSequence.SampleSeed += 2; #elif RANDSEQ == RANDSEQ_OWENSOBOL // rely on compiler to optimize out dead code for now // #dxr_todo: benchmark and specialize if needed Result = SobolSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).xy; #elif RANDSEQ == RANDSEQ_LATTICE Result = LatticeSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).xy; #else #error "Unknown random sequence chosen in path tracer" #endif return Result; } float3 RandomSequence_GenerateSample3D(inout RandomSequence RandSequence) { float3 Result; #if RANDSEQ == RANDSEQ_PURERANDOM Result.x = Rand(RandSequence.SampleSeed); Result.y = Rand(RandSequence.SampleSeed); Result.z = Rand(RandSequence.SampleSeed); #elif RANDSEQ == RANDSEQ_HALTON Result.x = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 0)); Result.y = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 1)); Result.z = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 2)); RandSequence.SampleSeed += 3; #elif RANDSEQ == RANDSEQ_OWENSOBOL // rely on compiler to optimize out dead code for now // #dxr_todo: benchmark and specialize if needed Result = SobolSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).xyz; #elif RANDSEQ == RANDSEQ_LATTICE Result = LatticeSampler(RandSequence.SampleIndex, RandSequence.SampleSeed).xyz; #else #error "Unknown random sequence chosen in path tracer" #endif return Result; } float4 RandomSequence_GenerateSample4D(inout RandomSequence RandSequence) { float4 Result; #if RANDSEQ == RANDSEQ_PURERANDOM Result.x = Rand(RandSequence.SampleSeed); Result.y = Rand(RandSequence.SampleSeed); Result.z = Rand(RandSequence.SampleSeed); Result.w = Rand(RandSequence.SampleSeed); #elif RANDSEQ == RANDSEQ_HALTON Result.x = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 0)); Result.y = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 1)); Result.z = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 2)); Result.w = Halton(RandSequence.SampleIndex, Prime512(RandSequence.SampleSeed + 3)); RandSequence.SampleSeed += 4; #elif RANDSEQ == RANDSEQ_OWENSOBOL Result = SobolSampler(RandSequence.SampleIndex, RandSequence.SampleSeed); #elif RANDSEQ == RANDSEQ_LATTICE Result = LatticeSampler(RandSequence.SampleIndex, RandSequence.SampleSeed); #else #error "Unknown random sequence chosen in path tracer" #endif return Result; }