// Copyright Epic Games, Inc. All Rights Reserved.

/*=============================================================================
	Random.ush: Pseudo-random number generators.
=============================================================================*/

#pragma once

// Most frequently used functions in separate smaller headers
#include "RandomInterleavedGradientNoise.ush"
#include "RandomPCG.ush"

// @param xy should be a integer position (e.g. pixel position on the screen), repeats each 128x128 pixels
// similar to a texture lookup but is only ALU
// ~13 ALU operations (3 frac, 6 *, 4 mad)
float PseudoRandom(float2 xy)
{
	float2 pos = frac(xy / 128.0f) * 128.0f + float2(-64.340622f, -72.465622f);
	
	// found by experimentation
	return frac(dot(pos.xyx * pos.xyy, float3(20.390625f, 60.703125f, 2.4281209f)));
}

// [0, 1[
// ~10 ALU operations (2 frac, 5 *, 3 mad)
float RandFast( uint2 PixelPos, float Magic = 3571.0 )
{
	float2 Random2 = ( 1.0 / 4320.0 ) * PixelPos + float2( 0.25, 0.0 );
	float Random = frac( dot( Random2 * Random2, Magic ) );
	Random = frac( Random * Random * (2 * Magic) );
	return Random;
}

// This is the largest prime < 2^12 so s*s will fit in a 24-bit floating point mantissa
#define BBS_PRIME24 4093

// Blum-Blum-Shub-inspired pseudo random number generator
// http://www.umbc.edu/~olano/papers/mNoise.pdf
// real BBS uses ((s*s) mod M) with bignums and M as the product of two huge Blum primes
// instead, we use a single prime M just small enough not to overflow
// note that the above paper used 61, which fits in a half, but is unusably bad
// @param Integer valued floating point seed
// @return random number in range [0,1)
// ~8 ALU operations (5 *, 3 frac)
float RandBBSfloat(float seed)
{
	float s = frac(seed / BBS_PRIME24);
	s = frac(s * s * BBS_PRIME24);
	s = frac(s * s * BBS_PRIME24);
	return s;
}

/**
 * Find good arbitrary axis vectors to represent U and V axes of a plane,
 * given just the normal. Ported from UnMath.h
 */
void FindBestAxisVectors(float3 In, out float3 Axis1, out float3 Axis2 )
{
	const float3 N = abs(In);

	// Find best basis vectors.
	if( N.z > N.x && N.z > N.y )
	{
		Axis1 = float3(1, 0, 0);
	}
	else
	{
		Axis1 = float3(0, 0, 1);
	}

	Axis1 = normalize(Axis1 - In * dot(Axis1, In));
	Axis2 = cross(Axis1, In);
}

// References for noise:
//
// Improved Perlin noise
//   http://mrl.nyu.edu/~perlin/noise/
//   http://http.developer.nvidia.com/GPUGems/gpugems_ch05.html
// Modified Noise for Evaluation on Graphics Hardware
//   http://www.csee.umbc.edu/~olano/papers/mNoise.pdf
// Perlin Noise
//   http://mrl.nyu.edu/~perlin/doc/oscar.html
// Fast Gradient Noise
//   http://prettyprocs.wordpress.com/2012/10/20/fast-perlin-noise


// -------- ALU based method ---------

/*
 * Pseudo random number generator, based on "TEA, a tiny Encrytion Algorithm"
 * http://citeseer.ist.psu.edu/viewdoc/download?doi=10.1.1.45.281&rep=rep1&type=pdf
 * http://www.umbc.edu/~olano/papers/index.html#GPUTEA
 * @param v - old seed (full 32bit range)
 * @param IterationCount - >=1, bigger numbers cost more performance but improve quality
 * @return new seed
 */
uint2 ScrambleTEA(uint2 v, uint IterationCount = 3)
{
	// Start with some random data (numbers can be arbitrary but those have been used by others and seem to work well)
	uint k[4] ={ 0xA341316Cu , 0xC8013EA4u , 0xAD90777Du , 0x7E95761Eu };
	
	uint y = v[0];
	uint z = v[1];
	uint sum = 0;
	
	UNROLL for(uint i = 0; i < IterationCount; ++i)
	{
		sum += 0x9e3779b9;
		y += ((z << 4u) + k[0]) ^ (z + sum) ^ ((z >> 5u) + k[1]);
		z += ((y << 4u) + k[2]) ^ (y + sum) ^ ((y >> 5u) + k[3]);
	}

	return uint2(y, z);
}

// Wraps noise for tiling texture creation
// @param v = unwrapped texture parameter
// @param bTiling = true to tile, false to not tile
// @param RepeatSize = number of units before repeating
// @return either original or wrapped coord
float3 NoiseTileWrap(float3 v,  bool bTiling, float RepeatSize)
{
	return bTiling ? (frac(v / RepeatSize) * RepeatSize) : v;
}

// Evaluate polynomial to get smooth transitions for Perlin noise
// only needed by Perlin functions in this file
// scalar(per component): 2 add, 5 mul
float4 PerlinRamp(float4 t)
{
	return t * t * t * (t * (t * 6 - 15) + 10); 
}

// Analytical derivative of the PerlinRamp polynomial
// only needed by Perlin functions in this file
// scalar(per component): 2 add, 5 mul
float4 PerlinRampDerivative(float4 t)
{
	return t * t * (t * (t * 30 - 60) + 30);
}

#define MGradientMask int3(0x8000, 0x4000, 0x2000)
#define MGradientScale float3(1. / 0x4000, 1. / 0x2000, 1. / 0x1000)
// Modified noise gradient term
// @param seed - random seed for integer lattice position
// @param offset - [-1,1] offset of evaluation point from lattice point
// @return gradient direction (xyz) and contribution (w) from this lattice point
float4 MGradient(int seed, float3 offset)
{
	uint rand = Rand3DPCG16(int3(seed,0,0)).x;
	float3 direction = float3(rand.xxx & MGradientMask) * MGradientScale - 1;
	return float4(direction, dot(direction, offset));
}

// compute Perlin and related noise corner seed values
// @param v = 3D noise argument, use float3(x,y,0) for 2D or float3(x,0,0) for 1D
// @param bTiling = true to return seed values for a repeating noise pattern
// @param RepeatSize = integer units before tiling in each dimension
// @param seed000-seed111 = hash function seeds for the eight corners
// @return fractional part of v
float3 NoiseSeeds(float3 v, bool bTiling, float RepeatSize,
	out float seed000, out float seed001, out float seed010, out float seed011,
	out float seed100, out float seed101, out float seed110, out float seed111)
{
	float3 fv = frac(v);
	float3 iv = floor(v);

	const float3 primes = float3(19, 47, 101);

	if (bTiling)
	{	// can't algebraically combine with primes
		seed000 = dot(primes, NoiseTileWrap(iv, true, RepeatSize));
		seed100 = dot(primes, NoiseTileWrap(iv + float3(1, 0, 0), true, RepeatSize));
		seed010 = dot(primes, NoiseTileWrap(iv + float3(0, 1, 0), true, RepeatSize));
		seed110 = dot(primes, NoiseTileWrap(iv + float3(1, 1, 0), true, RepeatSize));
		seed001 = dot(primes, NoiseTileWrap(iv + float3(0, 0, 1), true, RepeatSize));
		seed101 = dot(primes, NoiseTileWrap(iv + float3(1, 0, 1), true, RepeatSize));
		seed011 = dot(primes, NoiseTileWrap(iv + float3(0, 1, 1), true, RepeatSize));
		seed111 = dot(primes, NoiseTileWrap(iv + float3(1, 1, 1), true, RepeatSize));
	}
	else
	{	// get to combine offsets with multiplication by primes in this case
		seed000 = dot(iv, primes);
		seed100 = seed000 + primes.x;
		seed010 = seed000 + primes.y;
		seed110 = seed100 + primes.y;
		seed001 = seed000 + primes.z;
		seed101 = seed100 + primes.z;
		seed011 = seed010 + primes.z;
		seed111 = seed110 + primes.z;
	}

	return fv;
}

// Perlin-style "Modified Noise"
// http://www.umbc.edu/~olano/papers/index.html#mNoise
// @param v = 3D noise argument, use float3(x,y,0) for 2D or float3(x,0,0) for 1D
// @param bTiling = repeat noise pattern
// @param RepeatSize = integer units before tiling in each dimension
// @return random number in the range -1 .. 1
float GradientNoise3D_ALU(float3 v, bool bTiling, float RepeatSize)
{
	float seed000, seed001, seed010, seed011, seed100, seed101, seed110, seed111;
	float3 fv = NoiseSeeds(v, bTiling, RepeatSize, seed000, seed001, seed010, seed011, seed100, seed101, seed110, seed111);

	float rand000 = MGradient(int(seed000), fv - float3(0, 0, 0)).w;
	float rand100 = MGradient(int(seed100), fv - float3(1, 0, 0)).w;
	float rand010 = MGradient(int(seed010), fv - float3(0, 1, 0)).w;
	float rand110 = MGradient(int(seed110), fv - float3(1, 1, 0)).w;
	float rand001 = MGradient(int(seed001), fv - float3(0, 0, 1)).w;
	float rand101 = MGradient(int(seed101), fv - float3(1, 0, 1)).w;
	float rand011 = MGradient(int(seed011), fv - float3(0, 1, 1)).w;
	float rand111 = MGradient(int(seed111), fv - float3(1, 1, 1)).w;

	float3 Weights = PerlinRamp(float4(fv, 0)).xyz;

	float i = lerp(lerp(rand000, rand100, Weights.x), lerp(rand010, rand110, Weights.x), Weights.y);
	float j = lerp(lerp(rand001, rand101, Weights.x), lerp(rand011, rand111, Weights.x), Weights.y);
	return lerp(i, j, Weights.z).x;
}

// Coordinates for corners of a Simplex tetrahedron
// Based on McEwan et al., Efficient computation of noise in GLSL, JGT 2011
// @param v = 3D noise argument
// @return 4 corner locations
float4x3 SimplexCorners(float3 v)
{
	// find base corner by skewing to tetrahedral space and back
	float3 tet = floor(v + v.x/3 + v.y/3 + v.z/3);
	float3 base = tet - tet.x/6 - tet.y/6 - tet.z/6;
	float3 f = v - base;

	// Find offsets to other corners (McEwan did this in tetrahedral space,
	// but since skew is along x=y=z axis, this works in Euclidean space too.)
	float3 g = step(f.yzx, f.xyz), h = 1 - g.zxy;
	float3 a1 = min(g, h) - 1. / 6., a2 = max(g, h) - 1. / 3.;

	// four corners
	return float4x3(base, base + a1, base + a2, base + 0.5);
}

// Improved smoothing function for simplex noise
// @param f = fractional distance to four tetrahedral corners
// @return weight for each corner
float4 SimplexSmooth(float4x3 f)
{
	const float scale = 1024. / 375.;	// scale factor to make noise -1..1
	float4 d = float4(dot(f[0], f[0]), dot(f[1], f[1]), dot(f[2], f[2]), dot(f[3], f[3]));
	float4 s = saturate(2 * d);
	return (1 * scale + s*(-3 * scale + s*(3 * scale - s*scale)));
}

// Derivative of simplex noise smoothing function
// @param f = fractional distanc eto four tetrahedral corners
// @return derivative of smoothing function for each corner by x, y and z
float3x4 SimplexDSmooth(float4x3 f)
{
	const float scale = 1024. / 375.;	// scale factor to make noise -1..1
	float4 d = float4(dot(f[0], f[0]), dot(f[1], f[1]), dot(f[2], f[2]), dot(f[3], f[3]));
	float4 s = saturate(2 * d);
	s = -12 * scale + s*(24 * scale - s * 12 * scale);

	return float3x4(
		s * float4(f[0][0], f[1][0], f[2][0], f[3][0]),
		s * float4(f[0][1], f[1][1], f[2][1], f[3][1]),
		s * float4(f[0][2], f[1][2], f[2][2], f[3][2]));
}

// Simplex noise and its Jacobian derivative
// @param v = 3D noise argument
// @param bTiling = whether to repeat noise pattern
// @param RepeatSize = integer units before tiling in each dimension, must be a multiple of 3
// @return float3x3 Jacobian in J[*].xyz, vector noise in J[*].w
//     J[0].w, J[1].w, J[2].w is a Perlin-style simplex noise with vector output, e.g. (Nx, Ny, Nz)
//     J[i].x is X derivative of the i'th component of the noise so J[2].x is dNz/dx
// You can use this to compute the noise, gradient, curl, or divergence:
//   float3x4 J = JacobianSimplex_ALU(...);
//   float3 VNoise = float3(J[0].w, J[1].w, J[2].w);	// 3D noise
//   float3 Grad = J[0].xyz;							// gradient of J[0].w
//   float3 Curl = float3(J[1][2]-J[2][1], J[2][0]-J[0][2], J[0][1]-J[1][2]);
//   float Div = J[0][0]+J[1][1]+J[2][2];
// All of these are confirmed to compile out all unneeded terms.
// So Grad of X doesn't compute Y or Z components, and VNoise doesn't do any of the derivative computation.
float3x4 JacobianSimplex_ALU(float3 v, bool bTiling, float RepeatSize)
{
	// corners of tetrahedron
	float4x3 T = SimplexCorners(v);
	uint3 rand;
	float4x3 gvec[3], fv;
	float3x4 grad;

	// processing of tetrahedral vertices, unrolled
	// to compute gradient at each corner
	fv[0] = v - T[0];
	rand = Rand3DPCG16(int3(floor(NoiseTileWrap(6 * T[0] + 0.5, bTiling, RepeatSize))));
	gvec[0][0] = float3(rand.xxx & MGradientMask) * MGradientScale - 1;
	gvec[1][0] = float3(rand.yyy & MGradientMask) * MGradientScale - 1;
	gvec[2][0] = float3(rand.zzz & MGradientMask) * MGradientScale - 1;
	grad[0][0] = dot(gvec[0][0], fv[0]);
	grad[1][0] = dot(gvec[1][0], fv[0]);
	grad[2][0] = dot(gvec[2][0], fv[0]);

	fv[1] = v - T[1];
	rand = Rand3DPCG16(int3(floor(NoiseTileWrap(6 * T[1] + 0.5, bTiling, RepeatSize))));
	gvec[0][1] = float3(rand.xxx & MGradientMask) * MGradientScale - 1;
	gvec[1][1] = float3(rand.yyy & MGradientMask) * MGradientScale - 1;
	gvec[2][1] = float3(rand.zzz & MGradientMask) * MGradientScale - 1;
	grad[0][1] = dot(gvec[0][1], fv[1]);
	grad[1][1] = dot(gvec[1][1], fv[1]);
	grad[2][1] = dot(gvec[2][1], fv[1]);

	fv[2] = v - T[2];
	rand = Rand3DPCG16(int3(floor(NoiseTileWrap(6 * T[2] + 0.5, bTiling, RepeatSize))));
	gvec[0][2] = float3(rand.xxx & MGradientMask) * MGradientScale - 1;
	gvec[1][2] = float3(rand.yyy & MGradientMask) * MGradientScale - 1;
	gvec[2][2] = float3(rand.zzz & MGradientMask) * MGradientScale - 1;
	grad[0][2] = dot(gvec[0][2], fv[2]);
	grad[1][2] = dot(gvec[1][2], fv[2]);
	grad[2][2] = dot(gvec[2][2], fv[2]);

	fv[3] = v - T[3];
	rand = Rand3DPCG16(int3(floor(NoiseTileWrap(6 * T[3] + 0.5, bTiling, RepeatSize))));
	gvec[0][3] = float3(rand.xxx & MGradientMask) * MGradientScale - 1;
	gvec[1][3] = float3(rand.yyy & MGradientMask) * MGradientScale - 1;
	gvec[2][3] = float3(rand.zzz & MGradientMask) * MGradientScale - 1;
	grad[0][3] = dot(gvec[0][3], fv[3]);
	grad[1][3] = dot(gvec[1][3], fv[3]);
	grad[2][3] = dot(gvec[2][3], fv[3]);

	// blend gradients
	float4 sv = SimplexSmooth(fv);
	float3x4 ds = SimplexDSmooth(fv);

	float3x4 jacobian;
	jacobian[0] = float4(mul(sv, gvec[0]) + mul(ds, grad[0]), dot(sv, grad[0]));
	jacobian[1] = float4(mul(sv, gvec[1]) + mul(ds, grad[1]), dot(sv, grad[1]));
	jacobian[2] = float4(mul(sv, gvec[2]) + mul(ds, grad[2]), dot(sv, grad[2]));

	return jacobian;
}

// 3D value noise - used to be incorrectly called Perlin noise
// @param v = 3D noise argument, use float3(x,y,0) for 2D or float3(x,0,0) for 1D
// @param bTiling = repeat noise pattern
// @param RepeatSize = integer units before tiling in each dimension
// @return random number in the range -1 .. 1
float ValueNoise3D_ALU(float3 v, bool bTiling, float RepeatSize)
{
	float seed000, seed001, seed010, seed011, seed100, seed101, seed110, seed111;
	float3 fv = NoiseSeeds(v, bTiling, RepeatSize, seed000, seed001, seed010, seed011, seed100, seed101, seed110, seed111);

	float rand000 = RandBBSfloat(seed000) * 2 - 1;
	float rand100 = RandBBSfloat(seed100) * 2 - 1;
	float rand010 = RandBBSfloat(seed010) * 2 - 1;
	float rand110 = RandBBSfloat(seed110) * 2 - 1;
	float rand001 = RandBBSfloat(seed001) * 2 - 1;
	float rand101 = RandBBSfloat(seed101) * 2 - 1;
	float rand011 = RandBBSfloat(seed011) * 2 - 1;
	float rand111 = RandBBSfloat(seed111) * 2 - 1;
	
	float3 Weights = PerlinRamp(float4(fv, 0)).xyz;
	
	float i = lerp(lerp(rand000, rand100, Weights.x), lerp(rand010, rand110, Weights.x), Weights.y);
	float j = lerp(lerp(rand001, rand101, Weights.x), lerp(rand011, rand111, Weights.x), Weights.y);
	return lerp(i, j, Weights.z).x;
}


// -------- TEX based methods ---------

// filtered 3D noise, can be optimized
// @param v = 3D noise argument, use float3(x,y,0) for 2D or float3(x,0,0) for 1D
// @param bTiling = repeat noise pattern
// @param RepeatSize = integer units before tiling in each dimension
// @return random number in the range -1 .. 1
float GradientNoise3D_TEX(float3 v, bool bTiling, float RepeatSize)
{
	bTiling = true;
	float3 fv = frac(v);
	float3 iv0 = NoiseTileWrap(floor(v), bTiling, RepeatSize);
	float3 iv1 = NoiseTileWrap(iv0 + 1, bTiling, RepeatSize);

	const int2 ZShear = int2(17, 89);
	
	float2 OffsetA = iv0.z * ZShear;
	float2 OffsetB = OffsetA + ZShear;	// non-tiling, use relative offset
	if (bTiling)						// tiling, have to compute from wrapped coordinates
	{
		OffsetB = iv1.z * ZShear;
	}

	// Texture size scale factor
	float ts = 1 / 128.0f;

	// texture coordinates for iv0.xy, as offset for both z slices
	float2 TexA0 = (iv0.xy + OffsetA + 0.5f) * ts;
	float2 TexB0 = (iv0.xy + OffsetB + 0.5f) * ts;

	// texture coordinates for iv1.xy, as offset for both z slices
	float2 TexA1 = TexA0 + ts;	// for non-tiling, can compute relative to existing coordinates
	float2 TexB1 = TexB0 + ts;
	if (bTiling)				// for tiling, need to compute from wrapped coordinates
	{
		TexA1 = (iv1.xy + OffsetA + 0.5f) * ts;
		TexB1 = (iv1.xy + OffsetB + 0.5f) * ts;
	}


	// can be optimized to 1 or 2 texture lookups (4 or 8 channel encoded in 8, 16 or 32 bit)
	float3 A = Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, float2(TexA0.x, TexA0.y), 0).xyz * 2 - 1;
	float3 B = Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, float2(TexA1.x, TexA0.y), 0).xyz * 2 - 1;
	float3 C = Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, float2(TexA0.x, TexA1.y), 0).xyz * 2 - 1;
	float3 D = Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, float2(TexA1.x, TexA1.y), 0).xyz * 2 - 1;
	float3 E = Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, float2(TexB0.x, TexB0.y), 0).xyz * 2 - 1;
	float3 F = Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, float2(TexB1.x, TexB0.y), 0).xyz * 2 - 1;
	float3 G = Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, float2(TexB0.x, TexB1.y), 0).xyz * 2 - 1;
	float3 H = Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, float2(TexB1.x, TexB1.y), 0).xyz * 2 - 1;

	float a = dot(A, fv - float3(0, 0, 0));
	float b = dot(B, fv - float3(1, 0, 0));
	float c = dot(C, fv - float3(0, 1, 0));
	float d = dot(D, fv - float3(1, 1, 0));
	float e = dot(E, fv - float3(0, 0, 1));
	float f = dot(F, fv - float3(1, 0, 1));
	float g = dot(G, fv - float3(0, 1, 1));
	float h = dot(H, fv - float3(1, 1, 1));

	float3 Weights = PerlinRamp(frac(float4(fv, 0))).xyz;
	
	float i = lerp(lerp(a, b, Weights.x), lerp(c, d, Weights.x), Weights.y);
	float j = lerp(lerp(e, f, Weights.x), lerp(g, h, Weights.x), Weights.y);

	return lerp(i, j, Weights.z);
}

// @return random number in the range -1 .. 1
// scalar: 6 frac, 31 mul/mad, 15 add, 
float FastGradientPerlinNoise3D_TEX(float3 xyz)
{
	// needs to be the same value when creating the PerlinNoise3D texture
	float Extent = 16;

	// last texel replicated and needed for filtering
	// scalar: 3 frac, 6 mul
	xyz = frac(xyz / (Extent - 1)) * (Extent - 1);

	// scalar: 3 frac
	float3 uvw = frac(xyz);
	// = floor(xyz);
	// scalar: 3 add
	float3 p0 = xyz - uvw;
//	float3 f = pow(uvw, 2) * 3.0f - pow(uvw, 3) * 2.0f;	// original perlin hermite (ok when used without bump mapping)
	// scalar: 2*3 add 5*3 mul
	float3 f = PerlinRamp(float4(uvw, 0)).xyz;	// new, better with continues second derivative for bump mapping
	// scalar: 3 add
	float3 p = p0 + f;
	// scalar: 3 mad
	float4 NoiseSample = Texture3DSampleLevel(View.PerlinNoise3DTexture, View.PerlinNoise3DTextureSampler, p / Extent + 0.5f / Extent, 0);		// +0.5f to get rid of bilinear offset

	// reconstruct from 8bit (using mad with 2 constants and dot4 was same instruction count)
	// scalar: 4 mad, 3 mul, 3 add 
	float3 n = NoiseSample.xyz * 255.0f / 127.0f - 1.0f;
	float d = NoiseSample.w * 255.f - 127;
	return dot(xyz, n) - d;
}


// 3D jitter offset within a voronoi noise cell
// @param pos - integer lattice corner
// @return random offsets vector
float3 VoronoiCornerSample(float3 pos, int Quality)
{
	// random values in [-0.5, 0.5]
	float3 noise = float3(Rand3DPCG16(int3(pos))) / 0xffff - 0.5;

	// quality level 1 or 2: searches a 2x2x2 neighborhood with points distributed on a sphere
	// scale factor to guarantee jittered points will be found within a 2x2x2 search
	if (Quality <= 2)
	{
		return normalize(noise) * 0.2588;
	}

	// quality level 3: searches a 3x3x3 neighborhood with points distributed on a sphere
	// scale factor to guarantee jittered points will be found within a 3x3x3 search
	if (Quality == 3)
	{
		return normalize(noise) * 0.3090;
	}

	// quality level 4: jitter to anywhere in the cell, needs 4x4x4 search
	return noise;
}

// compare previous best with a new candidate
// not producing point locations makes it easier for compiler to eliminate calculations when they're not needed
// @param minval = location and distance of best candidate seed point before the new one
// @param candidate = candidate seed point
// @param offset = 3D offset to new candidate seed point
// @param bDistanceOnly = if true, only set maxval.w with distance, otherwise maxval.w is distance and maxval.xyz is position
// @return position (if bDistanceOnly is false) and distance to closest seed point so far
float4 VoronoiCompare(float4 minval, float3 candidate, float3 offset, bool bDistanceOnly)
{
	if (bDistanceOnly)
	{
		return float4(0, 0, 0, min(minval.w, dot(offset, offset)));
	}
	else
	{
		float newdist = dot(offset, offset);
		return newdist > minval.w ? minval : float4(candidate, newdist);
	}
}

// 220 instruction Worley noise
float4 VoronoiNoise3D_ALU(float3 v, int Quality, bool bTiling, float RepeatSize, bool bDistanceOnly)
{
	float3 fv = frac(v),  fv2 = frac(v + 0.5);
	float3 iv = floor(v), iv2 = floor(v + 0.5);

	// with initial minimum distance = infinity (or at least bigger than 4), first min is optimized away
	float4 mindist = float4(0,0,0,100);
	float3 p, offset;

	// quality level 3: do a 3x3x3 search
	if (Quality == 3)
	{
		UNROLL_N(3) for (offset.x = -1; offset.x <= 1; ++offset.x)
		{
			UNROLL_N(3) for (offset.y = -1; offset.y <= 1; ++offset.y)
			{
				UNROLL_N(3) for (offset.z = -1; offset.z <= 1; ++offset.z)
				{
					p = offset + VoronoiCornerSample(NoiseTileWrap(iv2 + offset, bTiling, RepeatSize), Quality);
					mindist = VoronoiCompare(mindist, iv2 + p, fv2 - p, bDistanceOnly);
				}
			}
		}
	}

	// everybody else searches a base 2x2x2 neighborhood
	else
	{
		UNROLL_N(2) for (offset.x = 0; offset.x <= 1; ++offset.x)
		{
			UNROLL_N(2) for (offset.y = 0; offset.y <= 1; ++offset.y)
			{
				UNROLL_N(2) for (offset.z = 0; offset.z <= 1; ++offset.z)
				{
					p = offset + VoronoiCornerSample(NoiseTileWrap(iv + offset, bTiling, RepeatSize), Quality);
					mindist = VoronoiCompare(mindist, iv + p, fv - p, bDistanceOnly);

					// quality level 2, do extra set of points, offset by half a cell
					if (Quality == 2)
					{
						// 467 is just an offset to a different area in the random number field to avoid similar neighbor artifacts
						p = offset + VoronoiCornerSample(NoiseTileWrap(iv2 + offset, bTiling, RepeatSize) + 467, Quality);
						mindist = VoronoiCompare(mindist, iv2 + p, fv2 - p, bDistanceOnly);
					}
				}
			}
		}
	}

	// quality level 4: add extra sets of four cells in each direction
	if (Quality >= 4)
	{
		UNROLL_N(2) for (offset.x = -1; offset.x <= 2; offset.x += 3)
		{
			UNROLL_N(2) for (offset.y = 0; offset.y <= 1; ++offset.y)
			{
				UNROLL_N(2) for (offset.z = 0; offset.z <= 1; ++offset.z)
				{
					// along x axis
					p = offset.xyz + VoronoiCornerSample(NoiseTileWrap(iv + offset.xyz, bTiling, RepeatSize), Quality);
					mindist = VoronoiCompare(mindist, iv + p, fv - p, bDistanceOnly);

					// along y axis
					p = offset.yzx + VoronoiCornerSample(NoiseTileWrap(iv + offset.yzx, bTiling, RepeatSize), Quality);
					mindist = VoronoiCompare(mindist, iv + p, fv - p, bDistanceOnly);

					// along z axis
					p = offset.zxy + VoronoiCornerSample(NoiseTileWrap(iv + offset.zxy, bTiling, RepeatSize), Quality);
					mindist = VoronoiCompare(mindist, iv + p, fv - p, bDistanceOnly);
				}
			}
		}
	}

	// transform squared distance to real distance
	return float4(mindist.xyz, sqrt(mindist.w));
}


// -------- Simplex method (faster in higher dimensions because less samples are used, uses gradient noise for quality) ---------
// <Dimensions>D:<Normal>/<Simplex> 1D:2, 2D:4/3, 3D:8/4, 4D:16/5 

// Computed weights and sample positions for simplex interpolation
// @return float3(a,b,c) Barycentric coordianate defined as Filtered = Tex(PosA) * a + Tex(PosB) * b + Tex(PosC) * c
float3 ComputeSimplexWeights2D(float2 OrthogonalPos, out float2 PosA, out float2 PosB, out float2 PosC)
{
	float2 OrthogonalPosFloor = floor(OrthogonalPos); 
	PosA = OrthogonalPosFloor;
	PosB = PosA + float2(1, 1); 

	float2 LocalPos = OrthogonalPos - OrthogonalPosFloor;

	PosC = PosA + ((LocalPos.x > LocalPos.y) ? float2(1,0) : float2(0,1));

	float b = min(LocalPos.x, LocalPos.y);
	float c = abs(LocalPos.y - LocalPos.x);
	float a = 1.0f - b - c;

	return float3(a, b, c);
}

// Computed weights and sample positions for simplex interpolation
// @return float4(a,b,c, d) Barycentric coordinate defined as Filtered = Tex(PosA) * a + Tex(PosB) * b + Tex(PosC) * c + Tex(PosD) * d
float4 ComputeSimplexWeights3D(float3 OrthogonalPos, out float3 PosA, out float3 PosB, out float3 PosC, out float3 PosD)
{
	float3 OrthogonalPosFloor = floor(OrthogonalPos);

	PosA = OrthogonalPosFloor;
	PosB = PosA + float3(1, 1, 1);

	OrthogonalPos -= OrthogonalPosFloor;

	float Largest = max(OrthogonalPos.x, max(OrthogonalPos.y, OrthogonalPos.z));
	float Smallest = min(OrthogonalPos.x, min(OrthogonalPos.y, OrthogonalPos.z));

	PosC = PosA + float3(Largest == OrthogonalPos.x, Largest == OrthogonalPos.y, Largest == OrthogonalPos.z);
	PosD = PosA + float3(Smallest != OrthogonalPos.x, Smallest != OrthogonalPos.y, Smallest != OrthogonalPos.z);

	float4 ret;

	float RG = OrthogonalPos.x - OrthogonalPos.y;
	float RB = OrthogonalPos.x - OrthogonalPos.z;
	float GB = OrthogonalPos.y - OrthogonalPos.z;

	ret.b = 
		  min(max(0, RG), max(0, RB))		// X
		+ min(max(0, -RG), max(0, GB))		// Y
		+ min(max(0, -RB), max(0, -GB));	// Z
	
	ret.a = 
		  min(max(0, -RG), max(0, -RB))		// X
		+ min(max(0, RG), max(0, -GB))		// Y
		+ min(max(0, RB), max(0, GB));		// Z

	ret.g = Smallest;
	ret.r = 1.0f - ret.g - ret.b - ret.a;

	return ret;
}

float2 GetPerlinNoiseGradientTextureAt(float2 v)
{
	float2 TexA = (v.xy + 0.5f) / 128.0f;

	// todo: storing random 2d unit vectors would be better
	float3 p = Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, TexA, 0).xyz * 2 - 1;
	return normalize(p.xy + p.z * 0.33f);
}

float3 GetPerlinNoiseGradientTextureAt(float3 v)
{
	const float2 ZShear = float2(17.0f, 89.0f);

	float2 OffsetA = v.z * ZShear;
	float2 TexA = (v.xy + OffsetA + 0.5f) / 128.0f;

	return Texture2DSampleLevel(View.PerlinNoiseGradientTexture, View.PerlinNoiseGradientTextureSampler, TexA , 0).xyz * 2 - 1;
}

float2 SkewSimplex(float2 In)
{
	return In + dot(In, (sqrt(3.0f) - 1.0f) * 0.5f );
}
float2 UnSkewSimplex(float2 In)
{
	return In - dot(In, (3.0f - sqrt(3.0f)) / 6.0f );
}
float3 SkewSimplex(float3 In)
{
	return In + dot(In, 1.0 / 3.0f );
}
float3 UnSkewSimplex(float3 In)
{
	return In - dot(In, 1.0 / 6.0f );
}

// filtered 3D gradient simple noise (few texture lookups, high quality)
// @param v >0
// @return random number in the range -1 .. 1
float GradientSimplexNoise2D_TEX(float2 EvalPos)
{
	float2 OrthogonalPos = SkewSimplex(EvalPos);

	float2 PosA, PosB, PosC, PosD;
	float3 Weights = ComputeSimplexWeights2D(OrthogonalPos, PosA, PosB, PosC);

	// can be optimized to 1 or 2 texture lookups (4 or 8 channel encoded in 32 bit)
	float2 A = GetPerlinNoiseGradientTextureAt(PosA);
	float2 B = GetPerlinNoiseGradientTextureAt(PosB);
	float2 C = GetPerlinNoiseGradientTextureAt(PosC);

	PosA = UnSkewSimplex(PosA);
	PosB = UnSkewSimplex(PosB);
	PosC = UnSkewSimplex(PosC);

	float DistanceWeight;

	DistanceWeight = saturate(0.5f - length2(EvalPos - PosA));	DistanceWeight *= DistanceWeight; DistanceWeight *= DistanceWeight;
	float a = dot(A, EvalPos - PosA) * DistanceWeight;
	DistanceWeight = saturate(0.5f - length2(EvalPos - PosB));	DistanceWeight *= DistanceWeight; DistanceWeight *= DistanceWeight;
	float b = dot(B, EvalPos - PosB) * DistanceWeight;
	DistanceWeight = saturate(0.5f - length2(EvalPos - PosC));	DistanceWeight *= DistanceWeight; DistanceWeight *= DistanceWeight;
	float c = dot(C, EvalPos - PosC) * DistanceWeight;
	
	return 70 * (a + b + c);
}


// filtered 3D gradient simple noise (few texture lookups, high quality)
// @param v >0
// @return random number in the range -1 .. 1
float SimplexNoise3D_TEX(float3 EvalPos)
{
	float3 OrthogonalPos = SkewSimplex(EvalPos);

	float3 PosA, PosB, PosC, PosD;
	float4 Weights = ComputeSimplexWeights3D(OrthogonalPos, PosA, PosB, PosC, PosD);

	// can be optimized to 1 or 2 texture lookups (4 or 8 channel encoded in 32 bit)
	float3 A = GetPerlinNoiseGradientTextureAt(PosA);
	float3 B = GetPerlinNoiseGradientTextureAt(PosB);
	float3 C = GetPerlinNoiseGradientTextureAt(PosC);
	float3 D = GetPerlinNoiseGradientTextureAt(PosD);
	
	PosA = UnSkewSimplex(PosA);
	PosB = UnSkewSimplex(PosB);
	PosC = UnSkewSimplex(PosC);
	PosD = UnSkewSimplex(PosD);

	float DistanceWeight;

	DistanceWeight = saturate(0.6f - length2(EvalPos - PosA));	DistanceWeight *= DistanceWeight; DistanceWeight *= DistanceWeight;
	float a = dot(A, EvalPos - PosA) * DistanceWeight;
	DistanceWeight = saturate(0.6f - length2(EvalPos - PosB));	DistanceWeight *= DistanceWeight; DistanceWeight *= DistanceWeight;
	float b = dot(B, EvalPos - PosB) * DistanceWeight;
	DistanceWeight = saturate(0.6f - length2(EvalPos - PosC));	DistanceWeight *= DistanceWeight; DistanceWeight *= DistanceWeight;
	float c = dot(C, EvalPos - PosC) * DistanceWeight;
	DistanceWeight = saturate(0.6f - length2(EvalPos - PosD));	DistanceWeight *= DistanceWeight; DistanceWeight *= DistanceWeight;
	float d = dot(D, EvalPos - PosD) * DistanceWeight;

	return 32 * (a + b + c + d);
}


float VolumeRaymarch(float3 posPixelWS, float3 posCameraWS)
{
	float ret = 0;
	int cnt = 60;

	LOOP for(int i=0; i < cnt; ++i)
	{
		ret += saturate(FastGradientPerlinNoise3D_TEX(lerp(posPixelWS, posCameraWS, i/(float)cnt) * 0.01) - 0.2f);
	}

	return ret / cnt * (length(posPixelWS - posCameraWS) * 0.001f );
}