// Copyright Epic Games, Inc. All Rights Reserved.

#ifndef __RADRR_COREH__
  #include "rrCore.h"
#endif

#include <string.h>
#include <stdint.h>

#include "radmath.h"
#include "binkacd.h"
#include "cpu.h"

#include "radfft.h"

#ifndef alloca
#if defined(_MSC_VER)
  #include <malloc.h>
#elif defined(__GNUC__)
  #include <alloca.h>
#endif
#endif


#ifdef BIG_OLE_FFT // always false for ue binka
#define MAX_TRANSFORM                 4096
#else
#define MAX_TRANSFORM                 2048
#endif

#define MAXCHANNELS          2
#define WINDOWRATIO          16

#define TOTBANDS 25

#define FXPBITS 29

#define VQLENGTH 8

#define RLEBITS 4
#define MAXRLE ( 1 << RLEBITS )

static
U8 bink_rlelens_snd[ MAXRLE ] =
{
  2,3,4,5, 6,8,9,10, 11,12,13,14, 15,16,32,64
};

static
U32 bink_bandtopfreq[ TOTBANDS ]=
{
  0, 100, 200, 300, 400, 510, 630, 770, 920, 1080, 1270, 1480, 1720, 2000,
  2320, 2700, 3150, 3700, 4400, 5300, 6400, 7700, 9500, 12000, 15500
};


#include "undeci.inc"

/*static F32 RADINLINE Undecibel( F32 d )
{
  return( ( F32 ) radpow( 10, d * 0.10f ) );
}
*/

//==============================================================================
//   decoding functions
//==============================================================================

static
F32 bink_invertbins[ 24 ]=
{
  (F32)( 1.0F / (F32)( 1 << 23 ) ),
  (F32)( 1.0F / (F32)( 1 << 22 ) ),(F32)( 1.0F / (F32)( 1 << 21 ) ),(F32)( 1.0F / (F32)( 1 << 20 ) ),(F32)( 1.0F / (F32)( 1 << 19) ),
  (F32)( 1.0F / (F32)( 1 << 18 ) ),(F32)( 1.0F / (F32)( 1 << 17 ) ),(F32)( 1.0F / (F32)( 1 << 16 ) ),(F32)( 1.0F / (F32)( 1 << 15) ),
  (F32)( 1.0F / (F32)( 1 << 14 ) ),(F32)( 1.0F / (F32)( 1 << 13 ) ),(F32)( 1.0F / (F32)( 1 << 12 ) ),(F32)( 1.0F / (F32)( 1 << 11) ),
  (F32)( 1.0F / (F32)( 1 << 10 ) ),(F32)( 1.0F / (F32)( 1 <<  9 ) ),(F32)( 1.0F / (F32)( 1 <<  8 ) ),(F32)( 1.0F / (F32)( 1 <<  7) ),
  (F32)( 1.0F / (F32)( 1 <<  6 ) ),(F32)( 1.0F / (F32)( 1 <<  5 ) ),(F32)( 1.0F / (F32)( 1 <<  4 ) ),(F32)( 1.0F / (F32)( 1 <<  3) ),
  (F32)( 1.0F / (F32)( 1 <<  2 ) ),(F32)( 1.0F / (F32)( 1 <<  1 ) ),(F32)( 1.0F / (F32)( 1 <<  0 ) )
};

#if defined(__RADPPC__) && defined(_MSC_VER)
#pragma optimize ("g", off)
#endif

static F32 fxptof( U32 val )
{
  F32 f;

  f = (F32) ( ( (F32) (S32) ( ( val & ( ~0x10000000 ) ) >> 5 ) ) * bink_invertbins[ val & 31 ] );
  return( ( val & 0x10000000 ) ? -f : f );
}

#if defined(__RADPPC__) && defined(_MSC_VER)
#pragma optimize ("g", on)
#endif

typedef struct BINKAUDIODECOMP
{
  S16 * overlap;

  U32 transform_size;
  F32 transform_size_root;

  U32 buffer_size;
  U32 window_size_in_bytes;

  U32 window_shift;
  U32 chans;

  S32 start_frame;
  U32 num_bands;

  U32 _unused; // work_sz used to be here
  U32 flags;

  U32 size;
  U32 bands[ TOTBANDS + 1 ];
} BINKAUDIODECOMP;

#include "crsfade.inl"

#include "binkbits.h"

#if  defined(__RADARM64__) && !defined(_M_ARM64) // exclude visual studio arm64 as it doesn't support __int128
  #define bigreg __int128
  #define bigregzero 0
  #define bigregloadaligned( val, ptr ) (val)=*((bigreg*)(ptr))
  #define bigregstorealigned( ptr, val ) *((bigreg*)(ptr))=(val)
  #define bigregstoreunaligned( ptr, val ) *((bigreg*)(ptr))=(val)
#elif defined(__RADNEON__)
  #include <arm_neon.h>
  #define bigreg uint8x16_t 
  #define bigregzero vdupq_n_s32(0)
  typedef RAD_ALIGN(uint8_t, ALU8,16);
  typedef RAD_ALIGN(uint8_t, ULU8,1);
  #define bigregloadaligned( val, ptr ) (val)=vld1q_u8( ((ALU8*)(ptr)) )
  #define bigregstorealigned( ptr, val ) vst1q_u8( ((ALU8*)(ptr)),val)
  #define bigregstoreunaligned( ptr, val ) vst1q_u8( ((ULU8*)(ptr)),val)
#elif defined(__RADARM__)
  typedef struct bigreg { U32 a,b; } bigreg;  // alignment trick
  #define bigreg bigreg
  #define bigregzero {0}
  #define bigregloadaligned( val, ptr ) (val)=*((bigreg*)(ptr))
  #define bigregstorealigned( ptr, val ) *((bigreg*)(ptr))=(val)
  #define bigregstoreunaligned( ptr, val ) *((bigreg*)(ptr))=(val)
#elif defined(__RADX86__)
  #include <xmmintrin.h>
  #define bigreg __m128
  #define bigregzero _mm_setzero_ps()
  #define bigregloadaligned( val, ptr ) (val)=_mm_load_ps( ((float*)(ptr)) )
  #define bigregstorealigned( ptr, val ) _mm_store_ps( ((float*)(ptr)),val)
  #define bigregstoreunaligned( ptr, val ) _mm_storeu_ps( ((float*)(ptr)),val)
#else
  #define rrmemsetzero(d,c) memset(d,0,c) // use for small zero clears
  #define ourmemsetzero rrmemsetzero
  #define rrmemmovebig memmove // use for large copies (>512 bytes) - can overlay
  #define ourmemcpy rrmemmovebig
#endif

#ifdef bigreg

static void ourmemcpy( void * destp, void const * srcp, U32 bytes ) //assumed aligned and multiple of 16
{
  #define PERLOOP (4*sizeof(bigreg))
  U32 s = bytes/PERLOOP;
  U8 const * src = (U8 const*)srcp;
  U8 * dest = (U8*)destp;
  rrassert( (bytes&15)==0 );
  bytes = ( bytes & 63 ) / 16;
  rrassert( (((UINTa)srcp)&15)==0);
  rrassert( (((UINTa)destp)&15)==0);
  rrassert( srcp != destp );
  while(s) // 64-byte chunks
  {
    bigreg a,b,c,d;
#ifdef __clang__
  __asm__ __volatile__("");  // force no conversion to enormous optimized memset
#endif  
    bigregloadaligned(a,((bigreg*)src)); bigregloadaligned(b,((bigreg*)src)+1); bigregloadaligned(c,((bigreg*)src)+2); bigregloadaligned(d,((bigreg*)src)+3);
    bigregstorealigned(((bigreg*)dest),a); bigregstorealigned(((bigreg*)dest)+1,b); bigregstorealigned(((bigreg*)dest)+2,c); bigregstorealigned(((bigreg*)dest)+3,d);
    --s;
    src+=PERLOOP;
    dest+=PERLOOP;
  };
  #undef PERLOOP
  
  while ( bytes ) // 16-byte chunks
  {
    bigreg a;
    bigregloadaligned(a,((bigreg*)src)); bigregstorealigned(((bigreg*)dest),a);
    src+=16; 
    dest+=16;
    --bytes;
  }
}

// always writes 32 bytes (so possible 31 byte overwrite), except arm32 without neon, 16 bytes (possible 15 bytes overwrite)
//   addr can be unaligned
static void ourmemsetzero( void * addr, size_t len )
{
  bigreg z = bigregzero;

  #define PERLOOP (2*sizeof(bigreg))

  len = ( len + (PERLOOP-1) ) / PERLOOP;
  do
  {
    --len;
#ifdef __clang__
  __asm__ __volatile__("");  // force no conversion to enormous optimized memset
#endif  
    bigregstoreunaligned( ((float*)addr),z);
    bigregstoreunaligned( ((float*)addr)+4,z);
    addr = ((float*)addr)+8;
  } while(len);
  #undef PERLOOP
}

#endif


static RADINLINE U32 read_up_to_24_bits(const U8* buffer, size_t bit_position)
{
    uint32_t x;
    memcpy(&x, buffer + (bit_position >> 3), sizeof(x));
    return x >> (bit_position & 7);
}

template <int bitlen, int index>
static RADINLINE void decode_coeff(unsigned char& current_sign_bit, U64 coeff_bits, U32 sign_bits, short* coeffs)
{
    unsigned int need_negate;
    short c;

    // this is crafted to coerce clang to generate ubxt, csinc, and csneg.
    c = (coeff_bits >> (index * bitlen)) & ((1 << bitlen) - 1);
    need_negate = (sign_bits >> current_sign_bit) & 1;
    current_sign_bit += (c != 0) ? 1 : 0;
    c = need_negate ? -c : c;
    coeffs[index] = c;
}

template <int bitlen>
static RADINLINE size_t decode_coeff_remnants(size_t start, float* results, const uint8_t* buffer, size_t coeff_position, size_t sign_position, size_t count)
{
    // This should almost never be hit
    for (; start < count; start++)
    {
        S16 c;
        unsigned char current_sign_bit = 0;

        U32 coeff_bits = read_up_to_24_bits(buffer, coeff_position);
        coeff_position += bitlen;
        U32 sign_bits = read_up_to_24_bits(buffer, sign_position);

        decode_coeff<bitlen, 0>(current_sign_bit, coeff_bits, sign_bits, &c);
        sign_position += current_sign_bit;

        results[start] = (float)c;
    }

    return sign_position;
}

template <class chunk_type>
static RADINLINE bool validate_chunked_bit_read(size_t bit_position, size_t end_bit_position, size_t need_bits)
{
    //
    // The check here is because even if we only need 1 bit and that fits
    // within the end_bit_position, we'll be reading a lot more to fill the
    // chunk.
    //
    // this all simpifies a great deal with compiling.
    size_t farthest_byte_to_read = (sizeof(chunk_type)/8) + ((bit_position + need_bits) >> 3);
    size_t farthest_bit_to_read = farthest_byte_to_read * 8;
    return farthest_bit_to_read <= end_bit_position;
}


//#define FORCE_SCALAR

#ifdef __RAD64__

static RADINLINE U64 read_up_to_56_bits(const U8* buffer, size_t bit_position)
{
    uint64_t x;
    memcpy(&x, buffer + (bit_position >> 3), sizeof(x));
    return x >> (bit_position & 7);
}

#if defined(RAD_USES_SSSE3) && defined(__RADX64__)

#include <smmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>

typedef __m128i Vec128;

static Vec128 load128u(const void * ptr) { return _mm_loadu_si128((const __m128i *) ptr); }

//static Vec128 s_getbit_consts[15][2]; // [bitlen-1][idx]   0=shuffle mask, 1=multipliers
//static volatile U32 s_getbit_consts_initd;

static RAD_ALIGN(U32, s_getbit_consts_table[], 16) =
{
  0x1000100, 0x1000100, 0x1000100, 0x1000100, 0x40408080, 0x10102020, 0x4040808, 0x1010202,
  0x1000100, 0x1000100, 0x2010201, 0x2010201, 0x20208080, 0x2020808, 0x20208080, 0x2020808,
  0x1000100, 0x2010100, 0x2010201, 0x3020302, 0x10108080, 0x40400202, 0x1010808, 0x4042020,
  0x1000100, 0x2010201, 0x3020302, 0x4030403, 0x8088080, 0x8088080, 0x8088080, 0x8088080,
  0x1000100, 0x2010201, 0x4030302, 0x5040403, 0x4048080, 0x1012020, 0x40400808, 0x10100202,
  0x1000100, 0x3020201, 0x4030403, 0x6050504, 0x2028080, 0x20200808, 0x2028080, 0x20200808,
  0x1000100, 0x3020201, 0x5040403, 0x7060605, 0x1018080, 0x4040202, 0x10100808, 0x40402020,
  0x2010100, 0x4030302, 0x6050504, 0x8070706, 0x80808080, 0x80808080, 0x80808080, 0x80808080,
  0x2010100, 0x4030302, 0x6050504, 0x8070706, 0x40408080, 0x10102020, 0x4040808, 0x1010202,
  0x2010100, 0x4030302, 0x7060605, 0x9080807, 0x20208080, 0x2020808, 0x20208080, 0x2020808,
  0x2010100, 0x5040302, 0x7060605, 0xa090908, 0x10108080, 0x40400202, 0x1010808, 0x4042020,
  0x2010100, 0x5040403, 0x8070706, 0xb0a0a09, 0x8088080, 0x8088080, 0x8088080, 0x8088080,
  0x2010100, 0x5040403, 0x9080706, 0xc0b0a09, 0x4048080, 0x1012020, 0x40400808, 0x10100202,
  0x2010100, 0x6050403, 0x9080807, 0xd0c0b0a, 0x2028080, 0x20200808, 0x2028080, 0x20200808,
  0x2010100, 0x6050403, 0xa090807, 0xe0d0c0b, 0x1018080, 0x4040202, 0x10100808, 0x40402020,
};

//
//static void init_bit_lookup_tables()
//{
//    if (rrAtomicAddExchange32(&s_getbit_consts_initd, 1) == 0)
//    {
//        int bitlen, lane;
//        // Init bit decode table
//        for (bitlen = 1; bitlen <= 15; bitlen++)
//        {
//            uint8_t shuffle[16];
//            uint16_t mul[8];
//
//            for (lane = 0; lane < 8; ++lane)
//            {
//                int lane_bitpos = lane*bitlen;
//                shuffle[lane*2 + 0] = (U8)(lane_bitpos >> 3);
//                shuffle[lane*2 + 1] = (U8)((lane_bitpos >> 3) + 1);
//                mul[lane] = 0x8080 >> (lane_bitpos & 7);
//            }
//
//            s_getbit_consts[bitlen-1][0] = load128u(shuffle);
//            s_getbit_consts[bitlen-1][1] = load128u(mul);
//
//            printf("  0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x,\n",
//                (shuffle[0] << 0) | (shuffle[1] << 8) | (shuffle[2] << 16) | (shuffle[3] << 24),
//                (shuffle[4] << 0) | (shuffle[5] << 8) | (shuffle[6] << 16) | (shuffle[7] << 24),
//                (shuffle[8] << 0) | (shuffle[9] << 8) | (shuffle[10] << 16) | (shuffle[11] << 24),
//                (shuffle[12] << 0) | (shuffle[13] << 8) | (shuffle[14] << 16) | (shuffle[15] << 24),
//                (mul[0] << 0) | (mul[1] << 16),
//                (mul[2] << 0) | (mul[3] << 16),
//                (mul[4] << 0) | (mul[5] << 16),
//                (mul[6] << 0) | (mul[7] << 16));
//        }
//    }
//}


static RADINLINE void determine_shuffle_and_mul(Vec128 * out_shuffle, Vec128 * out_mul, int bitlen, size_t bitpos)
{
    Vec128 extra_shift = _mm_cvtsi32_si128(bitpos & 7);
    Vec128 shuffle = _mm_load_si128( (__m128i*)((unsigned char*)s_getbit_consts_table + sizeof(__m128i)*2*(bitlen - 1)));
    Vec128 mul = _mm_load_si128((__m128i*)((unsigned char*)s_getbit_consts_table + sizeof(__m128i) * 2 * (bitlen - 1) + 1 * sizeof(__m128i)));
    Vec128 mul_lobyte, mul_byte_advanced;
    
    // Shift down the multiplier to consume the initial bits
    mul = _mm_srl_epi16(mul, extra_shift);

    // If shifting mul consumed the first byte, adjust shuffle accordingly.
    // Our original multiplier is 0x8080 >> (bitpos & 7); thus, when the
    // high 8 bits become 0, we know we consumed more than 8 bits total and
    // need to update the shuffle bytes.
    //
    // We test for "high byte 0" as x == (x & 0xff) since we need (x & 0xff)
    // anyway.
    mul_lobyte = _mm_and_si128(mul, _mm_set1_epi16(0xff));
    mul_byte_advanced = _mm_cmpeq_epi16(mul, mul_lobyte);

    // The 16-bit lanes where we went outside the original byte now have -1
    // in them; bytewise add to shuffle mask to increment the shuffle bytes
    // when this is the case.
    shuffle = _mm_sub_epi8(shuffle, mul_byte_advanced);

    *out_shuffle = shuffle;

    // Our output mult needs an extra shift-left by 1 which is just adding it to itself
    *out_mul = _mm_add_epi16(mul_lobyte, mul_lobyte);
}

// Inclusive prefix sum on U16 lanes
RAD_USES_SSSE3 static RADINLINE Vec128 prefix_sum_u16(Vec128 x)
{
    // Two Kogge-Stone steps, then finish with Sklansky
    // this has 2 shifts 1 shuffle vs. 3 shuffles for pure Kogge-Stone
    x = _mm_add_epi16(x, _mm_slli_epi64(x, 16));
    x = _mm_add_epi16(x, _mm_slli_epi64(x, 32));
    x = _mm_add_epi16(x, _mm_shuffle_epi8(x, _mm_setr_epi8(-1,-1,-1,-1,-1,-1,-1,-1, 6,7,6,7,6,7,6,7)));
    return x;
}

RAD_USES_SSSE3 static RADINLINE void decode_pfxsum_8(float* results, const uint8_t* buffer, Vec128 shuffle0, Vec128 shuffle1, Vec128 mul_vec, Vec128 mask_lut, Vec128 coeff_mask, Vec128* last_prefix_sum, size_t bitpos_vals, size_t bitpos_signs, int bitlen)
{
    const uint8_t* val_bytes = buffer + (bitpos_vals >> 3);
    Vec128 packed_coeffs = load128u(val_bytes);
    Vec128 coeff_bits;

    // Align target bits so bit 0 of our desired field is in bit 8 of the 16-bit lane
    Vec128 aligned = _mm_mullo_epi16(_mm_shuffle_epi8(packed_coeffs, shuffle0), mul_vec);
    coeff_bits = _mm_srli_epi16(aligned, 8);

    if (bitlen > 8)
    {
        // Need to grab the second byte and merge
        // this one already ends up in the high byte
        // no extra masking required, except for "truncation" due to shifting bits out
        // the two values agree
        aligned = _mm_mullo_epi16(_mm_shuffle_epi8(packed_coeffs, shuffle1), mul_vec);
        coeff_bits = _mm_or_si128(coeff_bits, aligned);
    }

    // Mask the active bits to finalize bit get
    {
        Vec128 unpacked_coeffs = _mm_and_si128(coeff_bits, coeff_mask);

        // Figure out which lanes are zero so we can work out who needs sign bits
        Vec128 coeff_zero = _mm_cmpeq_epi16(unpacked_coeffs, _mm_setzero_si128());
        Vec128 need_signs = _mm_add_epi16(coeff_zero, _mm_set1_epi16(1)); // 0 if no sign required, 1 if sign consumed

        // Prefix sum to figure out where everything comes from
        Vec128 pfx_sum = prefix_sum_u16(need_signs);
        

        // Peek at the next (up to) 8 sign bits, replicate into vector
        int sign_peek_bits = read_up_to_24_bits(buffer, bitpos_signs) & 0xff;
        Vec128 vec_signs = _mm_shuffle_epi8(_mm_cvtsi32_si128(sign_peek_bits), _mm_setzero_si128());

        // Materialize bit masks to test against via LUT
        Vec128 sign_masks = _mm_shuffle_epi8(mask_lut, pfx_sum);

        // Test the sign bits
        Vec128 negate_mask = _mm_cmpeq_epi16(_mm_and_si128(vec_signs, sign_masks), sign_masks);

        // Apply the signs to 16-bit values
        Vec128 signed_coeffs = _mm_sub_epi16(_mm_xor_si128(unpacked_coeffs, negate_mask), negate_mask);

        // Sign-extend packed coeffs to 32 bits
        Vec128 coeffs32_0 = _mm_srai_epi32(_mm_unpacklo_epi16(signed_coeffs, signed_coeffs), 16);
        Vec128 coeffs32_1 = _mm_srai_epi32(_mm_unpackhi_epi16(signed_coeffs, signed_coeffs), 16);

        // Convert to float and store
        __m128 coeffsf_0 = _mm_cvtepi32_ps(coeffs32_0);
        __m128 coeffsf_1 = _mm_cvtepi32_ps(coeffs32_1);

        _mm_storeu_ps(results + 0, coeffsf_0);
        _mm_storeu_ps(results + 4, coeffsf_1);
        *last_prefix_sum = pfx_sum;
    }
}

static RADINLINE size_t decode_pfxsum(float* results, const uint8_t* buffer, size_t bitpos_vals, size_t bitpos_signs, size_t end_bit_position, size_t count, int bitlen)
{
    Vec128 last_prefix_sum;
    size_t i;
    size_t run_count = count & ~7;

    const Vec128 coeff_mask = _mm_set1_epi16((1 << bitlen) - 1);

    // This mask has lane i = 1 << (i-1), with 0 in lane 0. (Lanes 9-15 ignored.)
    // -128 instead of 128 because the arguments to setr_epi8 are _signed_ int8.
    const Vec128 mask_lut = _mm_setr_epi8(0, 1, 2, 4, 8, 16, 32, 64, -128, 0, 0, 0, 0, 0, 0, 0);

    Vec128 shuffle0, shuffle1, mul_vec;
    

    determine_shuffle_and_mul(&shuffle0, &mul_vec, bitlen, bitpos_vals);

    shuffle1 = _mm_add_epi8(shuffle0, _mm_set1_epi8(1));
        
    for (i = 0; i < run_count; i +=8)
    {
        size_t nsigns;
        decode_pfxsum_8(results + i, buffer, shuffle0, shuffle1, mul_vec, mask_lut, coeff_mask, &last_prefix_sum, bitpos_vals, bitpos_signs, bitlen);

        // Advance read cursors
        bitpos_vals += 8 * bitlen;

        //
        // This extract doesn't actually need the &15, as the prefix sum can only ever be
        // a max of 8, however we have crash reports in the wild of situations where bitpos_signs
        // has gone stupid high and causing an AV. This happens after the initial run, and is _well_
        // off in to another page, so the belief is that we're getting some CPU overclocking
        // absurdity. This is confirmed because in the crash minidump we can see that the prefix
        // sum is sane. Since we don't know where the issue is, we're adding the &15. This ends up
        // replacing the mov ecx, ecx for sign extension in a 1:1 trade, so it's completely free.
        //
        // Then, to help sanitize, we test the bit positions against the end of the buffer and
        // just bail since we're off the edge of the map anyway. This ends up getting caught by
        // the bit position validation in read_channel_data_2 and zeroing the block.
        //
        nsigns = (size_t)(unsigned int)(_mm_extract_epi16(last_prefix_sum, 7) & 0xf);
        bitpos_signs += nsigns;

        if (bitpos_signs > end_bit_position ||
            bitpos_vals > end_bit_position)
        {
            return end_bit_position;
        }
    }

    // this should almost never happen - only when a run passes the end of the transform size.
    // as far as i can tell this only happens with zeroing runs
    if (count & 7)
    {
        float stack_results[8];
        unsigned short pfxsum[8];
        
        count &= 7;
        
        decode_pfxsum_8(stack_results, buffer, shuffle0, shuffle1, mul_vec, mask_lut, coeff_mask, &last_prefix_sum, bitpos_vals, bitpos_signs, bitlen);
        
        _mm_storeu_si128((__m128i*)pfxsum, last_prefix_sum);

        bitpos_signs = bitpos_signs + pfxsum[count-1];
        bitpos_vals += bitlen * (U32)count;

        for (i = 0; i < count; i++)
            results[run_count + i] = stack_results[i];
    }

    return bitpos_signs;
}

#endif // __RADX64__ && SSSE3

#ifdef __RADARM__
static RADINLINE void short_to_float_4(float* output, short* input)
{
    int16x4_t s16_low = vld1_s16(input);
    int32x4_t s32_low = vmovl_s16(s16_low);
    float32x4_t f32_low = vcvtq_f32_s32(s32_low);
    vst1q_f32(output, f32_low);
}
#else
static RADINLINE void short_to_float_4(float* output, short* input)
{
    size_t i = 0;
    for (i = 0; i < 4; i++)
        output[i] = (float)input[i];
}
#endif // __RADARM__


template <int bitlen>
static RADINLINE size_t decode_coeffs_runlength_8(float* results, const uint8_t* buffer, size_t coeff_position, size_t sign_position, size_t count)
{
    // 8 ceoffs fits in the 56 bits we can get - so we grab chunks of 8
    size_t run_count = count & ~7;
    size_t run = 0;

    for (run = 0; run < run_count; run+=8)
    {
        S16 coeffs[8];
        unsigned char current_sign_bit = 0;
        U64 coeff_bits = read_up_to_56_bits(buffer, coeff_position);
        coeff_position += 8 * bitlen;

        U32 sign_bits = read_up_to_24_bits(buffer, sign_position);

        decode_coeff<bitlen, 0>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 1>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 2>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 3>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 4>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 5>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 6>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 7>(current_sign_bit, coeff_bits, sign_bits, coeffs);

        sign_position += current_sign_bit;                

        short_to_float_4(results + run, coeffs);
        short_to_float_4(results + run + 4, coeffs + 4);
    }

    return decode_coeff_remnants<bitlen>(run, results, buffer, coeff_position, sign_position, count);
}


template <int bitlen>
static RADINLINE size_t decode_coeffs_runlength_4(float* results, const uint8_t* buffer, size_t coeff_position, size_t sign_position, size_t count)
{
    // 4 ceoffs fits in the 56 bits we can get - so we grab chunks of 4
    size_t run_count = count & ~3;
    size_t run = 0;

    for (run = 0; run < run_count; run+=4)
    {
        S16 coeffs[4];
        unsigned char current_sign_bit = 0;
        U64 coeff_bits = read_up_to_56_bits(buffer, coeff_position);
        coeff_position += 4 * bitlen;

        U32 sign_bits = read_up_to_24_bits(buffer, sign_position);

        decode_coeff<bitlen, 0>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 1>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 2>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 3>(current_sign_bit, coeff_bits, sign_bits, coeffs);

        sign_position += current_sign_bit;                

        short_to_float_4(results + run, coeffs);
    }

    return decode_coeff_remnants<bitlen>(run, results, buffer, coeff_position, sign_position, count);
}


static U32 RADINLINE decode_scalar(float* results, const uint8_t* buffer, size_t coeff_position, size_t sign_position, size_t count, int bitlen)
{
    switch (bitlen)
    {
    case 1: /* 1*8 = 8 <= 56 */  return (U32)decode_coeffs_runlength_8<1>(results, buffer, coeff_position, sign_position, count);
    case 2: /* 2*8 = 16 <= 56 */ return (U32)decode_coeffs_runlength_8<2>(results, buffer, coeff_position, sign_position, count);
    case 3: /* 3*8 = 24 <= 56 */ return (U32)decode_coeffs_runlength_8<3>(results, buffer, coeff_position, sign_position, count);
    case 4: /* 4*8 = 32 <= 56 */ return (U32)decode_coeffs_runlength_8<4>(results, buffer, coeff_position, sign_position, count);
    case 5: /* 5*8 = 40 <= 56 */ return (U32)decode_coeffs_runlength_8<5>(results, buffer, coeff_position, sign_position, count);
    case 6: /* 6*8 = 48 <= 56 */ return (U32)decode_coeffs_runlength_8<6>(results, buffer, coeff_position, sign_position, count);
    case 7: /* 7*8 = 48 <= 56 */ return (U32)decode_coeffs_runlength_8<7>(results, buffer, coeff_position, sign_position, count);
    case 8: /* 8*4 = 32 <= 56 */ return (U32)decode_coeffs_runlength_4<8>(results, buffer, coeff_position, sign_position, count);
    case 9:  /* 9*4 = 36 <= 56 */ return (U32)decode_coeffs_runlength_4<9>(results, buffer, coeff_position, sign_position, count);
    case 10: /* 10*4 = 40 <= 56 */ return (U32)decode_coeffs_runlength_4<10>(results, buffer, coeff_position, sign_position, count);
    case 11: /* 11*4 = 44 <= 56 */ return (U32)decode_coeffs_runlength_4<11>(results, buffer, coeff_position, sign_position, count);
    case 12: /* 12*4 = 48 <= 56 */ return (U32)decode_coeffs_runlength_4<12>(results, buffer, coeff_position, sign_position, count);
    case 13: /* 13*4 = 52 <= 56 */ return (U32)decode_coeffs_runlength_4<13>(results, buffer, coeff_position, sign_position, count);
    case 14: /* 14*4 = 56 <= 56 */ return (U32)decode_coeffs_runlength_4<14>(results, buffer, coeff_position, sign_position, count);
    case 15:
        {
            // super rare and can't really group in a convenient way - just do as singles.
            return (U32)decode_coeff_remnants<15>(0, results, buffer, coeff_position, sign_position, count);
        }
    }
    // invalid data.
    return (U32)sign_position;
}


static void read_channel_data_2( F32 * samps,
                               U32 transform_size,
                               U32 num_bands,
                               BINKVARBITS * vbp, 
                               U32 * bands,
                               void * padded_in_data_end ) 
{
  size_t i = 0;
  size_t b;
  F32 threshold[TOTBANDS + 2];

  // convert from varbits to base + offset
   U8* buffer = ((U8*)vbp->cur) - 8;
  size_t bit_position = 64 - vbp->bitlen;
  U32 end_bit_position = (U32)(((U8*)padded_in_data_end - buffer) * 8);

  // validate we can get to the transform
  U32 bits_needed = num_bands * 8 + FXPBITS * 2;
  if (validate_chunked_bit_read<uint32_t>(bit_position, end_bit_position, bits_needed) == false)
  {
    // corrupted data.
read_chan_data_corrupted:
    ourmemsetzero(samps, transform_size * 4);
    return;
  }

  // read the first two
  {
    // 29 bits each (FXPBITS)
    U64 fxp = read_up_to_56_bits(buffer, bit_position);
    bit_position += FXPBITS;
    fxp &= ((1 << FXPBITS) - 1);

    samps[0] = fxptof((U32)fxp);

    fxp = read_up_to_56_bits(buffer, bit_position);
    bit_position += FXPBITS;
    fxp &= ((1 << FXPBITS) - 1);

    samps[1] = fxptof((U32)fxp);
  }

  // unquantize the thresholds
  threshold[0] = 0;
  for (; i < num_bands; i++)
  {
    // each threshold lookup is 7 bits
    U32 j = read_up_to_24_bits(buffer, bit_position);
    bit_position += 7;
    j &= 0x7f;

    if (j > 95)   // sizeof(Undecibel)/sizeof(*Undecibel) - 1
      j = 95;     // Only look so far into the table, the rest is inaudible range
    threshold[i + 1] = bink_Undecibel_table[j];//(F32) Undecibel( ( (F32) (S32) j ) * 0.664F );
  }

  // decode the rle runs
  b = 0;
  for (i = 2; i < transform_size; )
  {
    int bitlen;
    size_t tmp;
    size_t end;

    if (validate_chunked_bit_read<uint32_t>(bit_position, end_bit_position, 9) == false)
      goto read_chan_data_corrupted;

    // 9 bits contains all our metadata
    tmp = read_up_to_24_bits(buffer, bit_position);
    if (tmp & 1) // is rle?
    {
      // next 4 bits are index in to rle run lengths
      // 1 flag + 4 rle + 4 bitlen
      bit_position += 9;
      end = ((tmp >> 1) & 15);
      tmp >>= 5;
      end = i + (bink_rlelens_snd[end] * VQLENGTH);
    }
    else
    {
      // a single run
      // 1 flag + 4 bitlen
      bit_position += 5;
      tmp >>= 1;
      end = i + VQLENGTH;
    }

    if (end > transform_size)
      end = transform_size;

    // remaining 4 bits are bitlen
    bitlen = tmp & 15;
    if (bitlen == 0)
    {
      ourmemsetzero(samps + i, (end - i) * 4);
      i = end;
    }
    else
    {
      size_t coeff_bits_needed = (bitlen * (end - i));
      size_t sign_bit_position = bit_position + coeff_bits_needed;

      {
        //
        // Validation
        // 
        // The farthest bit we can actually use, assuming all non-zero coeffs,
        // is sign_bit_position + run_length (minus 1).
        // 
        // However, for SSSE3 we read 128 bit chunks for coeffs, and 32 bit chunks
        // for signs. So a short run of 8 (very common) could mean the coeff vector
        // read is actually farther than the sign read. So we have to check both.
        //
        if (validate_chunked_bit_read<uint32_t>(sign_bit_position, end_bit_position, (end - i)) == false)
          goto read_chan_data_corrupted;

#if defined(__RADX64__) && defined(RAD_USES_SSSE3)
#ifndef FORCE_SCALAR
        if (CPU_can_use(CPU_SSSE3))
        {
          if (validate_chunked_bit_read<Vec128>(bit_position, end_bit_position, coeff_bits_needed) == false)
            goto read_chan_data_corrupted;
          bit_position = decode_pfxsum(samps + i, buffer, bit_position, sign_bit_position, end_bit_position, end - i, bitlen);
        }
        else
#endif
        {
          if (validate_chunked_bit_read<uint64_t>(bit_position, end_bit_position, coeff_bits_needed) == false)
            goto read_chan_data_corrupted;
          bit_position = decode_scalar(samps + i, buffer, bit_position, sign_bit_position, end - i, bitlen);
        }
#else
        if (validate_chunked_bit_read<uint64_t>(bit_position, end_bit_position, coeff_bits_needed) == false)
          goto read_chan_data_corrupted;
        bit_position = decode_scalar(samps + i, buffer, bit_position, sign_bit_position, end - i, bitlen);
#endif

        // all of the decoded values need to be scaled by the threshold,
        // which changes depending on the current band.
        while (i < end)
        {
          size_t bandend;                                        

          // figure out which band we're in
          while (i >= (bands[b] * 2))
            ++b;

          bandend = bands[b] * 2;
          if (end < bandend)
            bandend = end;

#ifdef __RADX64__
          // We can assume SSE2
          {
            size_t thresh_count = bandend - i;
            size_t simd_runs_end = thresh_count & ~0x7;
            __m128 threshold_vec = _mm_set_ps1(threshold[b]);
            
            size_t t = 0;
            for (t = 0; t < simd_runs_end; t += 8)
            {
              __m128 svec1 = _mm_loadu_ps(samps + i + t);
              __m128 svec2 = _mm_loadu_ps(samps + i + t + 4);
              svec1 = _mm_mul_ps(svec1, threshold_vec);
              svec2 = _mm_mul_ps(svec2, threshold_vec);
              _mm_storeu_ps(samps + i + t, svec1);
              _mm_storeu_ps(samps + i + t + 4, svec2);
            }

            // this is here to keep some compilers from autogenerating
            // the 8x unroll if it supports it (vs2010 doesnt, 2019 does).
            // since cdep builds with vs2010, we need to manually do the one
            // above.
            if ((thresh_count - t) < 8)
            {
              for (; t < thresh_count; t++)
                samps[i + t] = threshold[b] * samps[i + t];
            }
          }
#elif __RADARM__
          {
            size_t thresh_count = bandend - i;
            size_t simd_runs_end = thresh_count & ~0x7;
            float32x4_t threshold_vec = vdupq_n_f32(threshold[b]);
            
            size_t t = 0;
            for (t = 0; t < simd_runs_end; t += 8)
            {
              float32x4_t one = vld1q_f32(samps + i + t);
              float32x4_t two = vld1q_f32(samps + i + t + 4);
              one = vmulq_f32(one, threshold_vec);
              two = vmulq_f32(two, threshold_vec);
              vst1q_f32(samps + i + t, one);
              vst1q_f32(samps + i + t + 4, two);
            }

            for (; t < thresh_count; t++)
              samps[i + t] = threshold[b] * samps[i + t];
          }
#else
          for (size_t t=i; t < bandend; t++)
            samps[t] = threshold[b] * samps[t];
#endif

          i = bandend;
        } // end rle
      } // artificial scope
    } // end if not zeros
  } // end transform size

  // Convert back from base + offset to varbits
  buffer += 8 * (bit_position / 64);
  bit_position &= 63;

  vbp->bits = *(U64*)buffer;
  vbp->cur = (U64*)buffer + 1;
  vbp->bitlen = (U32)(64 - bit_position);
  vbp->bits >>= bit_position;
}

#else // __RAD64__


static RADINLINE void short_to_float_4(float* output, short* input)
{
    size_t i = 0;
    for (i = 0; i < 4; i++)
        output[i] = (float)input[i];
}

template <int bitlen>
static RADINLINE size_t decode_coeffs_runlength_4(float* results, const uint8_t* buffer, size_t coeff_position, size_t sign_position, size_t count)
{
    // 4 ceoffs fits in the 24 bits we can get - so we grab chunks of 4
    size_t run_count = count & ~3;
    size_t run = 0;

    for (run = 0; run < run_count; run+=4)
    {
        S16 coeffs[4];
        unsigned char current_sign_bit = 0;
        U32 coeff_bits = read_up_to_24_bits(buffer, coeff_position);
        coeff_position += 4 * bitlen;

        U32 sign_bits = read_up_to_24_bits(buffer, sign_position);

        decode_coeff<bitlen, 0>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 1>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 2>(current_sign_bit, coeff_bits, sign_bits, coeffs);
        decode_coeff<bitlen, 3>(current_sign_bit, coeff_bits, sign_bits, coeffs);

        sign_position += current_sign_bit;                

        short_to_float_4(results + run, coeffs);
    }

    return decode_coeff_remnants<bitlen>(run, results, buffer, coeff_position, sign_position, count);
}


static U32 RADINLINE decode_scalar(float* results, const uint8_t* buffer, size_t coeff_position, size_t sign_position, size_t count, int bitlen)
{
    switch (bitlen)
    {
    case 1: /* 1*4 = 4 <= 24 */  return (U32)decode_coeffs_runlength_4<1>(results, buffer, coeff_position, sign_position, count);
    case 2: /* 2*4 = 8 <= 24 */ return (U32)decode_coeffs_runlength_4<2>(results, buffer, coeff_position, sign_position, count);
    case 3: /* 3*4 = 12 <= 24 */ return (U32)decode_coeffs_runlength_4<3>(results, buffer, coeff_position, sign_position, count);
    case 4: /* 4*4 = 16 <= 24 */ return (U32)decode_coeffs_runlength_4<4>(results, buffer, coeff_position, sign_position, count);
    case 5: /* 5*4 = 20 <= 24 */ return (U32)decode_coeffs_runlength_4<5>(results, buffer, coeff_position, sign_position, count);
    case 6: /* 6*4 = 24 <= 24 */ return (U32)decode_coeffs_runlength_4<6>(results, buffer, coeff_position, sign_position, count);
    case 7: return (U32)decode_coeff_remnants<7>(0, results, buffer, coeff_position, sign_position, count);
    case 8: return (U32)decode_coeff_remnants<8>(0, results, buffer, coeff_position, sign_position, count);
    case 9: return (U32)decode_coeff_remnants<9>(0, results, buffer, coeff_position, sign_position, count);
    case 10: return (U32)decode_coeff_remnants<10>(0, results, buffer, coeff_position, sign_position, count);
    case 11: return (U32)decode_coeff_remnants<11>(0, results, buffer, coeff_position, sign_position, count);
    case 12: return (U32)decode_coeff_remnants<12>(0, results, buffer, coeff_position, sign_position, count);
    case 13: return (U32)decode_coeff_remnants<13>(0, results, buffer, coeff_position, sign_position, count);
    case 14: return (U32)decode_coeff_remnants<14>(0, results, buffer, coeff_position, sign_position, count);
    case 15: return (U32)decode_coeff_remnants<15>(0, results, buffer, coeff_position, sign_position, count);
    }
    // invalid data.
    return (U32)sign_position;
}


static RADINLINE U32 read_up_to_32_bits(const U8* buffer, size_t bit_position)
{
    uint32_t x1;
    uint32_t x2;
    memcpy(&x1, buffer + (bit_position >> 3), sizeof(x1));
    if ((bit_position & 7) == 0)
      return x1;

    memcpy(&x2, buffer + (bit_position >> 3) + 4, sizeof(x2));
    uint32_t x1_aligned = x1 >> (bit_position & 7);
    uint32_t x2_aligned = x2 << (32 - (bit_position & 7));
    return x1_aligned | x2_aligned;
}

static void read_channel_data_2( F32 * samps,
                               U32 transform_size,
                               U32 num_bands,
                               BINKVARBITS * vbp, 
                               U32 * bands,
                               void * padded_in_data_end ) 
{
  size_t i = 0;
  size_t b;
  F32 threshold[TOTBANDS + 2];

  // convert from varbits to base + offset
  U8* buffer = ((U8*)vbp->cur) - 4;
  size_t bit_position = 32 - vbp->bitlen;
  U32 end_bit_position = (U32)(((U8*)padded_in_data_end - buffer) * 8);

  // validate we can get to the transform
  U32 bits_needed = num_bands * 8 + FXPBITS * 2;
  if (validate_chunked_bit_read<uint32_t>(bit_position, end_bit_position, bits_needed) == false)
  {
    // corrupted data.
read_chan_data_corrupted:
    ourmemsetzero(samps, transform_size * 4);
    return;
  }

  // read the first two
  {
    // 29 bits each (FXPBITS)
    U32 fxp = read_up_to_32_bits(buffer, bit_position);
    bit_position += FXPBITS;
    fxp &= ((1 << FXPBITS) - 1);

    samps[0] = fxptof((U32)fxp);

    fxp = read_up_to_32_bits(buffer, bit_position);
    bit_position += FXPBITS;
    fxp &= ((1 << FXPBITS) - 1);

    samps[1] = fxptof((U32)fxp);
  }

  // unquantize the thresholds
  threshold[0] = 0;
  for (; i < num_bands; i++)
  {
    // each threshold lookup is 7 bits
    U32 j = read_up_to_24_bits(buffer, bit_position);
    bit_position += 7;
    j &= 0x7f;

    if (j > 95)   // sizeof(Undecibel)/sizeof(*Undecibel) - 1
      j = 95;     // Only look so far into the table, the rest is inaudible range
    threshold[i + 1] = bink_Undecibel_table[j];//(F32) Undecibel( ( (F32) (S32) j ) * 0.664F );
  }

  // decode the rle runs
  b = 0;
  for (i = 2; i < transform_size; )
  {
    int bitlen;
    size_t tmp;
    size_t end;

    if (validate_chunked_bit_read<uint32_t>(bit_position, end_bit_position, 9) == false)
      goto read_chan_data_corrupted;

    // 9 bits contains all our metadata
    tmp = read_up_to_24_bits(buffer, bit_position);
    if (tmp & 1) // is rle?
    {
      // next 4 bits are index in to rle run lengths
      // 1 flag + 4 rle + 4 bitlen
      bit_position += 9;
      end = ((tmp >> 1) & 15);
      tmp >>= 5;
      end = i + (bink_rlelens_snd[end] * VQLENGTH);
    }
    else
    {
      // a single run
      // 1 flag + 4 bitlen
      bit_position += 5;
      tmp >>= 1;
      end = i + VQLENGTH;
    }

    if (end > transform_size)
      end = transform_size;

    // remaining 4 bits are bitlen
    bitlen = tmp & 15;
    if (bitlen == 0)
    {
      ourmemsetzero(samps + i, (end - i) * 4);
      i = end;
    }
    else
    {
      size_t coeff_bits_needed = (bitlen * (end - i));
      size_t sign_bit_position = bit_position + coeff_bits_needed;

      {
        //
        // Validation
        // 
        // The farthest bit we can actually use, assuming all non-zero coeffs,
        // is sign_bit_position + run_length (minus 1).
        // 
        // However, for SSSE3 we read 128 bit chunks for coeffs, and 32 bit chunks
        // for signs. So a short run of 8 (very common) could mean the coeff vector
        // read is actually farther than the sign read. So we have to check both.
        //
        if (validate_chunked_bit_read<uint32_t>(sign_bit_position, end_bit_position, (end - i)) == false)
          goto read_chan_data_corrupted;

        {
          if (validate_chunked_bit_read<uint32_t>(bit_position, end_bit_position, coeff_bits_needed) == false)
            goto read_chan_data_corrupted;
          bit_position = decode_scalar(samps + i, buffer, bit_position, sign_bit_position, end - i, bitlen);
        }

        // all of the decoded values need to be scaled by the threshold,
        // which changes depending on the current band.
        while (i < end)
        {
          size_t bandend;                                        

          // figure out which band we're in
          while (i >= (bands[b] * 2))
            ++b;

          bandend = bands[b] * 2;
          if (end < bandend)
            bandend = end;

          for (size_t t=i; t < bandend; t++)
            samps[t] = threshold[b] * samps[t];

          i = bandend;
        } // end rle
      } // artificial scope
    } // end if not zeros
  } // end transform size

  // Convert back from base + offset to varbits
  buffer += 4 * (bit_position / 32);
  bit_position &= 31;

  vbp->bits = *(U32*)buffer;
  vbp->cur = (U32*)buffer + 1;
  vbp->bitlen = (U32)(32 - bit_position);
  vbp->bits >>= bit_position;
}


#endif // __RAD64__

static void read_channel_data(F32* samps,
    U32 transform_size,
    U32 num_bands,
    BINKVARBITS* vbp,
    U32* bands,
    void* padded_in_data_end)
{
  // bink audio 1 version - signs after coeffs
  U32 i, b;
  F32 threshold[ TOTBANDS + 2 ];
  void * init;

  BINKBITSLOCALS( lvb );

  VarBitsCopyToBinkBits( lvb, *vbp );
  init = lvbcur;

   // read the first two
  BinkBitsGet( i, U32, lvb, FXPBITS, ((1<<FXPBITS)-1) );
  samps[ 0 ] = fxptof( i );
  BinkBitsGet( i, U32, lvb, FXPBITS, ((1<<FXPBITS)-1) );
  samps[ 1 ] = fxptof( i );

  // unquantize the thresholds
  threshold[ 0 ] = 0;
  for ( i = 0 ; i < num_bands ; i++ )
  {
    U32 j;

    BinkBitsGet( j, U32, lvb, 8, ((1<<8)-1) );
    if (j > 95)   // sizeof(Undecibel)/sizeof(*Undecibel) - 1
      j = 95;     // Only look so far into the table, the rest is inaudible range
    threshold[ i + 1 ] = bink_Undecibel_table[ j ];//(F32) Undecibel( ( (F32) (S32) j ) * 0.664F );
  }

  b = 0;
  for( i = 2 ; i < transform_size ; )
  {
    U32 bitlen;
    U32 tmp;
    U32 end;
    
    BinkBitsPeek( tmp, U32, lvb, 1+4+4 );
    if ( tmp & 1 )
    {
      BinkBitsUse( lvb, 9 );
      end = ( ( tmp >> 1 ) & 15 );
      tmp >>= 5;
      end = i + ( bink_rlelens_snd[ end ] * VQLENGTH );
    }
    else
    {
      BinkBitsUse( lvb, 5 );
      tmp >>= 1;
      end = i + VQLENGTH;
    }
 
    if ( end > transform_size )
      end = transform_size;

    bitlen = tmp & 15;
    if ( bitlen == 0 )
    {
     clear:
      ourmemsetzero( samps+i, (end-i)*4 );
      i = end;
    }
    else
    {
      U32 bitmask = ( 1 << bitlen ) - 1;

      {
        U32 bp1 = bitlen + 1;

        if ( ( ((U8*)lvbcur) + (((( end - i )*bp1)+9)/8) ) > (U8*)padded_in_data_end )
        {
          // check to see if the maximum bit use per count (bp1 per sample) plus the next 
          // we are going to read passed out end point, which means a corruption
          //   has occurred - zap this range and get out

          end = transform_size; // clear to end, so we just fall out of the loop
          lvbcur = init; // reset read position  
          goto clear;
        }

        while ( i < end )
        {
          U32 bandend;
          F32 q[2];
          F32 * s, * s_end;
          //U32 v;
          U32 used;
          S32 val;

          // figure out which band we're in
          while ( i >= ( bands[ b ] * 2 ) )
            ++b;

          // decode either up to "end" or to the end of this band,
          // whichever is earlier.
          bandend = bands[ b ] * 2;
          if ( end < bandend )
            bandend = end;

          s = samps + i;
          s_end = samps + bandend;

          q[0] = threshold[b];
          q[1] = -threshold[b];

          // do four at a times for smaller bitlens
          if ( ( ( i + 4 ) <= bandend ) && ( bp1 <= (MAX_AT_LEAST_BITS/4) ) )   // div 4 because we unroll 4 reads
          {
            U32 bp14 = bp1*4;

            s_end -= 4;
            do
            {
              BinkBitsAtLeastStart( lvb, bp14 );

              val = ((S32)BinkBitsInAtLeastPeek(lvb)) & bitmask;
              BinkBitsInAtLeastUse( lvb, bitlen );
              s[0] = ((F32)val)*q[BinkBitsInAtLeastPeek(lvb)&1];
              used = val != 0; 
              BinkBitsInAtLeastUse( lvb, used );

              val = ((S32)BinkBitsInAtLeastPeek(lvb)) & bitmask;
              BinkBitsInAtLeastUse( lvb, bitlen );
              s[1] = ((F32)val)*q[BinkBitsInAtLeastPeek(lvb)&1];
              used = val != 0; 
              BinkBitsInAtLeastUse( lvb, used );

              val = ((S32)BinkBitsInAtLeastPeek(lvb)) & bitmask;
              BinkBitsInAtLeastUse( lvb, bitlen );
              s[2] = ((F32)val)*q[BinkBitsInAtLeastPeek(lvb)&1];
              used = val != 0; 
              BinkBitsInAtLeastUse( lvb, used );

              val = ((S32)BinkBitsInAtLeastPeek(lvb)) & bitmask;
              BinkBitsInAtLeastUse( lvb, bitlen );
              s[3] = ((F32)val)*q[BinkBitsInAtLeastPeek(lvb)&1];
              used = val != 0; 
              BinkBitsInAtLeastUse( lvb, used );

              BinkBitsAtLeastEnd( lvb );
              s += 4;
            } while ( s <= s_end );

            s_end += 4;
            i = (U32)( s_end - s );
            if ( i >= bandend )
              continue;
          }

          // now the remenants (or all of them, if the bitlen is big)        
          RADASSUME( ( s_end - s ) < 4 );  // hopefully no unroll, sigh
          while ( s < s_end ) 
          {
            BinkBitsAtLeastStart( lvb, bp1 );

            val = ((S32)BinkBitsInAtLeastPeek(lvb)) & bitmask;
            BinkBitsInAtLeastUse( lvb, bitlen );
            *s = ((F32)val)*q[BinkBitsInAtLeastPeek(lvb)&1];
            used = val != 0;
            BinkBitsInAtLeastUse( lvb, used );
            BinkBitsAtLeastEnd(lvb);
            ++s;
          }

          i = bandend;
        }
      }
    }
  }
  BinkBitsCopyToVarBits( *vbp, lvb );
}

// we return the pointer in out_ptr, if we have room, otherwise we return 0, and you have to alloca a buffer
static int have_room( void ** out_ptr, BINKAC_OUT_RINGBUF * out, U32 need )
{
  U32 have;

  have = (U32)( ((U8*)out->outend) - ((U8*)out->outptr) );
  if ( have > out->outlen ) have = out->outlen;
      
  // start by assuming we fit
  *out_ptr = out->outptr;

  // do we fit at the current pointer (at outptr)?
  if ( need > have )
  {
    // nope, well, would we fit at the front of the circular buffer?
    have = out->outlen - have;
    if ( need > have )
      return 0; // nope!

    *out_ptr = out->outstart; // use front of buffer
  }

  return 1;
}

static void update_ring( BINKAC_OUT_RINGBUF * out, S16 const * from, U32 bytes )
{
  U32 left;
  
  out->decoded_bytes += bytes;

  // if we have to eat front data, do it here
  if ( out->eatfirst )
  {
    if ( out->eatfirst > bytes )
    {
      out->eatfirst -= bytes;
      bytes = 0;
      out->outlen = 0;
      return;
    }
    else
    {
      bytes -= out->eatfirst;
      from = (S16*)( ((U8*)from) + out->eatfirst );
      out->eatfirst = 0;
    }
  }

  // limit to how much left
  if ( bytes > out->outlen ) bytes = out->outlen;
  out->outlen = bytes;
  
  // how much to the end of the ring buffer?
  left = (U32) ( ((U8*)out->outend) - ((U8*)out->outptr) );

  // limit to how many bytes that we have
  if ( left > bytes ) left = bytes;
  
  // move the data, if we have to
  if ( from != out->outptr ) 
  {
    ourmemcpy( (S16*)out->outptr, from, left );
    from = (S16*)( ((U8*)from) + left );
    out->outptr = (S16*)( ((U8*)out->outptr) + left );
    bytes -= left;
    if ( bytes )
    {
      out->outptr = out->outstart;
      ourmemcpy( out->outptr, from, bytes );
      out->outptr = (S16*)( ((U8*)out->outptr) + bytes );
    }
  }
  else
  {
    out->outptr = (S16*)( ((U8*)out->outptr) + bytes );
  }
}

static void linear_inverse_transform_to_s16( U32 flags, S16 * buf, F32 * decoded_coeffs, U32 transform_size, F32 transform_size_root, S16 * overlap, U32 window_size_in_bytes, U32 window_shift )
{
  F32 * f;

  f = (F32*) alloca( ( sizeof(F32)*transform_size ) + 64 ); // plus 64 for align
  f = (F32*) ( ( ( (UINTa)f ) + 63 ) & ~63 ); // align it

  // do the inverse transform
  {
    radfft_idct_to_S16( buf, transform_size_root, f, decoded_coeffs, transform_size );
  }

  // fade in the front
  if ( window_shift ) CallCrossFade( buf, overlap, window_size_in_bytes, window_shift );

  // Store end of buffer
  if ( window_size_in_bytes ) ourmemcpy( overlap, (U8*)(buf + transform_size) - window_size_in_bytes, window_size_in_bytes );
}


static void inverse_transform_to_s16( U32 flags, BINKAC_OUT_RINGBUF * out, F32 * decoded_coeffs, U32 transform_size, F32 transform_size_root, S16 * overlap, U32 window_size_in_bytes, U32 window_shift )
{
  S16 * buf;
  U32 need;

  need = transform_size * sizeof(S16);

  if ( !have_room( (void**)&buf, out, need ) )
  {
    buf = (S16*)alloca( need + 16 );
    buf = (S16*) ( ( ( (UINTa)buf ) + 15 ) & ~15 ); // align it
  }

  linear_inverse_transform_to_s16( flags, buf, decoded_coeffs, transform_size, transform_size_root, overlap, window_size_in_bytes, window_shift );

  update_ring( out, buf, ( sizeof(S16) * transform_size ) - window_size_in_bytes ); 
}


static void inverse_transform_to_s16_stereo( U32 flags, BINKAC_OUT_RINGBUF * out, S16 * buf, F32 * decoded_coeffs, U32 transform_size, F32 transform_size_root, S16 * overlap, U32 window_size_in_bytes, U32 window_shift )
{
  F32 * f;
  S16 * left;

  left = buf + transform_size;

  f = (F32*) alloca( ( sizeof(F32)*transform_size ) + 64 ); // plus 64 for align
  f = (F32*) ( ( ( (UINTa)f ) + 63 ) & ~63 ); // align it

  // do the inverse transform
  radfft_idct_to_S16_stereo_interleave( buf, left, transform_size_root, f, decoded_coeffs, transform_size );

  rrassert( window_size_in_bytes != 0 );
  
  // fade in the front
  if ( window_shift ) CallCrossFade( buf, overlap, window_size_in_bytes, window_shift );

  // Store end of buffer
  if ( window_size_in_bytes ) ourmemcpy( overlap, ((U8*)(buf + (transform_size*2))) - window_size_in_bytes, window_size_in_bytes ); 

  update_ring( out, buf, ( sizeof(S16) * transform_size * 2 ) - window_size_in_bytes ); 
}

// decode the data into an output buffer and return amount read from input
static void decode_frame( U32 transform_size,
                          F32 transform_size_root,
                          U32 chans,
                          U32 flags,
                          BINKAC_OUT_RINGBUF * output,
                          BINKAC_IN * input,
                          U32 num_bands,
                          U32 * bands,
                          U32 window_size_in_bytes,
                          U32 window_shift,
                          S16 * overlap )                   
{
  BINKVARBITS vb;
  F32 * decoded_coeffs;
  void * padded_in_data_end;

  decoded_coeffs = (F32*) alloca( ( sizeof(F32) * transform_size ) + 64 + 32 ); // plus 64 for align, 32 for zero overwrite
  decoded_coeffs = (F32*) ( ( ( (UINTa)decoded_coeffs ) + 63 ) & ~63 ); // align it

  padded_in_data_end = ( (U8*)input->inend ) + BINKACD_EXTRA_INPUT_SPACE;
  BinkVarBitsOpen( vb, input->inptr );

  {
    BinkVarBitsUse( vb, 2 );
  }

  if ( chans == 1 )
  {
    if (flags & BINKAC20)
      read_channel_data_2(decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end);
    else
      read_channel_data( decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end );
    inverse_transform_to_s16( flags, output, decoded_coeffs, transform_size, transform_size_root, overlap, window_size_in_bytes, window_shift );
  }
  else
  {
    if ( flags & BINKACNODEINTERLACE )
    {
      U32 eatfirst;

      if ( window_shift ) --window_shift;
      window_size_in_bytes >>= 1;

      eatfirst = output->eatfirst;
      output->eatfirst >>= 1; // only eat half for left channel

      if (flags & BINKAC20)
          read_channel_data_2(decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end);
      else
        read_channel_data( decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end );
      inverse_transform_to_s16( flags, output, decoded_coeffs, transform_size, transform_size_root, overlap, window_size_in_bytes, window_shift );

      output->eatfirst = eatfirst - ( (eatfirst>>1)-output->eatfirst ); // shrink the eatfirst by the amount that the left channel consumed

      if (flags & BINKAC20)
          read_channel_data_2(decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end);
      else
        read_channel_data( decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end );
      inverse_transform_to_s16( flags, output, decoded_coeffs, transform_size, transform_size_root, (S16*)(((U8*)overlap)+window_size_in_bytes), window_size_in_bytes, window_shift );
    }
    else
    {
      U32 need;
      S16 * buf;

      need = transform_size * 2 * sizeof(S16); // 2 for stereo

      // make a ring buf struct just for the left output channels (we interlace in the second inverse_transform)
      if ( !have_room( (void**)&buf, output, need ) )
      {
        // use temp stack buffer
        buf = (S16*)alloca( need + 16 );
        buf = (S16*) ( ( ( (UINTa)buf ) + 15 ) & ~15 ); // align it
      }

      if (flags & BINKAC20)
        read_channel_data_2(decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end);
      else
        read_channel_data( decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end );
      linear_inverse_transform_to_s16( flags, buf + transform_size, decoded_coeffs, transform_size, transform_size_root, 0, 0, 0 ); 

      if (flags & BINKAC20)
          read_channel_data_2(decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end);
      else
        read_channel_data( decoded_coeffs, transform_size, num_bands, &vb, bands, padded_in_data_end );
      inverse_transform_to_s16_stereo( flags, output, buf, decoded_coeffs, transform_size, transform_size_root, overlap, window_size_in_bytes, window_shift );
    }
  }

  // Store the results back in data
  input->inptr = ( (U8*) input->inptr ) + BinkVarBitsSizeBytesRoundedToU32( vb, input->inptr );
}

#define roundup( val ) ( ( ( val ) + 15 ) & ~15 )

RADDEFFUNC U32 RADLINK BinkAudioDecompressMemory( U32 rate,
                                                  U32 chans,
                                                  U32 flags )
{
  U32 ptr_end;
  U32 work_end;
  U32 overlap_end;
  U32 transform_size, buffer_size;

  if ( rate >= 44100 )
    transform_size = 2048;
  else if ( rate >= 22050 )
    transform_size = 1024;
  else
    transform_size = 512;

  // in bytes
  buffer_size = transform_size * chans * 2;
  ptr_end = roundup( sizeof( BINKAUDIODECOMP ) );
  overlap_end = roundup( ptr_end + ( buffer_size / WINDOWRATIO ) );
  work_end = overlap_end;

  return work_end;
}

RADDEFFUNC void RADLINK BinkAudioDecompressResetStartFrame(void* mem)
{
  HBINKAUDIODECOMP ba = (HBINKAUDIODECOMP)mem;
  // if we stop and want to restart the decompression at a new spot, we need to clear 
  // the start frame - otherwise we blend in the last frame.
  if (ba)
    ba->start_frame = 1;
}

#define SQRT2 1.41421356237309504880f
static F32 tsr4096 = 2.0f / 64.0f;
static F32 tsr2048 = 2.0f / (32.0f*SQRT2);
static F32 tsr1024 = 2.0f / 32.0f;
static F32 tsr512  = 2.0f / (16.0f*SQRT2);

RADDEFFUNC U32 RADLINK BinkAudioDecompressOpen( void * mem,
                                                             U32 rate,
                                                             U32 chans,
                                                             U32 flags )
{
  U32 i;
  U32 transform_size, transform_size_half, buffer_size;
  F32 transform_size_root, transform_size_root_big;
  U32 num_bands;
  S32 nyq;
  HBINKAUDIODECOMP ba;

  CPU_check( 0, 0 );

  if ( rate >= 44100 )
  {
    transform_size = 2048;
    transform_size_root = tsr2048;
    transform_size_root_big = tsr4096;
  }
  else if ( rate >= 22050 )
  {
    transform_size = 1024;
    transform_size_root = tsr1024;
    transform_size_root_big = tsr2048;
  }
  else
  {
    transform_size = 512;
    transform_size_root = tsr512;
    transform_size_root_big = tsr1024;
  }

  // in bytes
  buffer_size = transform_size * chans * 2;

  if ( transform_size > MAX_TRANSFORM )
    return 0;

  transform_size_half = transform_size / 2;
  nyq = ( rate + 1 ) / 2;

  // calculate the number of bands we'll use
  for( i = 0 ; i < TOTBANDS ; i++ )
  {
    if ( bink_bandtopfreq[ i ] >= (U32) nyq )
      break;
  }
  num_bands = i;

  // allocate our memory
  {
    U32 ptr_end;
    U32 work_end;
    U32 overlap_end;

    ptr_end = roundup( sizeof( BINKAUDIODECOMP ) );
    overlap_end = roundup( ptr_end + ( buffer_size / WINDOWRATIO ) );

    work_end = overlap_end;

    ba = (HBINKAUDIODECOMP) mem;

    ourmemsetzero( ba, sizeof( BINKAUDIODECOMP ) );

    ba->overlap = (S16*) ( ( (U8*) ba ) + ptr_end );
    ba->size = work_end;

    radfft_init();
  }

  ba->flags = flags;

  if ( chans == 1 )
    ba->flags &= ~BINKACNODEINTERLACE;

  ba->chans = chans;
  ba->num_bands = num_bands;
  ba->transform_size = transform_size;

  ba->buffer_size = buffer_size;
  ba->window_size_in_bytes = buffer_size / WINDOWRATIO;
  ba->window_shift = 0;

  switch (ba->window_size_in_bytes) // shift amount to divide by number of samples in the window.
  {
    case 512:
        ba->window_shift = 8; break;
    case 256:
        ba->window_shift = 7; break;
    case 128:
        ba->window_shift = 6; break;
    case 64:
        ba->window_shift = 5; break;
  
    default:
        RR_BREAK();
  }

  ba->transform_size_root = transform_size_root;

  // calculate the band ranges
  for( i = 0 ; i < num_bands ; i++ )
  {
    ba->bands[ i ] = ( bink_bandtopfreq[ i ] * transform_size_half ) / nyq;
    if ( ba->bands[ i ] == 0 )
      ba->bands[ i ] = 1;
  }
  ba->bands[ i ] = transform_size_half;

  ba->start_frame = 1;

  return 1;
}

RADDEFFUNC void RADLINK BinkAudioDecompress( void* mem,
                                             BINKAC_OUT_RINGBUF * output,
                                             BINKAC_IN * input )
{
  HBINKAUDIODECOMP ba = (HBINKAUDIODECOMP)mem;
  output->decoded_bytes = 0;

  // wrap pointer, if outer ringbuffer code didn't
  if ( output->outptr == output->outend ) output->outptr = output->outstart;
  
  decode_frame( ba->transform_size,
                ba->transform_size_root,
                ba->chans,
                ba->flags,
                output,
                input,
                ba->num_bands,
                ba->bands,
                ba->window_size_in_bytes,
                ( ba->start_frame ) ? 0 : ba->window_shift,
                ba->overlap );
  
  // reset after decoding each frame 
  ba->start_frame = 0;
}
 
RADDEFFUNC U32 RADLINK BinkAudioDecompressOutputSize(void* mem)
{
  HBINKAUDIODECOMP ba = (HBINKAUDIODECOMP)mem;
  return ba->buffer_size;
}

//#define TEST_READ_CHAN_DATA
#ifdef TEST_READ_CHAN_DATA

#include <stdio.h>
#include "ticks.h"
#include <assert.h>
#include <vector>
using namespace std;


static uint32_t read_bits_ref(const uint8_t * buffer, size_t buffer_len, size_t bitpos, int width)
{
    uint32_t bytes = 0;

    size_t begin = bitpos >> 3;
    size_t end = (bitpos + width + 7) >> 3;

    // little-endian read (clamped)
    int shift = 0;
    for (size_t pos = begin; pos < end; ++pos, shift += 8)
    {
        assert(pos < buffer_len);
        bytes |= buffer[pos] << shift;
    }

    uint32_t aligned = bytes >> (bitpos & 7);
    uint32_t mask = (1u << width) - 1;
    return aligned & mask;
}

// reads a fully qualified bink audio 2 data stream for 1 channel.
static bool read_channel_data_ref(float * results, const uint8_t * buffer, size_t buffer_len, int total_entries, size_t initial_bitpos)
{
    // read initial values - test stream sets these to 0
    size_t bitpos = initial_bitpos;
    results[0] = fxptof(read_bits_ref(buffer, buffer_len, bitpos, FXPBITS));
    bitpos += FXPBITS;
    results[1] = fxptof(read_bits_ref(buffer, buffer_len, bitpos, FXPBITS));
    bitpos += FXPBITS;

    // test stream has 1 threshold value, which is 0, corresponding to a 
    // 1.0f multiplier.
    U32 threshold_index = read_bits_ref(buffer, buffer_len, bitpos, 7);
    assert(threshold_index == 0);
    float threshold = bink_Undecibel_table[threshold_index];
    bitpos += 7;
    
    for (int i = 2; i < total_entries;)
    {
        // decode the run - must be a single run entry
        if (read_bits_ref(buffer, buffer_len, bitpos, 1) != 0)
        {
            assert(0);
            return false;
        }
        bitpos++;

        uint32_t bitlen = read_bits_ref(buffer, buffer_len, bitpos, 4);
        bitpos += 4;

        int runlength = 8;

        size_t sign_pos = bitpos + runlength * bitlen;
        for (int r = 0; r < runlength; r++, i++)
        {
            int32_t coeff = read_bits_ref(buffer, buffer_len, bitpos, bitlen);
            bitpos += bitlen;

            if (coeff)
            {
                if (read_bits_ref(buffer, buffer_len, sign_pos, 1))
                    coeff = -coeff;
                sign_pos++;
            }

            results[i] = (float)coeff * threshold;
        }

        bitpos = sign_pos;
    }

    return true;
}

static void encode_bits(U8* bitstream, size_t bitstream_len, size_t bitpos, unsigned int value, int bitlen)
{
    size_t bitend = bitpos + bitlen;
    size_t bytepos = bitpos >> 3;
    size_t byteend = (bitend + 7) >> 3;
    size_t bytecount = byteend - bytepos;

    uint64_t code_shifted = value << (bitpos & 7);
    for (size_t j = 0; j < bytecount; ++j)
    {
        assert(bytepos + j < bitstream_len);
        bitstream[bytepos + j] |= code_shifted >> (j * 8);
    }
}

static void encode_value(U8* bitstream, size_t bitstream_len, size_t bitpos, size_t &sign_bitpos, int value, int bitlen)
{
    int magnitude = abs(value);
    assert(magnitude < (1 << bitlen));

    size_t bitend = bitpos + bitlen;
    size_t bytepos = bitpos >> 3;
    size_t byteend = (bitend + 7) >> 3;
    size_t bytecount = byteend - bytepos;

    uint64_t code_shifted = magnitude << (bitpos & 7);
    for (size_t j = 0; j < bytecount; ++j)
    {
        assert(bytepos + j < bitstream_len);
        bitstream[bytepos + j] |= code_shifted >> (j * 8);
    }

    if (magnitude != 0)
    {
        if (value < 0)
        {
            assert((sign_bitpos >> 3) < bitstream_len);
            bitstream[sign_bitpos >> 3] |= 1 << (sign_bitpos & 7);
        }
        ++sign_bitpos;
    }
}

static bool test_all_signs(size_t initial_bitpos)
{
    // always use bitlen=1 and 8*256 values for this test
    static const int ngroups = 256; // all possible patterns of 8 lanes having/not having data
    static const int count = ngroups * 8;    

    int num_bands = 1;
    U32 bands[2] = {0, count + 3}; // set up the bands so we always use the first threshold (1.0)

    size_t startup_bitcount = 7 + FXPBITS*2; // 1 band and 2 initial samples
    size_t val_bitcount = count; // 1 bit per val for this test.
    size_t sign_bitcount = count; // sign bits are max 1 per
    size_t rle_bitcount = ngroups * 5;

    size_t write_pos = initial_bitpos;
    size_t end_pos_conservative = startup_bitcount + val_bitcount + rle_bitcount + sign_bitcount + 128; // +128 is extra padding for our sloppy reads

    size_t bitstream_len = (end_pos_conservative + 7) >> 3;
    U8* bitstream = (U8*)malloc(bitstream_len);
    memset(bitstream, 0, bitstream_len);
    
    // encode the first two samples
    encode_bits(bitstream, bitstream_len, write_pos, 0, FXPBITS);
    write_pos += FXPBITS;
    encode_bits(bitstream, bitstream_len, write_pos, 0, FXPBITS);
    write_pos += FXPBITS;

    // Encode the threshold index
    encode_bits(bitstream, bitstream_len, write_pos, 0, 7);
    write_pos += 7;

    int sign = 1;
    size_t sign_bitpos = write_pos;
    for (int i = 0; i < count; ++i)
    {
        int grp = i / 8; // index of group
        int pos_in_grp = i % 8;

        if (pos_in_grp == 0)
        {
            // 8 is the VQLENGTH, so we can encode single length runs

            // we start values where the last signs end.
            write_pos = sign_bitpos;

            // 0 means its a single length run
            encode_bits(bitstream, bitstream_len, write_pos, 0, 1);
            write_pos++;
            // bitlen = 1
            encode_bits(bitstream, bitstream_len, write_pos, 1, 4);
            write_pos +=4;

            // each run has the signs at the end
            sign_bitpos = write_pos + 1 * 8;
        }

        int value = 0;
        if (grp & (1 << pos_in_grp))
        {
            // just keep toggling signs, good enough pattern for this test (I hope)
            value = sign;
            sign = -sign;
        }

        encode_value(bitstream, bitstream_len, write_pos, sign_bitpos, value, 1);
        write_pos++;
        assert(write_pos <= sign_bitpos);
    }

    float output_floats[count + 2 + 8]; // entries + the first two + memset overwrite
    float ref_floats[count + 2 + 8];

    BINKVARBITS vb;
    BinkVarBitsOpen(vb, bitstream);
    BinkVarBitsUse(vb, (U32)initial_bitpos);
    read_channel_data_2(output_floats, count + 2, num_bands, &vb, bands, bitstream + bitstream_len);

    read_channel_data_ref(ref_floats, bitstream, bitstream_len, count + 2, initial_bitpos);
    
    bool failed = false;
    for (int i=0; i < count + 2; i++)
    {
        if (output_floats[i] != ref_floats[i])
        {
            printf("test_all_signs failed initial_pos: %d index %d\n", (int)initial_bitpos, i);
            failed = true;
            break;
        }
    }
    free(bitstream);

    return failed;
}

static bool test_all_codes(size_t initial_bitpos, int bitlen)
{
    assert(bitlen >= 1 && bitlen <= 15);
    int numcodes = 1 << bitlen;
    if (numcodes < 8)
        numcodes = 8;

    int num_bands = 1;
    U32 bands[2] = { 0, (U32)numcodes + 3 }; // set up the bands so we always use the first threshold (1.0)

    size_t startup_bitcount = 7 + FXPBITS * 2; // 1 band and 2 initial samples
    size_t val_bitcount = numcodes * bitlen; // 1 bit per val for this test.
    size_t sign_bitcount = numcodes; // sign bits are max 1 per
    size_t rle_bitcount = (numcodes >> 3) * 5;

    size_t write_pos = initial_bitpos;
    size_t end_pos_conservative = initial_bitpos + startup_bitcount + val_bitcount + rle_bitcount + sign_bitcount + 128; // +128 is extra padding for our sloppy reads

    size_t bitstream_len = (end_pos_conservative + 7) >> 3;
    U8* bitstream = (U8*)malloc(bitstream_len);
    memset(bitstream, 0, bitstream_len);

    // encode the first two samples
    encode_bits(bitstream, bitstream_len, write_pos, 0, FXPBITS);
    write_pos += FXPBITS;
    encode_bits(bitstream, bitstream_len, write_pos, 0, FXPBITS);
    write_pos += FXPBITS;

    // Encode the threshold index
    encode_bits(bitstream, bitstream_len, write_pos, 0, 7);
    write_pos += 7;

    size_t sign_bitpos = write_pos;
    for (int i = 0; i < numcodes; ++i)
    {
        int pos_in_grp = i % 8;

        if (pos_in_grp == 0)
        {
            // 8 is the VQLENGTH, so we can encode single length runs

            // we start values where the last signs end.
            write_pos = sign_bitpos;

            // 0 means its a single length run
            encode_bits(bitstream, bitstream_len, write_pos, 0, 1);
            write_pos++;

            encode_bits(bitstream, bitstream_len, write_pos, bitlen, 4);
            write_pos += 4;

            // each run has the signs at the end
            sign_bitpos = write_pos + bitlen * 8;
        }

        int code = i & ((1 << bitlen) - 1);

        encode_value(bitstream, bitstream_len, write_pos, sign_bitpos, code, bitlen);
        write_pos += bitlen;
        assert(write_pos <= sign_bitpos);
    }

    float* output_floats = (float*)malloc(sizeof(float) * (numcodes + 2) + 32); // entries + the first two. + memset overwrite
    float* ref_floats = (float*)malloc(sizeof(float) * (numcodes + 2) + 32); // entries + the first two.
    
    U64 start_ticks = baue_ticks();

    for (int i = 0; i < 200; i++)
    {
        BINKVARBITS vb;
        BinkVarBitsOpen(vb, bitstream);
        BinkVarBitsUse(vb, (U32)initial_bitpos);
        read_channel_data_2(output_floats, numcodes + 2, num_bands, &vb, bands, bitstream + bitstream_len);
    }

    U64 end_ticks = baue_ticks();
    printf("run time x200 (%d / %d) : %llu\n", (int)initial_bitpos, bitlen, end_ticks - start_ticks);


    read_channel_data_ref(ref_floats, bitstream, bitstream_len, numcodes + 2, initial_bitpos);

    bool failed = false;
    for (int i = 0; i < numcodes + 2; i++)
    {
        if (output_floats[i] != ref_floats[i])
        {
            printf("test_all_codes failed initial_pos: %d index %d\n", (int)initial_bitpos, i);
            failed = true;
            break;
        }
    }

    free(bitstream);
    free(output_floats);
    free(ref_floats);

    return failed;
}

// asserts on failure.
#ifdef __RADINDLL__
RADEXPFUNC bool RADEXPLINK BinkAudioDecompressTestReadChanData()
#else
RADDEFFUNC bool RADLINK BinkAudioDecompressTestReadChanData()
#endif
{
    bool failed = false;
    CPU_check(0, 0);
    for (int bitpos = 0; bitpos < 8; ++bitpos)
    {
        failed |= test_all_signs(bitpos);
    }

    for (int bitlen = 1; bitlen <= 15; ++bitlen)
    {
        for (int bitpos = 0; bitpos < 8; ++bitpos)
        {
            failed |= test_all_codes(bitpos, bitlen);
        }
    }

    return failed;
}

#endif // TEST_READ_CHAN_DATA