Files
UnrealEngine/Engine/Source/Runtime/RadAudioCodec/SDK/Src/RadAudio/radaudio_decoder.c
2025-05-18 13:04:45 +08:00

2555 lines
90 KiB
C

// Copyright Epic Games Tools, LLC. All Rights Reserved.
////////////////////////////////////////////////////////////////////////////
//
// RADaudio is a new audio codec made by Epic Game Tools for use in games,
// optimized for fast SIMD decoding and decent quality (roughly similar to
// Vorbis).
//
// It is a classical MDCT-based codec with two block sizes, and it uses
// the Oodle Data huffman entropy coder to store data.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// We want the external defines to be scoped to DECODER, but we don't want to rename everything in here.
#define RADAUDIO_AT_EOF RADAUDIO_DECODER_AT_EOF
#define RADAUDIO_INCOMPLETE_DATA RADAUDIO_DECODER_INCOMPLETE_DATA
#define RADAUDIO_INVALID_DATA RADAUDIO_DECODER_INVALID_DATA
#define RADAUDIO_START_OF_STREAM RADAUDIO_DECODER_START_OF_STREAM
#define RADAUDIO_INTERNAL_ERROR RADAUDIO_DECODER_INTERNAL_ERROR
#define HUFFMAN_DECODE // enable huffman decode tables
#include "radaudio_decoder.h"
#include "radaudio_decoder_sse2.h"
#include "radaudio_decoder_sse4.h"
#include "radaudio_decoder_avx2.h"
#include "radaudio_decoder_neon.h"
#include "radaudio_decoder_internal.h"
#include "radaudio_common.h"
#include "radaudio_mdct.h"
#include "rrCore.h"
#include "rrbits.h"
#include "radaudio_common.inl"
RR_COMPILER_ASSERT(COMMON_INVALID_DATA == RADAUDIO_INVALID_DATA);
RR_COMPILER_ASSERT(COMMON_INCOMPLETE_DATA == RADAUDIO_INCOMPLETE_DATA);
RR_COMPILER_ASSERT(RADAUDIO_STREAM_HEADER_SIZE == sizeof(radaudio_stream_header));
#ifdef RADAUDIO_DEVELOPMENT
#define PROFILE_ZONES() \
PROF(decoder_all) \
PROF(imdct) \
PROF(window) \
PROF(huffman) \
PROF(unquantize) \
PROF(distribute_rle) \
PROF(update_runlength) \
PROF(varbits) \
PROF(compute_mantissa_len) \
PROF(copy) \
PROF(compute_subbands) \
PROF(distribute_bitflag) \
PROF(unpack) \
PROF(randomize) \
PROF(compute_subband_energy)\
PROF(unbias) \
PROF(compute_band_energy) \
PROF(count_coefficients_huff) \
PROF(header) \
PROF(zero) \
PROF(flagbits) \
PROF(overhead) /* must always be last! */ \
PROF(total_count)
enum
{
#define PROF(x) PROF_##x,
PROFILE_ZONES()
#undef PROF
PROF__end
};
#define PROF_BEGIN(var) if (profile) { profile_times[PROF_##var] -= rrGetTicks(); profile_counts[PROF_##var] += 1; }
#define PROF_END(var) if (profile) profile_times[PROF_##var] += rrGetTicks()
static rrbool profile;
static U64 profile_times[PROF_total_count];
static S64 profile_counts[PROF_total_count];
#else
#define PROF_BEGIN(var)
#define PROF_END(var)
#define PROF_total_count 1
#endif
#define RANDVAL(r,i) (r)
// allow breakpointing on any error
static int e(int code)
{
return code;
}
typedef struct RadAudioDecoder
{
U32 version;
rrbool current_block_short;
rrbool last_block_short;
rrbool next_block_short; // we don't actually need this
int samprate_mode;
int num_channels;
int skip_bytes;
int sample_rate; // implied by samprate_mode
U32 block_number;
int fully_decoded; // sample offset in stream
U8 subband_predicted_sum[MAX_BANDS];
S8 mantissa_param[2][MAX_BANDS][2];
S8 subband_bias[MAX_BANDS];
rrbool at_eof;
rrbool post_seek;
rrbool bitstream_overshot;
radaudio_block_header_biases biases;
radaudio_cpu_features cpu;
radaudio_rate_info * info[2]; // pre-defined table, indexed by long vs. short block
radaudio_nonzero_blockmode_descriptor nz_desc[NUM_NZ_MODE];
U8 nz_correlated_huffman_selectors[NUM_NZ_SELECTOR][NUM_SELECTOR_MODES];
S16 * prev_block_right_samples[2];
F32 restore_scale[2]; // how to convert S16s back to floats
} radaudio_decoder_state;
//////////////////////////////////////////////////////////////////////////////
//
// ENTROPY DECODER
//
typedef struct
{
U8 *bitstream;
U8 *end;
U32 read_pos_in_bits;
U32 fast_num_bits; // if initial read_pos_in_bits < this, can take fast path (needs to be < not <= so 0 disables fast path)
U32 total_num_bits;
} rada_bit_decoder;
typedef struct
{
rada_bit_decoder stream[3];
} huff3_decoder;
static void decode_vbstream_init(rada_bit_decoder *d, U8 *bitstream, U8 *end, int *error)
{
if (bitstream > end) {
// this is a backwards stream (for huffman). not allowed to read through
// decode_vbstream_bits, so make sure we can't by setting total bits count to 0.
d->bitstream = bitstream;
d->end = end;
d->read_pos_in_bits = 0;
d->fast_num_bits = 0;
d->total_num_bits = 0;
return;
}
d->bitstream = bitstream;
d->end = end;
d->read_pos_in_bits = 0;
size_t num_bytes = end - bitstream;
if (num_bytes > MAX_ENCODED_BLOCK_BYTES) {
// not allowed! set num_bytes to 0 and initialize stream as empty
num_bytes = 0;
*error = 1;
}
d->total_num_bits = (U32) (num_bytes * 8); // can't overflow: num_bytes checked above
if (d->total_num_bits >= 32) {
d->fast_num_bits = d->total_num_bits - 32;
} else {
d->fast_num_bits = 0;
}
}
// bit reading cold path, reads one byte at a time to avoid over-reading
static RADNOINLINE U32 decode_vbstream_bits_cold(rada_bit_decoder *d, int bitlength, int *error)
{
// check whether actual data required goes off the end
if (d->read_pos_in_bits + bitlength > d->total_num_bits) {
*error = 1;
return 0;
}
// can read 0 bits exactly at the end
if (bitlength == 0)
return 0;
// if not, read as many valid bits as exist, then mask
size_t first_byte = (d->read_pos_in_bits >> 3);
U32 bits = d->bitstream[first_byte++];
U32 shift = 8;
while (d->bitstream+first_byte < d->end) {
bits = bits + (d->bitstream[first_byte++] << shift);
shift += 8;
}
bits >>= (d->read_pos_in_bits & 7);
bits &= (1 << bitlength)-1;
d->read_pos_in_bits += bitlength;
return bits;
}
static RADFORCEINLINE U32 decode_vbstream_bits(rada_bit_decoder *d, int bitlength, int *error)
{
// check for reading off the end...
if (d->read_pos_in_bits < d->fast_num_bits) {
// simple path
size_t first_byte = (d->read_pos_in_bits >> 3);
U32 bits = RR_GET32_LE(d->bitstream + first_byte);
bits >>= (d->read_pos_in_bits & 7); // discard bits we're pointing past
bits &= (1 << bitlength)-1;
d->read_pos_in_bits += bitlength;
return bits;
} else {
return decode_vbstream_bits_cold(d, bitlength, error);
}
}
typedef struct
{
U8 *decodeptr; // Current write cursor for the two stream triples
U8 *decodeend; // End of decoded bytes buffer for the two stream triples
const U8 *bitp[3]; // Next byte to be read for the streams
U32 bits[3]; // Current contents of bit buffer
U32 bitc[3]; // Current number of valid bits in bit buffer
} rada_internal_huff_state;
#define NEWLZ_HUFF_CODELEN_LIMIT 11
#define NEWLZ_HUFF_DECODE_TABLE_MASK 2047u
// 32-bit ARM implicitly masks 32-bit shift amounts by 255 (low 8 bits).
// All other current targets implicitly mask by 31 (low 5 bits). Either
// works for us, but we'd prefer not to get an extra AND, so use whatever
// the implicit mask is and rely on the compiler to clean it up.
#if defined(__RADARM__) && !defined(__RAD64__)
#define HUFF32LENMASK 255
#else
#define HUFF32LENMASK 31
#endif
static rrbool huff_decode_precise_finish(rada_internal_huff_state * s, radaudio_huffman *huff)
{
const U8 * in0 = s->bitp[0];
const U8 * in1 = s->bitp[1];
const U8 * in2 = s->bitp[2];
U32 bits0 = s->bits[0], bitc0 = s->bitc[0];
U32 bits1 = s->bits[1], bitc1 = s->bitc[1];
U32 bits2 = s->bits[2], bitc2 = s->bitc[2];
if (in0 > in2)
return false;
U8 *decodeptr = s->decodeptr;
U8 *decodeend = s->decodeend;
#define DECONE(strm) \
peek = bits##strm & NEWLZ_HUFF_DECODE_TABLE_MASK; \
cl = huff->decode[peek].length; \
sym = huff->decode[peek].symbol; \
bits##strm >>= cl & HUFF32LENMASK; bitc##strm -= cl; \
*decodeptr++ = (U8) sym
#define DECTHREE() \
DECONE(0); \
DECONE(1); \
DECONE(2)
RR_COMPILER_ASSERT( NEWLZ_HUFF_CODELEN_LIMIT <= 12 );
#define N_DECS_PER_REFILL 2
#define TRIPLE_DECS_PER_REFILL (3*N_DECS_PER_REFILL)
// bulk loop to get within 4B of end
if (in1 - in2 >= 4 && decodeend - decodeptr >= TRIPLE_DECS_PER_REFILL)
{
in1 -= 4;
decodeend -= TRIPLE_DECS_PER_REFILL-1;
while (decodeptr < decodeend)
{
// non-crossing invariant: in0 <= in2 && in2 <= in1
if (in0 > in2 || in2 > in1)
break;
// non-crossing and 4B access size guarantee that the
// following reads are safe; the decodeend decrement before the
// loop guarantees that we don't write out of bounds.
// refill :
bits0 |= RR_GET32_LE(in0) << bitc0;
in0 += (31 - bitc0)>>3; // bytes_consumed
bitc0 |= 24; // same as += bytes_consumed<<3 here!
bits1 |= RR_GET32_BE(in1) << bitc1;
in1 -= (31 - bitc1)>>3; // bytes_consumed
bitc1 |= 24; // same as += bytes_consumed<<3 here!
bits2 |= RR_GET32_LE(in2) << bitc2;
in2 += (31 - bitc2)>>3; // bytes_consumed
bitc2 |= 24; // same as += bytes_consumed<<3 here!
U32 peek; int cl; int sym;
RR_COMPILER_ASSERT( N_DECS_PER_REFILL == 2 );
DECTHREE();
DECTHREE();
}
decodeend += TRIPLE_DECS_PER_REFILL-1;
in1 += 4;
// transition to final loop
in0 -= (bitc0 >> 3); bitc0 &= 7;
in1 += (bitc1 >> 3); bitc1 &= 7;
in2 -= (bitc2 >> 3); bitc2 &= 7;
}
// Final loop. This is really careful about the bytes it accesses.
while (decodeptr < decodeend)
{
U32 peek, cl, sym;
// refill to >=16b in bit0 buf
if (in2 - in0 > 1)
bits0 |= RR_GET16_LE(in0) << bitc0;
else if (in2 - in0 == 1)
bits0 |= in0[0] << bitc0;
DECONE(0);
in0 += (7 - bitc0) >> 3;
bitc0 &= 7;
if (decodeptr >= decodeend)
break;
// refill to >=16b left in bit1, bit2 bufs
if (in1 - in2 > 1)
{
bits1 |= RR_GET16_BE(in1 - 2) << bitc1;
bits2 |= RR_GET16_LE(in2) << bitc2;
}
else if (in1 - in2 == 1)
{
// accessing the same byte!
bits1 |= in2[0] << bitc1;
bits2 |= in2[0] << bitc2;
}
DECONE(1);
in1 -= (7 - bitc1) >> 3;
bitc1 &= 7;
if (decodeptr >= decodeend)
break;
DECONE(2);
in2 += (7 - bitc2) >> 3;
bitc2 &= 7;
if (in0 > in2 || in2 > in1) // corruption check
return false;
}
if (decodeptr != decodeend)
return false;
#undef DECONE
#undef DECTHREE
#undef N_DECS_PER_REFILL
#undef TRIPLE_DECS_PER_REFILL
s->bitp[0] = in0; s->bits[0] = bits0; s->bitc[0] = bitc0;
s->bitp[1] = in1; s->bits[1] = bits1; s->bitc[1] = bitc1;
s->bitp[2] = in2; s->bits[2] = bits2; s->bitc[2] = bitc2;
return true;
}
#if defined(__RAD64REGS__)
static rrbool huff_decode_inner64(rada_internal_huff_state * s, radaudio_huffman *huff)
{
// Layout: strm0-> | strm2-> | <-strm1
const U8 * in0 = s->bitp[0];
const U8 * in1 = s->bitp[1];
const U8 * in2 = s->bitp[2];
U8 * decodeptr = s->decodeptr;
U8 * decodeend = s->decodeend;
// NEWLZ_HUFF_CODELEN_LIMIT == 11 , could actually do 5 per refill = 10 per loop
#if (56/NEWLZ_HUFF_CODELEN_LIMIT) >= 5
#define N_DECS_PER_REFILL 5
#elif (56/NEWLZ_HUFF_CODELEN_LIMIT) >= 4
#define N_DECS_PER_REFILL 4
#else
#define N_DECS_PER_REFILL 3
#endif
#define TRIPLE_DECS_PER_REFILL (3*N_DECS_PER_REFILL)
// bulk loop
if (decodeend - decodeptr > TRIPLE_DECS_PER_REFILL-1 && in1 - in2 > 8) // @TODO: maybe test for going outside the buffer instead of this, since this might be true too often
{
// offset the end marker so we only run with full groups left
decodeend -= TRIPLE_DECS_PER_REFILL-1;
in1 -= 8;
U64 bits0=s->bits[0], bitcount0 = s->bitc[0];
U64 bits1=s->bits[1], bitcount1 = s->bitc[1];
U64 bits2=s->bits[2], bitcount2 = s->bitc[2];
const U8 *hufftab_base = &huff->decode[0].length;
#define DECONE(strm) \
/* NOTE(fg): This address calc is a single UBFIZ */ \
tabv = (bits##strm & NEWLZ_HUFF_DECODE_TABLE_MASK) * sizeof(radaudio_huff_symbol); \
tabv = RR_GET16_LE((const U16 *) (hufftab_base + tabv)); \
bits##strm >>= tabv & 63; bitcount##strm -= tabv; \
*decodeptr++ = (U8) (tabv >> 8)
#define DECTHREE() \
DECONE(0); \
DECONE(1); \
DECONE(2)
while (decodeptr < decodeend)
{
// non-crossing invariant: in0 <= in2 && in2 <= in1
if (in0 > in2) // if_unlikely
break;
if (in2 > in1) // if_unlikely
break;
// refill :
U64 next0 = RR_GET64_LE(in0);
bits0 |= next0 << bitcount0;
in0 += (63 - bitcount0)>>3; // bytes_consumed
bitcount0 |= 56; // same as += bytes_consumed<<3 here!
U64 next1 = RR_GET64_BE(in1);
bits1 |= next1 << bitcount1;
in1 -= (63 - bitcount1)>>3; // bytes_consumed
bitcount1 |= 56; // same as += bytes_consumed<<3 here!
U64 next2 = RR_GET64_LE(in2);
bits2 |= next2 << bitcount2;
in2 += (63 - bitcount2)>>3; // bytes_consumed
bitcount2 |= 56; // same as += bytes_consumed<<3 here!
U32 tabv;
RR_COMPILER_ASSERT( N_DECS_PER_REFILL >= 3 && N_DECS_PER_REFILL <= 5 );
DECTHREE();
DECTHREE();
DECTHREE();
#if N_DECS_PER_REFILL > 3
DECTHREE();
#endif
#if N_DECS_PER_REFILL > 4
DECTHREE();
#endif
// our decode process puts some crap in the top bits; clear them
bitcount0 &= 63;
bitcount1 &= 63;
bitcount2 &= 63;
}
#undef DECONE
#undef DECTHREE
in1 += 8;
// transition to careful loop
s->decodeptr = decodeptr;
s->bitp[0] = in0 - (bitcount0 >> 3); s->bits[0] = (U32) (bits0 & 0xff); s->bitc[0] = bitcount0 & 7;
s->bitp[1] = in1 + (bitcount1 >> 3); s->bits[1] = (U32) (bits1 & 0xff); s->bitc[1] = bitcount1 & 7;
s->bitp[2] = in2 - (bitcount2 >> 3); s->bits[2] = (U32) (bits2 & 0xff); s->bitc[2] = bitcount2 & 7;
}
#undef N_DECS_PER_REFILL
#undef TRIPLE_DECS_PER_REFILL
return huff_decode_precise_finish(s, huff);
}
#endif
static void decode_huff_array(huff3_decoder *ds, radaudio_huffman *huff, U8 *array, int length, int *error)
{
rada_internal_huff_state s;
s.decodeptr = array;
s.decodeend = array+length;
// generate Huff3 decoder state from our naive state
for (int i=0; i <= 2; i += 2) {
s.bitp[i] = &ds->stream[i].bitstream[ds->stream[i].read_pos_in_bits>>3];
s.bitc[i] = (0-ds->stream[i].read_pos_in_bits) & 7; // read pos of 2 => 6 bits left
if (s.bitc[i] == 0)
s.bits[i] = 0;
else {
s.bits[i] = *(s.bitp[i]) >> (8-s.bitc[i]);
++s.bitp[i];
}
}
s.bitp[1] = &ds->stream[1].bitstream[-(int)(ds->stream[1].read_pos_in_bits>>3)];
s.bitc[1] = (0-ds->stream[1].read_pos_in_bits) & 7; // read pos of 2 => 6 bits left
if (s.bitc[1] == 0)
s.bits[1] = 0;
else {
s.bits[1] = *(s.bitp[1]-1) >> (8-s.bitc[1]);
--s.bitp[1];
}
#ifdef __RAD64REGS__
if (!huff_decode_inner64(&s, huff))
*error = 1;
#else
if (!huff_decode_precise_finish(&s, huff))
*error = 1;
#endif
ds->stream[0].read_pos_in_bits = (int) (8*(s.bitp[0] - ds->stream[0].bitstream) - s.bitc[0]);
ds->stream[2].read_pos_in_bits = (int) (8*(s.bitp[2] - ds->stream[2].bitstream) - s.bitc[2]);
ds->stream[1].read_pos_in_bits = (int) (8*(ds->stream[1].bitstream - s.bitp[1]) - s.bitc[1]);
}
////////////////////////////////////////////////////////////////////////////////
static void compute_windowed_sum_multiple64(radaudio_decoder_state *ds, float *output, int n,
const float *fwd_data, S16 *rev_data, int revlen, int revoff, float rev_scale,
const float *window, int block_number, int channel, int stream_offset)
{
rrAssert(n % 64 == 0);
// Starting point:
// output[0:n] = fwd_c[0:n] .* window[0:n] + rev_scale * rev_c[revoff:revoff+n] .* reverse(window[0:n])
//
// let n2 = n/2, then the IMDCT symmetries mean that (when both blocks have same length, I'll account for revoff later)
//
// fwd_c[0:n2] = -reverse(fwd_c[n2:n])
// rev_c[n2:n] = reverse(rev_c[0:n2])
//
// and therefore we can work with just the middle samples (i.e. the back half of fwd_c and the front
// half of rev_c). To exploit this systematically, split the loop into two halves at n2:
//
// output[0:n2] = fwd_c[0:n2] .* window[0:n2] + rev_scale * rev_c[revoff+0:revoff+n2] .* reverse(window[n2:n])
// output[n2:n] = fwd_c[n2:n] .* window[n2:n] + rev_scale * rev_c[revoff+n2:revoff+n] .* reverse(window[0:n2])
//
// note rev_c is symmetric about revoff+n2, so rev_c[revoff+n2:revoff_n] = reverse(rev_c[revoff+0:revoff+n2]).
// (This is the second symmetry, accounting for potential differences in MDCT size.)
//
// Define:
// fwd[0:n2] = fwd_c[n2:n]
// rev[0:n2] = rev_scale * rev_c[revoff:revoff+n2]
//
// and then use the symmetries and algebra to get
//
// output[0:n2] = -reverse(fwd) .* window[0:n2] + rev .* reverse(window[n2:n])
// = rev .* reverse(window[n2:n]) - reverse(fwd) .* window[0:n2]
//
// output[n2:n] = fwd .* window[n2:n] + reverse(rev) .* reverse(window[0:n2])
// = reverse(rev .* window[0:n2]) + fwd .* window[n2:n]
const float *fwd = fwd_data; // NOTE: second half of the forward data, first half is implied by odd symmetry
S16 *rev = rev_data + revoff;
#if defined(DO_BUILD_SSE4) || defined(DO_BUILD_AVX2)
if (ds->cpu.has_sse2) {
#ifdef DO_BUILD_AVX2
if (ds->cpu.has_avx2)
radaudio_avx2_compute_windowed_sum_multiple16(output, n, fwd, rev, rev_scale, window);
else
#endif
radaudio_sse2_compute_windowed_sum_multiple8(output, n, fwd, rev, rev_scale, window);
return;
}
#endif
#if defined(DO_BUILD_NEON)
radaudio_neon_compute_windowed_sum_multiple8(output, n, fwd, rev, rev_scale, window);
#else
SINTa N2 = n >> 1;
for (SINTa j = 0; j < N2; ++j) {
output[j] = rev_scale * rev[j] * window[n-1-j] - fwd[N2-1-j] * window[j];
}
for (SINTa j = 0; j < N2; ++j) {
output[j+N2] = rev_scale * rev[N2-1-j] * window[N2-1-j] + fwd[j] * window[N2+j];
}
#endif
}
static void copy_samples_multiple16(float *output, int n, const float *input)
{
rrAssert(n % 16 == 0);
// potentially rely on it being aligned
memcpy(output, input, 4*n);
}
static void copy_samples_multiple16_scaled(float *output, int n, const S16 *input, float rescale)
{
rrAssert(n % 16 == 0);
for (int i=0; i < n; i += 8) {
output[i+0] = input[i+0] * rescale;
output[i+1] = input[i+1] * rescale;
output[i+2] = input[i+2] * rescale;
output[i+3] = input[i+3] * rescale;
output[i+4] = input[i+4] * rescale;
output[i+5] = input[i+5] * rescale;
output[i+6] = input[i+6] * rescale;
output[i+7] = input[i+7] * rescale;
}
}
static void build_rand_state(U32 *rand_state, U32 randval)
{
U32 r2 = (U32) (((randval + 5000) * (U64) 0xc4ceb9fe1a85ec53ULL) >> 33);
rand_state[0] = randval;
rand_state[1] = r2;
rand_state[2] = randval ^ 0x55555555;
rand_state[3] = r2 ^ 0x55555555;
}
static void randomize_long_block_8x8_Nx16(radaudio_decoder_state *ds, S8 *quantized_coeff, U32 randval, int num_subbands, int *num_coeffs_for_band)
{
RAD_ALIGN(U32, rand_state[4], 16);
build_rand_state(rand_state, randval);
static S8 random_table[16] = { -1,1, -2,2, -3,3, -4,4, -5,5, -6,6, -7,7, -8,8 };
// SIMD: compute 4 independent randvals in parallel... the encoder doesn't care what the random
// values are, so they should be stable, but don't have to be the same as the current code
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse4_1) {
radaudio_sse4_randomize_long_block_8x8_Nx16(quantized_coeff, rand_state, num_subbands);
return;
}
#endif
int j;
int cb = 0;
U32 randval0 = rand_state[0];
U32 randval1 = rand_state[1];
for (j=0; num_coeffs_for_band[j] == 4; ++j) {
if (RR_GET64_NATIVE(&quantized_coeff[cb]) == 0) {
U32 rbits = randval0 >> 4;
randval0 = lcg(randval0);
for (int i=0; i < 4; ++i) {
quantized_coeff[cb+i] = random_table[rbits & 15];
rbits >>= 8;
}
}
cb += 4;
}
for (; num_coeffs_for_band[j] == 8; ++j) {
if (RR_GET64_NATIVE(&quantized_coeff[cb]) == 0) {
U32 rbits = randval0 >> 4;
randval0 = lcg(randval0);
for (int i=0; i < 4; ++i) {
quantized_coeff[cb+i] = random_table[rbits & 15];
rbits >>= 8;
}
rbits = randval1 >> 4;
randval1 = lcg(randval1);
for (int i=4; i < 8; ++i) {
quantized_coeff[cb+i] = random_table[rbits & 15];
rbits >>= 8;
}
}
cb += 8;
}
rand_state[0] = randval0;
rand_state[1] = randval1;
for (; j < num_subbands; ++j) {
if ((RR_GET64_NATIVE(&quantized_coeff[cb+0]) | RR_GET64_NATIVE(&quantized_coeff[cb+8])) == 0) {
U32 rbits = rand_state[0] >> 4;
rand_state[0] = lcg(rand_state[0]);
for (int i=0; i < 4; ++i) {
quantized_coeff[cb+i] = random_table[rbits & 15];
rbits >>= 8;
}
rbits = rand_state[1] >> 4;
rand_state[1] = lcg(rand_state[1]);
for (int i=4; i < 8; ++i) {
quantized_coeff[cb+i] = random_table[rbits & 15];
rbits >>= 8;
}
rbits = rand_state[2] >> 4;
rand_state[2] = lcg(rand_state[2]);
for (int i=8; i < 12; ++i) {
quantized_coeff[cb+i] = random_table[rbits & 15];
rbits >>= 8;
}
rbits = rand_state[3] >> 4;
rand_state[3] = lcg(rand_state[3]);
for (int i=12; i < 16; ++i) {
quantized_coeff[cb+i] = random_table[rbits & 15];
rbits >>= 8;
}
}
cb += 16;
}
}
static void randomize_short_block(S8 quantized_coeff[], U32 randval, int num_bands, int *num_coeffs_for_band)
{
static S8 random_table[16] = { -1,1, -2,2, -3,3, -4,4, -5,5, -6,6, -7,7, -8,8 };
int cb = 0;
U32 rbits = randval;
randval = lcg(randval);
rbits >>= 10;
// Bands 0..3 are 1 coefficient
for (int j=0; j < 4; ++j) {
if (quantized_coeff[j] == 0) {
quantized_coeff[j] = random_table[rbits & 1];
}
rbits >>= 4;
}
// Bands 4..7 are 2 coefficients each
for (int j=4; j < 8; ++j) {
if (RR_GET16_LE_UNALIGNED(&quantized_coeff[j*2-4]) == 0) {
rbits = randval, randval = lcg(randval);
rbits >>= 20;
quantized_coeff[j*2-4] = random_table[rbits & 15];
quantized_coeff[j*2-3] = random_table[(rbits >> 4) & 15];
}
}
// Bands 8..13 are 4 coefficients each
for (int j=8; j < 13; ++j) {
if (RR_GET32_LE_UNALIGNED(&quantized_coeff[j*4-20]) == 0) {
rbits = randval, randval = lcg(randval);
quantized_coeff[j*4-20] = random_table[(rbits >> 12) & 15];
quantized_coeff[j*4-19] = random_table[(rbits >> 16) & 15];
quantized_coeff[j*4-18] = random_table[(rbits >> 20) & 15];
quantized_coeff[j*4-17] = random_table[(rbits >> 24) & 15];
}
}
// Remaining bands have 16 or 32 coeffs
cb = 4*1 + 4*2 + 5*4;
for (int j=13; j < num_bands; ++j) {
int i;
U32 sum1=0, sum2=0;
int num = num_coeffs_for_band[j];
for (i=0; i < num; i += 8) { // should be 16 or 32
sum1 |= RR_GET32_LE_UNALIGNED(&quantized_coeff[cb+i+0]);
sum2 |= RR_GET32_LE_UNALIGNED(&quantized_coeff[cb+i+4]);
}
if ((sum1|sum2) == 0) {
for (i=0; i+7 < num; i += 8) {
rbits = randval, randval = lcg(randval);
for (int k=0; k < 8; ++k) {
quantized_coeff[cb+i+k] = random_table[rbits & 15];
rbits >>= 4;
}
}
rbits = randval, randval = lcg(randval);
for (; i < num; ++i) {
quantized_coeff[cb+i] = random_table[rbits & 15];
rbits >>= 4;
}
}
cb += num_coeffs_for_band[j];
}
}
static int count_bytes_below_value_sentinel16(radaudio_decoder_state *ds, U8 *data, int num_bytes, U8 threshold)
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse2)
return radaudio_sse2_count_bytes_below_value_sentinel16(data, num_bytes, threshold);
#endif
#if defined(DO_BUILD_NEON)
return radaudio_neon_count_bytes_below_value_sentinel16(data, num_bytes, threshold);
#else
int num=0;
for (int i=0; i < num_bytes; ++i) {
num += (data[i] < threshold);
}
return num;
#endif
}
// overwrites up to 7 bytes of space at end of array if not a multiple of 8
static int count_set_bits_multiple8_sentinel8(radaudio_decoder_state *ds, U8 *data, int num_bytes)
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse4_1) // @TODO: ds->cpu.has_popcnt
return radaudio_intel_popcnt_count_set_bits_read_multiple8_sentinel8(data, num_bytes);
#endif
#ifdef DO_BUILD_NEON
return radaudio_neon_count_set_bits_read_multiple8_sentinel8(data, num_bytes);
#endif
#ifndef DO_BUILD_NEON // for unreachable code warnings
#ifdef __RAD64REGS__
// 64-bit scalar code
int num=0;
int padded_size = (num_bytes + 7) & ~7;
if (num_bytes != padded_size)
RR_PUT64_NATIVE(&data[num_bytes], 0);
for (int i=0; i < padded_size; i += 8) {
U64 value = RR_GET64_NATIVE(&data[i]);
value = value - ((value >> 1) & 0x5555555555555555ull); // for pairs of bits: 00->00, 01->01, 10->01, 11->10
// sums across groups of 2 bits -> sums across groups of 8 bits
// skipping the groups-of-4 stage to get a wider reduction tree with fewer constants
U64 threes = 0x0303030303030303ull;
value = (value & threes) + ((value >> 2) & threes) + ((value >> 4) & threes) + ((value >> 6) & threes);
// sum the bytes (can't overflow)
value = (value * 0x0101010101010101ull) >> 56;
num += (int)value;
}
#else
// 32-bit scalar code
int num=0;
int padded_size = (num_bytes + 3) & ~3;
if (num_bytes != padded_size)
RR_PUT32_NATIVE(&data[num_bytes], 0);
for (int i=0; i < padded_size; i += 4) {
U32 value = RR_GET32_NATIVE(&data[i]);
value = value - ((value >> 1) & 0x55555555); // for pairs of bits: 00->00, 01->01, 10->01, 11->10
value = (value & 0x33333333) + ((value>> 2) & 0x33333333);
value = (value & 0x0f0f0f0f) + ((value>> 4) & 0x0f0f0f0f);
value = (value * 0x01010101) >> 24;
num += (int)value;
}
#endif
return num;
#endif // !DO_BUILD_NEON
}
// guarantees a multiple of 16 bytes is written, with the extra bytes having the value of 1:
// scalar: reads exact bytes specified, writes an extra 16 bytes of "1"
// SSE : writes multiple of 32 bytes with extras equal to "1", also writes 16 bytes starting at &packed[num_packed]
static void unpack_nibbles_input_excess16_output_excess16_multiple32_default1(radaudio_decoder_state *ds, S8 *unpacked, U8 *packed, int num_packed)
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse2) {
radaudio_sse2_unpack_nibbles_read_sentinel16_write_multiple32(unpacked, packed, num_packed, 0x1111111111111111ull);
return;
}
#endif
#ifdef DO_BUILD_NEON
{
radaudio_neon_unpack_nibbles_read_sentinel16_write_multiple32(unpacked, packed, num_packed, 0x1111111111111111ull);
return;
}
#endif
#ifndef DO_BUILD_NEON // for unreachable code warnings
for (int i=0; i < num_packed; ++i) {
unpacked[i*2+0] = (S8) (packed[i] & 15);
unpacked[i*2+1] = (S8) (packed[i] >> 4);
}
RR_PUT64_NATIVE(&unpacked[num_packed*2+0], 0x0101010101010101ull);
RR_PUT64_NATIVE(&unpacked[num_packed*2+8], 0x0101010101010101ull);
#endif // !DO_BUILD_NEON
}
// if coefficient is 0, then read it from the big coefficient array
// otherwise, remove the +8 bias by subtracting 8
static rrbool expand_nonzero_coefficients(radaudio_decoder_state *ds, S8 *nonzero_coefficients, int num_nonzero, S8 *big_coeff, S8 *big_limit, S8 *safe_read)
{
if (safe_read - big_limit > 15) {
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse2) {
return radaudio_sse2_expand_coefficients_excess_read15(nonzero_coefficients, num_nonzero, big_coeff, big_limit);
}
#endif
#ifdef DO_BUILD_NEON
return radaudio_neon_expand_coefficients_excess_read15(nonzero_coefficients, num_nonzero, big_coeff, big_limit);
#endif
}
// else fall through to scalar
for (int i = 0; i < num_nonzero; ++i) {
if (nonzero_coefficients[i] == 0) {
if (big_coeff == big_limit)
return false; // overread error
nonzero_coefficients[i] = *big_coeff++;
} else
nonzero_coefficients[i] -= 8;
}
return true;
}
static void compute_band_energy_multiple4(radaudio_decoder_state *ds, F32 *band_energy, int num_bands, int band_exponent[], U16 fine_energy[], F32 band_scale_decode[])
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse2) {
radaudio_sse2_compute_band_energy_multiple4(band_energy, num_bands, band_exponent, fine_energy, band_scale_decode);
return;
}
#endif
#ifdef DO_BUILD_NEON
radaudio_neon_compute_band_energy_multiple4(band_energy, num_bands, band_exponent, fine_energy, band_scale_decode);
#else
for (int j=0; j < num_bands; ++j) { // safe to run 24 times for SIMD
int qe = fine_energy[j]; // quantized energy, in [0, 1<<MAX_FINE_ENERGY_BITS)
F32 fe, ce, pe; // fine energy, coarse energy, packed energy
pe = qe / (float) (1 << MAX_FINE_ENERGY_BITS); // pe is 0..1
fe = (0.34375f*pe + 0.65625f)*pe + 1.0f;
if (band_exponent[j] == BAND_EXPONENT_NONE)
ce = 0;
else
ce = (float) (1 << (band_exponent[j] + 16)); // integer_exponent 0 => (1<<30>>14) => 1<<16
band_energy[j] = (fe * ce) * band_scale_decode[j];
}
#endif
}
static void compute_subband_energy_skip12_excess_read7(radaudio_decoder_state *ds, F32 *subband_energy, const F32 *band_energy, int num_bands, int num_subbands, int *num_subbands_for_band, U16 *quantized_subbands)
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse4_1) {
radaudio_sse4_compute_subband_energy_skip12_excess_read7(subband_energy, band_energy, num_bands, num_subbands_for_band, quantized_subbands);
return;
}
#endif
#ifdef DO_BUILD_NEON
radaudio_neon_compute_subband_energy_skip12_excess_read7(subband_energy, band_energy, num_bands, num_subbands_for_band, quantized_subbands);
#else
int start, j;
for (j=0; num_subbands_for_band[j] == 1; ++j)
;
start = j;
for (; j < num_bands; ++j) {
int sum=0;
int num = num_subbands_for_band[j];
// these loops are pretty random lengths, for example, at 44.1Khz, they're: 2,2,2,2,3,4,9,10,12 iterations
for (int i=0; i < num; ++i) {
sum += (quantized_subbands[start+i] * quantized_subbands[start+i]);
}
F32 scale = band_energy[j] / sqrtf((F32) sum);
rrAssert(!isnan(band_energy[j]));
rrAssert(sum != 0);
for (int i=0; i < num; ++i) {
subband_energy[start+i] = scale * quantized_subbands[start+i];
}
start += num;
}
rrAssert(start == num_subbands);
#endif
}
static void distribute_bitflag_coefficients_multiple64(radaudio_decoder_state *ds,
S8 *quantized_coeff, int num_coeff,
U8 *nonzero_flagbits,
S8 *nonzero_coeffs, int *pcur_nonzero_coeffs)
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_ssse3) {
radaudio_ssse3_distribute_bitflag_coefficients_multiple16(
ds->cpu,
quantized_coeff, num_coeff,
nonzero_flagbits,
nonzero_coeffs, pcur_nonzero_coeffs);
return;
}
#endif
#ifdef DO_BUILD_NEON
{
radaudio_neon_distribute_bitflag_coefficients_multiple16(
quantized_coeff, num_coeff,
nonzero_flagbits,
nonzero_coeffs, pcur_nonzero_coeffs);
return;
}
#endif
#ifndef DO_BUILD_NEON // for unreachable code warnings
int cur_nonzero_coeffs = *pcur_nonzero_coeffs;
memset(quantized_coeff, 0, num_coeff);
// use a run-length style scheme using bit scans to reduce branch mispredictions
int pos=0;
for (int i=0; i < num_coeff; i += 64) {
U64 flags = RR_GET64_LE(nonzero_flagbits + pos);
pos += 8;
// even though the run is never long--we could just use a small lookup table--let's do it right
int offset = i;
while (flags) {
SINTa dist = rrCtz64(flags);
quantized_coeff[offset+dist] = nonzero_coeffs[cur_nonzero_coeffs++];
flags = rrClearLowestSetBit64(flags);
}
}
*pcur_nonzero_coeffs = cur_nonzero_coeffs;
#endif // !DO_BUILD_NEON
}
static rrbool distribute_nonzero_coefficients(radaudio_decoder_state *ds,
S8 *quantized_coeff, int num_coeff32,
U8 *runlength_data, int *pcur_runlength_data, // there's guaranteed sentinels, so don't need length
S8 *nonzero_coeffs, int *pcur_nonzero_coeffs,
U8 *nonzero_flagbits, int num_nonzero_flagbits, int channel)
{
RR_UNUSED_VARIABLE(channel);
SINTa num_coeff = num_coeff32;
SINTa k=0;
if (num_nonzero_flagbits) {
PROF_BEGIN(distribute_bitflag);
distribute_bitflag_coefficients_multiple64(ds, quantized_coeff, num_nonzero_flagbits, nonzero_flagbits, nonzero_coeffs, pcur_nonzero_coeffs);
PROF_END(distribute_bitflag);
k = num_nonzero_flagbits;
}
const U8 *runlens = runlength_data + *pcur_runlength_data;
const S8 *nzcoeffs = nonzero_coeffs + *pcur_nonzero_coeffs;
PROF_BEGIN(distribute_rle);
memset(quantized_coeff+k, 0, num_coeff-k);
// tried a branchless version of this using the slot[] logic from above, but saw no gain
// we put in sentinels that guarantee this loop will see a END_OF_ZERORUN
for(;;) {
U8 rl = *runlens++;
if (rl == END_OF_ZERORUN)
break;
k += rl;
if (rl < MAX_RUNLEN) {
if (k >= num_coeff)
return false;
quantized_coeff[k] = *nzcoeffs++;
++k;
}
}
*pcur_runlength_data = (int)(SINTa)(runlens - runlength_data);
*pcur_nonzero_coeffs = (int)(SINTa)(nzcoeffs - nonzero_coeffs);
PROF_END(distribute_rle);
return true;
}
static void dequantize_long_block_8x8_Nx16(radaudio_decoder_state *ds, float *coeffs, S8 *quantized_coeff, float *subband_energy, int num_subbands, int *num_coeffs_for_band)
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse4_1) {
radaudio_sse4_dequantize_long_block_8x8_Nx16(coeffs, quantized_coeff, subband_energy, num_subbands);
return;
}
#endif
int cb=0;
int j=0;
// first 8 subbands should be 8 coefficients long
while (num_coeffs_for_band[j] < 16) {
F32 sum=1.e-20f,scale;
for (int i=0; i < num_coeffs_for_band[j]; ++i) {
F32 n = (F32) quantized_coeff[cb+i];
sum += n*n;
}
scale = subband_energy[j] / sqrtf(sum);
for (int i=0; i < 8; ++i)
coeffs[cb+i] = quantized_coeff[cb+i] * scale;
cb += num_coeffs_for_band[j];
++j;
}
// all remaining subbands are 16 coefficients long, so we don't have to check bands
for (; j < num_subbands; ++j) {
F32 sum=1.e-20f,scale;
for (int i=0; i < 16; ++i) {
F32 n = (F32) quantized_coeff[cb+i];
sum += n*n;
}
scale = subband_energy[j] / sqrtf(sum);
for (int i=0; i < 16; ++i)
coeffs[cb+i] = quantized_coeff[cb+i] * scale;
cb += 16;
}
for (int i=cb; i < RADAUDIO_LONG_BLOCK_LEN; ++i)
coeffs[i] = 0;
}
static void dequantize_long_block_with_random_8x8_Nx16(radaudio_decoder_state *ds, F32 *coeffs, S8 *quantized_coeff, F32 *subband_energy, int num_subbands, int *num_coeffs_for_band, U32 randval)
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse4_1) {
RAD_ALIGN(U32, rand_state[4], 16);
build_rand_state(rand_state, randval);
radaudio_sse4_dequantize_long_block_replace_0_with_random_8x8_Nx16(coeffs, quantized_coeff, subband_energy, num_subbands, rand_state);
return;
}
#endif
#ifdef DO_BUILD_NEON
{
RAD_ALIGN(U32, rand_state[4], 16);
build_rand_state(rand_state, randval);
radaudio_neon_dequantize_long_block_replace_0_with_random_8x8_Nx16(coeffs, quantized_coeff, subband_energy, num_subbands, rand_state);
return;
}
#endif
#ifndef DO_BUILD_NEON // for unreachable code warnings
randomize_long_block_8x8_Nx16(ds, quantized_coeff, randval, num_subbands, num_coeffs_for_band);
dequantize_long_block_8x8_Nx16(ds, coeffs, quantized_coeff, subband_energy, num_subbands, num_coeffs_for_band);
#endif // !DO_BUILD_NEON
}
static void scalar_dequantize_short_block(float *coeffs, S8 *quantized_coeff, float *band_energy, int num_bands, int *num_coeffs_for_band)
{
int i,j;
int cb=0;
// first 4 bands are 1 coefficient long, and coefficient is always 1 or -1
for (j=0; j < 4; ++j) {
rrAssert(abs(quantized_coeff[j]) == 1);
coeffs[j] = (F32) quantized_coeff[j] * band_energy[j];
}
// next 4 bands are 2 coefficients long
for (j=0; j < 4; ++j) {
float x = (F32) quantized_coeff[4+j*2+0];
float y = (F32) quantized_coeff[4+j*2+1];
float scale = band_energy[4+j] / sqrtf(x*x+y*y+1.e-20f);
coeffs[4+j*2+0] = x*scale;
coeffs[4+j*2+1] = y*scale;
}
// next 4 bands are 4 coefficients long
// (actually next 5 bands are)
cb = 4*1 + 4*2;
for (j=8; j < 12; ++j) {
float sum=1.e-20f, scale;
for (i=0; i < 4; ++i) {
float n = (F32) quantized_coeff[cb+i];
sum += n*n;
}
scale = band_energy[j] / sqrtf(sum);
for (i=0; i < 4; ++i)
coeffs[cb+i] = (F32) quantized_coeff[cb+i] * scale;
cb += 4;
}
// now we have either [4,16,16,16,32]
// or [4,16,16,32,32] for lower sample rates
cb = 4*1 + 4*2 + 4*4;
for (j=12; j < num_bands; ++j) {
int count = num_coeffs_for_band[j];
F32 sum=1.e-20f,scale;
for (i=0; i < count; i += 4) {
for (int k=0; k < 4; ++k) {
F32 n = (F32) quantized_coeff[cb+i+k];
sum += n*n;
}
}
scale = band_energy[j] / sqrtf(sum);
for (i=0; i < count; i += 4) {
for (int k=0; k < 4; ++k) {
coeffs[cb+i+k] = quantized_coeff[cb+i+k] * scale;
}
}
cb += count;
}
for (i=cb; i < RADAUDIO_SHORT_BLOCK_LEN; ++i)
coeffs[i] = 0;
}
static void dequantize_short_block(radaudio_decoder_state *ds, float *coeffs, S8 *quantized_coeff, float *band_energy, int num_bands, int *num_coeffs_for_band)
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse4_1) {
radaudio_sse4_dequantize_short_block_sse4(coeffs, quantized_coeff, band_energy, num_bands, num_coeffs_for_band);
return;
}
#endif
#ifdef DO_BUILD_NEON
{
radaudio_neon_dequantize_short_block(coeffs, quantized_coeff, band_energy, num_bands, num_coeffs_for_band);
return;
}
#endif
#ifndef DO_BUILD_NEON // for unreachable code warnings
scalar_dequantize_short_block(coeffs, quantized_coeff, band_energy, num_bands, num_coeffs_for_band);
#endif // !DO_BUILD_NEON
}
// we have to store half the MDCT output to overlap in the next block. this is the largest
// per-stream memory cost of the decoder. we used to store floats, but now we convert them
// to S16. This is *pre* windowing, so the quality loss is minimal.
//
// sse2 runs at about half speed of original "store as floats" version, but it's about a 2% slowdown
// overall and we decided it was worth the speed loss in return for halving memory usage
static float save_overlapping_samples(radaudio_decoder_state *ds, S16 *buffer, const float *data, int num)
{
// the profile wrapper is external to this, under the name "copy"
rrAssert(num % 64 == 0);
num /= 2;
#if defined(DO_BUILD_SSE4) || defined(DO_BUILD_AVX2)
if (ds->cpu.has_sse2) {
#ifdef DO_BUILD_AVX2
if (ds->cpu.has_avx2) {
return radaudio_avx2_save_samples(buffer, data, num);
}
#endif
return radaudio_sse2_save_samples(buffer, data, num);
}
#endif
#ifdef DO_BUILD_NEON
return radaudio_neon_save_samples(buffer, data, num);
#endif
#ifndef DO_BUILD_NEON // for unreachable code warnings
#define FAST_FLOAT_TO_INT // best solution i've found for x64
#if 0
// naive implementation for reference, but floor() is unacceptably slow
// doubles total decode time on test platform; round() and rint() were worse
// also, doesn't round to nearest even like SSE path
// most files in fnaudio get different results, is it buggy? there's no way this can just be from tie-breaking 0.5?!?
float largest0 = 1.0f;
float largest1 = 1.0f;
float scale = 32767.0f;
for (int i=0; i < num; i += 2) {
F32 d0 = data[i+0];
F32 d1 = data[i+1];
buffer[i+0] = (S16) floorf(d0 * scale + 0.5f);
buffer[i+1] = (S16) floorf(d1 * scale + 0.5f);
F32 a0 = fabsf(d0);
F32 a1 = fabsf(d1);
largest0 = RR_MAX(largest0, a0);
largest1 = RR_MAX(largest1, a1);
}
float largest = RR_MAX(largest0,largest1);
if (largest > 1.0f) {
scale = 32767.0f / largest;
for (int i=0; i < num; i += 2) {
buffer[i+0] = (S16) floorf(data[i+0] * scale + 0.5f);
buffer[i+1] = (S16) floorf(data[i+1] * scale + 0.5f);
}
}
return 1.0f / scale;
#elif defined(FAST_FLOAT_TO_INT)
// this should round correctly
// bithack float-to-int
typedef union {
F32 f;
S32 i;
} float_conv;
float_conv temp0,temp1,temp2,temp3;
// add (1<<23) to convert to int, then divide by 2^SHIFT, then add 0.5/2^SHIFT to round
#define MAGIC(SHIFT) (1.5f * (1 << (23-SHIFT)) + 0.5f/(1 << SHIFT))
#define ADDEND(SHIFT) (((150-SHIFT) << 23) + (1 << 22))
#define FAST_SCALED_FLOAT_TO_INT(temp,x,s) (temp.f = (x) + MAGIC(s), temp.i - ADDEND(s))
float largest0 = 1.0f;
float largest1 = 1.0f;
float largest2 = 1.0f;
float largest3 = 1.0f;
float scale = 32767.0f;
for (int i=0; i < num; i += 4) {
F32 d0 = data[i+0];
F32 d1 = data[i+1];
F32 d2 = data[i+2];
F32 d3 = data[i+3];
F32 a0 = fabsf(d0);
F32 a1 = fabsf(d1);
F32 a2 = fabsf(d2);
F32 a3 = fabsf(d3);
buffer[i+0] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, d0 * scale, 0);
buffer[i+1] = (S16) FAST_SCALED_FLOAT_TO_INT(temp1, d1 * scale, 0);
buffer[i+2] = (S16) FAST_SCALED_FLOAT_TO_INT(temp2, d2 * scale, 0);
buffer[i+3] = (S16) FAST_SCALED_FLOAT_TO_INT(temp3, d3 * scale, 0);
largest0 = RR_MAX(largest0, a0);
largest1 = RR_MAX(largest1, a1);
largest2 = RR_MAX(largest2, a2);
largest3 = RR_MAX(largest3, a3);
}
float largest01 = RR_MAX(largest0,largest1);
float largest23 = RR_MAX(largest2,largest3);
float largest = RR_MAX(largest01,largest23);
if (largest > 1.0f) {
scale = 32767.0f / largest;
for (int i=0; i < num; i += 4) {
buffer[i+0] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+0] * scale, 0);
buffer[i+1] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+1] * scale, 0);
buffer[i+2] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+2] * scale, 0);
buffer[i+3] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+3] * scale, 0);
}
}
return 1.0f / scale;
#else
// 30% slower than FAST_FLOAT_TO_INT on x64
// we want to use the equivalent of floor() so we can round.
// if we use fixed-point, right-shifting two's complement values is floor.
// though we might get compiler warnings about signed shifts
// problem: this doesn't produce the exact same results as other methods
// most files in fnaudio get different results
#define TRUNC_SHIFT 15
float largest0 = 1.0f;
float largest1 = 1.0f;
float largest2 = 1.0f;
float largest3 = 1.0f;
float scale = 32767.0f;
for (int i=0; i < num; i += 4) {
F32 d0 = data[i+0];
F32 d1 = data[i+1];
F32 d2 = data[i+2];
F32 d3 = data[i+3];
F32 a0 = fabsf(d0);
F32 a1 = fabsf(d1);
F32 a2 = fabsf(d2);
F32 a3 = fabsf(d3);
S32 i0 = (S32) (d0 * scale * (1 << TRUNC_SHIFT));
S32 i1 = (S32) (d1 * scale * (1 << TRUNC_SHIFT));
S32 i2 = (S32) (d2 * scale * (1 << TRUNC_SHIFT));
S32 i3 = (S32) (d3 * scale * (1 << TRUNC_SHIFT));
buffer[i+0] = (S16) ((i0 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
buffer[i+1] = (S16) ((i1 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
buffer[i+2] = (S16) ((i2 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
buffer[i+3] = (S16) ((i3 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
largest0 = RR_MAX(largest0, a0);
largest1 = RR_MAX(largest1, a1);
largest2 = RR_MAX(largest2, a2);
largest3 = RR_MAX(largest3, a3);
}
float largest01 = RR_MAX(largest0,largest1);
float largest23 = RR_MAX(largest2,largest3);
float largest = RR_MAX(largest01,largest23);
if (largest > 1.0f) {
scale = 32767.0f / largest;
for (int i=0; i < num; i += 4) {
S32 i0 = (S32) (data[i+0] * scale * (1 << TRUNC_SHIFT));
S32 i1 = (S32) (data[i+1] * scale * (1 << TRUNC_SHIFT));
S32 i2 = (S32) (data[i+2] * scale * (1 << TRUNC_SHIFT));
S32 i3 = (S32) (data[i+3] * scale * (1 << TRUNC_SHIFT));
buffer[i+0] = (S16) ((i0 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
buffer[i+1] = (S16) ((i1 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
buffer[i+2] = (S16) ((i2 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
buffer[i+3] = (S16) ((i3 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
}
}
return 1.0f / scale;
#endif
#endif // !DO_BUILD_NEON
}
/////////////////////////////////////////////////////////////////////////////////
static const size_t decoder_align = 32;
// We fill all-0 subbands with random noise (that's scaled to the
// appropriate subband energy).
//
// We need a mechanism where SSE2 can generate random data very fast,
// but it doesn't hobble the scalar pass. so e.g. SSE2 can generate
// 4 LCG steps in parallel, 16..24 bits of useful data in each one.
// but doing this on scalar might be slow. for a single 16-item subband,
// we need 4*16 = 64 bits of randomness to use our 4-bit random_table[]
// (which we can lookup with pshub).
//
// scalar: old code did 2 32-bit LCGs per subband, but only if the subband was zero.
// Optimized-for-SIMD code might prefer to be branchless and do an LCG on
// every subband even if non-zero, and there are ~80 subbands. But it turns
// out we take a branch anyway in the SSE code, so executive decision to stick
// to a design where we only update LCGs if subband was zero. (Note that if
// we didn't care about identical decoding across platforms, you could use
// whatever random method was optimal for each platform).
static void decode_channel_before_imdct(radaudio_decoder_state *ds,
radaudio_block_data *bd,
int channel,
U32 rand_seed,
U16 fine_energy[])
{
radaudio_rate_info *info;
RAD_ALIGN(F32, band_energy[24+16], 16) = { 0 }; // must be a multiple of 4
RAD_ALIGN(F32, subband_energy[MAX_SUBBANDS+16], 16) = { 0 };
int is_short_block = ds->current_block_short;
info = ds->info[is_short_block];
PROF_BEGIN(compute_band_energy);
// compute band energy
compute_band_energy_multiple4(ds, band_energy, info->num_bands, bd->band_exponent, fine_energy, info->band_scale_decode);
PROF_END(compute_band_energy);
// compute subband energy
if (!is_short_block) {
PROF_BEGIN(compute_subband_energy);
// first bands are shorter than a full subband, so treat those specially
int j;
for (j=0; info->num_subbands_for_band[j] == 1; ++j)
subband_energy[j] = band_energy[j];
compute_subband_energy_skip12_excess_read7(ds, subband_energy, band_energy, info->num_bands, info->num_subbands, info->num_subbands_for_band, bd->quantized_subbands);
for (j=0; j < info->num_subbands; ++j)
rrAssert(!isnan(subband_energy[j]));
PROF_END(compute_subband_energy);
}
// spread out adjacent blocks to be less similar
U32 randval = (rand_seed + (rand_seed >> 5)) * 0x27d4eb2d;
randval = lcg(randval);
F32 *coeffs = bd->dequantized_coeff_decode;
if (!is_short_block) {
for (int j=0; j < info->num_subbands; ++j)
rrAssert(info->num_coeffs_for_subband[j] == (j < 8 ? 8 : 16));
PROF_BEGIN(unquantize);
dequantize_long_block_with_random_8x8_Nx16(ds, coeffs, bd->quantized_coeff_decode, subband_energy, info->num_subbands, info->num_coeffs_for_band, randval);
PROF_END(unquantize);
} else {
PROF_BEGIN(randomize);
randomize_short_block(bd->quantized_coeff_decode, randval, info->num_bands, info->num_coeffs_for_band);
PROF_END(randomize);
PROF_BEGIN(unquantize);
dequantize_short_block(ds, coeffs, bd->quantized_coeff_decode, band_energy, info->num_bands, info->num_coeffs_for_band);
PROF_END(unquantize);
}
}
#if 0
static void decode_channel_before_imdct_reference(radaudio_decoder_state *ds,
radaudio_block_data *bd,
int channel,
U32 rand_seed,
U16 fine_energy[])
{
radaudio_rate_info *info;
RAD_ALIGN(F32, band_energy[24+16], 16) = { 0 }; // must be a multiple of 4
RAD_ALIGN(F32, subband_energy[MAX_SUBBANDS+16], 16) = { 0 };
int is_short_block = ds->current_block_short;
info = ds->info[is_short_block];
PROF_BEGIN(compute_band_energy);
// compute band energy
compute_band_energy_multiple4(ds, band_energy, info->num_bands, bd->band_exponent, fine_energy, info->band_scale_decode);
PROF_END(compute_band_energy);
// compute subband energy
if (!is_short_block) {
PROF_BEGIN(compute_subband_energy);
// first bands are shorter than a full subband, so treat those specially
int j;
for (j=0; info->num_subbands_for_band[j] == 1; ++j)
subband_energy[j] = band_energy[j];
compute_subband_energy_skip12_excess_read7(ds, subband_energy, band_energy, info->num_bands, info->num_subbands, info->num_subbands_for_band, bd->quantized_subbands);
for (j=0; j < info->num_subbands; ++j)
rrAssert(!isnan(subband_energy[j]));
PROF_END(compute_subband_energy);
}
// spread out adjacent blocks to be less similar
U32 randval = (rand_seed + (rand_seed >> 5)) * 0x27d4eb2d;
randval = lcg(randval);
PROF_BEGIN(randomize);
if (!is_short_block) {
// replace all-zero coefficient chunks with noise
for (int j=0; j < info->num_subbands; ++j)
rrAssert(info->num_coeffs_for_subband[j] == (j < 8 ? 8 : 16));
randomize_long_block_8x8_Nx16(bd->quantized_coeff_decode, randval, info->num_subbands, info->num_coeffs_for_band);
} else {
// replace all-zero coefficient chunks with noise
randomize_short_block(bd->quantized_coeff_decode, randval, info->num_bands, info->num_coeffs_for_band);
}
PROF_END(randomize);
F32 *coeffs = bd->dequantized_coeff_decode;
PROF_BEGIN(unquantize);
// reference implementation doesn't make any assumptions about distribution of subbands
{
int start = 0;
if (is_short_block) {
for (int j=0; j < info->num_bands; ++j)
subband_energy[j] = band_energy[j];
} else {
for (int j=0; j < info->num_bands && info->num_subbands_for_band[j] == 1; ++j)
subband_energy[j] = band_energy[j];
}
for (int j=0; j < info->num_subbands; ++j) {
int n = info->num_coeffs_for_subband[j];
F32 sum = 1.0e-30f;
for (int i=0; i < n; ++i) {
F32 x = (F32) bd->quantized_coeff_decode[start+i];
sum += x*x;
}
F32 scale = subband_energy[j] / sqrtf(sum);
for (int i=0; i < n; ++i) {
coeffs[start+i] = (F32) bd->quantized_coeff_decode[start+i] * scale;
}
start += n;
}
for (int i=info->num_quantized_coeffs; i < info->num_coeffs; ++i)
coeffs[i] = 0;
}
PROF_END(unquantize);
}
#endif
// dequantized_coeff_decode[] in bd is overwritten in the process
static void decode_channel_imdct(radaudio_decoder_state *ds,
F32 rawdata[MAX_COEFFS], // only max_coeffs because we expand the symmetries later
radaudio_block_data *bd,
int channel)
{
radaudio_rate_info *info;
int is_short_block = ds->current_block_short;
info = ds->info[is_short_block];
F32 *coeffs = bd->dequantized_coeff_decode;
for (int j=0; j < info->num_coeffs; ++j)
rrAssert(!isnan(coeffs[j]));
PROF_BEGIN(imdct);
radaudio_imdct_fft_only_middle(ds->cpu, rawdata, coeffs, info->num_coeffs);
PROF_END(imdct);
}
static int decode_channel_after_imdct(radaudio_decoder_state *ds,
F32 data1 [MAX_COEFFS],
F32 data2 [MAX_COEFFS],
radaudio_block_data *bd,
int channel,
F32 *output)
{
int result_length;
radaudio_rate_info *info;
int is_short_block = ds->current_block_short;
info = ds->info[is_short_block];
PROF_BEGIN(window);
if (ds->post_seek)
// ignore the left side of the first block
result_length = 0;
else {
// use window for whichever is smaller of current block or previous block
// apply window to pending data
if (ds->current_block_short == ds->last_block_short) {
F32 *window = radaudio_windows[ds->current_block_short];
int len = ds->current_block_short ? RADAUDIO_SHORT_BLOCK_LEN : RADAUDIO_LONG_BLOCK_LEN;
compute_windowed_sum_multiple64(ds, output, len,
data1, ds->prev_block_right_samples[channel], len, 0, ds->restore_scale[channel],
window, ds->block_number, channel, ds->fully_decoded);
result_length = len;
} else {
F32 *window = radaudio_windows[RADAUDIO_SHORT];
const int n = RADAUDIO_SHORT_BLOCK_LEN;
if (is_short_block) {
// if previous block was long and this is short
//
// <-----LONG_BLOCK_LEN---->
// +-----------+-----------+-----------+-----------+
// | prev |
// +-----------+-----------+-----------+-----------+
// *************** <- output samples
// -----------------------1111111111WWwww000000000 <- window weights
// ----------wwWWW
// +----+----+
// | cur |
// +----+----+
// <---->
// |
// SHORT_BLOCK_LEN
//
const int len = RADAUDIO_LONG_BLOCK_LEN/2 - RADAUDIO_SHORT_BLOCK_LEN/2;
copy_samples_multiple16_scaled(output, len, ds->prev_block_right_samples[channel], ds->restore_scale[channel]); // copy samples from previous where the new window is 0 and old window was 1
compute_windowed_sum_multiple64(ds, output+len, n,
data1, ds->prev_block_right_samples[channel], RADAUDIO_LONG_BLOCK_LEN, len, ds->restore_scale[channel],
window, ds->block_number, channel, ds->fully_decoded+len); // sum the part of the previous block that overlaps the left half of the new block
result_length = RADAUDIO_LONG_BLOCK_LEN/2 + RADAUDIO_SHORT_BLOCK_LEN/2; // generated (LONG/2 - SHORT/2) + SHORT
} else {
// if previous block was short and this is long
//
// SHORT_BLOCK_LEN
// |
// <---->
// +----+----+
// | prev |
// +----+----+
// WWwww---------
// 000000000wwwWW111111111----------------------- <- window weights
// ************** <- output samples
// +-----------+-----------+-----------+-----------+
// | cur |
// +-----------+-----------+-----------+-----------+
// <-----LONG_BLOCK_LEN---->
//
const int offset = RADAUDIO_LONG_BLOCK_LEN/2 - RADAUDIO_SHORT_BLOCK_LEN/2;
compute_windowed_sum_multiple64(ds, output, n, data1, ds->prev_block_right_samples[channel], RADAUDIO_SHORT_BLOCK_LEN, 0, ds->restore_scale[channel], window, ds->block_number, channel, ds->fully_decoded);
copy_samples_multiple16(output+n, RADAUDIO_LONG_BLOCK_LEN - (offset+n), data1 + n/2);
result_length = RADAUDIO_SHORT_BLOCK_LEN/2 + RADAUDIO_LONG_BLOCK_LEN/2;
}
}
}
PROF_END(window);
PROF_BEGIN(copy);
ds->restore_scale[channel] = save_overlapping_samples(ds, ds->prev_block_right_samples[channel], data2, info->num_coeffs);
PROF_END(copy);
return result_length;
}
static U8 *find_next_coarse_run_excess16(radaudio_decoder_state *ds, U8 *cur, U8 *end)
{
#ifdef DO_BUILD_SSE4
if (ds->cpu.has_sse2) {
return radaudio_sse2_find_next_coarse_run_excess16(cur, end);
}
#endif
#ifdef __RAD64REGS__
RR_COMPILER_ASSERT(COARSE_RUNLEN_THRESHOLD < 128);
RR_COMPILER_ASSERT(MAX_RUNLEN >= 128);
const U64 splat8 = ~(U64)0 / 255; // 0x0101...01
const U64 msb_mask = 0x80 * splat8;
const U64 low7_mask = ~msb_mask;
const U64 bias0 = (128 - COARSE_RUNLEN_THRESHOLD) * splat8;
const U64 bias1 = (256 - MAX_RUNLEN) * splat8;
while (cur < end) {
U64 bytes = RR_GET64_LE(cur);
cur += 8;
// check if there are any bytes >=COARSE_RUNLEN_THRESHOLD in those 8 bytes we just read.
// idea: these are bytes that either
// 1. have low 7 bits >=COARSE_RUNLEN_THRESHOLD
// 2. have MSB set (thus >=128)
// we can check the former by masking with 0x7f7f...7f and then adding (128 - COARSE_RUNLEN_THRESHOLD)
// to every byte. if the MSB ends up set, they were above COARSE_RUNLEN_THRESHOLD.
//
// by the same logic, we have bytes >=MAX_RUNLEN (which is >=128) if and only if both
// 1. their low 7 bits >= (MAX_RUNLEN - 128)
// 2. their MSB is set
U64 low7 = bytes & low7_mask;
U64 above_coarse_runlen_thresh = (low7 + bias0) | bytes; // MSB in byte set if that byte >=COARSE_RUNLEN_THRESHOLD
U64 above_max_runlen_thresh = (low7 + bias1) & bytes; // MSB in byte set if that byte >=MAX_RUNLEN
U64 active = above_coarse_runlen_thresh & ~above_max_runlen_thresh & msb_mask;
if (active) {
// found at least one! locate the first occurrence using a trailing
// zero count.
return (cur - 8) + rrCtzBytes64(active);
}
}
return cur;
#else
while (cur < end && (*cur < COARSE_RUNLEN_THRESHOLD || *cur >= MAX_RUNLEN))
++cur;
return cur;
#endif
}
static int decode_block(radaudio_decoder_state *ds, F32 *output[2], void *mem, size_t memavail, size_t *memconsumed)
{
int len, skip=0;
radaudio_block_data bd[2];
*memconsumed = 0;
U8 *memory = mem;
//size_t memory_valid = memavail;
int c;
huff3_decoder dec;
// Throw in one empty section first so we know what the overhead of one of these is
PROF_BEGIN(overhead);
PROF_END(overhead);
PROF_BEGIN(header);
radaudio_block_header_unpacked header;
int offset = radaudio_decode_block_header(memory, &ds->biases, &header, memavail);
// check if we're at the stream header, if so skip it; this happens at start,
// but also if they seek without telling us
if (offset == COMMON_STREAM_HEADER) {
// we might be at the start of the stream
// enough bytes to check for the stream signature?
if (memavail < 8)
return e(RADAUDIO_INCOMPLETE_DATA);
// check the stream signature
if (!radaudio_check_stream_header(memory, memavail))
return e(RADAUDIO_INVALID_DATA);
// enough bytes for a full header?
if (memavail < sizeof(radaudio_stream_header))
return e(RADAUDIO_INCOMPLETE_DATA);
// decode the header, the only way we have to parse it
radaudio_stream_header_unpacked fh;
size_t header_size = radaudio_unpack_stream_header(memory, memavail, &fh);
// was it a valid header?
if (header_size == 0)
return e(RADAUDIO_INVALID_DATA);
// make sure the subtract below can't be negative
if (header_size > memavail)
return e(RADAUDIO_INTERNAL_ERROR);
// behave as if we just did a seek operation
ds->post_seek = true;
// we know the block number
ds->block_number = 0;
// shrink the input buffer
memory += header_size;
memavail -= header_size;
skip = (int) header_size;
// now decode the real block header and go back to the main block decode path with the real block header
offset = radaudio_decode_block_header(memory, &ds->biases, &header, memavail);
// if that's ALSO a stream header, it's a corrupt file
if (offset == COMMON_STREAM_HEADER)
return e(RADAUDIO_INVALID_DATA);
}
if (offset == COMMON_INCOMPLETE_DATA)
return e(RADAUDIO_INCOMPLETE_DATA);
else if (offset < 0)
return e(RADAUDIO_INVALID_DATA);
U32 block_length_in_bytes = header.block_bytes + offset;
///////////////////////////////////////////////////////
// validate data
//
// block length isn't longer than spec max
if (block_length_in_bytes > MAX_ENCODED_BLOCK_BYTES)
return RADAUDIO_INVALID_DATA;
// length of first stream doesn't go off end of block
U32 mid_side_band_length = header.mid_side_bands ? (24/MACRO_BAND_SIZE+7)/8 : 0;
if (offset + mid_side_band_length + header.vbstream0_length > block_length_in_bytes)
return e(RADAUDIO_INVALID_DATA);
// final
if (header.final_block)
if (header.final_samples_discard > RADAUDIO_SHORT_BLOCK_LEN)
return e(RADAUDIO_INVALID_DATA);
// can't have more RLE entries than coefficients
if (header.num_runlength_array > (U32) 2*(header.this_block_short ? RADAUDIO_SHORT_BLOCK_LEN+1 : RADAUDIO_LONG_BLOCK_LEN+1))
return e(RADAUDIO_INVALID_DATA);
if (block_length_in_bytes > memavail)
return e(RADAUDIO_INCOMPLETE_DATA);
if (header.final_block)
ds->at_eof = true;
rrbool is_short_block = header.this_block_short;
ds->current_block_short = (U8) is_short_block;
ds->next_block_short = header.next_block_short;
radaudio_rate_info *bi = ds->info[is_short_block];
int num_channels = header.num_channels_encoded;
int nz_mode = header.nonzero_bitarray_mode;
U8 *mid_side_bands = memory + offset;
U8 *post_header = mid_side_bands + mid_side_band_length;
U8 *vbstream2 = post_header + header.vbstream0_length;
U8 *packet_end = memory + block_length_in_bytes;
int error=0;
// we initialize the 'end' pointers for each stream to the end of valid data
// in that packet, not the end of that stream. So without further tests, they
// could read the same raw bytes as part of multiple streams; but this is used
// just to guarantee no memory overreads.
decode_vbstream_init(&dec.stream[0], post_header, packet_end, &error);
decode_vbstream_init(&dec.stream[1], packet_end , post_header, &error);
decode_vbstream_init(&dec.stream[2], vbstream2 , packet_end, &error);
U32 midside_bands=0;
if (header.mid_side_encoded)
midside_bands = 0xffffffff;
else if (header.mid_side_bands) {
RR_COMPILER_ASSERT(MACRO_BAND_SIZE == 3);
U8 midside_band_triples = *mid_side_bands; // read 8 bits
int k=0;
for (int j=0; j < bi->num_bands; j += 3, ++k) {
if (midside_band_triples & (1 << k))
midside_bands |= (7 << j);
}
}
U8 band_exponents[32*2];
int cur_band_exponents=0;
PROF_END(header);
//
// decode the band energy first, in case we want to use it to compute/predict other things (we don't anymore)
//
// band exponents
PROF_BEGIN(huffman);
if (header.predict_stereo_exponent && num_channels == 2) {
decode_huff_array(&dec, &rada_band_exponent_correct_huff , band_exponents , bi->num_bands, &error);
decode_huff_array(&dec, &rada_band_exponent_stereo_correct_huff, band_exponents+32, bi->num_bands, &error);
} else {
decode_huff_array(&dec, &rada_band_exponent_correct_huff, band_exponents, bi->num_bands * num_channels, &error);
}
PROF_END(huffman);
if (error) {
return e(RADAUDIO_INVALID_DATA);
}
PROF_BEGIN(unpack);
for (c=0; c < (header.predict_stereo_exponent ? 1 : num_channels); ++c) {
int lastv = PREDICT_FIRST_BAND_EXP;
for (int j=0; j < bi->num_bands; ++j) {
int v = (S8) band_exponents[cur_band_exponents++];
v += lastv;
lastv = v;
bd[c].band_exponent[j] = v;
}
}
// decode stereo predicted exponents
if (header.predict_stereo_exponent && num_channels == 2) {
for (int j=0; j < bi->num_bands; ++j)
bd[1].band_exponent[j] = bd[0].band_exponent[j] + (S8) band_exponents[32+j];
}
PROF_END(unpack);
RAD_ALIGN(U16, m_mantissa[MAX_BANDS*2+16], 16);
PROF_BEGIN(compute_mantissa_len);
for (c=0; c < num_channels; ++c) {
// THIS LOGIC MUST BE EXACTLY REPLICATED IN THE COMPRESSOR!!!
compute_mantissa_bitcount(
ds->samprate_mode,
is_short_block,
ds->mantissa_param,
bd[c].band_exponent,
bd[c].band_mantissa_bitcount);
}
PROF_END(compute_mantissa_len);
if (error)
return e(RADAUDIO_INVALID_DATA);
PROF_BEGIN(varbits);
{
int slot=0;
for (c = 0; c < num_channels; ++c) {
for (int j=0; j < bi->num_bands; ++j) {
U8 size = bd[c].band_mantissa_bitcount[j];
U16 mantissa = (U16) decode_vbstream_bits(&dec.stream[2], size, &error);
m_mantissa[slot] = mantissa << (MAX_FINE_ENERGY_BITS - size);
++slot;
}
}
}
PROF_END(varbits);
if (error)
return e(RADAUDIO_INVALID_DATA);
//
// now do all the remaining entropy decoding
//
#define runlen_value_sentinel_size 2 // room to write two END_OF_ZERORUN markers to preven overread if input doesn't have them
#define runlen_read_sentinel_size 16 // room to write dummy values for SIMD to run on multiple-of-16-bytes
#define nonzero_coefficients_padding 32 // room to write dummy values for SIMD overwrite/overread, both of which are at most 16
#define coeff_pair_padding 16 // room to write dummy data when unpacking
#define runlen_pad (runlen_value_sentinel_size + runlen_read_sentinel_size)
#define max_runlength_data 1025 // 1024 empty runs per channel, plus two end-of-run markers
RAD_ALIGN(U8, subband_value [2* MAX_SUBBANDS ], 16);
RAD_ALIGN(U8, subband_correction [2* MAX_BANDS ], 16);
RAD_ALIGN(U8, subband_stereo_correct[ MAX_SUBBANDS ], 16);
RAD_ALIGN(S8, nonzero_coefficients [2* 1024+nonzero_coefficients_padding ], 16);
RAD_ALIGN(U8, runlength_data [2* max_runlength_data + runlen_pad ], 16);
RAD_ALIGN(U8, nonzero_flagbits [2* (1024/8) + 16 ], 16);
int num_subband_values0=0;
int num_subband_corrections=0, num_subband_stereo_correct=0;
int num_runlength_data=header.num_runlength_array;
int num_coeff_pairs;
for (int j=0; j < bi->num_bands; ++j) {
int numsub = bi->num_subbands_for_band[j];
if (numsub > 1) {
for (c=0; c < num_channels; ++c) {
if (bd[c].band_exponent[j] == BAND_EXPONENT_NONE && SUBBANDS_SKIP_EMPTY_BANDS)
continue;
int n = numsub;
if (c == 1 && header.predict_stereo_subband) {
num_subband_stereo_correct += n;
} else {
if (!header.disable_final_subband_predict) {
--n;
++num_subband_corrections;
}
num_subband_values0 += n;
}
}
}
}
// subband values
PROF_BEGIN(huffman);
if (!is_short_block) {
decode_huff_array(&dec, &rada_subband_value_huff , subband_value , num_subband_values0 , &error);
if (!header.disable_final_subband_predict)
decode_huff_array(&dec, &rada_subband_value_last_in_band_correct_huff, subband_correction, num_subband_corrections, &error);
if (header.predict_stereo_subband)
decode_huff_array(&dec, &rada_subband_value_stereo_correct_huff, subband_stereo_correct, num_subband_stereo_correct, &error);
}
PROF_END(huffman);
if (error)
return e(RADAUDIO_INVALID_DATA);
// coefficient zero-runlength data
if (num_runlength_data > 1025*2)
return e(RADAUDIO_INVALID_DATA);
if (!is_short_block && nz_mode != 3) {
U8 huffbits[2048];
radaudio_nonzero_blockmode_descriptor *bdesc = &ds->nz_desc[nz_mode];
PROF_BEGIN(huffman);
{
int p=0;
for (int i=0; i < NUM_NZ_HUFF; ++i) {
int q = bdesc->num_chunks_per_huff[i];
if (q) {
decode_huff_array(&dec, rada_nonzero_bitflags_huff[i], huffbits+p, q*8*num_channels, &error);
p += q*8*num_channels;
}
}
}
PROF_END(huffman);
PROF_BEGIN(huffman);
int j=0, s=num_channels-1;
for (c=0; c < num_channels; ++c, ++s) {
for (int i=0; i < bdesc->num_8byte_chunks; ++i, ++j) {
U8 p = bdesc->source_pos[s][i];
U64 xor = (U64)0 - bdesc->invert_chunk[i]; // if invert_chunk=1, this gives ~0 (invert), else 0.
RR_PUT64_NATIVE(&nonzero_flagbits[j*8], xor ^ RR_GET64_NATIVE(huffbits+p*8));
}
}
PROF_END(huffman);
}
if (error)
return e(RADAUDIO_INVALID_DATA);
PROF_BEGIN(huffman);
decode_huff_array(&dec, &rada_zero_runlength_huff, runlength_data, num_runlength_data, &error);
// add sentinel so we don't need to length-check loop
runlength_data[num_runlength_data+0] = END_OF_ZERORUN;
// add extra sentinel in case the data is invalid and doesn't have the stereo separator, so we don't need to length-check loop
runlength_data[num_runlength_data+1] = END_OF_ZERORUN;
PROF_END(huffman);
if (error)
return e(RADAUDIO_INVALID_DATA);
PROF_BEGIN(count_coefficients_huff);
// values of MAX_RUNLEN don't indicate coefficients, because they have a following real runlength
int num_nonzero_coefficients = count_bytes_below_value_sentinel16(ds, runlength_data, num_runlength_data, MAX_RUNLEN);
if (!is_short_block) {
int num_flagbit_bytes = ds->nz_desc[nz_mode].num_8byte_chunks * 8;
if (num_flagbit_bytes != 0)
num_nonzero_coefficients += count_set_bits_multiple8_sentinel8(ds, nonzero_flagbits, num_flagbit_bytes*num_channels);
}
PROF_END(count_coefficients_huff);
// runlength data + flagbits combined could be too many coefficients
if (num_nonzero_coefficients > num_channels*1024)
return e(RADAUDIO_INVALID_DATA);
int nz_selector = is_short_block ? 4 : nz_mode;
// coefficients -- need to have decoded the runlength data to know how many coefficients
{
// transient temp mem
RAD_ALIGN(U8, coefficient_pairs[2* 1024/2 * 2 + coeff_pair_padding], 16);
PROF_BEGIN(huffman);
num_coeff_pairs = (num_nonzero_coefficients+1)/2;
int tp = ds->nz_correlated_huffman_selectors[HS_COEFF_PAIR][nz_selector];
decode_huff_array(&dec, rada_nonzero_coefficient_pair_huff[tp], coefficient_pairs, num_coeff_pairs, &error);
PROF_END(huffman);
// convert coefficient pairs to coefficients
PROF_BEGIN(unpack);
unpack_nibbles_input_excess16_output_excess16_multiple32_default1(ds, nonzero_coefficients, coefficient_pairs, num_coeff_pairs);
PROF_END(unpack);
}
// read and apply bottom bits of run length data
// we have 2*1024 coeffs, COARSE_RUNLEN_THRESHOLD=60 and such runs are followed by a
// nonzero coefficient, so per 1024 coeffs we can have at most floor(1024/61)=16 of these
// (32 total between the total channels). in practice, the typical counts are 0-4.
PROF_BEGIN(update_runlength);
{
U8 *cur = runlength_data;
U8 *end = runlength_data + num_runlength_data; // we have runlen_read_sentinel_size of padding, so can be sloppy
while (cur < end) {
cur = find_next_coarse_run_excess16(ds, cur, end);
if (cur >= end)
break;
rrAssert(*cur >= COARSE_RUNLEN_THRESHOLD && *cur < MAX_RUNLEN);
// process this run and advance
U8 extra = (U8) decode_vbstream_bits(&dec.stream[2], 2, &error);
*cur += extra;
++cur;
}
}
PROF_END(update_runlength);
// big coefficients are coded as value 0 in the coefficient pairs
{
// transient temp mem, only used right here
RAD_ALIGN(S8, big_coefficients[2* 1024 + 16], 16);
// count zero bytes
int num_big_coefficients = count_bytes_below_value_sentinel16(ds, (U8*) nonzero_coefficients, num_nonzero_coefficients, 1);
PROF_BEGIN(huffman);
int tb = ds->nz_correlated_huffman_selectors[HS_COEFF_BIG][nz_selector];
decode_huff_array(&dec, rada_nonzero_coefficient_big_huff[tb], (U8*) big_coefficients, num_big_coefficients, &error);
PROF_END(huffman);
PROF_BEGIN(unbias);
// big coefficients are byte-sized, so stored aligned in stream[2]
//decode_stream_align_to_byte(&dec.stream[2]);
//U8 *bytestream = &dec.stream[2].bitstream[ dec.stream[2].read_pos_in_bits>>3 ];
// bytestream ends at current position of reverse-read stream 1
//decode_stream_align_to_byte(&dec.stream[1]);
//U8 *bytestream_end = &dec.stream[1].bitstream[-(int)(dec.stream[1].read_pos_in_bits>>3)];
// expand used to decode directly from the stream and hence needed a safety range
if (!expand_nonzero_coefficients(ds, nonzero_coefficients, num_nonzero_coefficients,
big_coefficients, (big_coefficients+num_nonzero_coefficients), (big_coefficients+sizeof(big_coefficients))))
return e(RADAUDIO_INVALID_DATA);
PROF_END(unbias);
}
if (error)
return e(RADAUDIO_INVALID_DATA);
int cur_subband_values0=0;
int cur_subband_corrections=0;
int cur_subband_stereo_correct=0;
int cur_nonzero_coefficients=0;
int cur_runlength_data=0;
PROF_BEGIN(compute_subbands);
for (c=0; c < num_channels; ++c)
memset(bd[c].quantized_subbands, 0, bi->num_subbands * 2);
if (!is_short_block) {
// subbands
for (c=0; c < num_channels; ++c) {
for (int j=0; j < bi->num_bands; ++j) {
if (bi->num_subbands_for_band[j] == 1)
continue;
int start = bi->first_subband_for_band[j];
int num_coded_subbands = bi->num_subbands_for_band[j];
if (bd[c].band_exponent[j] == BAND_EXPONENT_NONE && SUBBANDS_SKIP_EMPTY_BANDS) {
for (int i=0; i < num_coded_subbands; ++i)
bd[c].quantized_subbands[start+i] = (U16) (ds->subband_predicted_sum[j] / num_coded_subbands); // this value is predicted from in stero
} else if (header.predict_stereo_subband && c == 1) {
for (int i=0; i < num_coded_subbands; ++i) {
int predict = bd[0].quantized_subbands[start+i];
int correct = (S8) subband_stereo_correct[cur_subband_stereo_correct++];
bd[c].quantized_subbands[start+i] = (U16) (predict + correct);
}
} else {
int predicted_sum = ds->subband_predicted_sum[j];
int bias = ds->subband_bias[j];
int partial_sum = 0;
if (!header.disable_final_subband_predict)
--num_coded_subbands;
for (int i=0; i < num_coded_subbands; ++i) {
int v = subband_value[cur_subband_values0++];
v -= bias; // remove bias
v = (v & 63);
bd[c].quantized_subbands[start+i] = (U16) v;
partial_sum += v;
}
if (!header.disable_final_subband_predict) {
int actual_sum = predicted_sum + (S8) subband_correction[cur_subband_corrections++];
int v = actual_sum - partial_sum;
if (v < 0) // @TODO investigate this case closely, why can't it be negative, should there be an upper bound?
return e(RADAUDIO_INVALID_DATA);
bd[c].quantized_subbands[start+num_coded_subbands] = (U16) v;
}
}
}
}
}
PROF_END(compute_subbands);
if (error)
return e(RADAUDIO_INVALID_DATA);
for (c=0; c < num_channels; ++c) {
int num_nonzero_bitarray_bytes = ds->nz_desc[nz_mode].num_8byte_chunks * 8;
rrbool result = distribute_nonzero_coefficients(ds, bd[c].quantized_coeff_decode, bi->num_quantized_coeffs,
runlength_data, &cur_runlength_data,
nonzero_coefficients, &cur_nonzero_coefficients,
nonzero_flagbits + c*num_nonzero_bitarray_bytes, is_short_block ? 0 : num_nonzero_bitarray_bytes*8, c);
if (!result)
return e(RADAUDIO_INVALID_DATA);
}
// we expect to read the first sentinel; if we read the second, it's a bug
if (cur_runlength_data > num_runlength_data+1)
return e(RADAUDIO_INVALID_DATA);
FFT_ALIGN(F32, rawdata[MAX_COEFFS]);
F32 *data1 = rawdata, *data2 = rawdata + (bi->num_coeffs >> 1);
if (ds->num_channels==1) {
// mono stream
(void) decode_channel_before_imdct(ds, &bd[0], 0, ds->block_number, m_mantissa);
(void) decode_channel_imdct (ds, rawdata , &bd[0], 0);
len = decode_channel_after_imdct (ds, data1, data2, &bd[0], 0, output[0]);
} else if (ds->num_channels==2 && num_channels==1) {
// stereo stream with mono block
(void) decode_channel_before_imdct(ds, &bd[0], 0, ds->block_number, m_mantissa);
(void) decode_channel_imdct (ds, rawdata , &bd[0], 0);
(void) decode_channel_after_imdct (ds, data1, data2, &bd[0], 0, output[0]);
len = decode_channel_after_imdct (ds, data1, data2, &bd[0], 1, output[1]);
} else {
// stereo stream with stereo block
(void) decode_channel_before_imdct(ds, &bd[0], 0, ds->block_number , m_mantissa);
(void) decode_channel_before_imdct(ds, &bd[1], 1, ds->block_number^0x55555555, m_mantissa+ds->info[is_short_block]->num_bands);
// midside decode
for (int j=0; j < bi->num_bands; ++j) {
if (midside_bands & (1 << j)) {
F32 *coeffs1 = bd[0].dequantized_coeff_decode;
F32 *coeffs2 = bd[1].dequantized_coeff_decode;
int start = bi->first_coeff_for_band[j];
int end = start + bi->num_coeffs_for_band[j];
for (int i=start; i < end; ++i) {
float x = coeffs1[i];
float y = coeffs2[i]*0.5f;
coeffs1[i] = x+y;
coeffs2[i] = x-y;
}
}
}
(void) decode_channel_imdct (ds, rawdata , &bd[0], 0);
(void) decode_channel_after_imdct (ds, data1, data2, &bd[0], 0, output[0]);
(void) decode_channel_imdct (ds, rawdata , &bd[1], 1);
len = decode_channel_after_imdct (ds, data1, data2, &bd[1], 1, output[1]);
}
if (error)
return e(RADAUDIO_INVALID_DATA);
*memconsumed = block_length_in_bytes + skip;
++ds->block_number;
ds->last_block_short = ds->current_block_short;
if (header.final_block) {
if (header.final_samples_discard > (U32) len)
return e(RADAUDIO_INVALID_DATA);
else {
int total = len - header.final_samples_discard;
return total;
}
} else
return len;
}
size_t RadAudioDecoderMemoryRequired(U8 *raw_header, size_t raw_header_size)
{
size_t size = sizeof(radaudio_decoder_state);
size += decoder_align-1; // allow room to align
int num_channels;
if (raw_header == NULL)
num_channels = 2;
else {
radaudio_stream_header_unpacked header;
if (radaudio_unpack_stream_header(raw_header, raw_header_size, &header) == 0)
return 0;
num_channels = header.num_channels;
}
// room for buffered samples from previous block
size += RADAUDIO_LONG_BLOCK_LEN/2 * sizeof(S16) * num_channels;
return size;
}
static radaudio_decoder_state * radaudio_decompressor_memalloc(radaudio_stream_header_unpacked *header, void *vmem, size_t memsize)
{
int i;
union {
UINTa addr;
U8 * ptr;
} convert;
if (memsize < sizeof(radaudio_decoder_state))
return 0;
U8 *mem = vmem;
radaudio_decoder_state *ds;
ds = (void*) mem; mem += sizeof(*ds);
memset(ds, 0, sizeof(*ds));
// align data after struct
convert.ptr = mem;
convert.addr = (convert.addr + decoder_align-1) & ~(decoder_align-1);
mem = convert.ptr;
for (i=0; i < header->num_channels; ++i) {
ds->prev_block_right_samples[i] = (void *) mem;
mem += RADAUDIO_LONG_BLOCK_LEN/2 * sizeof(S16);
}
size_t memneeded = mem - (U8*)vmem;
if (memneeded > memsize)
return 0;
ds->last_block_short = 1; // shouldn't matter
return ds;
}
RadAudioDecoder *RadAudioDecoderOpen(U8 *raw_header, size_t raw_header_size, void *vmem, size_t memsize, size_t *header_read)
{
radaudio_decoder_state *ds;
radaudio_stream_header_unpacked header;
if (raw_header_size < sizeof(radaudio_stream_header))
return NULL;
size_t header_size = radaudio_unpack_stream_header(raw_header, raw_header_size, &header);
if (header_size == 0)
return NULL;
// unpack_stream_header does some sanity checking, here's the rest:
for (int i=0; i < NUM_NZ_MODE; ++i)
if (header.nzmode_num64[i] > MAX_NZ_BLOCKS)
return 0;
ds = radaudio_decompressor_memalloc(&header, vmem, memsize);
if (ds == NULL)
return 0;
ds->version = header.version;
ds->num_channels = header.num_channels;
ds->skip_bytes = 0;//(U8) header_size;
ds->cpu = cpu_detect();
ds->post_seek = true; // very first block decoded discards input
ds->sample_rate = header.sample_rate;
ds->samprate_mode = header.sample_rate_mode;
memcpy(ds->subband_bias, header.subband_bias, sizeof(ds->subband_bias));
ds->info[0] = &radaudio_rateinfo[0][ds->samprate_mode];
ds->info[1] = &radaudio_rateinfo[1][ds->samprate_mode];
memcpy(ds->subband_predicted_sum, header.subband_predicted_sum, 24);
memcpy(ds->mantissa_param , header.mantissa_param, sizeof(header.mantissa_param));
compute_bias_set(&ds->biases, header.bytes_bias);
for (int i=0; i < NUM_NZ_MODE; ++i) {
ds->nz_desc[i].num_8byte_chunks = header.nzmode_num64[i];
if (ds->nz_desc[i].num_8byte_chunks > MAX_NZ_BLOCKS)
return 0;
for (int j=0; j < MAX_NZ_BLOCKS; ++j) {
ds->nz_desc[i].huffman_table_for_chunk[j] = (header.nzmode_huff[i][j] & ~NZ_MODE_INVERT);
ds->nz_desc[i].invert_chunk[j] = (header.nzmode_huff[i][j] & NZ_MODE_INVERT) != 0;
if (ds->nz_desc[i].huffman_table_for_chunk[j] >= NUM_NZ_HUFF)
return 0;
}
}
for (int j=0; j < NUM_NZ_SELECTOR; ++j)
for (int i=0; i < NUM_SELECTOR_MODES; ++i)
ds->nz_correlated_huffman_selectors[j][i] = header.nzmode_selectors[j][i];
radaudio_init_nz_desc(ds->nz_desc);
if (header_read)
*header_read = header_size;
return ds;
}
#ifdef RADAUDIO_DEVELOPMENT
void RadAudioDecoderForceIntelCPU(RadAudioDecoder *hradaud, rrbool has_sse2, rrbool has_ssse3, rrbool has_sse4_1, rrbool has_popcnt, rrbool has_avx2)
{
radaudio_decoder_state *ds = (radaudio_decoder_state *) hradaud;
RR_UNUSED_VARIABLE(ds);
#ifdef __RADX86__
ds->cpu.has_sse2 = (U8) has_sse2;
ds->cpu.has_ssse3 = (U8) has_ssse3;
ds->cpu.has_sse4_1 = (U8) has_sse4_1;
ds->cpu.has_popcnt = (U8) has_popcnt;
ds->cpu.has_avx2 = (U8) has_avx2;
#endif
}
#endif
static void decode_version(RadAudioInfo *info, U32 version)
{
info->major_version = (U8 ) ((version & 0xff000000) >> 24);
info->minor_version = (U8 ) ((version & 0x00ff0000) >> 16);
info->sequential_version = (U16) ((version & 0x0000ffff) >> 0);
}
void RadAudioDecoderGetInfo(const RadAudioDecoder *hradaud, RadAudioInfo *out_info)
{
radaudio_decoder_state *ds = (radaudio_decoder_state *) hradaud;
out_info->sample_rate = ds->sample_rate;
out_info->num_channels = ds->num_channels;
decode_version(out_info, ds->version);
}
size_t RadAudioDecoderGetInfoHeader(U8* raw_header, size_t raw_header_size, RadAudioInfo *out_info)
{
radaudio_stream_header_unpacked header;
size_t header_size = radaudio_unpack_stream_header(raw_header, raw_header_size, &header);
if (header_size == 0)
return 0;
out_info->sample_rate = header.sample_rate;
out_info->num_channels = header.num_channels;
decode_version(out_info, header.version);
return header_size;
}
RADDEFFUNC void RadAudioDecoderDidSeek(RadAudioDecoder *radaudio_decomp)
{
radaudio_decoder_state *ds = (radaudio_decoder_state *) radaudio_decomp;
ds->post_seek = true;
ds->at_eof = false;
}
int RadAudioDecoderGetChunkLength(RadAudioDecoder *radaudio_decomp, const U8 *data, size_t data_avail)
{
radaudio_decoder_state *ds = (radaudio_decoder_state *) radaudio_decomp;
if (ds->at_eof)
return RADAUDIO_AT_EOF;
if (data_avail < 4)
return RADAUDIO_INCOMPLETE_DATA;
radaudio_block_header_unpacked header;
int offset = radaudio_decode_block_header(data, &ds->biases, &header, data_avail);
if (offset == COMMON_STREAM_HEADER)
return RADAUDIO_START_OF_STREAM;
if (offset == COMMON_INCOMPLETE_DATA)
return RADAUDIO_INCOMPLETE_DATA;
if (offset == COMMON_INVALID_DATA)
return RADAUDIO_INVALID_DATA;
U32 block_length_in_bytes = header.block_bytes + offset;
// validate data
if (block_length_in_bytes > MAX_ENCODED_BLOCK_BYTES)
return RADAUDIO_INVALID_DATA;
if (offset + header.vbstream0_length > block_length_in_bytes)
return RADAUDIO_INVALID_DATA;
if (header.final_block)
if (header.final_samples_discard > RADAUDIO_SHORT_BLOCK_LEN)
return RADAUDIO_INVALID_DATA;
if (header.num_runlength_array > (U32) 2*(header.this_block_short ? RADAUDIO_SHORT_BLOCK_LEN+1 : RADAUDIO_LONG_BLOCK_LEN+1))
return RADAUDIO_INVALID_DATA;
return header.block_bytes + offset;
}
// returns the number of samples output per channel, and update 'memconsumed'
// with the amount of memory consumed.
//
// return values:
// n number of samples decoded (for one channel, e.g. n=1024 means 1024 stereo pairs)
// 0 can decode 0 samples legitimately, e.g. first block or after seeking
// -1 at end-of-stream
// -2 not enough input data to decode a frame, always consumes 0
// -3 error (e.g. corrupt stream)
int RadAudioDecoderDecodeChunk(
RadAudioDecoder *radaudio_decomp,
const U8 *mem ,
size_t memavail ,
size_t *memconsumed ,
F32 *output_samples[2],
size_t max_samples_per_channel
)
{
*memconsumed = 0;
radaudio_decoder_state *ds = (radaudio_decoder_state *) radaudio_decomp;
if (ds->at_eof)
return RADAUDIO_AT_EOF;
if (memavail < 7)
return RADAUDIO_INCOMPLETE_DATA;
size_t used;
if (!ds)
return -2;
PROF_BEGIN(decoder_all);
size_t skip = ds->skip_bytes;
int len = decode_block(ds, output_samples, (U8*)mem+skip, memavail-skip, &used);
ds->post_seek = false;
if (len >= 0) {
*memconsumed = used + skip;
ds->skip_bytes = 0;
ds->fully_decoded += len;
}
PROF_END(decoder_all);
return len;
}
#ifdef RADAUDIO_DEVELOPMENT
// internal use
int RadAudioDecoderGetProfileData(RadAudioDecoder *hradaud, radaudio_profile_value *aprofile, int num_profile)
{
RR_UNUSED_VARIABLE(hradaud);
int n = RR_MIN(num_profile, PROF_total_count - 1);
static const char *names[] = {
#define PROF(x) #x,
PROFILE_ZONES()
#undef PROF
};
if (aprofile) {
double overhead_time = 0.0;
// we have an empty profiling region to estimate overhead of tracking a region to begin with
if (profile_counts[PROF_overhead]) {
overhead_time = rrTicksToSeconds(profile_times[PROF_overhead]) / profile_counts[PROF_overhead];
}
for (int i=0; i < n; ++i) {
aprofile[i].name = names[i];
// subtract out estimated overhead
aprofile[i].time = rrTicksToSeconds(profile_times[i]) - overhead_time * profile_counts[i];
}
} else {
profile = num_profile;
}
for (int i=0; i < PROF_total_count; ++i) {
profile_times[i] = 0;
profile_counts[i] = 0;
}
return n;
}
#else
int RadAudioDecoderGetProfileData(RadAudioDecoder *hradaud, radaudio_profile_value *profile, int num_profile)
{
RR_UNUSED_VARIABLE(hradaud); RR_UNUSED_VARIABLE(profile); RR_UNUSED_VARIABLE(num_profile);
return 0;
}
#endif