2555 lines
90 KiB
C
2555 lines
90 KiB
C
// Copyright Epic Games Tools, LLC. All Rights Reserved.
|
|
////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// RADaudio is a new audio codec made by Epic Game Tools for use in games,
|
|
// optimized for fast SIMD decoding and decent quality (roughly similar to
|
|
// Vorbis).
|
|
//
|
|
// It is a classical MDCT-based codec with two block sizes, and it uses
|
|
// the Oodle Data huffman entropy coder to store data.
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
|
|
// We want the external defines to be scoped to DECODER, but we don't want to rename everything in here.
|
|
#define RADAUDIO_AT_EOF RADAUDIO_DECODER_AT_EOF
|
|
#define RADAUDIO_INCOMPLETE_DATA RADAUDIO_DECODER_INCOMPLETE_DATA
|
|
#define RADAUDIO_INVALID_DATA RADAUDIO_DECODER_INVALID_DATA
|
|
#define RADAUDIO_START_OF_STREAM RADAUDIO_DECODER_START_OF_STREAM
|
|
#define RADAUDIO_INTERNAL_ERROR RADAUDIO_DECODER_INTERNAL_ERROR
|
|
|
|
#define HUFFMAN_DECODE // enable huffman decode tables
|
|
#include "radaudio_decoder.h"
|
|
#include "radaudio_decoder_sse2.h"
|
|
#include "radaudio_decoder_sse4.h"
|
|
#include "radaudio_decoder_avx2.h"
|
|
#include "radaudio_decoder_neon.h"
|
|
#include "radaudio_decoder_internal.h"
|
|
#include "radaudio_common.h"
|
|
#include "radaudio_mdct.h"
|
|
#include "rrCore.h"
|
|
#include "rrbits.h"
|
|
|
|
#include "radaudio_common.inl"
|
|
|
|
RR_COMPILER_ASSERT(COMMON_INVALID_DATA == RADAUDIO_INVALID_DATA);
|
|
RR_COMPILER_ASSERT(COMMON_INCOMPLETE_DATA == RADAUDIO_INCOMPLETE_DATA);
|
|
RR_COMPILER_ASSERT(RADAUDIO_STREAM_HEADER_SIZE == sizeof(radaudio_stream_header));
|
|
|
|
#ifdef RADAUDIO_DEVELOPMENT
|
|
#define PROFILE_ZONES() \
|
|
PROF(decoder_all) \
|
|
PROF(imdct) \
|
|
PROF(window) \
|
|
PROF(huffman) \
|
|
PROF(unquantize) \
|
|
PROF(distribute_rle) \
|
|
PROF(update_runlength) \
|
|
PROF(varbits) \
|
|
PROF(compute_mantissa_len) \
|
|
PROF(copy) \
|
|
PROF(compute_subbands) \
|
|
PROF(distribute_bitflag) \
|
|
PROF(unpack) \
|
|
PROF(randomize) \
|
|
PROF(compute_subband_energy)\
|
|
PROF(unbias) \
|
|
PROF(compute_band_energy) \
|
|
PROF(count_coefficients_huff) \
|
|
PROF(header) \
|
|
PROF(zero) \
|
|
PROF(flagbits) \
|
|
PROF(overhead) /* must always be last! */ \
|
|
PROF(total_count)
|
|
|
|
enum
|
|
{
|
|
#define PROF(x) PROF_##x,
|
|
PROFILE_ZONES()
|
|
#undef PROF
|
|
|
|
PROF__end
|
|
};
|
|
|
|
#define PROF_BEGIN(var) if (profile) { profile_times[PROF_##var] -= rrGetTicks(); profile_counts[PROF_##var] += 1; }
|
|
#define PROF_END(var) if (profile) profile_times[PROF_##var] += rrGetTicks()
|
|
|
|
static rrbool profile;
|
|
static U64 profile_times[PROF_total_count];
|
|
static S64 profile_counts[PROF_total_count];
|
|
#else
|
|
#define PROF_BEGIN(var)
|
|
#define PROF_END(var)
|
|
#define PROF_total_count 1
|
|
#endif
|
|
|
|
#define RANDVAL(r,i) (r)
|
|
|
|
// allow breakpointing on any error
|
|
static int e(int code)
|
|
{
|
|
return code;
|
|
}
|
|
|
|
typedef struct RadAudioDecoder
|
|
{
|
|
U32 version;
|
|
rrbool current_block_short;
|
|
rrbool last_block_short;
|
|
rrbool next_block_short; // we don't actually need this
|
|
int samprate_mode;
|
|
int num_channels;
|
|
|
|
int skip_bytes;
|
|
int sample_rate; // implied by samprate_mode
|
|
U32 block_number;
|
|
int fully_decoded; // sample offset in stream
|
|
U8 subband_predicted_sum[MAX_BANDS];
|
|
S8 mantissa_param[2][MAX_BANDS][2];
|
|
S8 subband_bias[MAX_BANDS];
|
|
rrbool at_eof;
|
|
rrbool post_seek;
|
|
|
|
rrbool bitstream_overshot;
|
|
radaudio_block_header_biases biases;
|
|
radaudio_cpu_features cpu;
|
|
|
|
radaudio_rate_info * info[2]; // pre-defined table, indexed by long vs. short block
|
|
radaudio_nonzero_blockmode_descriptor nz_desc[NUM_NZ_MODE];
|
|
U8 nz_correlated_huffman_selectors[NUM_NZ_SELECTOR][NUM_SELECTOR_MODES];
|
|
|
|
S16 * prev_block_right_samples[2];
|
|
F32 restore_scale[2]; // how to convert S16s back to floats
|
|
} radaudio_decoder_state;
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// ENTROPY DECODER
|
|
//
|
|
|
|
typedef struct
|
|
{
|
|
U8 *bitstream;
|
|
U8 *end;
|
|
U32 read_pos_in_bits;
|
|
U32 fast_num_bits; // if initial read_pos_in_bits < this, can take fast path (needs to be < not <= so 0 disables fast path)
|
|
U32 total_num_bits;
|
|
} rada_bit_decoder;
|
|
|
|
typedef struct
|
|
{
|
|
rada_bit_decoder stream[3];
|
|
} huff3_decoder;
|
|
|
|
static void decode_vbstream_init(rada_bit_decoder *d, U8 *bitstream, U8 *end, int *error)
|
|
{
|
|
if (bitstream > end) {
|
|
// this is a backwards stream (for huffman). not allowed to read through
|
|
// decode_vbstream_bits, so make sure we can't by setting total bits count to 0.
|
|
d->bitstream = bitstream;
|
|
d->end = end;
|
|
d->read_pos_in_bits = 0;
|
|
d->fast_num_bits = 0;
|
|
d->total_num_bits = 0;
|
|
return;
|
|
}
|
|
|
|
d->bitstream = bitstream;
|
|
d->end = end;
|
|
d->read_pos_in_bits = 0;
|
|
|
|
size_t num_bytes = end - bitstream;
|
|
if (num_bytes > MAX_ENCODED_BLOCK_BYTES) {
|
|
// not allowed! set num_bytes to 0 and initialize stream as empty
|
|
num_bytes = 0;
|
|
*error = 1;
|
|
}
|
|
|
|
d->total_num_bits = (U32) (num_bytes * 8); // can't overflow: num_bytes checked above
|
|
if (d->total_num_bits >= 32) {
|
|
d->fast_num_bits = d->total_num_bits - 32;
|
|
} else {
|
|
d->fast_num_bits = 0;
|
|
}
|
|
}
|
|
|
|
// bit reading cold path, reads one byte at a time to avoid over-reading
|
|
static RADNOINLINE U32 decode_vbstream_bits_cold(rada_bit_decoder *d, int bitlength, int *error)
|
|
{
|
|
// check whether actual data required goes off the end
|
|
if (d->read_pos_in_bits + bitlength > d->total_num_bits) {
|
|
*error = 1;
|
|
return 0;
|
|
}
|
|
|
|
// can read 0 bits exactly at the end
|
|
if (bitlength == 0)
|
|
return 0;
|
|
|
|
// if not, read as many valid bits as exist, then mask
|
|
size_t first_byte = (d->read_pos_in_bits >> 3);
|
|
U32 bits = d->bitstream[first_byte++];
|
|
U32 shift = 8;
|
|
while (d->bitstream+first_byte < d->end) {
|
|
bits = bits + (d->bitstream[first_byte++] << shift);
|
|
shift += 8;
|
|
}
|
|
bits >>= (d->read_pos_in_bits & 7);
|
|
bits &= (1 << bitlength)-1;
|
|
d->read_pos_in_bits += bitlength;
|
|
return bits;
|
|
}
|
|
|
|
static RADFORCEINLINE U32 decode_vbstream_bits(rada_bit_decoder *d, int bitlength, int *error)
|
|
{
|
|
// check for reading off the end...
|
|
if (d->read_pos_in_bits < d->fast_num_bits) {
|
|
// simple path
|
|
size_t first_byte = (d->read_pos_in_bits >> 3);
|
|
U32 bits = RR_GET32_LE(d->bitstream + first_byte);
|
|
bits >>= (d->read_pos_in_bits & 7); // discard bits we're pointing past
|
|
bits &= (1 << bitlength)-1;
|
|
d->read_pos_in_bits += bitlength;
|
|
return bits;
|
|
} else {
|
|
return decode_vbstream_bits_cold(d, bitlength, error);
|
|
}
|
|
}
|
|
|
|
typedef struct
|
|
{
|
|
U8 *decodeptr; // Current write cursor for the two stream triples
|
|
U8 *decodeend; // End of decoded bytes buffer for the two stream triples
|
|
|
|
const U8 *bitp[3]; // Next byte to be read for the streams
|
|
U32 bits[3]; // Current contents of bit buffer
|
|
U32 bitc[3]; // Current number of valid bits in bit buffer
|
|
} rada_internal_huff_state;
|
|
|
|
#define NEWLZ_HUFF_CODELEN_LIMIT 11
|
|
#define NEWLZ_HUFF_DECODE_TABLE_MASK 2047u
|
|
|
|
// 32-bit ARM implicitly masks 32-bit shift amounts by 255 (low 8 bits).
|
|
// All other current targets implicitly mask by 31 (low 5 bits). Either
|
|
// works for us, but we'd prefer not to get an extra AND, so use whatever
|
|
// the implicit mask is and rely on the compiler to clean it up.
|
|
#if defined(__RADARM__) && !defined(__RAD64__)
|
|
#define HUFF32LENMASK 255
|
|
#else
|
|
#define HUFF32LENMASK 31
|
|
#endif
|
|
|
|
static rrbool huff_decode_precise_finish(rada_internal_huff_state * s, radaudio_huffman *huff)
|
|
{
|
|
const U8 * in0 = s->bitp[0];
|
|
const U8 * in1 = s->bitp[1];
|
|
const U8 * in2 = s->bitp[2];
|
|
|
|
U32 bits0 = s->bits[0], bitc0 = s->bitc[0];
|
|
U32 bits1 = s->bits[1], bitc1 = s->bitc[1];
|
|
U32 bits2 = s->bits[2], bitc2 = s->bitc[2];
|
|
|
|
if (in0 > in2)
|
|
return false;
|
|
|
|
U8 *decodeptr = s->decodeptr;
|
|
U8 *decodeend = s->decodeend;
|
|
|
|
#define DECONE(strm) \
|
|
peek = bits##strm & NEWLZ_HUFF_DECODE_TABLE_MASK; \
|
|
cl = huff->decode[peek].length; \
|
|
sym = huff->decode[peek].symbol; \
|
|
bits##strm >>= cl & HUFF32LENMASK; bitc##strm -= cl; \
|
|
*decodeptr++ = (U8) sym
|
|
|
|
#define DECTHREE() \
|
|
DECONE(0); \
|
|
DECONE(1); \
|
|
DECONE(2)
|
|
|
|
RR_COMPILER_ASSERT( NEWLZ_HUFF_CODELEN_LIMIT <= 12 );
|
|
#define N_DECS_PER_REFILL 2
|
|
#define TRIPLE_DECS_PER_REFILL (3*N_DECS_PER_REFILL)
|
|
|
|
// bulk loop to get within 4B of end
|
|
if (in1 - in2 >= 4 && decodeend - decodeptr >= TRIPLE_DECS_PER_REFILL)
|
|
{
|
|
in1 -= 4;
|
|
decodeend -= TRIPLE_DECS_PER_REFILL-1;
|
|
|
|
while (decodeptr < decodeend)
|
|
{
|
|
// non-crossing invariant: in0 <= in2 && in2 <= in1
|
|
if (in0 > in2 || in2 > in1)
|
|
break;
|
|
|
|
// non-crossing and 4B access size guarantee that the
|
|
// following reads are safe; the decodeend decrement before the
|
|
// loop guarantees that we don't write out of bounds.
|
|
|
|
// refill :
|
|
bits0 |= RR_GET32_LE(in0) << bitc0;
|
|
in0 += (31 - bitc0)>>3; // bytes_consumed
|
|
bitc0 |= 24; // same as += bytes_consumed<<3 here!
|
|
|
|
bits1 |= RR_GET32_BE(in1) << bitc1;
|
|
in1 -= (31 - bitc1)>>3; // bytes_consumed
|
|
bitc1 |= 24; // same as += bytes_consumed<<3 here!
|
|
|
|
bits2 |= RR_GET32_LE(in2) << bitc2;
|
|
in2 += (31 - bitc2)>>3; // bytes_consumed
|
|
bitc2 |= 24; // same as += bytes_consumed<<3 here!
|
|
|
|
U32 peek; int cl; int sym;
|
|
|
|
RR_COMPILER_ASSERT( N_DECS_PER_REFILL == 2 );
|
|
DECTHREE();
|
|
DECTHREE();
|
|
}
|
|
|
|
decodeend += TRIPLE_DECS_PER_REFILL-1;
|
|
in1 += 4;
|
|
|
|
// transition to final loop
|
|
in0 -= (bitc0 >> 3); bitc0 &= 7;
|
|
in1 += (bitc1 >> 3); bitc1 &= 7;
|
|
in2 -= (bitc2 >> 3); bitc2 &= 7;
|
|
}
|
|
|
|
// Final loop. This is really careful about the bytes it accesses.
|
|
while (decodeptr < decodeend)
|
|
{
|
|
U32 peek, cl, sym;
|
|
|
|
// refill to >=16b in bit0 buf
|
|
if (in2 - in0 > 1)
|
|
bits0 |= RR_GET16_LE(in0) << bitc0;
|
|
else if (in2 - in0 == 1)
|
|
bits0 |= in0[0] << bitc0;
|
|
|
|
DECONE(0);
|
|
in0 += (7 - bitc0) >> 3;
|
|
bitc0 &= 7;
|
|
|
|
if (decodeptr >= decodeend)
|
|
break;
|
|
|
|
// refill to >=16b left in bit1, bit2 bufs
|
|
if (in1 - in2 > 1)
|
|
{
|
|
bits1 |= RR_GET16_BE(in1 - 2) << bitc1;
|
|
bits2 |= RR_GET16_LE(in2) << bitc2;
|
|
}
|
|
else if (in1 - in2 == 1)
|
|
{
|
|
// accessing the same byte!
|
|
bits1 |= in2[0] << bitc1;
|
|
bits2 |= in2[0] << bitc2;
|
|
}
|
|
|
|
DECONE(1);
|
|
in1 -= (7 - bitc1) >> 3;
|
|
bitc1 &= 7;
|
|
|
|
if (decodeptr >= decodeend)
|
|
break;
|
|
|
|
DECONE(2);
|
|
in2 += (7 - bitc2) >> 3;
|
|
bitc2 &= 7;
|
|
|
|
if (in0 > in2 || in2 > in1) // corruption check
|
|
return false;
|
|
}
|
|
|
|
if (decodeptr != decodeend)
|
|
return false;
|
|
|
|
#undef DECONE
|
|
#undef DECTHREE
|
|
#undef N_DECS_PER_REFILL
|
|
#undef TRIPLE_DECS_PER_REFILL
|
|
|
|
s->bitp[0] = in0; s->bits[0] = bits0; s->bitc[0] = bitc0;
|
|
s->bitp[1] = in1; s->bits[1] = bits1; s->bitc[1] = bitc1;
|
|
s->bitp[2] = in2; s->bits[2] = bits2; s->bitc[2] = bitc2;
|
|
|
|
return true;
|
|
}
|
|
|
|
#if defined(__RAD64REGS__)
|
|
static rrbool huff_decode_inner64(rada_internal_huff_state * s, radaudio_huffman *huff)
|
|
{
|
|
// Layout: strm0-> | strm2-> | <-strm1
|
|
const U8 * in0 = s->bitp[0];
|
|
const U8 * in1 = s->bitp[1];
|
|
const U8 * in2 = s->bitp[2];
|
|
|
|
U8 * decodeptr = s->decodeptr;
|
|
U8 * decodeend = s->decodeend;
|
|
|
|
// NEWLZ_HUFF_CODELEN_LIMIT == 11 , could actually do 5 per refill = 10 per loop
|
|
#if (56/NEWLZ_HUFF_CODELEN_LIMIT) >= 5
|
|
#define N_DECS_PER_REFILL 5
|
|
#elif (56/NEWLZ_HUFF_CODELEN_LIMIT) >= 4
|
|
#define N_DECS_PER_REFILL 4
|
|
#else
|
|
#define N_DECS_PER_REFILL 3
|
|
#endif
|
|
#define TRIPLE_DECS_PER_REFILL (3*N_DECS_PER_REFILL)
|
|
|
|
// bulk loop
|
|
if (decodeend - decodeptr > TRIPLE_DECS_PER_REFILL-1 && in1 - in2 > 8) // @TODO: maybe test for going outside the buffer instead of this, since this might be true too often
|
|
{
|
|
// offset the end marker so we only run with full groups left
|
|
decodeend -= TRIPLE_DECS_PER_REFILL-1;
|
|
in1 -= 8;
|
|
|
|
U64 bits0=s->bits[0], bitcount0 = s->bitc[0];
|
|
U64 bits1=s->bits[1], bitcount1 = s->bitc[1];
|
|
U64 bits2=s->bits[2], bitcount2 = s->bitc[2];
|
|
const U8 *hufftab_base = &huff->decode[0].length;
|
|
|
|
#define DECONE(strm) \
|
|
/* NOTE(fg): This address calc is a single UBFIZ */ \
|
|
tabv = (bits##strm & NEWLZ_HUFF_DECODE_TABLE_MASK) * sizeof(radaudio_huff_symbol); \
|
|
tabv = RR_GET16_LE((const U16 *) (hufftab_base + tabv)); \
|
|
bits##strm >>= tabv & 63; bitcount##strm -= tabv; \
|
|
*decodeptr++ = (U8) (tabv >> 8)
|
|
|
|
#define DECTHREE() \
|
|
DECONE(0); \
|
|
DECONE(1); \
|
|
DECONE(2)
|
|
|
|
while (decodeptr < decodeend)
|
|
{
|
|
// non-crossing invariant: in0 <= in2 && in2 <= in1
|
|
if (in0 > in2) // if_unlikely
|
|
break;
|
|
if (in2 > in1) // if_unlikely
|
|
break;
|
|
|
|
// refill :
|
|
U64 next0 = RR_GET64_LE(in0);
|
|
bits0 |= next0 << bitcount0;
|
|
in0 += (63 - bitcount0)>>3; // bytes_consumed
|
|
bitcount0 |= 56; // same as += bytes_consumed<<3 here!
|
|
|
|
U64 next1 = RR_GET64_BE(in1);
|
|
bits1 |= next1 << bitcount1;
|
|
in1 -= (63 - bitcount1)>>3; // bytes_consumed
|
|
bitcount1 |= 56; // same as += bytes_consumed<<3 here!
|
|
|
|
U64 next2 = RR_GET64_LE(in2);
|
|
bits2 |= next2 << bitcount2;
|
|
in2 += (63 - bitcount2)>>3; // bytes_consumed
|
|
bitcount2 |= 56; // same as += bytes_consumed<<3 here!
|
|
|
|
U32 tabv;
|
|
|
|
RR_COMPILER_ASSERT( N_DECS_PER_REFILL >= 3 && N_DECS_PER_REFILL <= 5 );
|
|
DECTHREE();
|
|
DECTHREE();
|
|
DECTHREE();
|
|
#if N_DECS_PER_REFILL > 3
|
|
DECTHREE();
|
|
#endif
|
|
#if N_DECS_PER_REFILL > 4
|
|
DECTHREE();
|
|
#endif
|
|
|
|
// our decode process puts some crap in the top bits; clear them
|
|
bitcount0 &= 63;
|
|
bitcount1 &= 63;
|
|
bitcount2 &= 63;
|
|
}
|
|
#undef DECONE
|
|
#undef DECTHREE
|
|
|
|
in1 += 8;
|
|
|
|
// transition to careful loop
|
|
s->decodeptr = decodeptr;
|
|
s->bitp[0] = in0 - (bitcount0 >> 3); s->bits[0] = (U32) (bits0 & 0xff); s->bitc[0] = bitcount0 & 7;
|
|
s->bitp[1] = in1 + (bitcount1 >> 3); s->bits[1] = (U32) (bits1 & 0xff); s->bitc[1] = bitcount1 & 7;
|
|
s->bitp[2] = in2 - (bitcount2 >> 3); s->bits[2] = (U32) (bits2 & 0xff); s->bitc[2] = bitcount2 & 7;
|
|
}
|
|
|
|
#undef N_DECS_PER_REFILL
|
|
#undef TRIPLE_DECS_PER_REFILL
|
|
|
|
return huff_decode_precise_finish(s, huff);
|
|
}
|
|
#endif
|
|
|
|
static void decode_huff_array(huff3_decoder *ds, radaudio_huffman *huff, U8 *array, int length, int *error)
|
|
{
|
|
rada_internal_huff_state s;
|
|
|
|
s.decodeptr = array;
|
|
s.decodeend = array+length;
|
|
|
|
// generate Huff3 decoder state from our naive state
|
|
for (int i=0; i <= 2; i += 2) {
|
|
s.bitp[i] = &ds->stream[i].bitstream[ds->stream[i].read_pos_in_bits>>3];
|
|
s.bitc[i] = (0-ds->stream[i].read_pos_in_bits) & 7; // read pos of 2 => 6 bits left
|
|
if (s.bitc[i] == 0)
|
|
s.bits[i] = 0;
|
|
else {
|
|
s.bits[i] = *(s.bitp[i]) >> (8-s.bitc[i]);
|
|
++s.bitp[i];
|
|
}
|
|
}
|
|
|
|
s.bitp[1] = &ds->stream[1].bitstream[-(int)(ds->stream[1].read_pos_in_bits>>3)];
|
|
s.bitc[1] = (0-ds->stream[1].read_pos_in_bits) & 7; // read pos of 2 => 6 bits left
|
|
if (s.bitc[1] == 0)
|
|
s.bits[1] = 0;
|
|
else {
|
|
s.bits[1] = *(s.bitp[1]-1) >> (8-s.bitc[1]);
|
|
--s.bitp[1];
|
|
}
|
|
|
|
#ifdef __RAD64REGS__
|
|
if (!huff_decode_inner64(&s, huff))
|
|
*error = 1;
|
|
#else
|
|
if (!huff_decode_precise_finish(&s, huff))
|
|
*error = 1;
|
|
#endif
|
|
|
|
ds->stream[0].read_pos_in_bits = (int) (8*(s.bitp[0] - ds->stream[0].bitstream) - s.bitc[0]);
|
|
ds->stream[2].read_pos_in_bits = (int) (8*(s.bitp[2] - ds->stream[2].bitstream) - s.bitc[2]);
|
|
ds->stream[1].read_pos_in_bits = (int) (8*(ds->stream[1].bitstream - s.bitp[1]) - s.bitc[1]);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void compute_windowed_sum_multiple64(radaudio_decoder_state *ds, float *output, int n,
|
|
const float *fwd_data, S16 *rev_data, int revlen, int revoff, float rev_scale,
|
|
const float *window, int block_number, int channel, int stream_offset)
|
|
{
|
|
rrAssert(n % 64 == 0);
|
|
|
|
// Starting point:
|
|
// output[0:n] = fwd_c[0:n] .* window[0:n] + rev_scale * rev_c[revoff:revoff+n] .* reverse(window[0:n])
|
|
//
|
|
// let n2 = n/2, then the IMDCT symmetries mean that (when both blocks have same length, I'll account for revoff later)
|
|
//
|
|
// fwd_c[0:n2] = -reverse(fwd_c[n2:n])
|
|
// rev_c[n2:n] = reverse(rev_c[0:n2])
|
|
//
|
|
// and therefore we can work with just the middle samples (i.e. the back half of fwd_c and the front
|
|
// half of rev_c). To exploit this systematically, split the loop into two halves at n2:
|
|
//
|
|
// output[0:n2] = fwd_c[0:n2] .* window[0:n2] + rev_scale * rev_c[revoff+0:revoff+n2] .* reverse(window[n2:n])
|
|
// output[n2:n] = fwd_c[n2:n] .* window[n2:n] + rev_scale * rev_c[revoff+n2:revoff+n] .* reverse(window[0:n2])
|
|
//
|
|
// note rev_c is symmetric about revoff+n2, so rev_c[revoff+n2:revoff_n] = reverse(rev_c[revoff+0:revoff+n2]).
|
|
// (This is the second symmetry, accounting for potential differences in MDCT size.)
|
|
//
|
|
// Define:
|
|
// fwd[0:n2] = fwd_c[n2:n]
|
|
// rev[0:n2] = rev_scale * rev_c[revoff:revoff+n2]
|
|
//
|
|
// and then use the symmetries and algebra to get
|
|
//
|
|
// output[0:n2] = -reverse(fwd) .* window[0:n2] + rev .* reverse(window[n2:n])
|
|
// = rev .* reverse(window[n2:n]) - reverse(fwd) .* window[0:n2]
|
|
//
|
|
// output[n2:n] = fwd .* window[n2:n] + reverse(rev) .* reverse(window[0:n2])
|
|
// = reverse(rev .* window[0:n2]) + fwd .* window[n2:n]
|
|
const float *fwd = fwd_data; // NOTE: second half of the forward data, first half is implied by odd symmetry
|
|
S16 *rev = rev_data + revoff;
|
|
|
|
#if defined(DO_BUILD_SSE4) || defined(DO_BUILD_AVX2)
|
|
if (ds->cpu.has_sse2) {
|
|
#ifdef DO_BUILD_AVX2
|
|
if (ds->cpu.has_avx2)
|
|
radaudio_avx2_compute_windowed_sum_multiple16(output, n, fwd, rev, rev_scale, window);
|
|
else
|
|
#endif
|
|
radaudio_sse2_compute_windowed_sum_multiple8(output, n, fwd, rev, rev_scale, window);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#if defined(DO_BUILD_NEON)
|
|
|
|
radaudio_neon_compute_windowed_sum_multiple8(output, n, fwd, rev, rev_scale, window);
|
|
|
|
#else
|
|
|
|
SINTa N2 = n >> 1;
|
|
|
|
for (SINTa j = 0; j < N2; ++j) {
|
|
output[j] = rev_scale * rev[j] * window[n-1-j] - fwd[N2-1-j] * window[j];
|
|
}
|
|
|
|
for (SINTa j = 0; j < N2; ++j) {
|
|
output[j+N2] = rev_scale * rev[N2-1-j] * window[N2-1-j] + fwd[j] * window[N2+j];
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
static void copy_samples_multiple16(float *output, int n, const float *input)
|
|
{
|
|
rrAssert(n % 16 == 0);
|
|
// potentially rely on it being aligned
|
|
memcpy(output, input, 4*n);
|
|
}
|
|
|
|
static void copy_samples_multiple16_scaled(float *output, int n, const S16 *input, float rescale)
|
|
{
|
|
rrAssert(n % 16 == 0);
|
|
for (int i=0; i < n; i += 8) {
|
|
output[i+0] = input[i+0] * rescale;
|
|
output[i+1] = input[i+1] * rescale;
|
|
output[i+2] = input[i+2] * rescale;
|
|
output[i+3] = input[i+3] * rescale;
|
|
output[i+4] = input[i+4] * rescale;
|
|
output[i+5] = input[i+5] * rescale;
|
|
output[i+6] = input[i+6] * rescale;
|
|
output[i+7] = input[i+7] * rescale;
|
|
}
|
|
}
|
|
|
|
static void build_rand_state(U32 *rand_state, U32 randval)
|
|
{
|
|
U32 r2 = (U32) (((randval + 5000) * (U64) 0xc4ceb9fe1a85ec53ULL) >> 33);
|
|
rand_state[0] = randval;
|
|
rand_state[1] = r2;
|
|
rand_state[2] = randval ^ 0x55555555;
|
|
rand_state[3] = r2 ^ 0x55555555;
|
|
}
|
|
|
|
static void randomize_long_block_8x8_Nx16(radaudio_decoder_state *ds, S8 *quantized_coeff, U32 randval, int num_subbands, int *num_coeffs_for_band)
|
|
{
|
|
RAD_ALIGN(U32, rand_state[4], 16);
|
|
build_rand_state(rand_state, randval);
|
|
|
|
static S8 random_table[16] = { -1,1, -2,2, -3,3, -4,4, -5,5, -6,6, -7,7, -8,8 };
|
|
|
|
// SIMD: compute 4 independent randvals in parallel... the encoder doesn't care what the random
|
|
// values are, so they should be stable, but don't have to be the same as the current code
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse4_1) {
|
|
radaudio_sse4_randomize_long_block_8x8_Nx16(quantized_coeff, rand_state, num_subbands);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
int j;
|
|
|
|
int cb = 0;
|
|
U32 randval0 = rand_state[0];
|
|
U32 randval1 = rand_state[1];
|
|
for (j=0; num_coeffs_for_band[j] == 4; ++j) {
|
|
if (RR_GET64_NATIVE(&quantized_coeff[cb]) == 0) {
|
|
U32 rbits = randval0 >> 4;
|
|
randval0 = lcg(randval0);
|
|
for (int i=0; i < 4; ++i) {
|
|
quantized_coeff[cb+i] = random_table[rbits & 15];
|
|
rbits >>= 8;
|
|
}
|
|
}
|
|
cb += 4;
|
|
}
|
|
|
|
for (; num_coeffs_for_band[j] == 8; ++j) {
|
|
if (RR_GET64_NATIVE(&quantized_coeff[cb]) == 0) {
|
|
U32 rbits = randval0 >> 4;
|
|
randval0 = lcg(randval0);
|
|
for (int i=0; i < 4; ++i) {
|
|
quantized_coeff[cb+i] = random_table[rbits & 15];
|
|
rbits >>= 8;
|
|
}
|
|
rbits = randval1 >> 4;
|
|
randval1 = lcg(randval1);
|
|
for (int i=4; i < 8; ++i) {
|
|
quantized_coeff[cb+i] = random_table[rbits & 15];
|
|
rbits >>= 8;
|
|
}
|
|
}
|
|
cb += 8;
|
|
}
|
|
rand_state[0] = randval0;
|
|
rand_state[1] = randval1;
|
|
|
|
for (; j < num_subbands; ++j) {
|
|
if ((RR_GET64_NATIVE(&quantized_coeff[cb+0]) | RR_GET64_NATIVE(&quantized_coeff[cb+8])) == 0) {
|
|
U32 rbits = rand_state[0] >> 4;
|
|
rand_state[0] = lcg(rand_state[0]);
|
|
for (int i=0; i < 4; ++i) {
|
|
quantized_coeff[cb+i] = random_table[rbits & 15];
|
|
rbits >>= 8;
|
|
}
|
|
rbits = rand_state[1] >> 4;
|
|
rand_state[1] = lcg(rand_state[1]);
|
|
for (int i=4; i < 8; ++i) {
|
|
quantized_coeff[cb+i] = random_table[rbits & 15];
|
|
rbits >>= 8;
|
|
}
|
|
rbits = rand_state[2] >> 4;
|
|
rand_state[2] = lcg(rand_state[2]);
|
|
for (int i=8; i < 12; ++i) {
|
|
quantized_coeff[cb+i] = random_table[rbits & 15];
|
|
rbits >>= 8;
|
|
}
|
|
rbits = rand_state[3] >> 4;
|
|
rand_state[3] = lcg(rand_state[3]);
|
|
for (int i=12; i < 16; ++i) {
|
|
quantized_coeff[cb+i] = random_table[rbits & 15];
|
|
rbits >>= 8;
|
|
}
|
|
}
|
|
cb += 16;
|
|
}
|
|
}
|
|
|
|
static void randomize_short_block(S8 quantized_coeff[], U32 randval, int num_bands, int *num_coeffs_for_band)
|
|
{
|
|
static S8 random_table[16] = { -1,1, -2,2, -3,3, -4,4, -5,5, -6,6, -7,7, -8,8 };
|
|
|
|
int cb = 0;
|
|
U32 rbits = randval;
|
|
randval = lcg(randval);
|
|
rbits >>= 10;
|
|
|
|
// Bands 0..3 are 1 coefficient
|
|
for (int j=0; j < 4; ++j) {
|
|
if (quantized_coeff[j] == 0) {
|
|
quantized_coeff[j] = random_table[rbits & 1];
|
|
}
|
|
rbits >>= 4;
|
|
}
|
|
|
|
// Bands 4..7 are 2 coefficients each
|
|
for (int j=4; j < 8; ++j) {
|
|
if (RR_GET16_LE_UNALIGNED(&quantized_coeff[j*2-4]) == 0) {
|
|
rbits = randval, randval = lcg(randval);
|
|
rbits >>= 20;
|
|
quantized_coeff[j*2-4] = random_table[rbits & 15];
|
|
quantized_coeff[j*2-3] = random_table[(rbits >> 4) & 15];
|
|
}
|
|
}
|
|
|
|
// Bands 8..13 are 4 coefficients each
|
|
for (int j=8; j < 13; ++j) {
|
|
if (RR_GET32_LE_UNALIGNED(&quantized_coeff[j*4-20]) == 0) {
|
|
rbits = randval, randval = lcg(randval);
|
|
quantized_coeff[j*4-20] = random_table[(rbits >> 12) & 15];
|
|
quantized_coeff[j*4-19] = random_table[(rbits >> 16) & 15];
|
|
quantized_coeff[j*4-18] = random_table[(rbits >> 20) & 15];
|
|
quantized_coeff[j*4-17] = random_table[(rbits >> 24) & 15];
|
|
}
|
|
}
|
|
|
|
// Remaining bands have 16 or 32 coeffs
|
|
cb = 4*1 + 4*2 + 5*4;
|
|
for (int j=13; j < num_bands; ++j) {
|
|
int i;
|
|
U32 sum1=0, sum2=0;
|
|
int num = num_coeffs_for_band[j];
|
|
for (i=0; i < num; i += 8) { // should be 16 or 32
|
|
sum1 |= RR_GET32_LE_UNALIGNED(&quantized_coeff[cb+i+0]);
|
|
sum2 |= RR_GET32_LE_UNALIGNED(&quantized_coeff[cb+i+4]);
|
|
}
|
|
if ((sum1|sum2) == 0) {
|
|
for (i=0; i+7 < num; i += 8) {
|
|
rbits = randval, randval = lcg(randval);
|
|
for (int k=0; k < 8; ++k) {
|
|
quantized_coeff[cb+i+k] = random_table[rbits & 15];
|
|
rbits >>= 4;
|
|
}
|
|
}
|
|
rbits = randval, randval = lcg(randval);
|
|
for (; i < num; ++i) {
|
|
quantized_coeff[cb+i] = random_table[rbits & 15];
|
|
rbits >>= 4;
|
|
}
|
|
}
|
|
cb += num_coeffs_for_band[j];
|
|
}
|
|
}
|
|
|
|
static int count_bytes_below_value_sentinel16(radaudio_decoder_state *ds, U8 *data, int num_bytes, U8 threshold)
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse2)
|
|
return radaudio_sse2_count_bytes_below_value_sentinel16(data, num_bytes, threshold);
|
|
#endif
|
|
|
|
#if defined(DO_BUILD_NEON)
|
|
return radaudio_neon_count_bytes_below_value_sentinel16(data, num_bytes, threshold);
|
|
#else
|
|
int num=0;
|
|
for (int i=0; i < num_bytes; ++i) {
|
|
num += (data[i] < threshold);
|
|
}
|
|
return num;
|
|
#endif
|
|
}
|
|
|
|
// overwrites up to 7 bytes of space at end of array if not a multiple of 8
|
|
static int count_set_bits_multiple8_sentinel8(radaudio_decoder_state *ds, U8 *data, int num_bytes)
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse4_1) // @TODO: ds->cpu.has_popcnt
|
|
return radaudio_intel_popcnt_count_set_bits_read_multiple8_sentinel8(data, num_bytes);
|
|
#endif
|
|
#ifdef DO_BUILD_NEON
|
|
return radaudio_neon_count_set_bits_read_multiple8_sentinel8(data, num_bytes);
|
|
#endif
|
|
|
|
#ifndef DO_BUILD_NEON // for unreachable code warnings
|
|
#ifdef __RAD64REGS__
|
|
// 64-bit scalar code
|
|
int num=0;
|
|
int padded_size = (num_bytes + 7) & ~7;
|
|
if (num_bytes != padded_size)
|
|
RR_PUT64_NATIVE(&data[num_bytes], 0);
|
|
for (int i=0; i < padded_size; i += 8) {
|
|
U64 value = RR_GET64_NATIVE(&data[i]);
|
|
value = value - ((value >> 1) & 0x5555555555555555ull); // for pairs of bits: 00->00, 01->01, 10->01, 11->10
|
|
// sums across groups of 2 bits -> sums across groups of 8 bits
|
|
// skipping the groups-of-4 stage to get a wider reduction tree with fewer constants
|
|
U64 threes = 0x0303030303030303ull;
|
|
value = (value & threes) + ((value >> 2) & threes) + ((value >> 4) & threes) + ((value >> 6) & threes);
|
|
// sum the bytes (can't overflow)
|
|
value = (value * 0x0101010101010101ull) >> 56;
|
|
num += (int)value;
|
|
}
|
|
#else
|
|
// 32-bit scalar code
|
|
int num=0;
|
|
int padded_size = (num_bytes + 3) & ~3;
|
|
if (num_bytes != padded_size)
|
|
RR_PUT32_NATIVE(&data[num_bytes], 0);
|
|
for (int i=0; i < padded_size; i += 4) {
|
|
U32 value = RR_GET32_NATIVE(&data[i]);
|
|
value = value - ((value >> 1) & 0x55555555); // for pairs of bits: 00->00, 01->01, 10->01, 11->10
|
|
value = (value & 0x33333333) + ((value>> 2) & 0x33333333);
|
|
value = (value & 0x0f0f0f0f) + ((value>> 4) & 0x0f0f0f0f);
|
|
value = (value * 0x01010101) >> 24;
|
|
num += (int)value;
|
|
}
|
|
#endif
|
|
return num;
|
|
#endif // !DO_BUILD_NEON
|
|
}
|
|
|
|
// guarantees a multiple of 16 bytes is written, with the extra bytes having the value of 1:
|
|
// scalar: reads exact bytes specified, writes an extra 16 bytes of "1"
|
|
// SSE : writes multiple of 32 bytes with extras equal to "1", also writes 16 bytes starting at &packed[num_packed]
|
|
static void unpack_nibbles_input_excess16_output_excess16_multiple32_default1(radaudio_decoder_state *ds, S8 *unpacked, U8 *packed, int num_packed)
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse2) {
|
|
radaudio_sse2_unpack_nibbles_read_sentinel16_write_multiple32(unpacked, packed, num_packed, 0x1111111111111111ull);
|
|
return;
|
|
}
|
|
#endif
|
|
#ifdef DO_BUILD_NEON
|
|
{
|
|
radaudio_neon_unpack_nibbles_read_sentinel16_write_multiple32(unpacked, packed, num_packed, 0x1111111111111111ull);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef DO_BUILD_NEON // for unreachable code warnings
|
|
for (int i=0; i < num_packed; ++i) {
|
|
unpacked[i*2+0] = (S8) (packed[i] & 15);
|
|
unpacked[i*2+1] = (S8) (packed[i] >> 4);
|
|
}
|
|
|
|
RR_PUT64_NATIVE(&unpacked[num_packed*2+0], 0x0101010101010101ull);
|
|
RR_PUT64_NATIVE(&unpacked[num_packed*2+8], 0x0101010101010101ull);
|
|
#endif // !DO_BUILD_NEON
|
|
}
|
|
|
|
// if coefficient is 0, then read it from the big coefficient array
|
|
// otherwise, remove the +8 bias by subtracting 8
|
|
static rrbool expand_nonzero_coefficients(radaudio_decoder_state *ds, S8 *nonzero_coefficients, int num_nonzero, S8 *big_coeff, S8 *big_limit, S8 *safe_read)
|
|
{
|
|
if (safe_read - big_limit > 15) {
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse2) {
|
|
return radaudio_sse2_expand_coefficients_excess_read15(nonzero_coefficients, num_nonzero, big_coeff, big_limit);
|
|
}
|
|
#endif
|
|
#ifdef DO_BUILD_NEON
|
|
return radaudio_neon_expand_coefficients_excess_read15(nonzero_coefficients, num_nonzero, big_coeff, big_limit);
|
|
#endif
|
|
}
|
|
|
|
// else fall through to scalar
|
|
|
|
for (int i = 0; i < num_nonzero; ++i) {
|
|
if (nonzero_coefficients[i] == 0) {
|
|
if (big_coeff == big_limit)
|
|
return false; // overread error
|
|
nonzero_coefficients[i] = *big_coeff++;
|
|
} else
|
|
nonzero_coefficients[i] -= 8;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static void compute_band_energy_multiple4(radaudio_decoder_state *ds, F32 *band_energy, int num_bands, int band_exponent[], U16 fine_energy[], F32 band_scale_decode[])
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse2) {
|
|
radaudio_sse2_compute_band_energy_multiple4(band_energy, num_bands, band_exponent, fine_energy, band_scale_decode);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifdef DO_BUILD_NEON
|
|
radaudio_neon_compute_band_energy_multiple4(band_energy, num_bands, band_exponent, fine_energy, band_scale_decode);
|
|
#else
|
|
for (int j=0; j < num_bands; ++j) { // safe to run 24 times for SIMD
|
|
int qe = fine_energy[j]; // quantized energy, in [0, 1<<MAX_FINE_ENERGY_BITS)
|
|
F32 fe, ce, pe; // fine energy, coarse energy, packed energy
|
|
|
|
pe = qe / (float) (1 << MAX_FINE_ENERGY_BITS); // pe is 0..1
|
|
fe = (0.34375f*pe + 0.65625f)*pe + 1.0f;
|
|
|
|
if (band_exponent[j] == BAND_EXPONENT_NONE)
|
|
ce = 0;
|
|
else
|
|
ce = (float) (1 << (band_exponent[j] + 16)); // integer_exponent 0 => (1<<30>>14) => 1<<16
|
|
|
|
band_energy[j] = (fe * ce) * band_scale_decode[j];
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static void compute_subband_energy_skip12_excess_read7(radaudio_decoder_state *ds, F32 *subband_energy, const F32 *band_energy, int num_bands, int num_subbands, int *num_subbands_for_band, U16 *quantized_subbands)
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse4_1) {
|
|
radaudio_sse4_compute_subband_energy_skip12_excess_read7(subband_energy, band_energy, num_bands, num_subbands_for_band, quantized_subbands);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifdef DO_BUILD_NEON
|
|
radaudio_neon_compute_subband_energy_skip12_excess_read7(subband_energy, band_energy, num_bands, num_subbands_for_band, quantized_subbands);
|
|
#else
|
|
int start, j;
|
|
for (j=0; num_subbands_for_band[j] == 1; ++j)
|
|
;
|
|
start = j;
|
|
for (; j < num_bands; ++j) {
|
|
int sum=0;
|
|
int num = num_subbands_for_band[j];
|
|
// these loops are pretty random lengths, for example, at 44.1Khz, they're: 2,2,2,2,3,4,9,10,12 iterations
|
|
for (int i=0; i < num; ++i) {
|
|
sum += (quantized_subbands[start+i] * quantized_subbands[start+i]);
|
|
}
|
|
|
|
F32 scale = band_energy[j] / sqrtf((F32) sum);
|
|
rrAssert(!isnan(band_energy[j]));
|
|
rrAssert(sum != 0);
|
|
for (int i=0; i < num; ++i) {
|
|
subband_energy[start+i] = scale * quantized_subbands[start+i];
|
|
}
|
|
start += num;
|
|
}
|
|
rrAssert(start == num_subbands);
|
|
#endif
|
|
}
|
|
|
|
static void distribute_bitflag_coefficients_multiple64(radaudio_decoder_state *ds,
|
|
S8 *quantized_coeff, int num_coeff,
|
|
U8 *nonzero_flagbits,
|
|
S8 *nonzero_coeffs, int *pcur_nonzero_coeffs)
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_ssse3) {
|
|
radaudio_ssse3_distribute_bitflag_coefficients_multiple16(
|
|
ds->cpu,
|
|
quantized_coeff, num_coeff,
|
|
nonzero_flagbits,
|
|
nonzero_coeffs, pcur_nonzero_coeffs);
|
|
return;
|
|
}
|
|
#endif
|
|
#ifdef DO_BUILD_NEON
|
|
{
|
|
radaudio_neon_distribute_bitflag_coefficients_multiple16(
|
|
quantized_coeff, num_coeff,
|
|
nonzero_flagbits,
|
|
nonzero_coeffs, pcur_nonzero_coeffs);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef DO_BUILD_NEON // for unreachable code warnings
|
|
int cur_nonzero_coeffs = *pcur_nonzero_coeffs;
|
|
|
|
memset(quantized_coeff, 0, num_coeff);
|
|
|
|
// use a run-length style scheme using bit scans to reduce branch mispredictions
|
|
int pos=0;
|
|
for (int i=0; i < num_coeff; i += 64) {
|
|
U64 flags = RR_GET64_LE(nonzero_flagbits + pos);
|
|
pos += 8;
|
|
|
|
// even though the run is never long--we could just use a small lookup table--let's do it right
|
|
int offset = i;
|
|
while (flags) {
|
|
SINTa dist = rrCtz64(flags);
|
|
quantized_coeff[offset+dist] = nonzero_coeffs[cur_nonzero_coeffs++];
|
|
flags = rrClearLowestSetBit64(flags);
|
|
}
|
|
}
|
|
|
|
*pcur_nonzero_coeffs = cur_nonzero_coeffs;
|
|
#endif // !DO_BUILD_NEON
|
|
}
|
|
|
|
static rrbool distribute_nonzero_coefficients(radaudio_decoder_state *ds,
|
|
S8 *quantized_coeff, int num_coeff32,
|
|
U8 *runlength_data, int *pcur_runlength_data, // there's guaranteed sentinels, so don't need length
|
|
S8 *nonzero_coeffs, int *pcur_nonzero_coeffs,
|
|
U8 *nonzero_flagbits, int num_nonzero_flagbits, int channel)
|
|
{
|
|
RR_UNUSED_VARIABLE(channel);
|
|
SINTa num_coeff = num_coeff32;
|
|
SINTa k=0;
|
|
if (num_nonzero_flagbits) {
|
|
PROF_BEGIN(distribute_bitflag);
|
|
distribute_bitflag_coefficients_multiple64(ds, quantized_coeff, num_nonzero_flagbits, nonzero_flagbits, nonzero_coeffs, pcur_nonzero_coeffs);
|
|
PROF_END(distribute_bitflag);
|
|
k = num_nonzero_flagbits;
|
|
}
|
|
|
|
const U8 *runlens = runlength_data + *pcur_runlength_data;
|
|
const S8 *nzcoeffs = nonzero_coeffs + *pcur_nonzero_coeffs;
|
|
|
|
PROF_BEGIN(distribute_rle);
|
|
memset(quantized_coeff+k, 0, num_coeff-k);
|
|
// tried a branchless version of this using the slot[] logic from above, but saw no gain
|
|
// we put in sentinels that guarantee this loop will see a END_OF_ZERORUN
|
|
for(;;) {
|
|
U8 rl = *runlens++;
|
|
if (rl == END_OF_ZERORUN)
|
|
break;
|
|
k += rl;
|
|
if (rl < MAX_RUNLEN) {
|
|
if (k >= num_coeff)
|
|
return false;
|
|
quantized_coeff[k] = *nzcoeffs++;
|
|
++k;
|
|
}
|
|
}
|
|
|
|
*pcur_runlength_data = (int)(SINTa)(runlens - runlength_data);
|
|
*pcur_nonzero_coeffs = (int)(SINTa)(nzcoeffs - nonzero_coeffs);
|
|
PROF_END(distribute_rle);
|
|
|
|
return true;
|
|
}
|
|
|
|
static void dequantize_long_block_8x8_Nx16(radaudio_decoder_state *ds, float *coeffs, S8 *quantized_coeff, float *subband_energy, int num_subbands, int *num_coeffs_for_band)
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse4_1) {
|
|
radaudio_sse4_dequantize_long_block_8x8_Nx16(coeffs, quantized_coeff, subband_energy, num_subbands);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
int cb=0;
|
|
int j=0;
|
|
|
|
// first 8 subbands should be 8 coefficients long
|
|
while (num_coeffs_for_band[j] < 16) {
|
|
F32 sum=1.e-20f,scale;
|
|
for (int i=0; i < num_coeffs_for_band[j]; ++i) {
|
|
F32 n = (F32) quantized_coeff[cb+i];
|
|
sum += n*n;
|
|
}
|
|
scale = subband_energy[j] / sqrtf(sum);
|
|
for (int i=0; i < 8; ++i)
|
|
coeffs[cb+i] = quantized_coeff[cb+i] * scale;
|
|
cb += num_coeffs_for_band[j];
|
|
++j;
|
|
}
|
|
|
|
// all remaining subbands are 16 coefficients long, so we don't have to check bands
|
|
for (; j < num_subbands; ++j) {
|
|
F32 sum=1.e-20f,scale;
|
|
for (int i=0; i < 16; ++i) {
|
|
F32 n = (F32) quantized_coeff[cb+i];
|
|
sum += n*n;
|
|
}
|
|
scale = subband_energy[j] / sqrtf(sum);
|
|
for (int i=0; i < 16; ++i)
|
|
coeffs[cb+i] = quantized_coeff[cb+i] * scale;
|
|
cb += 16;
|
|
}
|
|
for (int i=cb; i < RADAUDIO_LONG_BLOCK_LEN; ++i)
|
|
coeffs[i] = 0;
|
|
}
|
|
|
|
static void dequantize_long_block_with_random_8x8_Nx16(radaudio_decoder_state *ds, F32 *coeffs, S8 *quantized_coeff, F32 *subband_energy, int num_subbands, int *num_coeffs_for_band, U32 randval)
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse4_1) {
|
|
RAD_ALIGN(U32, rand_state[4], 16);
|
|
build_rand_state(rand_state, randval);
|
|
radaudio_sse4_dequantize_long_block_replace_0_with_random_8x8_Nx16(coeffs, quantized_coeff, subband_energy, num_subbands, rand_state);
|
|
return;
|
|
}
|
|
#endif
|
|
#ifdef DO_BUILD_NEON
|
|
{
|
|
RAD_ALIGN(U32, rand_state[4], 16);
|
|
build_rand_state(rand_state, randval);
|
|
radaudio_neon_dequantize_long_block_replace_0_with_random_8x8_Nx16(coeffs, quantized_coeff, subband_energy, num_subbands, rand_state);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef DO_BUILD_NEON // for unreachable code warnings
|
|
randomize_long_block_8x8_Nx16(ds, quantized_coeff, randval, num_subbands, num_coeffs_for_band);
|
|
dequantize_long_block_8x8_Nx16(ds, coeffs, quantized_coeff, subband_energy, num_subbands, num_coeffs_for_band);
|
|
#endif // !DO_BUILD_NEON
|
|
}
|
|
|
|
static void scalar_dequantize_short_block(float *coeffs, S8 *quantized_coeff, float *band_energy, int num_bands, int *num_coeffs_for_band)
|
|
{
|
|
int i,j;
|
|
int cb=0;
|
|
// first 4 bands are 1 coefficient long, and coefficient is always 1 or -1
|
|
for (j=0; j < 4; ++j) {
|
|
rrAssert(abs(quantized_coeff[j]) == 1);
|
|
coeffs[j] = (F32) quantized_coeff[j] * band_energy[j];
|
|
}
|
|
|
|
// next 4 bands are 2 coefficients long
|
|
for (j=0; j < 4; ++j) {
|
|
float x = (F32) quantized_coeff[4+j*2+0];
|
|
float y = (F32) quantized_coeff[4+j*2+1];
|
|
float scale = band_energy[4+j] / sqrtf(x*x+y*y+1.e-20f);
|
|
coeffs[4+j*2+0] = x*scale;
|
|
coeffs[4+j*2+1] = y*scale;
|
|
}
|
|
|
|
// next 4 bands are 4 coefficients long
|
|
// (actually next 5 bands are)
|
|
cb = 4*1 + 4*2;
|
|
for (j=8; j < 12; ++j) {
|
|
float sum=1.e-20f, scale;
|
|
for (i=0; i < 4; ++i) {
|
|
float n = (F32) quantized_coeff[cb+i];
|
|
sum += n*n;
|
|
}
|
|
scale = band_energy[j] / sqrtf(sum);
|
|
for (i=0; i < 4; ++i)
|
|
coeffs[cb+i] = (F32) quantized_coeff[cb+i] * scale;
|
|
cb += 4;
|
|
}
|
|
|
|
// now we have either [4,16,16,16,32]
|
|
// or [4,16,16,32,32] for lower sample rates
|
|
cb = 4*1 + 4*2 + 4*4;
|
|
for (j=12; j < num_bands; ++j) {
|
|
int count = num_coeffs_for_band[j];
|
|
F32 sum=1.e-20f,scale;
|
|
for (i=0; i < count; i += 4) {
|
|
for (int k=0; k < 4; ++k) {
|
|
F32 n = (F32) quantized_coeff[cb+i+k];
|
|
sum += n*n;
|
|
}
|
|
}
|
|
scale = band_energy[j] / sqrtf(sum);
|
|
for (i=0; i < count; i += 4) {
|
|
for (int k=0; k < 4; ++k) {
|
|
coeffs[cb+i+k] = quantized_coeff[cb+i+k] * scale;
|
|
}
|
|
}
|
|
cb += count;
|
|
}
|
|
|
|
for (i=cb; i < RADAUDIO_SHORT_BLOCK_LEN; ++i)
|
|
coeffs[i] = 0;
|
|
|
|
}
|
|
|
|
static void dequantize_short_block(radaudio_decoder_state *ds, float *coeffs, S8 *quantized_coeff, float *band_energy, int num_bands, int *num_coeffs_for_band)
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse4_1) {
|
|
radaudio_sse4_dequantize_short_block_sse4(coeffs, quantized_coeff, band_energy, num_bands, num_coeffs_for_band);
|
|
return;
|
|
}
|
|
#endif
|
|
#ifdef DO_BUILD_NEON
|
|
{
|
|
radaudio_neon_dequantize_short_block(coeffs, quantized_coeff, band_energy, num_bands, num_coeffs_for_band);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#ifndef DO_BUILD_NEON // for unreachable code warnings
|
|
scalar_dequantize_short_block(coeffs, quantized_coeff, band_energy, num_bands, num_coeffs_for_band);
|
|
#endif // !DO_BUILD_NEON
|
|
}
|
|
|
|
// we have to store half the MDCT output to overlap in the next block. this is the largest
|
|
// per-stream memory cost of the decoder. we used to store floats, but now we convert them
|
|
// to S16. This is *pre* windowing, so the quality loss is minimal.
|
|
//
|
|
// sse2 runs at about half speed of original "store as floats" version, but it's about a 2% slowdown
|
|
// overall and we decided it was worth the speed loss in return for halving memory usage
|
|
static float save_overlapping_samples(radaudio_decoder_state *ds, S16 *buffer, const float *data, int num)
|
|
{
|
|
// the profile wrapper is external to this, under the name "copy"
|
|
|
|
rrAssert(num % 64 == 0);
|
|
|
|
num /= 2;
|
|
|
|
#if defined(DO_BUILD_SSE4) || defined(DO_BUILD_AVX2)
|
|
if (ds->cpu.has_sse2) {
|
|
#ifdef DO_BUILD_AVX2
|
|
if (ds->cpu.has_avx2) {
|
|
return radaudio_avx2_save_samples(buffer, data, num);
|
|
}
|
|
#endif
|
|
return radaudio_sse2_save_samples(buffer, data, num);
|
|
}
|
|
#endif
|
|
|
|
#ifdef DO_BUILD_NEON
|
|
return radaudio_neon_save_samples(buffer, data, num);
|
|
#endif
|
|
|
|
#ifndef DO_BUILD_NEON // for unreachable code warnings
|
|
|
|
#define FAST_FLOAT_TO_INT // best solution i've found for x64
|
|
|
|
#if 0
|
|
// naive implementation for reference, but floor() is unacceptably slow
|
|
// doubles total decode time on test platform; round() and rint() were worse
|
|
// also, doesn't round to nearest even like SSE path
|
|
// most files in fnaudio get different results, is it buggy? there's no way this can just be from tie-breaking 0.5?!?
|
|
float largest0 = 1.0f;
|
|
float largest1 = 1.0f;
|
|
float scale = 32767.0f;
|
|
for (int i=0; i < num; i += 2) {
|
|
F32 d0 = data[i+0];
|
|
F32 d1 = data[i+1];
|
|
buffer[i+0] = (S16) floorf(d0 * scale + 0.5f);
|
|
buffer[i+1] = (S16) floorf(d1 * scale + 0.5f);
|
|
F32 a0 = fabsf(d0);
|
|
F32 a1 = fabsf(d1);
|
|
largest0 = RR_MAX(largest0, a0);
|
|
largest1 = RR_MAX(largest1, a1);
|
|
}
|
|
float largest = RR_MAX(largest0,largest1);
|
|
if (largest > 1.0f) {
|
|
scale = 32767.0f / largest;
|
|
for (int i=0; i < num; i += 2) {
|
|
buffer[i+0] = (S16) floorf(data[i+0] * scale + 0.5f);
|
|
buffer[i+1] = (S16) floorf(data[i+1] * scale + 0.5f);
|
|
}
|
|
}
|
|
return 1.0f / scale;
|
|
|
|
#elif defined(FAST_FLOAT_TO_INT)
|
|
// this should round correctly
|
|
// bithack float-to-int
|
|
|
|
typedef union {
|
|
F32 f;
|
|
S32 i;
|
|
} float_conv;
|
|
|
|
float_conv temp0,temp1,temp2,temp3;
|
|
// add (1<<23) to convert to int, then divide by 2^SHIFT, then add 0.5/2^SHIFT to round
|
|
#define MAGIC(SHIFT) (1.5f * (1 << (23-SHIFT)) + 0.5f/(1 << SHIFT))
|
|
#define ADDEND(SHIFT) (((150-SHIFT) << 23) + (1 << 22))
|
|
#define FAST_SCALED_FLOAT_TO_INT(temp,x,s) (temp.f = (x) + MAGIC(s), temp.i - ADDEND(s))
|
|
|
|
float largest0 = 1.0f;
|
|
float largest1 = 1.0f;
|
|
float largest2 = 1.0f;
|
|
float largest3 = 1.0f;
|
|
float scale = 32767.0f;
|
|
for (int i=0; i < num; i += 4) {
|
|
F32 d0 = data[i+0];
|
|
F32 d1 = data[i+1];
|
|
F32 d2 = data[i+2];
|
|
F32 d3 = data[i+3];
|
|
F32 a0 = fabsf(d0);
|
|
F32 a1 = fabsf(d1);
|
|
F32 a2 = fabsf(d2);
|
|
F32 a3 = fabsf(d3);
|
|
buffer[i+0] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, d0 * scale, 0);
|
|
buffer[i+1] = (S16) FAST_SCALED_FLOAT_TO_INT(temp1, d1 * scale, 0);
|
|
buffer[i+2] = (S16) FAST_SCALED_FLOAT_TO_INT(temp2, d2 * scale, 0);
|
|
buffer[i+3] = (S16) FAST_SCALED_FLOAT_TO_INT(temp3, d3 * scale, 0);
|
|
largest0 = RR_MAX(largest0, a0);
|
|
largest1 = RR_MAX(largest1, a1);
|
|
largest2 = RR_MAX(largest2, a2);
|
|
largest3 = RR_MAX(largest3, a3);
|
|
}
|
|
float largest01 = RR_MAX(largest0,largest1);
|
|
float largest23 = RR_MAX(largest2,largest3);
|
|
float largest = RR_MAX(largest01,largest23);
|
|
if (largest > 1.0f) {
|
|
scale = 32767.0f / largest;
|
|
for (int i=0; i < num; i += 4) {
|
|
buffer[i+0] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+0] * scale, 0);
|
|
buffer[i+1] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+1] * scale, 0);
|
|
buffer[i+2] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+2] * scale, 0);
|
|
buffer[i+3] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+3] * scale, 0);
|
|
}
|
|
}
|
|
return 1.0f / scale;
|
|
|
|
#else
|
|
|
|
// 30% slower than FAST_FLOAT_TO_INT on x64
|
|
|
|
// we want to use the equivalent of floor() so we can round.
|
|
// if we use fixed-point, right-shifting two's complement values is floor.
|
|
// though we might get compiler warnings about signed shifts
|
|
// problem: this doesn't produce the exact same results as other methods
|
|
// most files in fnaudio get different results
|
|
|
|
#define TRUNC_SHIFT 15
|
|
|
|
float largest0 = 1.0f;
|
|
float largest1 = 1.0f;
|
|
float largest2 = 1.0f;
|
|
float largest3 = 1.0f;
|
|
float scale = 32767.0f;
|
|
for (int i=0; i < num; i += 4) {
|
|
F32 d0 = data[i+0];
|
|
F32 d1 = data[i+1];
|
|
F32 d2 = data[i+2];
|
|
F32 d3 = data[i+3];
|
|
F32 a0 = fabsf(d0);
|
|
F32 a1 = fabsf(d1);
|
|
F32 a2 = fabsf(d2);
|
|
F32 a3 = fabsf(d3);
|
|
S32 i0 = (S32) (d0 * scale * (1 << TRUNC_SHIFT));
|
|
S32 i1 = (S32) (d1 * scale * (1 << TRUNC_SHIFT));
|
|
S32 i2 = (S32) (d2 * scale * (1 << TRUNC_SHIFT));
|
|
S32 i3 = (S32) (d3 * scale * (1 << TRUNC_SHIFT));
|
|
buffer[i+0] = (S16) ((i0 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
|
|
buffer[i+1] = (S16) ((i1 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
|
|
buffer[i+2] = (S16) ((i2 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
|
|
buffer[i+3] = (S16) ((i3 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
|
|
largest0 = RR_MAX(largest0, a0);
|
|
largest1 = RR_MAX(largest1, a1);
|
|
largest2 = RR_MAX(largest2, a2);
|
|
largest3 = RR_MAX(largest3, a3);
|
|
}
|
|
float largest01 = RR_MAX(largest0,largest1);
|
|
float largest23 = RR_MAX(largest2,largest3);
|
|
float largest = RR_MAX(largest01,largest23);
|
|
if (largest > 1.0f) {
|
|
scale = 32767.0f / largest;
|
|
for (int i=0; i < num; i += 4) {
|
|
S32 i0 = (S32) (data[i+0] * scale * (1 << TRUNC_SHIFT));
|
|
S32 i1 = (S32) (data[i+1] * scale * (1 << TRUNC_SHIFT));
|
|
S32 i2 = (S32) (data[i+2] * scale * (1 << TRUNC_SHIFT));
|
|
S32 i3 = (S32) (data[i+3] * scale * (1 << TRUNC_SHIFT));
|
|
buffer[i+0] = (S16) ((i0 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
|
|
buffer[i+1] = (S16) ((i1 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
|
|
buffer[i+2] = (S16) ((i2 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
|
|
buffer[i+3] = (S16) ((i3 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT);
|
|
}
|
|
}
|
|
return 1.0f / scale;
|
|
|
|
#endif
|
|
|
|
#endif // !DO_BUILD_NEON
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static const size_t decoder_align = 32;
|
|
|
|
// We fill all-0 subbands with random noise (that's scaled to the
|
|
// appropriate subband energy).
|
|
//
|
|
// We need a mechanism where SSE2 can generate random data very fast,
|
|
// but it doesn't hobble the scalar pass. so e.g. SSE2 can generate
|
|
// 4 LCG steps in parallel, 16..24 bits of useful data in each one.
|
|
// but doing this on scalar might be slow. for a single 16-item subband,
|
|
// we need 4*16 = 64 bits of randomness to use our 4-bit random_table[]
|
|
// (which we can lookup with pshub).
|
|
//
|
|
// scalar: old code did 2 32-bit LCGs per subband, but only if the subband was zero.
|
|
// Optimized-for-SIMD code might prefer to be branchless and do an LCG on
|
|
// every subband even if non-zero, and there are ~80 subbands. But it turns
|
|
// out we take a branch anyway in the SSE code, so executive decision to stick
|
|
// to a design where we only update LCGs if subband was zero. (Note that if
|
|
// we didn't care about identical decoding across platforms, you could use
|
|
// whatever random method was optimal for each platform).
|
|
|
|
static void decode_channel_before_imdct(radaudio_decoder_state *ds,
|
|
radaudio_block_data *bd,
|
|
int channel,
|
|
U32 rand_seed,
|
|
U16 fine_energy[])
|
|
{
|
|
radaudio_rate_info *info;
|
|
|
|
RAD_ALIGN(F32, band_energy[24+16], 16) = { 0 }; // must be a multiple of 4
|
|
RAD_ALIGN(F32, subband_energy[MAX_SUBBANDS+16], 16) = { 0 };
|
|
|
|
int is_short_block = ds->current_block_short;
|
|
info = ds->info[is_short_block];
|
|
|
|
PROF_BEGIN(compute_band_energy);
|
|
// compute band energy
|
|
compute_band_energy_multiple4(ds, band_energy, info->num_bands, bd->band_exponent, fine_energy, info->band_scale_decode);
|
|
PROF_END(compute_band_energy);
|
|
|
|
// compute subband energy
|
|
if (!is_short_block) {
|
|
PROF_BEGIN(compute_subband_energy);
|
|
// first bands are shorter than a full subband, so treat those specially
|
|
int j;
|
|
for (j=0; info->num_subbands_for_band[j] == 1; ++j)
|
|
subband_energy[j] = band_energy[j];
|
|
|
|
compute_subband_energy_skip12_excess_read7(ds, subband_energy, band_energy, info->num_bands, info->num_subbands, info->num_subbands_for_band, bd->quantized_subbands);
|
|
|
|
for (j=0; j < info->num_subbands; ++j)
|
|
rrAssert(!isnan(subband_energy[j]));
|
|
|
|
PROF_END(compute_subband_energy);
|
|
}
|
|
|
|
// spread out adjacent blocks to be less similar
|
|
U32 randval = (rand_seed + (rand_seed >> 5)) * 0x27d4eb2d;
|
|
randval = lcg(randval);
|
|
|
|
F32 *coeffs = bd->dequantized_coeff_decode;
|
|
|
|
if (!is_short_block) {
|
|
for (int j=0; j < info->num_subbands; ++j)
|
|
rrAssert(info->num_coeffs_for_subband[j] == (j < 8 ? 8 : 16));
|
|
|
|
PROF_BEGIN(unquantize);
|
|
dequantize_long_block_with_random_8x8_Nx16(ds, coeffs, bd->quantized_coeff_decode, subband_energy, info->num_subbands, info->num_coeffs_for_band, randval);
|
|
PROF_END(unquantize);
|
|
} else {
|
|
PROF_BEGIN(randomize);
|
|
randomize_short_block(bd->quantized_coeff_decode, randval, info->num_bands, info->num_coeffs_for_band);
|
|
PROF_END(randomize);
|
|
|
|
PROF_BEGIN(unquantize);
|
|
dequantize_short_block(ds, coeffs, bd->quantized_coeff_decode, band_energy, info->num_bands, info->num_coeffs_for_band);
|
|
PROF_END(unquantize);
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
static void decode_channel_before_imdct_reference(radaudio_decoder_state *ds,
|
|
radaudio_block_data *bd,
|
|
int channel,
|
|
U32 rand_seed,
|
|
U16 fine_energy[])
|
|
{
|
|
radaudio_rate_info *info;
|
|
|
|
RAD_ALIGN(F32, band_energy[24+16], 16) = { 0 }; // must be a multiple of 4
|
|
RAD_ALIGN(F32, subband_energy[MAX_SUBBANDS+16], 16) = { 0 };
|
|
|
|
int is_short_block = ds->current_block_short;
|
|
info = ds->info[is_short_block];
|
|
|
|
PROF_BEGIN(compute_band_energy);
|
|
// compute band energy
|
|
compute_band_energy_multiple4(ds, band_energy, info->num_bands, bd->band_exponent, fine_energy, info->band_scale_decode);
|
|
PROF_END(compute_band_energy);
|
|
|
|
// compute subband energy
|
|
if (!is_short_block) {
|
|
PROF_BEGIN(compute_subband_energy);
|
|
// first bands are shorter than a full subband, so treat those specially
|
|
int j;
|
|
for (j=0; info->num_subbands_for_band[j] == 1; ++j)
|
|
subband_energy[j] = band_energy[j];
|
|
|
|
compute_subband_energy_skip12_excess_read7(ds, subband_energy, band_energy, info->num_bands, info->num_subbands, info->num_subbands_for_band, bd->quantized_subbands);
|
|
|
|
for (j=0; j < info->num_subbands; ++j)
|
|
rrAssert(!isnan(subband_energy[j]));
|
|
|
|
PROF_END(compute_subband_energy);
|
|
}
|
|
|
|
// spread out adjacent blocks to be less similar
|
|
U32 randval = (rand_seed + (rand_seed >> 5)) * 0x27d4eb2d;
|
|
randval = lcg(randval);
|
|
|
|
PROF_BEGIN(randomize);
|
|
if (!is_short_block) {
|
|
// replace all-zero coefficient chunks with noise
|
|
for (int j=0; j < info->num_subbands; ++j)
|
|
rrAssert(info->num_coeffs_for_subband[j] == (j < 8 ? 8 : 16));
|
|
randomize_long_block_8x8_Nx16(bd->quantized_coeff_decode, randval, info->num_subbands, info->num_coeffs_for_band);
|
|
} else {
|
|
// replace all-zero coefficient chunks with noise
|
|
randomize_short_block(bd->quantized_coeff_decode, randval, info->num_bands, info->num_coeffs_for_band);
|
|
}
|
|
PROF_END(randomize);
|
|
|
|
F32 *coeffs = bd->dequantized_coeff_decode;
|
|
|
|
PROF_BEGIN(unquantize);
|
|
// reference implementation doesn't make any assumptions about distribution of subbands
|
|
{
|
|
int start = 0;
|
|
|
|
if (is_short_block) {
|
|
for (int j=0; j < info->num_bands; ++j)
|
|
subband_energy[j] = band_energy[j];
|
|
} else {
|
|
for (int j=0; j < info->num_bands && info->num_subbands_for_band[j] == 1; ++j)
|
|
subband_energy[j] = band_energy[j];
|
|
}
|
|
|
|
for (int j=0; j < info->num_subbands; ++j) {
|
|
int n = info->num_coeffs_for_subband[j];
|
|
F32 sum = 1.0e-30f;
|
|
for (int i=0; i < n; ++i) {
|
|
F32 x = (F32) bd->quantized_coeff_decode[start+i];
|
|
sum += x*x;
|
|
}
|
|
F32 scale = subband_energy[j] / sqrtf(sum);
|
|
for (int i=0; i < n; ++i) {
|
|
coeffs[start+i] = (F32) bd->quantized_coeff_decode[start+i] * scale;
|
|
}
|
|
|
|
start += n;
|
|
}
|
|
for (int i=info->num_quantized_coeffs; i < info->num_coeffs; ++i)
|
|
coeffs[i] = 0;
|
|
}
|
|
PROF_END(unquantize);
|
|
}
|
|
#endif
|
|
|
|
// dequantized_coeff_decode[] in bd is overwritten in the process
|
|
static void decode_channel_imdct(radaudio_decoder_state *ds,
|
|
F32 rawdata[MAX_COEFFS], // only max_coeffs because we expand the symmetries later
|
|
radaudio_block_data *bd,
|
|
int channel)
|
|
{
|
|
radaudio_rate_info *info;
|
|
|
|
int is_short_block = ds->current_block_short;
|
|
info = ds->info[is_short_block];
|
|
|
|
F32 *coeffs = bd->dequantized_coeff_decode;
|
|
for (int j=0; j < info->num_coeffs; ++j)
|
|
rrAssert(!isnan(coeffs[j]));
|
|
|
|
PROF_BEGIN(imdct);
|
|
radaudio_imdct_fft_only_middle(ds->cpu, rawdata, coeffs, info->num_coeffs);
|
|
PROF_END(imdct);
|
|
}
|
|
|
|
static int decode_channel_after_imdct(radaudio_decoder_state *ds,
|
|
F32 data1 [MAX_COEFFS],
|
|
F32 data2 [MAX_COEFFS],
|
|
radaudio_block_data *bd,
|
|
int channel,
|
|
F32 *output)
|
|
{
|
|
int result_length;
|
|
radaudio_rate_info *info;
|
|
|
|
int is_short_block = ds->current_block_short;
|
|
info = ds->info[is_short_block];
|
|
|
|
PROF_BEGIN(window);
|
|
if (ds->post_seek)
|
|
// ignore the left side of the first block
|
|
result_length = 0;
|
|
else {
|
|
// use window for whichever is smaller of current block or previous block
|
|
// apply window to pending data
|
|
if (ds->current_block_short == ds->last_block_short) {
|
|
F32 *window = radaudio_windows[ds->current_block_short];
|
|
int len = ds->current_block_short ? RADAUDIO_SHORT_BLOCK_LEN : RADAUDIO_LONG_BLOCK_LEN;
|
|
|
|
compute_windowed_sum_multiple64(ds, output, len,
|
|
data1, ds->prev_block_right_samples[channel], len, 0, ds->restore_scale[channel],
|
|
window, ds->block_number, channel, ds->fully_decoded);
|
|
result_length = len;
|
|
} else {
|
|
F32 *window = radaudio_windows[RADAUDIO_SHORT];
|
|
const int n = RADAUDIO_SHORT_BLOCK_LEN;
|
|
if (is_short_block) {
|
|
// if previous block was long and this is short
|
|
//
|
|
// <-----LONG_BLOCK_LEN---->
|
|
// +-----------+-----------+-----------+-----------+
|
|
// | prev |
|
|
// +-----------+-----------+-----------+-----------+
|
|
// *************** <- output samples
|
|
// -----------------------1111111111WWwww000000000 <- window weights
|
|
// ----------wwWWW
|
|
// +----+----+
|
|
// | cur |
|
|
// +----+----+
|
|
// <---->
|
|
// |
|
|
// SHORT_BLOCK_LEN
|
|
//
|
|
const int len = RADAUDIO_LONG_BLOCK_LEN/2 - RADAUDIO_SHORT_BLOCK_LEN/2;
|
|
copy_samples_multiple16_scaled(output, len, ds->prev_block_right_samples[channel], ds->restore_scale[channel]); // copy samples from previous where the new window is 0 and old window was 1
|
|
compute_windowed_sum_multiple64(ds, output+len, n,
|
|
data1, ds->prev_block_right_samples[channel], RADAUDIO_LONG_BLOCK_LEN, len, ds->restore_scale[channel],
|
|
window, ds->block_number, channel, ds->fully_decoded+len); // sum the part of the previous block that overlaps the left half of the new block
|
|
|
|
result_length = RADAUDIO_LONG_BLOCK_LEN/2 + RADAUDIO_SHORT_BLOCK_LEN/2; // generated (LONG/2 - SHORT/2) + SHORT
|
|
} else {
|
|
// if previous block was short and this is long
|
|
//
|
|
// SHORT_BLOCK_LEN
|
|
// |
|
|
// <---->
|
|
// +----+----+
|
|
// | prev |
|
|
// +----+----+
|
|
// WWwww---------
|
|
// 000000000wwwWW111111111----------------------- <- window weights
|
|
// ************** <- output samples
|
|
// +-----------+-----------+-----------+-----------+
|
|
// | cur |
|
|
// +-----------+-----------+-----------+-----------+
|
|
// <-----LONG_BLOCK_LEN---->
|
|
//
|
|
|
|
const int offset = RADAUDIO_LONG_BLOCK_LEN/2 - RADAUDIO_SHORT_BLOCK_LEN/2;
|
|
compute_windowed_sum_multiple64(ds, output, n, data1, ds->prev_block_right_samples[channel], RADAUDIO_SHORT_BLOCK_LEN, 0, ds->restore_scale[channel], window, ds->block_number, channel, ds->fully_decoded);
|
|
copy_samples_multiple16(output+n, RADAUDIO_LONG_BLOCK_LEN - (offset+n), data1 + n/2);
|
|
|
|
result_length = RADAUDIO_SHORT_BLOCK_LEN/2 + RADAUDIO_LONG_BLOCK_LEN/2;
|
|
}
|
|
}
|
|
}
|
|
PROF_END(window);
|
|
|
|
PROF_BEGIN(copy);
|
|
ds->restore_scale[channel] = save_overlapping_samples(ds, ds->prev_block_right_samples[channel], data2, info->num_coeffs);
|
|
PROF_END(copy);
|
|
|
|
return result_length;
|
|
}
|
|
|
|
static U8 *find_next_coarse_run_excess16(radaudio_decoder_state *ds, U8 *cur, U8 *end)
|
|
{
|
|
#ifdef DO_BUILD_SSE4
|
|
if (ds->cpu.has_sse2) {
|
|
return radaudio_sse2_find_next_coarse_run_excess16(cur, end);
|
|
}
|
|
#endif
|
|
|
|
#ifdef __RAD64REGS__
|
|
RR_COMPILER_ASSERT(COARSE_RUNLEN_THRESHOLD < 128);
|
|
RR_COMPILER_ASSERT(MAX_RUNLEN >= 128);
|
|
|
|
const U64 splat8 = ~(U64)0 / 255; // 0x0101...01
|
|
const U64 msb_mask = 0x80 * splat8;
|
|
const U64 low7_mask = ~msb_mask;
|
|
const U64 bias0 = (128 - COARSE_RUNLEN_THRESHOLD) * splat8;
|
|
const U64 bias1 = (256 - MAX_RUNLEN) * splat8;
|
|
|
|
while (cur < end) {
|
|
U64 bytes = RR_GET64_LE(cur);
|
|
cur += 8;
|
|
|
|
// check if there are any bytes >=COARSE_RUNLEN_THRESHOLD in those 8 bytes we just read.
|
|
// idea: these are bytes that either
|
|
// 1. have low 7 bits >=COARSE_RUNLEN_THRESHOLD
|
|
// 2. have MSB set (thus >=128)
|
|
// we can check the former by masking with 0x7f7f...7f and then adding (128 - COARSE_RUNLEN_THRESHOLD)
|
|
// to every byte. if the MSB ends up set, they were above COARSE_RUNLEN_THRESHOLD.
|
|
//
|
|
// by the same logic, we have bytes >=MAX_RUNLEN (which is >=128) if and only if both
|
|
// 1. their low 7 bits >= (MAX_RUNLEN - 128)
|
|
// 2. their MSB is set
|
|
U64 low7 = bytes & low7_mask;
|
|
U64 above_coarse_runlen_thresh = (low7 + bias0) | bytes; // MSB in byte set if that byte >=COARSE_RUNLEN_THRESHOLD
|
|
U64 above_max_runlen_thresh = (low7 + bias1) & bytes; // MSB in byte set if that byte >=MAX_RUNLEN
|
|
U64 active = above_coarse_runlen_thresh & ~above_max_runlen_thresh & msb_mask;
|
|
if (active) {
|
|
// found at least one! locate the first occurrence using a trailing
|
|
// zero count.
|
|
return (cur - 8) + rrCtzBytes64(active);
|
|
}
|
|
}
|
|
|
|
return cur;
|
|
#else
|
|
while (cur < end && (*cur < COARSE_RUNLEN_THRESHOLD || *cur >= MAX_RUNLEN))
|
|
++cur;
|
|
|
|
return cur;
|
|
#endif
|
|
}
|
|
|
|
static int decode_block(radaudio_decoder_state *ds, F32 *output[2], void *mem, size_t memavail, size_t *memconsumed)
|
|
{
|
|
int len, skip=0;
|
|
radaudio_block_data bd[2];
|
|
|
|
*memconsumed = 0;
|
|
U8 *memory = mem;
|
|
//size_t memory_valid = memavail;
|
|
|
|
int c;
|
|
|
|
huff3_decoder dec;
|
|
|
|
// Throw in one empty section first so we know what the overhead of one of these is
|
|
PROF_BEGIN(overhead);
|
|
PROF_END(overhead);
|
|
|
|
PROF_BEGIN(header);
|
|
radaudio_block_header_unpacked header;
|
|
|
|
int offset = radaudio_decode_block_header(memory, &ds->biases, &header, memavail);
|
|
|
|
// check if we're at the stream header, if so skip it; this happens at start,
|
|
// but also if they seek without telling us
|
|
if (offset == COMMON_STREAM_HEADER) {
|
|
// we might be at the start of the stream
|
|
|
|
// enough bytes to check for the stream signature?
|
|
if (memavail < 8)
|
|
return e(RADAUDIO_INCOMPLETE_DATA);
|
|
|
|
// check the stream signature
|
|
if (!radaudio_check_stream_header(memory, memavail))
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
// enough bytes for a full header?
|
|
if (memavail < sizeof(radaudio_stream_header))
|
|
return e(RADAUDIO_INCOMPLETE_DATA);
|
|
|
|
// decode the header, the only way we have to parse it
|
|
radaudio_stream_header_unpacked fh;
|
|
size_t header_size = radaudio_unpack_stream_header(memory, memavail, &fh);
|
|
|
|
// was it a valid header?
|
|
if (header_size == 0)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
// make sure the subtract below can't be negative
|
|
if (header_size > memavail)
|
|
return e(RADAUDIO_INTERNAL_ERROR);
|
|
|
|
// behave as if we just did a seek operation
|
|
ds->post_seek = true;
|
|
|
|
// we know the block number
|
|
ds->block_number = 0;
|
|
|
|
// shrink the input buffer
|
|
memory += header_size;
|
|
memavail -= header_size;
|
|
skip = (int) header_size;
|
|
|
|
// now decode the real block header and go back to the main block decode path with the real block header
|
|
offset = radaudio_decode_block_header(memory, &ds->biases, &header, memavail);
|
|
|
|
// if that's ALSO a stream header, it's a corrupt file
|
|
if (offset == COMMON_STREAM_HEADER)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
}
|
|
|
|
if (offset == COMMON_INCOMPLETE_DATA)
|
|
return e(RADAUDIO_INCOMPLETE_DATA);
|
|
else if (offset < 0)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
U32 block_length_in_bytes = header.block_bytes + offset;
|
|
|
|
///////////////////////////////////////////////////////
|
|
// validate data
|
|
//
|
|
|
|
// block length isn't longer than spec max
|
|
if (block_length_in_bytes > MAX_ENCODED_BLOCK_BYTES)
|
|
return RADAUDIO_INVALID_DATA;
|
|
|
|
// length of first stream doesn't go off end of block
|
|
U32 mid_side_band_length = header.mid_side_bands ? (24/MACRO_BAND_SIZE+7)/8 : 0;
|
|
if (offset + mid_side_band_length + header.vbstream0_length > block_length_in_bytes)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
// final
|
|
if (header.final_block)
|
|
if (header.final_samples_discard > RADAUDIO_SHORT_BLOCK_LEN)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
// can't have more RLE entries than coefficients
|
|
if (header.num_runlength_array > (U32) 2*(header.this_block_short ? RADAUDIO_SHORT_BLOCK_LEN+1 : RADAUDIO_LONG_BLOCK_LEN+1))
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
if (block_length_in_bytes > memavail)
|
|
return e(RADAUDIO_INCOMPLETE_DATA);
|
|
|
|
if (header.final_block)
|
|
ds->at_eof = true;
|
|
|
|
rrbool is_short_block = header.this_block_short;
|
|
ds->current_block_short = (U8) is_short_block;
|
|
ds->next_block_short = header.next_block_short;
|
|
|
|
radaudio_rate_info *bi = ds->info[is_short_block];
|
|
int num_channels = header.num_channels_encoded;
|
|
|
|
int nz_mode = header.nonzero_bitarray_mode;
|
|
|
|
U8 *mid_side_bands = memory + offset;
|
|
U8 *post_header = mid_side_bands + mid_side_band_length;
|
|
|
|
U8 *vbstream2 = post_header + header.vbstream0_length;
|
|
U8 *packet_end = memory + block_length_in_bytes;
|
|
|
|
int error=0;
|
|
|
|
// we initialize the 'end' pointers for each stream to the end of valid data
|
|
// in that packet, not the end of that stream. So without further tests, they
|
|
// could read the same raw bytes as part of multiple streams; but this is used
|
|
// just to guarantee no memory overreads.
|
|
decode_vbstream_init(&dec.stream[0], post_header, packet_end, &error);
|
|
decode_vbstream_init(&dec.stream[1], packet_end , post_header, &error);
|
|
decode_vbstream_init(&dec.stream[2], vbstream2 , packet_end, &error);
|
|
|
|
U32 midside_bands=0;
|
|
if (header.mid_side_encoded)
|
|
midside_bands = 0xffffffff;
|
|
else if (header.mid_side_bands) {
|
|
RR_COMPILER_ASSERT(MACRO_BAND_SIZE == 3);
|
|
U8 midside_band_triples = *mid_side_bands; // read 8 bits
|
|
int k=0;
|
|
for (int j=0; j < bi->num_bands; j += 3, ++k) {
|
|
if (midside_band_triples & (1 << k))
|
|
midside_bands |= (7 << j);
|
|
}
|
|
}
|
|
|
|
U8 band_exponents[32*2];
|
|
int cur_band_exponents=0;
|
|
|
|
PROF_END(header);
|
|
|
|
//
|
|
// decode the band energy first, in case we want to use it to compute/predict other things (we don't anymore)
|
|
//
|
|
|
|
// band exponents
|
|
PROF_BEGIN(huffman);
|
|
if (header.predict_stereo_exponent && num_channels == 2) {
|
|
decode_huff_array(&dec, &rada_band_exponent_correct_huff , band_exponents , bi->num_bands, &error);
|
|
decode_huff_array(&dec, &rada_band_exponent_stereo_correct_huff, band_exponents+32, bi->num_bands, &error);
|
|
} else {
|
|
decode_huff_array(&dec, &rada_band_exponent_correct_huff, band_exponents, bi->num_bands * num_channels, &error);
|
|
}
|
|
PROF_END(huffman);
|
|
|
|
if (error) {
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
}
|
|
|
|
PROF_BEGIN(unpack);
|
|
for (c=0; c < (header.predict_stereo_exponent ? 1 : num_channels); ++c) {
|
|
int lastv = PREDICT_FIRST_BAND_EXP;
|
|
for (int j=0; j < bi->num_bands; ++j) {
|
|
int v = (S8) band_exponents[cur_band_exponents++];
|
|
v += lastv;
|
|
lastv = v;
|
|
bd[c].band_exponent[j] = v;
|
|
}
|
|
}
|
|
|
|
// decode stereo predicted exponents
|
|
if (header.predict_stereo_exponent && num_channels == 2) {
|
|
for (int j=0; j < bi->num_bands; ++j)
|
|
bd[1].band_exponent[j] = bd[0].band_exponent[j] + (S8) band_exponents[32+j];
|
|
}
|
|
PROF_END(unpack);
|
|
|
|
RAD_ALIGN(U16, m_mantissa[MAX_BANDS*2+16], 16);
|
|
|
|
PROF_BEGIN(compute_mantissa_len);
|
|
for (c=0; c < num_channels; ++c) {
|
|
// THIS LOGIC MUST BE EXACTLY REPLICATED IN THE COMPRESSOR!!!
|
|
compute_mantissa_bitcount(
|
|
ds->samprate_mode,
|
|
is_short_block,
|
|
ds->mantissa_param,
|
|
bd[c].band_exponent,
|
|
bd[c].band_mantissa_bitcount);
|
|
}
|
|
PROF_END(compute_mantissa_len);
|
|
|
|
if (error)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
PROF_BEGIN(varbits);
|
|
{
|
|
int slot=0;
|
|
|
|
for (c = 0; c < num_channels; ++c) {
|
|
for (int j=0; j < bi->num_bands; ++j) {
|
|
U8 size = bd[c].band_mantissa_bitcount[j];
|
|
U16 mantissa = (U16) decode_vbstream_bits(&dec.stream[2], size, &error);
|
|
m_mantissa[slot] = mantissa << (MAX_FINE_ENERGY_BITS - size);
|
|
++slot;
|
|
}
|
|
}
|
|
}
|
|
PROF_END(varbits);
|
|
|
|
if (error)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
//
|
|
// now do all the remaining entropy decoding
|
|
//
|
|
|
|
#define runlen_value_sentinel_size 2 // room to write two END_OF_ZERORUN markers to preven overread if input doesn't have them
|
|
#define runlen_read_sentinel_size 16 // room to write dummy values for SIMD to run on multiple-of-16-bytes
|
|
#define nonzero_coefficients_padding 32 // room to write dummy values for SIMD overwrite/overread, both of which are at most 16
|
|
#define coeff_pair_padding 16 // room to write dummy data when unpacking
|
|
|
|
#define runlen_pad (runlen_value_sentinel_size + runlen_read_sentinel_size)
|
|
|
|
#define max_runlength_data 1025 // 1024 empty runs per channel, plus two end-of-run markers
|
|
|
|
RAD_ALIGN(U8, subband_value [2* MAX_SUBBANDS ], 16);
|
|
RAD_ALIGN(U8, subband_correction [2* MAX_BANDS ], 16);
|
|
RAD_ALIGN(U8, subband_stereo_correct[ MAX_SUBBANDS ], 16);
|
|
|
|
RAD_ALIGN(S8, nonzero_coefficients [2* 1024+nonzero_coefficients_padding ], 16);
|
|
|
|
RAD_ALIGN(U8, runlength_data [2* max_runlength_data + runlen_pad ], 16);
|
|
RAD_ALIGN(U8, nonzero_flagbits [2* (1024/8) + 16 ], 16);
|
|
|
|
int num_subband_values0=0;
|
|
int num_subband_corrections=0, num_subband_stereo_correct=0;
|
|
int num_runlength_data=header.num_runlength_array;
|
|
int num_coeff_pairs;
|
|
|
|
for (int j=0; j < bi->num_bands; ++j) {
|
|
int numsub = bi->num_subbands_for_band[j];
|
|
if (numsub > 1) {
|
|
for (c=0; c < num_channels; ++c) {
|
|
if (bd[c].band_exponent[j] == BAND_EXPONENT_NONE && SUBBANDS_SKIP_EMPTY_BANDS)
|
|
continue;
|
|
int n = numsub;
|
|
if (c == 1 && header.predict_stereo_subband) {
|
|
num_subband_stereo_correct += n;
|
|
} else {
|
|
if (!header.disable_final_subband_predict) {
|
|
--n;
|
|
++num_subband_corrections;
|
|
}
|
|
num_subband_values0 += n;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// subband values
|
|
PROF_BEGIN(huffman);
|
|
if (!is_short_block) {
|
|
decode_huff_array(&dec, &rada_subband_value_huff , subband_value , num_subband_values0 , &error);
|
|
if (!header.disable_final_subband_predict)
|
|
decode_huff_array(&dec, &rada_subband_value_last_in_band_correct_huff, subband_correction, num_subband_corrections, &error);
|
|
if (header.predict_stereo_subband)
|
|
decode_huff_array(&dec, &rada_subband_value_stereo_correct_huff, subband_stereo_correct, num_subband_stereo_correct, &error);
|
|
}
|
|
PROF_END(huffman);
|
|
|
|
if (error)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
// coefficient zero-runlength data
|
|
if (num_runlength_data > 1025*2)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
if (!is_short_block && nz_mode != 3) {
|
|
U8 huffbits[2048];
|
|
radaudio_nonzero_blockmode_descriptor *bdesc = &ds->nz_desc[nz_mode];
|
|
|
|
PROF_BEGIN(huffman);
|
|
{
|
|
int p=0;
|
|
for (int i=0; i < NUM_NZ_HUFF; ++i) {
|
|
int q = bdesc->num_chunks_per_huff[i];
|
|
if (q) {
|
|
decode_huff_array(&dec, rada_nonzero_bitflags_huff[i], huffbits+p, q*8*num_channels, &error);
|
|
p += q*8*num_channels;
|
|
}
|
|
}
|
|
}
|
|
PROF_END(huffman);
|
|
|
|
PROF_BEGIN(huffman);
|
|
int j=0, s=num_channels-1;
|
|
for (c=0; c < num_channels; ++c, ++s) {
|
|
for (int i=0; i < bdesc->num_8byte_chunks; ++i, ++j) {
|
|
U8 p = bdesc->source_pos[s][i];
|
|
U64 xor = (U64)0 - bdesc->invert_chunk[i]; // if invert_chunk=1, this gives ~0 (invert), else 0.
|
|
RR_PUT64_NATIVE(&nonzero_flagbits[j*8], xor ^ RR_GET64_NATIVE(huffbits+p*8));
|
|
}
|
|
}
|
|
PROF_END(huffman);
|
|
}
|
|
|
|
if (error)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
PROF_BEGIN(huffman);
|
|
decode_huff_array(&dec, &rada_zero_runlength_huff, runlength_data, num_runlength_data, &error);
|
|
|
|
// add sentinel so we don't need to length-check loop
|
|
runlength_data[num_runlength_data+0] = END_OF_ZERORUN;
|
|
|
|
// add extra sentinel in case the data is invalid and doesn't have the stereo separator, so we don't need to length-check loop
|
|
runlength_data[num_runlength_data+1] = END_OF_ZERORUN;
|
|
PROF_END(huffman);
|
|
|
|
if (error)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
PROF_BEGIN(count_coefficients_huff);
|
|
// values of MAX_RUNLEN don't indicate coefficients, because they have a following real runlength
|
|
int num_nonzero_coefficients = count_bytes_below_value_sentinel16(ds, runlength_data, num_runlength_data, MAX_RUNLEN);
|
|
|
|
if (!is_short_block) {
|
|
int num_flagbit_bytes = ds->nz_desc[nz_mode].num_8byte_chunks * 8;
|
|
if (num_flagbit_bytes != 0)
|
|
num_nonzero_coefficients += count_set_bits_multiple8_sentinel8(ds, nonzero_flagbits, num_flagbit_bytes*num_channels);
|
|
}
|
|
PROF_END(count_coefficients_huff);
|
|
|
|
// runlength data + flagbits combined could be too many coefficients
|
|
|
|
if (num_nonzero_coefficients > num_channels*1024)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
int nz_selector = is_short_block ? 4 : nz_mode;
|
|
|
|
// coefficients -- need to have decoded the runlength data to know how many coefficients
|
|
{
|
|
// transient temp mem
|
|
RAD_ALIGN(U8, coefficient_pairs[2* 1024/2 * 2 + coeff_pair_padding], 16);
|
|
|
|
PROF_BEGIN(huffman);
|
|
num_coeff_pairs = (num_nonzero_coefficients+1)/2;
|
|
int tp = ds->nz_correlated_huffman_selectors[HS_COEFF_PAIR][nz_selector];
|
|
decode_huff_array(&dec, rada_nonzero_coefficient_pair_huff[tp], coefficient_pairs, num_coeff_pairs, &error);
|
|
PROF_END(huffman);
|
|
|
|
// convert coefficient pairs to coefficients
|
|
PROF_BEGIN(unpack);
|
|
unpack_nibbles_input_excess16_output_excess16_multiple32_default1(ds, nonzero_coefficients, coefficient_pairs, num_coeff_pairs);
|
|
PROF_END(unpack);
|
|
}
|
|
|
|
// read and apply bottom bits of run length data
|
|
// we have 2*1024 coeffs, COARSE_RUNLEN_THRESHOLD=60 and such runs are followed by a
|
|
// nonzero coefficient, so per 1024 coeffs we can have at most floor(1024/61)=16 of these
|
|
// (32 total between the total channels). in practice, the typical counts are 0-4.
|
|
PROF_BEGIN(update_runlength);
|
|
{
|
|
U8 *cur = runlength_data;
|
|
U8 *end = runlength_data + num_runlength_data; // we have runlen_read_sentinel_size of padding, so can be sloppy
|
|
while (cur < end) {
|
|
cur = find_next_coarse_run_excess16(ds, cur, end);
|
|
if (cur >= end)
|
|
break;
|
|
|
|
rrAssert(*cur >= COARSE_RUNLEN_THRESHOLD && *cur < MAX_RUNLEN);
|
|
|
|
// process this run and advance
|
|
U8 extra = (U8) decode_vbstream_bits(&dec.stream[2], 2, &error);
|
|
*cur += extra;
|
|
++cur;
|
|
}
|
|
}
|
|
PROF_END(update_runlength);
|
|
|
|
// big coefficients are coded as value 0 in the coefficient pairs
|
|
|
|
{
|
|
// transient temp mem, only used right here
|
|
RAD_ALIGN(S8, big_coefficients[2* 1024 + 16], 16);
|
|
|
|
// count zero bytes
|
|
int num_big_coefficients = count_bytes_below_value_sentinel16(ds, (U8*) nonzero_coefficients, num_nonzero_coefficients, 1);
|
|
|
|
PROF_BEGIN(huffman);
|
|
int tb = ds->nz_correlated_huffman_selectors[HS_COEFF_BIG][nz_selector];
|
|
decode_huff_array(&dec, rada_nonzero_coefficient_big_huff[tb], (U8*) big_coefficients, num_big_coefficients, &error);
|
|
PROF_END(huffman);
|
|
|
|
PROF_BEGIN(unbias);
|
|
// big coefficients are byte-sized, so stored aligned in stream[2]
|
|
//decode_stream_align_to_byte(&dec.stream[2]);
|
|
//U8 *bytestream = &dec.stream[2].bitstream[ dec.stream[2].read_pos_in_bits>>3 ];
|
|
// bytestream ends at current position of reverse-read stream 1
|
|
//decode_stream_align_to_byte(&dec.stream[1]);
|
|
//U8 *bytestream_end = &dec.stream[1].bitstream[-(int)(dec.stream[1].read_pos_in_bits>>3)];
|
|
|
|
// expand used to decode directly from the stream and hence needed a safety range
|
|
if (!expand_nonzero_coefficients(ds, nonzero_coefficients, num_nonzero_coefficients,
|
|
big_coefficients, (big_coefficients+num_nonzero_coefficients), (big_coefficients+sizeof(big_coefficients))))
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
PROF_END(unbias);
|
|
}
|
|
|
|
if (error)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
int cur_subband_values0=0;
|
|
int cur_subband_corrections=0;
|
|
int cur_subband_stereo_correct=0;
|
|
int cur_nonzero_coefficients=0;
|
|
int cur_runlength_data=0;
|
|
|
|
PROF_BEGIN(compute_subbands);
|
|
for (c=0; c < num_channels; ++c)
|
|
memset(bd[c].quantized_subbands, 0, bi->num_subbands * 2);
|
|
|
|
if (!is_short_block) {
|
|
// subbands
|
|
for (c=0; c < num_channels; ++c) {
|
|
for (int j=0; j < bi->num_bands; ++j) {
|
|
if (bi->num_subbands_for_band[j] == 1)
|
|
continue;
|
|
|
|
int start = bi->first_subband_for_band[j];
|
|
int num_coded_subbands = bi->num_subbands_for_band[j];
|
|
|
|
if (bd[c].band_exponent[j] == BAND_EXPONENT_NONE && SUBBANDS_SKIP_EMPTY_BANDS) {
|
|
for (int i=0; i < num_coded_subbands; ++i)
|
|
bd[c].quantized_subbands[start+i] = (U16) (ds->subband_predicted_sum[j] / num_coded_subbands); // this value is predicted from in stero
|
|
} else if (header.predict_stereo_subband && c == 1) {
|
|
for (int i=0; i < num_coded_subbands; ++i) {
|
|
int predict = bd[0].quantized_subbands[start+i];
|
|
int correct = (S8) subband_stereo_correct[cur_subband_stereo_correct++];
|
|
bd[c].quantized_subbands[start+i] = (U16) (predict + correct);
|
|
}
|
|
} else {
|
|
int predicted_sum = ds->subband_predicted_sum[j];
|
|
int bias = ds->subband_bias[j];
|
|
int partial_sum = 0;
|
|
|
|
if (!header.disable_final_subband_predict)
|
|
--num_coded_subbands;
|
|
|
|
for (int i=0; i < num_coded_subbands; ++i) {
|
|
int v = subband_value[cur_subband_values0++];
|
|
v -= bias; // remove bias
|
|
v = (v & 63);
|
|
bd[c].quantized_subbands[start+i] = (U16) v;
|
|
partial_sum += v;
|
|
}
|
|
|
|
if (!header.disable_final_subband_predict) {
|
|
int actual_sum = predicted_sum + (S8) subband_correction[cur_subband_corrections++];
|
|
int v = actual_sum - partial_sum;
|
|
|
|
if (v < 0) // @TODO investigate this case closely, why can't it be negative, should there be an upper bound?
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
bd[c].quantized_subbands[start+num_coded_subbands] = (U16) v;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
PROF_END(compute_subbands);
|
|
|
|
if (error)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
for (c=0; c < num_channels; ++c) {
|
|
int num_nonzero_bitarray_bytes = ds->nz_desc[nz_mode].num_8byte_chunks * 8;
|
|
rrbool result = distribute_nonzero_coefficients(ds, bd[c].quantized_coeff_decode, bi->num_quantized_coeffs,
|
|
runlength_data, &cur_runlength_data,
|
|
nonzero_coefficients, &cur_nonzero_coefficients,
|
|
nonzero_flagbits + c*num_nonzero_bitarray_bytes, is_short_block ? 0 : num_nonzero_bitarray_bytes*8, c);
|
|
if (!result)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
}
|
|
|
|
// we expect to read the first sentinel; if we read the second, it's a bug
|
|
if (cur_runlength_data > num_runlength_data+1)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
FFT_ALIGN(F32, rawdata[MAX_COEFFS]);
|
|
F32 *data1 = rawdata, *data2 = rawdata + (bi->num_coeffs >> 1);
|
|
|
|
if (ds->num_channels==1) {
|
|
// mono stream
|
|
(void) decode_channel_before_imdct(ds, &bd[0], 0, ds->block_number, m_mantissa);
|
|
(void) decode_channel_imdct (ds, rawdata , &bd[0], 0);
|
|
len = decode_channel_after_imdct (ds, data1, data2, &bd[0], 0, output[0]);
|
|
} else if (ds->num_channels==2 && num_channels==1) {
|
|
// stereo stream with mono block
|
|
(void) decode_channel_before_imdct(ds, &bd[0], 0, ds->block_number, m_mantissa);
|
|
(void) decode_channel_imdct (ds, rawdata , &bd[0], 0);
|
|
(void) decode_channel_after_imdct (ds, data1, data2, &bd[0], 0, output[0]);
|
|
len = decode_channel_after_imdct (ds, data1, data2, &bd[0], 1, output[1]);
|
|
} else {
|
|
// stereo stream with stereo block
|
|
(void) decode_channel_before_imdct(ds, &bd[0], 0, ds->block_number , m_mantissa);
|
|
(void) decode_channel_before_imdct(ds, &bd[1], 1, ds->block_number^0x55555555, m_mantissa+ds->info[is_short_block]->num_bands);
|
|
|
|
// midside decode
|
|
for (int j=0; j < bi->num_bands; ++j) {
|
|
if (midside_bands & (1 << j)) {
|
|
F32 *coeffs1 = bd[0].dequantized_coeff_decode;
|
|
F32 *coeffs2 = bd[1].dequantized_coeff_decode;
|
|
int start = bi->first_coeff_for_band[j];
|
|
int end = start + bi->num_coeffs_for_band[j];
|
|
for (int i=start; i < end; ++i) {
|
|
float x = coeffs1[i];
|
|
float y = coeffs2[i]*0.5f;
|
|
coeffs1[i] = x+y;
|
|
coeffs2[i] = x-y;
|
|
}
|
|
}
|
|
}
|
|
(void) decode_channel_imdct (ds, rawdata , &bd[0], 0);
|
|
(void) decode_channel_after_imdct (ds, data1, data2, &bd[0], 0, output[0]);
|
|
(void) decode_channel_imdct (ds, rawdata , &bd[1], 1);
|
|
len = decode_channel_after_imdct (ds, data1, data2, &bd[1], 1, output[1]);
|
|
}
|
|
|
|
if (error)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
|
|
*memconsumed = block_length_in_bytes + skip;
|
|
|
|
++ds->block_number;
|
|
ds->last_block_short = ds->current_block_short;
|
|
|
|
if (header.final_block) {
|
|
if (header.final_samples_discard > (U32) len)
|
|
return e(RADAUDIO_INVALID_DATA);
|
|
else {
|
|
int total = len - header.final_samples_discard;
|
|
return total;
|
|
}
|
|
} else
|
|
return len;
|
|
}
|
|
|
|
size_t RadAudioDecoderMemoryRequired(U8 *raw_header, size_t raw_header_size)
|
|
{
|
|
size_t size = sizeof(radaudio_decoder_state);
|
|
size += decoder_align-1; // allow room to align
|
|
|
|
int num_channels;
|
|
if (raw_header == NULL)
|
|
num_channels = 2;
|
|
else {
|
|
radaudio_stream_header_unpacked header;
|
|
if (radaudio_unpack_stream_header(raw_header, raw_header_size, &header) == 0)
|
|
return 0;
|
|
num_channels = header.num_channels;
|
|
}
|
|
|
|
// room for buffered samples from previous block
|
|
size += RADAUDIO_LONG_BLOCK_LEN/2 * sizeof(S16) * num_channels;
|
|
return size;
|
|
}
|
|
|
|
static radaudio_decoder_state * radaudio_decompressor_memalloc(radaudio_stream_header_unpacked *header, void *vmem, size_t memsize)
|
|
{
|
|
int i;
|
|
union {
|
|
UINTa addr;
|
|
U8 * ptr;
|
|
} convert;
|
|
|
|
if (memsize < sizeof(radaudio_decoder_state))
|
|
return 0;
|
|
|
|
U8 *mem = vmem;
|
|
radaudio_decoder_state *ds;
|
|
ds = (void*) mem; mem += sizeof(*ds);
|
|
memset(ds, 0, sizeof(*ds));
|
|
|
|
// align data after struct
|
|
convert.ptr = mem;
|
|
convert.addr = (convert.addr + decoder_align-1) & ~(decoder_align-1);
|
|
mem = convert.ptr;
|
|
for (i=0; i < header->num_channels; ++i) {
|
|
ds->prev_block_right_samples[i] = (void *) mem;
|
|
mem += RADAUDIO_LONG_BLOCK_LEN/2 * sizeof(S16);
|
|
}
|
|
|
|
size_t memneeded = mem - (U8*)vmem;
|
|
if (memneeded > memsize)
|
|
return 0;
|
|
|
|
ds->last_block_short = 1; // shouldn't matter
|
|
return ds;
|
|
}
|
|
|
|
RadAudioDecoder *RadAudioDecoderOpen(U8 *raw_header, size_t raw_header_size, void *vmem, size_t memsize, size_t *header_read)
|
|
{
|
|
radaudio_decoder_state *ds;
|
|
radaudio_stream_header_unpacked header;
|
|
if (raw_header_size < sizeof(radaudio_stream_header))
|
|
return NULL;
|
|
size_t header_size = radaudio_unpack_stream_header(raw_header, raw_header_size, &header);
|
|
if (header_size == 0)
|
|
return NULL;
|
|
|
|
// unpack_stream_header does some sanity checking, here's the rest:
|
|
|
|
for (int i=0; i < NUM_NZ_MODE; ++i)
|
|
if (header.nzmode_num64[i] > MAX_NZ_BLOCKS)
|
|
return 0;
|
|
|
|
ds = radaudio_decompressor_memalloc(&header, vmem, memsize);
|
|
if (ds == NULL)
|
|
return 0;
|
|
|
|
ds->version = header.version;
|
|
ds->num_channels = header.num_channels;
|
|
ds->skip_bytes = 0;//(U8) header_size;
|
|
ds->cpu = cpu_detect();
|
|
ds->post_seek = true; // very first block decoded discards input
|
|
|
|
ds->sample_rate = header.sample_rate;
|
|
ds->samprate_mode = header.sample_rate_mode;
|
|
memcpy(ds->subband_bias, header.subband_bias, sizeof(ds->subband_bias));
|
|
|
|
ds->info[0] = &radaudio_rateinfo[0][ds->samprate_mode];
|
|
ds->info[1] = &radaudio_rateinfo[1][ds->samprate_mode];
|
|
|
|
memcpy(ds->subband_predicted_sum, header.subband_predicted_sum, 24);
|
|
memcpy(ds->mantissa_param , header.mantissa_param, sizeof(header.mantissa_param));
|
|
compute_bias_set(&ds->biases, header.bytes_bias);
|
|
|
|
for (int i=0; i < NUM_NZ_MODE; ++i) {
|
|
ds->nz_desc[i].num_8byte_chunks = header.nzmode_num64[i];
|
|
if (ds->nz_desc[i].num_8byte_chunks > MAX_NZ_BLOCKS)
|
|
return 0;
|
|
for (int j=0; j < MAX_NZ_BLOCKS; ++j) {
|
|
ds->nz_desc[i].huffman_table_for_chunk[j] = (header.nzmode_huff[i][j] & ~NZ_MODE_INVERT);
|
|
ds->nz_desc[i].invert_chunk[j] = (header.nzmode_huff[i][j] & NZ_MODE_INVERT) != 0;
|
|
if (ds->nz_desc[i].huffman_table_for_chunk[j] >= NUM_NZ_HUFF)
|
|
return 0;
|
|
}
|
|
}
|
|
for (int j=0; j < NUM_NZ_SELECTOR; ++j)
|
|
for (int i=0; i < NUM_SELECTOR_MODES; ++i)
|
|
ds->nz_correlated_huffman_selectors[j][i] = header.nzmode_selectors[j][i];
|
|
|
|
radaudio_init_nz_desc(ds->nz_desc);
|
|
|
|
if (header_read)
|
|
*header_read = header_size;
|
|
|
|
return ds;
|
|
}
|
|
|
|
#ifdef RADAUDIO_DEVELOPMENT
|
|
void RadAudioDecoderForceIntelCPU(RadAudioDecoder *hradaud, rrbool has_sse2, rrbool has_ssse3, rrbool has_sse4_1, rrbool has_popcnt, rrbool has_avx2)
|
|
{
|
|
radaudio_decoder_state *ds = (radaudio_decoder_state *) hradaud;
|
|
RR_UNUSED_VARIABLE(ds);
|
|
#ifdef __RADX86__
|
|
ds->cpu.has_sse2 = (U8) has_sse2;
|
|
ds->cpu.has_ssse3 = (U8) has_ssse3;
|
|
ds->cpu.has_sse4_1 = (U8) has_sse4_1;
|
|
ds->cpu.has_popcnt = (U8) has_popcnt;
|
|
ds->cpu.has_avx2 = (U8) has_avx2;
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
static void decode_version(RadAudioInfo *info, U32 version)
|
|
{
|
|
info->major_version = (U8 ) ((version & 0xff000000) >> 24);
|
|
info->minor_version = (U8 ) ((version & 0x00ff0000) >> 16);
|
|
info->sequential_version = (U16) ((version & 0x0000ffff) >> 0);
|
|
}
|
|
|
|
void RadAudioDecoderGetInfo(const RadAudioDecoder *hradaud, RadAudioInfo *out_info)
|
|
{
|
|
radaudio_decoder_state *ds = (radaudio_decoder_state *) hradaud;
|
|
out_info->sample_rate = ds->sample_rate;
|
|
out_info->num_channels = ds->num_channels;
|
|
decode_version(out_info, ds->version);
|
|
}
|
|
|
|
size_t RadAudioDecoderGetInfoHeader(U8* raw_header, size_t raw_header_size, RadAudioInfo *out_info)
|
|
{
|
|
radaudio_stream_header_unpacked header;
|
|
size_t header_size = radaudio_unpack_stream_header(raw_header, raw_header_size, &header);
|
|
if (header_size == 0)
|
|
return 0;
|
|
out_info->sample_rate = header.sample_rate;
|
|
out_info->num_channels = header.num_channels;
|
|
decode_version(out_info, header.version);
|
|
return header_size;
|
|
}
|
|
|
|
RADDEFFUNC void RadAudioDecoderDidSeek(RadAudioDecoder *radaudio_decomp)
|
|
{
|
|
radaudio_decoder_state *ds = (radaudio_decoder_state *) radaudio_decomp;
|
|
ds->post_seek = true;
|
|
ds->at_eof = false;
|
|
}
|
|
|
|
int RadAudioDecoderGetChunkLength(RadAudioDecoder *radaudio_decomp, const U8 *data, size_t data_avail)
|
|
{
|
|
radaudio_decoder_state *ds = (radaudio_decoder_state *) radaudio_decomp;
|
|
if (ds->at_eof)
|
|
return RADAUDIO_AT_EOF;
|
|
|
|
if (data_avail < 4)
|
|
return RADAUDIO_INCOMPLETE_DATA;
|
|
|
|
radaudio_block_header_unpacked header;
|
|
int offset = radaudio_decode_block_header(data, &ds->biases, &header, data_avail);
|
|
|
|
if (offset == COMMON_STREAM_HEADER)
|
|
return RADAUDIO_START_OF_STREAM;
|
|
if (offset == COMMON_INCOMPLETE_DATA)
|
|
return RADAUDIO_INCOMPLETE_DATA;
|
|
if (offset == COMMON_INVALID_DATA)
|
|
return RADAUDIO_INVALID_DATA;
|
|
|
|
U32 block_length_in_bytes = header.block_bytes + offset;
|
|
|
|
// validate data
|
|
|
|
if (block_length_in_bytes > MAX_ENCODED_BLOCK_BYTES)
|
|
return RADAUDIO_INVALID_DATA;
|
|
|
|
if (offset + header.vbstream0_length > block_length_in_bytes)
|
|
return RADAUDIO_INVALID_DATA;
|
|
|
|
if (header.final_block)
|
|
if (header.final_samples_discard > RADAUDIO_SHORT_BLOCK_LEN)
|
|
return RADAUDIO_INVALID_DATA;
|
|
|
|
if (header.num_runlength_array > (U32) 2*(header.this_block_short ? RADAUDIO_SHORT_BLOCK_LEN+1 : RADAUDIO_LONG_BLOCK_LEN+1))
|
|
return RADAUDIO_INVALID_DATA;
|
|
|
|
return header.block_bytes + offset;
|
|
}
|
|
|
|
// returns the number of samples output per channel, and update 'memconsumed'
|
|
// with the amount of memory consumed.
|
|
//
|
|
// return values:
|
|
// n number of samples decoded (for one channel, e.g. n=1024 means 1024 stereo pairs)
|
|
// 0 can decode 0 samples legitimately, e.g. first block or after seeking
|
|
// -1 at end-of-stream
|
|
// -2 not enough input data to decode a frame, always consumes 0
|
|
// -3 error (e.g. corrupt stream)
|
|
int RadAudioDecoderDecodeChunk(
|
|
RadAudioDecoder *radaudio_decomp,
|
|
const U8 *mem ,
|
|
size_t memavail ,
|
|
size_t *memconsumed ,
|
|
F32 *output_samples[2],
|
|
size_t max_samples_per_channel
|
|
)
|
|
{
|
|
*memconsumed = 0;
|
|
|
|
radaudio_decoder_state *ds = (radaudio_decoder_state *) radaudio_decomp;
|
|
if (ds->at_eof)
|
|
return RADAUDIO_AT_EOF;
|
|
|
|
if (memavail < 7)
|
|
return RADAUDIO_INCOMPLETE_DATA;
|
|
|
|
size_t used;
|
|
|
|
if (!ds)
|
|
return -2;
|
|
|
|
PROF_BEGIN(decoder_all);
|
|
|
|
size_t skip = ds->skip_bytes;
|
|
int len = decode_block(ds, output_samples, (U8*)mem+skip, memavail-skip, &used);
|
|
ds->post_seek = false;
|
|
|
|
if (len >= 0) {
|
|
*memconsumed = used + skip;
|
|
ds->skip_bytes = 0;
|
|
ds->fully_decoded += len;
|
|
}
|
|
|
|
PROF_END(decoder_all);
|
|
return len;
|
|
}
|
|
|
|
#ifdef RADAUDIO_DEVELOPMENT
|
|
// internal use
|
|
int RadAudioDecoderGetProfileData(RadAudioDecoder *hradaud, radaudio_profile_value *aprofile, int num_profile)
|
|
{
|
|
RR_UNUSED_VARIABLE(hradaud);
|
|
int n = RR_MIN(num_profile, PROF_total_count - 1);
|
|
static const char *names[] = {
|
|
#define PROF(x) #x,
|
|
PROFILE_ZONES()
|
|
#undef PROF
|
|
};
|
|
|
|
if (aprofile) {
|
|
double overhead_time = 0.0;
|
|
// we have an empty profiling region to estimate overhead of tracking a region to begin with
|
|
if (profile_counts[PROF_overhead]) {
|
|
overhead_time = rrTicksToSeconds(profile_times[PROF_overhead]) / profile_counts[PROF_overhead];
|
|
}
|
|
for (int i=0; i < n; ++i) {
|
|
aprofile[i].name = names[i];
|
|
// subtract out estimated overhead
|
|
aprofile[i].time = rrTicksToSeconds(profile_times[i]) - overhead_time * profile_counts[i];
|
|
}
|
|
} else {
|
|
profile = num_profile;
|
|
}
|
|
|
|
for (int i=0; i < PROF_total_count; ++i) {
|
|
profile_times[i] = 0;
|
|
profile_counts[i] = 0;
|
|
}
|
|
return n;
|
|
}
|
|
#else
|
|
int RadAudioDecoderGetProfileData(RadAudioDecoder *hradaud, radaudio_profile_value *profile, int num_profile)
|
|
{
|
|
RR_UNUSED_VARIABLE(hradaud); RR_UNUSED_VARIABLE(profile); RR_UNUSED_VARIABLE(num_profile);
|
|
return 0;
|
|
}
|
|
#endif
|