// Copyright Epic Games Tools, LLC. All Rights Reserved. //////////////////////////////////////////////////////////////////////////// // // RADaudio is a new audio codec made by Epic Game Tools for use in games, // optimized for fast SIMD decoding and decent quality (roughly similar to // Vorbis). // // It is a classical MDCT-based codec with two block sizes, and it uses // the Oodle Data huffman entropy coder to store data. #include #include #include // We want the external defines to be scoped to DECODER, but we don't want to rename everything in here. #define RADAUDIO_AT_EOF RADAUDIO_DECODER_AT_EOF #define RADAUDIO_INCOMPLETE_DATA RADAUDIO_DECODER_INCOMPLETE_DATA #define RADAUDIO_INVALID_DATA RADAUDIO_DECODER_INVALID_DATA #define RADAUDIO_START_OF_STREAM RADAUDIO_DECODER_START_OF_STREAM #define RADAUDIO_INTERNAL_ERROR RADAUDIO_DECODER_INTERNAL_ERROR #define HUFFMAN_DECODE // enable huffman decode tables #include "radaudio_decoder.h" #include "radaudio_decoder_sse2.h" #include "radaudio_decoder_sse4.h" #include "radaudio_decoder_avx2.h" #include "radaudio_decoder_neon.h" #include "radaudio_decoder_internal.h" #include "radaudio_common.h" #include "radaudio_mdct.h" #include "rrCore.h" #include "rrbits.h" #include "radaudio_common.inl" RR_COMPILER_ASSERT(COMMON_INVALID_DATA == RADAUDIO_INVALID_DATA); RR_COMPILER_ASSERT(COMMON_INCOMPLETE_DATA == RADAUDIO_INCOMPLETE_DATA); RR_COMPILER_ASSERT(RADAUDIO_STREAM_HEADER_SIZE == sizeof(radaudio_stream_header)); #ifdef RADAUDIO_DEVELOPMENT #define PROFILE_ZONES() \ PROF(decoder_all) \ PROF(imdct) \ PROF(window) \ PROF(huffman) \ PROF(unquantize) \ PROF(distribute_rle) \ PROF(update_runlength) \ PROF(varbits) \ PROF(compute_mantissa_len) \ PROF(copy) \ PROF(compute_subbands) \ PROF(distribute_bitflag) \ PROF(unpack) \ PROF(randomize) \ PROF(compute_subband_energy)\ PROF(unbias) \ PROF(compute_band_energy) \ PROF(count_coefficients_huff) \ PROF(header) \ PROF(zero) \ PROF(flagbits) \ PROF(overhead) /* must always be last! */ \ PROF(total_count) enum { #define PROF(x) PROF_##x, PROFILE_ZONES() #undef PROF PROF__end }; #define PROF_BEGIN(var) if (profile) { profile_times[PROF_##var] -= rrGetTicks(); profile_counts[PROF_##var] += 1; } #define PROF_END(var) if (profile) profile_times[PROF_##var] += rrGetTicks() static rrbool profile; static U64 profile_times[PROF_total_count]; static S64 profile_counts[PROF_total_count]; #else #define PROF_BEGIN(var) #define PROF_END(var) #define PROF_total_count 1 #endif #define RANDVAL(r,i) (r) // allow breakpointing on any error static int e(int code) { return code; } typedef struct RadAudioDecoder { U32 version; rrbool current_block_short; rrbool last_block_short; rrbool next_block_short; // we don't actually need this int samprate_mode; int num_channels; int skip_bytes; int sample_rate; // implied by samprate_mode U32 block_number; int fully_decoded; // sample offset in stream U8 subband_predicted_sum[MAX_BANDS]; S8 mantissa_param[2][MAX_BANDS][2]; S8 subband_bias[MAX_BANDS]; rrbool at_eof; rrbool post_seek; rrbool bitstream_overshot; radaudio_block_header_biases biases; radaudio_cpu_features cpu; radaudio_rate_info * info[2]; // pre-defined table, indexed by long vs. short block radaudio_nonzero_blockmode_descriptor nz_desc[NUM_NZ_MODE]; U8 nz_correlated_huffman_selectors[NUM_NZ_SELECTOR][NUM_SELECTOR_MODES]; S16 * prev_block_right_samples[2]; F32 restore_scale[2]; // how to convert S16s back to floats } radaudio_decoder_state; ////////////////////////////////////////////////////////////////////////////// // // ENTROPY DECODER // typedef struct { U8 *bitstream; U8 *end; U32 read_pos_in_bits; U32 fast_num_bits; // if initial read_pos_in_bits < this, can take fast path (needs to be < not <= so 0 disables fast path) U32 total_num_bits; } rada_bit_decoder; typedef struct { rada_bit_decoder stream[3]; } huff3_decoder; static void decode_vbstream_init(rada_bit_decoder *d, U8 *bitstream, U8 *end, int *error) { if (bitstream > end) { // this is a backwards stream (for huffman). not allowed to read through // decode_vbstream_bits, so make sure we can't by setting total bits count to 0. d->bitstream = bitstream; d->end = end; d->read_pos_in_bits = 0; d->fast_num_bits = 0; d->total_num_bits = 0; return; } d->bitstream = bitstream; d->end = end; d->read_pos_in_bits = 0; size_t num_bytes = end - bitstream; if (num_bytes > MAX_ENCODED_BLOCK_BYTES) { // not allowed! set num_bytes to 0 and initialize stream as empty num_bytes = 0; *error = 1; } d->total_num_bits = (U32) (num_bytes * 8); // can't overflow: num_bytes checked above if (d->total_num_bits >= 32) { d->fast_num_bits = d->total_num_bits - 32; } else { d->fast_num_bits = 0; } } // bit reading cold path, reads one byte at a time to avoid over-reading static RADNOINLINE U32 decode_vbstream_bits_cold(rada_bit_decoder *d, int bitlength, int *error) { // check whether actual data required goes off the end if (d->read_pos_in_bits + bitlength > d->total_num_bits) { *error = 1; return 0; } // can read 0 bits exactly at the end if (bitlength == 0) return 0; // if not, read as many valid bits as exist, then mask size_t first_byte = (d->read_pos_in_bits >> 3); U32 bits = d->bitstream[first_byte++]; U32 shift = 8; while (d->bitstream+first_byte < d->end) { bits = bits + (d->bitstream[first_byte++] << shift); shift += 8; } bits >>= (d->read_pos_in_bits & 7); bits &= (1 << bitlength)-1; d->read_pos_in_bits += bitlength; return bits; } static RADFORCEINLINE U32 decode_vbstream_bits(rada_bit_decoder *d, int bitlength, int *error) { // check for reading off the end... if (d->read_pos_in_bits < d->fast_num_bits) { // simple path size_t first_byte = (d->read_pos_in_bits >> 3); U32 bits = RR_GET32_LE(d->bitstream + first_byte); bits >>= (d->read_pos_in_bits & 7); // discard bits we're pointing past bits &= (1 << bitlength)-1; d->read_pos_in_bits += bitlength; return bits; } else { return decode_vbstream_bits_cold(d, bitlength, error); } } typedef struct { U8 *decodeptr; // Current write cursor for the two stream triples U8 *decodeend; // End of decoded bytes buffer for the two stream triples const U8 *bitp[3]; // Next byte to be read for the streams U32 bits[3]; // Current contents of bit buffer U32 bitc[3]; // Current number of valid bits in bit buffer } rada_internal_huff_state; #define NEWLZ_HUFF_CODELEN_LIMIT 11 #define NEWLZ_HUFF_DECODE_TABLE_MASK 2047u // 32-bit ARM implicitly masks 32-bit shift amounts by 255 (low 8 bits). // All other current targets implicitly mask by 31 (low 5 bits). Either // works for us, but we'd prefer not to get an extra AND, so use whatever // the implicit mask is and rely on the compiler to clean it up. #if defined(__RADARM__) && !defined(__RAD64__) #define HUFF32LENMASK 255 #else #define HUFF32LENMASK 31 #endif static rrbool huff_decode_precise_finish(rada_internal_huff_state * s, radaudio_huffman *huff) { const U8 * in0 = s->bitp[0]; const U8 * in1 = s->bitp[1]; const U8 * in2 = s->bitp[2]; U32 bits0 = s->bits[0], bitc0 = s->bitc[0]; U32 bits1 = s->bits[1], bitc1 = s->bitc[1]; U32 bits2 = s->bits[2], bitc2 = s->bitc[2]; if (in0 > in2) return false; U8 *decodeptr = s->decodeptr; U8 *decodeend = s->decodeend; #define DECONE(strm) \ peek = bits##strm & NEWLZ_HUFF_DECODE_TABLE_MASK; \ cl = huff->decode[peek].length; \ sym = huff->decode[peek].symbol; \ bits##strm >>= cl & HUFF32LENMASK; bitc##strm -= cl; \ *decodeptr++ = (U8) sym #define DECTHREE() \ DECONE(0); \ DECONE(1); \ DECONE(2) RR_COMPILER_ASSERT( NEWLZ_HUFF_CODELEN_LIMIT <= 12 ); #define N_DECS_PER_REFILL 2 #define TRIPLE_DECS_PER_REFILL (3*N_DECS_PER_REFILL) // bulk loop to get within 4B of end if (in1 - in2 >= 4 && decodeend - decodeptr >= TRIPLE_DECS_PER_REFILL) { in1 -= 4; decodeend -= TRIPLE_DECS_PER_REFILL-1; while (decodeptr < decodeend) { // non-crossing invariant: in0 <= in2 && in2 <= in1 if (in0 > in2 || in2 > in1) break; // non-crossing and 4B access size guarantee that the // following reads are safe; the decodeend decrement before the // loop guarantees that we don't write out of bounds. // refill : bits0 |= RR_GET32_LE(in0) << bitc0; in0 += (31 - bitc0)>>3; // bytes_consumed bitc0 |= 24; // same as += bytes_consumed<<3 here! bits1 |= RR_GET32_BE(in1) << bitc1; in1 -= (31 - bitc1)>>3; // bytes_consumed bitc1 |= 24; // same as += bytes_consumed<<3 here! bits2 |= RR_GET32_LE(in2) << bitc2; in2 += (31 - bitc2)>>3; // bytes_consumed bitc2 |= 24; // same as += bytes_consumed<<3 here! U32 peek; int cl; int sym; RR_COMPILER_ASSERT( N_DECS_PER_REFILL == 2 ); DECTHREE(); DECTHREE(); } decodeend += TRIPLE_DECS_PER_REFILL-1; in1 += 4; // transition to final loop in0 -= (bitc0 >> 3); bitc0 &= 7; in1 += (bitc1 >> 3); bitc1 &= 7; in2 -= (bitc2 >> 3); bitc2 &= 7; } // Final loop. This is really careful about the bytes it accesses. while (decodeptr < decodeend) { U32 peek, cl, sym; // refill to >=16b in bit0 buf if (in2 - in0 > 1) bits0 |= RR_GET16_LE(in0) << bitc0; else if (in2 - in0 == 1) bits0 |= in0[0] << bitc0; DECONE(0); in0 += (7 - bitc0) >> 3; bitc0 &= 7; if (decodeptr >= decodeend) break; // refill to >=16b left in bit1, bit2 bufs if (in1 - in2 > 1) { bits1 |= RR_GET16_BE(in1 - 2) << bitc1; bits2 |= RR_GET16_LE(in2) << bitc2; } else if (in1 - in2 == 1) { // accessing the same byte! bits1 |= in2[0] << bitc1; bits2 |= in2[0] << bitc2; } DECONE(1); in1 -= (7 - bitc1) >> 3; bitc1 &= 7; if (decodeptr >= decodeend) break; DECONE(2); in2 += (7 - bitc2) >> 3; bitc2 &= 7; if (in0 > in2 || in2 > in1) // corruption check return false; } if (decodeptr != decodeend) return false; #undef DECONE #undef DECTHREE #undef N_DECS_PER_REFILL #undef TRIPLE_DECS_PER_REFILL s->bitp[0] = in0; s->bits[0] = bits0; s->bitc[0] = bitc0; s->bitp[1] = in1; s->bits[1] = bits1; s->bitc[1] = bitc1; s->bitp[2] = in2; s->bits[2] = bits2; s->bitc[2] = bitc2; return true; } #if defined(__RAD64REGS__) static rrbool huff_decode_inner64(rada_internal_huff_state * s, radaudio_huffman *huff) { // Layout: strm0-> | strm2-> | <-strm1 const U8 * in0 = s->bitp[0]; const U8 * in1 = s->bitp[1]; const U8 * in2 = s->bitp[2]; U8 * decodeptr = s->decodeptr; U8 * decodeend = s->decodeend; // NEWLZ_HUFF_CODELEN_LIMIT == 11 , could actually do 5 per refill = 10 per loop #if (56/NEWLZ_HUFF_CODELEN_LIMIT) >= 5 #define N_DECS_PER_REFILL 5 #elif (56/NEWLZ_HUFF_CODELEN_LIMIT) >= 4 #define N_DECS_PER_REFILL 4 #else #define N_DECS_PER_REFILL 3 #endif #define TRIPLE_DECS_PER_REFILL (3*N_DECS_PER_REFILL) // bulk loop if (decodeend - decodeptr > TRIPLE_DECS_PER_REFILL-1 && in1 - in2 > 8) // @TODO: maybe test for going outside the buffer instead of this, since this might be true too often { // offset the end marker so we only run with full groups left decodeend -= TRIPLE_DECS_PER_REFILL-1; in1 -= 8; U64 bits0=s->bits[0], bitcount0 = s->bitc[0]; U64 bits1=s->bits[1], bitcount1 = s->bitc[1]; U64 bits2=s->bits[2], bitcount2 = s->bitc[2]; const U8 *hufftab_base = &huff->decode[0].length; #define DECONE(strm) \ /* NOTE(fg): This address calc is a single UBFIZ */ \ tabv = (bits##strm & NEWLZ_HUFF_DECODE_TABLE_MASK) * sizeof(radaudio_huff_symbol); \ tabv = RR_GET16_LE((const U16 *) (hufftab_base + tabv)); \ bits##strm >>= tabv & 63; bitcount##strm -= tabv; \ *decodeptr++ = (U8) (tabv >> 8) #define DECTHREE() \ DECONE(0); \ DECONE(1); \ DECONE(2) while (decodeptr < decodeend) { // non-crossing invariant: in0 <= in2 && in2 <= in1 if (in0 > in2) // if_unlikely break; if (in2 > in1) // if_unlikely break; // refill : U64 next0 = RR_GET64_LE(in0); bits0 |= next0 << bitcount0; in0 += (63 - bitcount0)>>3; // bytes_consumed bitcount0 |= 56; // same as += bytes_consumed<<3 here! U64 next1 = RR_GET64_BE(in1); bits1 |= next1 << bitcount1; in1 -= (63 - bitcount1)>>3; // bytes_consumed bitcount1 |= 56; // same as += bytes_consumed<<3 here! U64 next2 = RR_GET64_LE(in2); bits2 |= next2 << bitcount2; in2 += (63 - bitcount2)>>3; // bytes_consumed bitcount2 |= 56; // same as += bytes_consumed<<3 here! U32 tabv; RR_COMPILER_ASSERT( N_DECS_PER_REFILL >= 3 && N_DECS_PER_REFILL <= 5 ); DECTHREE(); DECTHREE(); DECTHREE(); #if N_DECS_PER_REFILL > 3 DECTHREE(); #endif #if N_DECS_PER_REFILL > 4 DECTHREE(); #endif // our decode process puts some crap in the top bits; clear them bitcount0 &= 63; bitcount1 &= 63; bitcount2 &= 63; } #undef DECONE #undef DECTHREE in1 += 8; // transition to careful loop s->decodeptr = decodeptr; s->bitp[0] = in0 - (bitcount0 >> 3); s->bits[0] = (U32) (bits0 & 0xff); s->bitc[0] = bitcount0 & 7; s->bitp[1] = in1 + (bitcount1 >> 3); s->bits[1] = (U32) (bits1 & 0xff); s->bitc[1] = bitcount1 & 7; s->bitp[2] = in2 - (bitcount2 >> 3); s->bits[2] = (U32) (bits2 & 0xff); s->bitc[2] = bitcount2 & 7; } #undef N_DECS_PER_REFILL #undef TRIPLE_DECS_PER_REFILL return huff_decode_precise_finish(s, huff); } #endif static void decode_huff_array(huff3_decoder *ds, radaudio_huffman *huff, U8 *array, int length, int *error) { rada_internal_huff_state s; s.decodeptr = array; s.decodeend = array+length; // generate Huff3 decoder state from our naive state for (int i=0; i <= 2; i += 2) { s.bitp[i] = &ds->stream[i].bitstream[ds->stream[i].read_pos_in_bits>>3]; s.bitc[i] = (0-ds->stream[i].read_pos_in_bits) & 7; // read pos of 2 => 6 bits left if (s.bitc[i] == 0) s.bits[i] = 0; else { s.bits[i] = *(s.bitp[i]) >> (8-s.bitc[i]); ++s.bitp[i]; } } s.bitp[1] = &ds->stream[1].bitstream[-(int)(ds->stream[1].read_pos_in_bits>>3)]; s.bitc[1] = (0-ds->stream[1].read_pos_in_bits) & 7; // read pos of 2 => 6 bits left if (s.bitc[1] == 0) s.bits[1] = 0; else { s.bits[1] = *(s.bitp[1]-1) >> (8-s.bitc[1]); --s.bitp[1]; } #ifdef __RAD64REGS__ if (!huff_decode_inner64(&s, huff)) *error = 1; #else if (!huff_decode_precise_finish(&s, huff)) *error = 1; #endif ds->stream[0].read_pos_in_bits = (int) (8*(s.bitp[0] - ds->stream[0].bitstream) - s.bitc[0]); ds->stream[2].read_pos_in_bits = (int) (8*(s.bitp[2] - ds->stream[2].bitstream) - s.bitc[2]); ds->stream[1].read_pos_in_bits = (int) (8*(ds->stream[1].bitstream - s.bitp[1]) - s.bitc[1]); } //////////////////////////////////////////////////////////////////////////////// static void compute_windowed_sum_multiple64(radaudio_decoder_state *ds, float *output, int n, const float *fwd_data, S16 *rev_data, int revlen, int revoff, float rev_scale, const float *window, int block_number, int channel, int stream_offset) { rrAssert(n % 64 == 0); // Starting point: // output[0:n] = fwd_c[0:n] .* window[0:n] + rev_scale * rev_c[revoff:revoff+n] .* reverse(window[0:n]) // // let n2 = n/2, then the IMDCT symmetries mean that (when both blocks have same length, I'll account for revoff later) // // fwd_c[0:n2] = -reverse(fwd_c[n2:n]) // rev_c[n2:n] = reverse(rev_c[0:n2]) // // and therefore we can work with just the middle samples (i.e. the back half of fwd_c and the front // half of rev_c). To exploit this systematically, split the loop into two halves at n2: // // output[0:n2] = fwd_c[0:n2] .* window[0:n2] + rev_scale * rev_c[revoff+0:revoff+n2] .* reverse(window[n2:n]) // output[n2:n] = fwd_c[n2:n] .* window[n2:n] + rev_scale * rev_c[revoff+n2:revoff+n] .* reverse(window[0:n2]) // // note rev_c is symmetric about revoff+n2, so rev_c[revoff+n2:revoff_n] = reverse(rev_c[revoff+0:revoff+n2]). // (This is the second symmetry, accounting for potential differences in MDCT size.) // // Define: // fwd[0:n2] = fwd_c[n2:n] // rev[0:n2] = rev_scale * rev_c[revoff:revoff+n2] // // and then use the symmetries and algebra to get // // output[0:n2] = -reverse(fwd) .* window[0:n2] + rev .* reverse(window[n2:n]) // = rev .* reverse(window[n2:n]) - reverse(fwd) .* window[0:n2] // // output[n2:n] = fwd .* window[n2:n] + reverse(rev) .* reverse(window[0:n2]) // = reverse(rev .* window[0:n2]) + fwd .* window[n2:n] const float *fwd = fwd_data; // NOTE: second half of the forward data, first half is implied by odd symmetry S16 *rev = rev_data + revoff; #if defined(DO_BUILD_SSE4) || defined(DO_BUILD_AVX2) if (ds->cpu.has_sse2) { #ifdef DO_BUILD_AVX2 if (ds->cpu.has_avx2) radaudio_avx2_compute_windowed_sum_multiple16(output, n, fwd, rev, rev_scale, window); else #endif radaudio_sse2_compute_windowed_sum_multiple8(output, n, fwd, rev, rev_scale, window); return; } #endif #if defined(DO_BUILD_NEON) radaudio_neon_compute_windowed_sum_multiple8(output, n, fwd, rev, rev_scale, window); #else SINTa N2 = n >> 1; for (SINTa j = 0; j < N2; ++j) { output[j] = rev_scale * rev[j] * window[n-1-j] - fwd[N2-1-j] * window[j]; } for (SINTa j = 0; j < N2; ++j) { output[j+N2] = rev_scale * rev[N2-1-j] * window[N2-1-j] + fwd[j] * window[N2+j]; } #endif } static void copy_samples_multiple16(float *output, int n, const float *input) { rrAssert(n % 16 == 0); // potentially rely on it being aligned memcpy(output, input, 4*n); } static void copy_samples_multiple16_scaled(float *output, int n, const S16 *input, float rescale) { rrAssert(n % 16 == 0); for (int i=0; i < n; i += 8) { output[i+0] = input[i+0] * rescale; output[i+1] = input[i+1] * rescale; output[i+2] = input[i+2] * rescale; output[i+3] = input[i+3] * rescale; output[i+4] = input[i+4] * rescale; output[i+5] = input[i+5] * rescale; output[i+6] = input[i+6] * rescale; output[i+7] = input[i+7] * rescale; } } static void build_rand_state(U32 *rand_state, U32 randval) { U32 r2 = (U32) (((randval + 5000) * (U64) 0xc4ceb9fe1a85ec53ULL) >> 33); rand_state[0] = randval; rand_state[1] = r2; rand_state[2] = randval ^ 0x55555555; rand_state[3] = r2 ^ 0x55555555; } static void randomize_long_block_8x8_Nx16(radaudio_decoder_state *ds, S8 *quantized_coeff, U32 randval, int num_subbands, int *num_coeffs_for_band) { RAD_ALIGN(U32, rand_state[4], 16); build_rand_state(rand_state, randval); static S8 random_table[16] = { -1,1, -2,2, -3,3, -4,4, -5,5, -6,6, -7,7, -8,8 }; // SIMD: compute 4 independent randvals in parallel... the encoder doesn't care what the random // values are, so they should be stable, but don't have to be the same as the current code #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse4_1) { radaudio_sse4_randomize_long_block_8x8_Nx16(quantized_coeff, rand_state, num_subbands); return; } #endif int j; int cb = 0; U32 randval0 = rand_state[0]; U32 randval1 = rand_state[1]; for (j=0; num_coeffs_for_band[j] == 4; ++j) { if (RR_GET64_NATIVE(&quantized_coeff[cb]) == 0) { U32 rbits = randval0 >> 4; randval0 = lcg(randval0); for (int i=0; i < 4; ++i) { quantized_coeff[cb+i] = random_table[rbits & 15]; rbits >>= 8; } } cb += 4; } for (; num_coeffs_for_band[j] == 8; ++j) { if (RR_GET64_NATIVE(&quantized_coeff[cb]) == 0) { U32 rbits = randval0 >> 4; randval0 = lcg(randval0); for (int i=0; i < 4; ++i) { quantized_coeff[cb+i] = random_table[rbits & 15]; rbits >>= 8; } rbits = randval1 >> 4; randval1 = lcg(randval1); for (int i=4; i < 8; ++i) { quantized_coeff[cb+i] = random_table[rbits & 15]; rbits >>= 8; } } cb += 8; } rand_state[0] = randval0; rand_state[1] = randval1; for (; j < num_subbands; ++j) { if ((RR_GET64_NATIVE(&quantized_coeff[cb+0]) | RR_GET64_NATIVE(&quantized_coeff[cb+8])) == 0) { U32 rbits = rand_state[0] >> 4; rand_state[0] = lcg(rand_state[0]); for (int i=0; i < 4; ++i) { quantized_coeff[cb+i] = random_table[rbits & 15]; rbits >>= 8; } rbits = rand_state[1] >> 4; rand_state[1] = lcg(rand_state[1]); for (int i=4; i < 8; ++i) { quantized_coeff[cb+i] = random_table[rbits & 15]; rbits >>= 8; } rbits = rand_state[2] >> 4; rand_state[2] = lcg(rand_state[2]); for (int i=8; i < 12; ++i) { quantized_coeff[cb+i] = random_table[rbits & 15]; rbits >>= 8; } rbits = rand_state[3] >> 4; rand_state[3] = lcg(rand_state[3]); for (int i=12; i < 16; ++i) { quantized_coeff[cb+i] = random_table[rbits & 15]; rbits >>= 8; } } cb += 16; } } static void randomize_short_block(S8 quantized_coeff[], U32 randval, int num_bands, int *num_coeffs_for_band) { static S8 random_table[16] = { -1,1, -2,2, -3,3, -4,4, -5,5, -6,6, -7,7, -8,8 }; int cb = 0; U32 rbits = randval; randval = lcg(randval); rbits >>= 10; // Bands 0..3 are 1 coefficient for (int j=0; j < 4; ++j) { if (quantized_coeff[j] == 0) { quantized_coeff[j] = random_table[rbits & 1]; } rbits >>= 4; } // Bands 4..7 are 2 coefficients each for (int j=4; j < 8; ++j) { if (RR_GET16_LE_UNALIGNED(&quantized_coeff[j*2-4]) == 0) { rbits = randval, randval = lcg(randval); rbits >>= 20; quantized_coeff[j*2-4] = random_table[rbits & 15]; quantized_coeff[j*2-3] = random_table[(rbits >> 4) & 15]; } } // Bands 8..13 are 4 coefficients each for (int j=8; j < 13; ++j) { if (RR_GET32_LE_UNALIGNED(&quantized_coeff[j*4-20]) == 0) { rbits = randval, randval = lcg(randval); quantized_coeff[j*4-20] = random_table[(rbits >> 12) & 15]; quantized_coeff[j*4-19] = random_table[(rbits >> 16) & 15]; quantized_coeff[j*4-18] = random_table[(rbits >> 20) & 15]; quantized_coeff[j*4-17] = random_table[(rbits >> 24) & 15]; } } // Remaining bands have 16 or 32 coeffs cb = 4*1 + 4*2 + 5*4; for (int j=13; j < num_bands; ++j) { int i; U32 sum1=0, sum2=0; int num = num_coeffs_for_band[j]; for (i=0; i < num; i += 8) { // should be 16 or 32 sum1 |= RR_GET32_LE_UNALIGNED(&quantized_coeff[cb+i+0]); sum2 |= RR_GET32_LE_UNALIGNED(&quantized_coeff[cb+i+4]); } if ((sum1|sum2) == 0) { for (i=0; i+7 < num; i += 8) { rbits = randval, randval = lcg(randval); for (int k=0; k < 8; ++k) { quantized_coeff[cb+i+k] = random_table[rbits & 15]; rbits >>= 4; } } rbits = randval, randval = lcg(randval); for (; i < num; ++i) { quantized_coeff[cb+i] = random_table[rbits & 15]; rbits >>= 4; } } cb += num_coeffs_for_band[j]; } } static int count_bytes_below_value_sentinel16(radaudio_decoder_state *ds, U8 *data, int num_bytes, U8 threshold) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse2) return radaudio_sse2_count_bytes_below_value_sentinel16(data, num_bytes, threshold); #endif #if defined(DO_BUILD_NEON) return radaudio_neon_count_bytes_below_value_sentinel16(data, num_bytes, threshold); #else int num=0; for (int i=0; i < num_bytes; ++i) { num += (data[i] < threshold); } return num; #endif } // overwrites up to 7 bytes of space at end of array if not a multiple of 8 static int count_set_bits_multiple8_sentinel8(radaudio_decoder_state *ds, U8 *data, int num_bytes) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse4_1) // @TODO: ds->cpu.has_popcnt return radaudio_intel_popcnt_count_set_bits_read_multiple8_sentinel8(data, num_bytes); #endif #ifdef DO_BUILD_NEON return radaudio_neon_count_set_bits_read_multiple8_sentinel8(data, num_bytes); #endif #ifndef DO_BUILD_NEON // for unreachable code warnings #ifdef __RAD64REGS__ // 64-bit scalar code int num=0; int padded_size = (num_bytes + 7) & ~7; if (num_bytes != padded_size) RR_PUT64_NATIVE(&data[num_bytes], 0); for (int i=0; i < padded_size; i += 8) { U64 value = RR_GET64_NATIVE(&data[i]); value = value - ((value >> 1) & 0x5555555555555555ull); // for pairs of bits: 00->00, 01->01, 10->01, 11->10 // sums across groups of 2 bits -> sums across groups of 8 bits // skipping the groups-of-4 stage to get a wider reduction tree with fewer constants U64 threes = 0x0303030303030303ull; value = (value & threes) + ((value >> 2) & threes) + ((value >> 4) & threes) + ((value >> 6) & threes); // sum the bytes (can't overflow) value = (value * 0x0101010101010101ull) >> 56; num += (int)value; } #else // 32-bit scalar code int num=0; int padded_size = (num_bytes + 3) & ~3; if (num_bytes != padded_size) RR_PUT32_NATIVE(&data[num_bytes], 0); for (int i=0; i < padded_size; i += 4) { U32 value = RR_GET32_NATIVE(&data[i]); value = value - ((value >> 1) & 0x55555555); // for pairs of bits: 00->00, 01->01, 10->01, 11->10 value = (value & 0x33333333) + ((value>> 2) & 0x33333333); value = (value & 0x0f0f0f0f) + ((value>> 4) & 0x0f0f0f0f); value = (value * 0x01010101) >> 24; num += (int)value; } #endif return num; #endif // !DO_BUILD_NEON } // guarantees a multiple of 16 bytes is written, with the extra bytes having the value of 1: // scalar: reads exact bytes specified, writes an extra 16 bytes of "1" // SSE : writes multiple of 32 bytes with extras equal to "1", also writes 16 bytes starting at &packed[num_packed] static void unpack_nibbles_input_excess16_output_excess16_multiple32_default1(radaudio_decoder_state *ds, S8 *unpacked, U8 *packed, int num_packed) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse2) { radaudio_sse2_unpack_nibbles_read_sentinel16_write_multiple32(unpacked, packed, num_packed, 0x1111111111111111ull); return; } #endif #ifdef DO_BUILD_NEON { radaudio_neon_unpack_nibbles_read_sentinel16_write_multiple32(unpacked, packed, num_packed, 0x1111111111111111ull); return; } #endif #ifndef DO_BUILD_NEON // for unreachable code warnings for (int i=0; i < num_packed; ++i) { unpacked[i*2+0] = (S8) (packed[i] & 15); unpacked[i*2+1] = (S8) (packed[i] >> 4); } RR_PUT64_NATIVE(&unpacked[num_packed*2+0], 0x0101010101010101ull); RR_PUT64_NATIVE(&unpacked[num_packed*2+8], 0x0101010101010101ull); #endif // !DO_BUILD_NEON } // if coefficient is 0, then read it from the big coefficient array // otherwise, remove the +8 bias by subtracting 8 static rrbool expand_nonzero_coefficients(radaudio_decoder_state *ds, S8 *nonzero_coefficients, int num_nonzero, S8 *big_coeff, S8 *big_limit, S8 *safe_read) { if (safe_read - big_limit > 15) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse2) { return radaudio_sse2_expand_coefficients_excess_read15(nonzero_coefficients, num_nonzero, big_coeff, big_limit); } #endif #ifdef DO_BUILD_NEON return radaudio_neon_expand_coefficients_excess_read15(nonzero_coefficients, num_nonzero, big_coeff, big_limit); #endif } // else fall through to scalar for (int i = 0; i < num_nonzero; ++i) { if (nonzero_coefficients[i] == 0) { if (big_coeff == big_limit) return false; // overread error nonzero_coefficients[i] = *big_coeff++; } else nonzero_coefficients[i] -= 8; } return true; } static void compute_band_energy_multiple4(radaudio_decoder_state *ds, F32 *band_energy, int num_bands, int band_exponent[], U16 fine_energy[], F32 band_scale_decode[]) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse2) { radaudio_sse2_compute_band_energy_multiple4(band_energy, num_bands, band_exponent, fine_energy, band_scale_decode); return; } #endif #ifdef DO_BUILD_NEON radaudio_neon_compute_band_energy_multiple4(band_energy, num_bands, band_exponent, fine_energy, band_scale_decode); #else for (int j=0; j < num_bands; ++j) { // safe to run 24 times for SIMD int qe = fine_energy[j]; // quantized energy, in [0, 1< (1<<30>>14) => 1<<16 band_energy[j] = (fe * ce) * band_scale_decode[j]; } #endif } static void compute_subband_energy_skip12_excess_read7(radaudio_decoder_state *ds, F32 *subband_energy, const F32 *band_energy, int num_bands, int num_subbands, int *num_subbands_for_band, U16 *quantized_subbands) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse4_1) { radaudio_sse4_compute_subband_energy_skip12_excess_read7(subband_energy, band_energy, num_bands, num_subbands_for_band, quantized_subbands); return; } #endif #ifdef DO_BUILD_NEON radaudio_neon_compute_subband_energy_skip12_excess_read7(subband_energy, band_energy, num_bands, num_subbands_for_band, quantized_subbands); #else int start, j; for (j=0; num_subbands_for_band[j] == 1; ++j) ; start = j; for (; j < num_bands; ++j) { int sum=0; int num = num_subbands_for_band[j]; // these loops are pretty random lengths, for example, at 44.1Khz, they're: 2,2,2,2,3,4,9,10,12 iterations for (int i=0; i < num; ++i) { sum += (quantized_subbands[start+i] * quantized_subbands[start+i]); } F32 scale = band_energy[j] / sqrtf((F32) sum); rrAssert(!isnan(band_energy[j])); rrAssert(sum != 0); for (int i=0; i < num; ++i) { subband_energy[start+i] = scale * quantized_subbands[start+i]; } start += num; } rrAssert(start == num_subbands); #endif } static void distribute_bitflag_coefficients_multiple64(radaudio_decoder_state *ds, S8 *quantized_coeff, int num_coeff, U8 *nonzero_flagbits, S8 *nonzero_coeffs, int *pcur_nonzero_coeffs) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_ssse3) { radaudio_ssse3_distribute_bitflag_coefficients_multiple16( ds->cpu, quantized_coeff, num_coeff, nonzero_flagbits, nonzero_coeffs, pcur_nonzero_coeffs); return; } #endif #ifdef DO_BUILD_NEON { radaudio_neon_distribute_bitflag_coefficients_multiple16( quantized_coeff, num_coeff, nonzero_flagbits, nonzero_coeffs, pcur_nonzero_coeffs); return; } #endif #ifndef DO_BUILD_NEON // for unreachable code warnings int cur_nonzero_coeffs = *pcur_nonzero_coeffs; memset(quantized_coeff, 0, num_coeff); // use a run-length style scheme using bit scans to reduce branch mispredictions int pos=0; for (int i=0; i < num_coeff; i += 64) { U64 flags = RR_GET64_LE(nonzero_flagbits + pos); pos += 8; // even though the run is never long--we could just use a small lookup table--let's do it right int offset = i; while (flags) { SINTa dist = rrCtz64(flags); quantized_coeff[offset+dist] = nonzero_coeffs[cur_nonzero_coeffs++]; flags = rrClearLowestSetBit64(flags); } } *pcur_nonzero_coeffs = cur_nonzero_coeffs; #endif // !DO_BUILD_NEON } static rrbool distribute_nonzero_coefficients(radaudio_decoder_state *ds, S8 *quantized_coeff, int num_coeff32, U8 *runlength_data, int *pcur_runlength_data, // there's guaranteed sentinels, so don't need length S8 *nonzero_coeffs, int *pcur_nonzero_coeffs, U8 *nonzero_flagbits, int num_nonzero_flagbits, int channel) { RR_UNUSED_VARIABLE(channel); SINTa num_coeff = num_coeff32; SINTa k=0; if (num_nonzero_flagbits) { PROF_BEGIN(distribute_bitflag); distribute_bitflag_coefficients_multiple64(ds, quantized_coeff, num_nonzero_flagbits, nonzero_flagbits, nonzero_coeffs, pcur_nonzero_coeffs); PROF_END(distribute_bitflag); k = num_nonzero_flagbits; } const U8 *runlens = runlength_data + *pcur_runlength_data; const S8 *nzcoeffs = nonzero_coeffs + *pcur_nonzero_coeffs; PROF_BEGIN(distribute_rle); memset(quantized_coeff+k, 0, num_coeff-k); // tried a branchless version of this using the slot[] logic from above, but saw no gain // we put in sentinels that guarantee this loop will see a END_OF_ZERORUN for(;;) { U8 rl = *runlens++; if (rl == END_OF_ZERORUN) break; k += rl; if (rl < MAX_RUNLEN) { if (k >= num_coeff) return false; quantized_coeff[k] = *nzcoeffs++; ++k; } } *pcur_runlength_data = (int)(SINTa)(runlens - runlength_data); *pcur_nonzero_coeffs = (int)(SINTa)(nzcoeffs - nonzero_coeffs); PROF_END(distribute_rle); return true; } static void dequantize_long_block_8x8_Nx16(radaudio_decoder_state *ds, float *coeffs, S8 *quantized_coeff, float *subband_energy, int num_subbands, int *num_coeffs_for_band) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse4_1) { radaudio_sse4_dequantize_long_block_8x8_Nx16(coeffs, quantized_coeff, subband_energy, num_subbands); return; } #endif int cb=0; int j=0; // first 8 subbands should be 8 coefficients long while (num_coeffs_for_band[j] < 16) { F32 sum=1.e-20f,scale; for (int i=0; i < num_coeffs_for_band[j]; ++i) { F32 n = (F32) quantized_coeff[cb+i]; sum += n*n; } scale = subband_energy[j] / sqrtf(sum); for (int i=0; i < 8; ++i) coeffs[cb+i] = quantized_coeff[cb+i] * scale; cb += num_coeffs_for_band[j]; ++j; } // all remaining subbands are 16 coefficients long, so we don't have to check bands for (; j < num_subbands; ++j) { F32 sum=1.e-20f,scale; for (int i=0; i < 16; ++i) { F32 n = (F32) quantized_coeff[cb+i]; sum += n*n; } scale = subband_energy[j] / sqrtf(sum); for (int i=0; i < 16; ++i) coeffs[cb+i] = quantized_coeff[cb+i] * scale; cb += 16; } for (int i=cb; i < RADAUDIO_LONG_BLOCK_LEN; ++i) coeffs[i] = 0; } static void dequantize_long_block_with_random_8x8_Nx16(radaudio_decoder_state *ds, F32 *coeffs, S8 *quantized_coeff, F32 *subband_energy, int num_subbands, int *num_coeffs_for_band, U32 randval) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse4_1) { RAD_ALIGN(U32, rand_state[4], 16); build_rand_state(rand_state, randval); radaudio_sse4_dequantize_long_block_replace_0_with_random_8x8_Nx16(coeffs, quantized_coeff, subband_energy, num_subbands, rand_state); return; } #endif #ifdef DO_BUILD_NEON { RAD_ALIGN(U32, rand_state[4], 16); build_rand_state(rand_state, randval); radaudio_neon_dequantize_long_block_replace_0_with_random_8x8_Nx16(coeffs, quantized_coeff, subband_energy, num_subbands, rand_state); return; } #endif #ifndef DO_BUILD_NEON // for unreachable code warnings randomize_long_block_8x8_Nx16(ds, quantized_coeff, randval, num_subbands, num_coeffs_for_band); dequantize_long_block_8x8_Nx16(ds, coeffs, quantized_coeff, subband_energy, num_subbands, num_coeffs_for_band); #endif // !DO_BUILD_NEON } static void scalar_dequantize_short_block(float *coeffs, S8 *quantized_coeff, float *band_energy, int num_bands, int *num_coeffs_for_band) { int i,j; int cb=0; // first 4 bands are 1 coefficient long, and coefficient is always 1 or -1 for (j=0; j < 4; ++j) { rrAssert(abs(quantized_coeff[j]) == 1); coeffs[j] = (F32) quantized_coeff[j] * band_energy[j]; } // next 4 bands are 2 coefficients long for (j=0; j < 4; ++j) { float x = (F32) quantized_coeff[4+j*2+0]; float y = (F32) quantized_coeff[4+j*2+1]; float scale = band_energy[4+j] / sqrtf(x*x+y*y+1.e-20f); coeffs[4+j*2+0] = x*scale; coeffs[4+j*2+1] = y*scale; } // next 4 bands are 4 coefficients long // (actually next 5 bands are) cb = 4*1 + 4*2; for (j=8; j < 12; ++j) { float sum=1.e-20f, scale; for (i=0; i < 4; ++i) { float n = (F32) quantized_coeff[cb+i]; sum += n*n; } scale = band_energy[j] / sqrtf(sum); for (i=0; i < 4; ++i) coeffs[cb+i] = (F32) quantized_coeff[cb+i] * scale; cb += 4; } // now we have either [4,16,16,16,32] // or [4,16,16,32,32] for lower sample rates cb = 4*1 + 4*2 + 4*4; for (j=12; j < num_bands; ++j) { int count = num_coeffs_for_band[j]; F32 sum=1.e-20f,scale; for (i=0; i < count; i += 4) { for (int k=0; k < 4; ++k) { F32 n = (F32) quantized_coeff[cb+i+k]; sum += n*n; } } scale = band_energy[j] / sqrtf(sum); for (i=0; i < count; i += 4) { for (int k=0; k < 4; ++k) { coeffs[cb+i+k] = quantized_coeff[cb+i+k] * scale; } } cb += count; } for (i=cb; i < RADAUDIO_SHORT_BLOCK_LEN; ++i) coeffs[i] = 0; } static void dequantize_short_block(radaudio_decoder_state *ds, float *coeffs, S8 *quantized_coeff, float *band_energy, int num_bands, int *num_coeffs_for_band) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse4_1) { radaudio_sse4_dequantize_short_block_sse4(coeffs, quantized_coeff, band_energy, num_bands, num_coeffs_for_band); return; } #endif #ifdef DO_BUILD_NEON { radaudio_neon_dequantize_short_block(coeffs, quantized_coeff, band_energy, num_bands, num_coeffs_for_band); return; } #endif #ifndef DO_BUILD_NEON // for unreachable code warnings scalar_dequantize_short_block(coeffs, quantized_coeff, band_energy, num_bands, num_coeffs_for_band); #endif // !DO_BUILD_NEON } // we have to store half the MDCT output to overlap in the next block. this is the largest // per-stream memory cost of the decoder. we used to store floats, but now we convert them // to S16. This is *pre* windowing, so the quality loss is minimal. // // sse2 runs at about half speed of original "store as floats" version, but it's about a 2% slowdown // overall and we decided it was worth the speed loss in return for halving memory usage static float save_overlapping_samples(radaudio_decoder_state *ds, S16 *buffer, const float *data, int num) { // the profile wrapper is external to this, under the name "copy" rrAssert(num % 64 == 0); num /= 2; #if defined(DO_BUILD_SSE4) || defined(DO_BUILD_AVX2) if (ds->cpu.has_sse2) { #ifdef DO_BUILD_AVX2 if (ds->cpu.has_avx2) { return radaudio_avx2_save_samples(buffer, data, num); } #endif return radaudio_sse2_save_samples(buffer, data, num); } #endif #ifdef DO_BUILD_NEON return radaudio_neon_save_samples(buffer, data, num); #endif #ifndef DO_BUILD_NEON // for unreachable code warnings #define FAST_FLOAT_TO_INT // best solution i've found for x64 #if 0 // naive implementation for reference, but floor() is unacceptably slow // doubles total decode time on test platform; round() and rint() were worse // also, doesn't round to nearest even like SSE path // most files in fnaudio get different results, is it buggy? there's no way this can just be from tie-breaking 0.5?!? float largest0 = 1.0f; float largest1 = 1.0f; float scale = 32767.0f; for (int i=0; i < num; i += 2) { F32 d0 = data[i+0]; F32 d1 = data[i+1]; buffer[i+0] = (S16) floorf(d0 * scale + 0.5f); buffer[i+1] = (S16) floorf(d1 * scale + 0.5f); F32 a0 = fabsf(d0); F32 a1 = fabsf(d1); largest0 = RR_MAX(largest0, a0); largest1 = RR_MAX(largest1, a1); } float largest = RR_MAX(largest0,largest1); if (largest > 1.0f) { scale = 32767.0f / largest; for (int i=0; i < num; i += 2) { buffer[i+0] = (S16) floorf(data[i+0] * scale + 0.5f); buffer[i+1] = (S16) floorf(data[i+1] * scale + 0.5f); } } return 1.0f / scale; #elif defined(FAST_FLOAT_TO_INT) // this should round correctly // bithack float-to-int typedef union { F32 f; S32 i; } float_conv; float_conv temp0,temp1,temp2,temp3; // add (1<<23) to convert to int, then divide by 2^SHIFT, then add 0.5/2^SHIFT to round #define MAGIC(SHIFT) (1.5f * (1 << (23-SHIFT)) + 0.5f/(1 << SHIFT)) #define ADDEND(SHIFT) (((150-SHIFT) << 23) + (1 << 22)) #define FAST_SCALED_FLOAT_TO_INT(temp,x,s) (temp.f = (x) + MAGIC(s), temp.i - ADDEND(s)) float largest0 = 1.0f; float largest1 = 1.0f; float largest2 = 1.0f; float largest3 = 1.0f; float scale = 32767.0f; for (int i=0; i < num; i += 4) { F32 d0 = data[i+0]; F32 d1 = data[i+1]; F32 d2 = data[i+2]; F32 d3 = data[i+3]; F32 a0 = fabsf(d0); F32 a1 = fabsf(d1); F32 a2 = fabsf(d2); F32 a3 = fabsf(d3); buffer[i+0] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, d0 * scale, 0); buffer[i+1] = (S16) FAST_SCALED_FLOAT_TO_INT(temp1, d1 * scale, 0); buffer[i+2] = (S16) FAST_SCALED_FLOAT_TO_INT(temp2, d2 * scale, 0); buffer[i+3] = (S16) FAST_SCALED_FLOAT_TO_INT(temp3, d3 * scale, 0); largest0 = RR_MAX(largest0, a0); largest1 = RR_MAX(largest1, a1); largest2 = RR_MAX(largest2, a2); largest3 = RR_MAX(largest3, a3); } float largest01 = RR_MAX(largest0,largest1); float largest23 = RR_MAX(largest2,largest3); float largest = RR_MAX(largest01,largest23); if (largest > 1.0f) { scale = 32767.0f / largest; for (int i=0; i < num; i += 4) { buffer[i+0] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+0] * scale, 0); buffer[i+1] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+1] * scale, 0); buffer[i+2] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+2] * scale, 0); buffer[i+3] = (S16) FAST_SCALED_FLOAT_TO_INT(temp0, data[i+3] * scale, 0); } } return 1.0f / scale; #else // 30% slower than FAST_FLOAT_TO_INT on x64 // we want to use the equivalent of floor() so we can round. // if we use fixed-point, right-shifting two's complement values is floor. // though we might get compiler warnings about signed shifts // problem: this doesn't produce the exact same results as other methods // most files in fnaudio get different results #define TRUNC_SHIFT 15 float largest0 = 1.0f; float largest1 = 1.0f; float largest2 = 1.0f; float largest3 = 1.0f; float scale = 32767.0f; for (int i=0; i < num; i += 4) { F32 d0 = data[i+0]; F32 d1 = data[i+1]; F32 d2 = data[i+2]; F32 d3 = data[i+3]; F32 a0 = fabsf(d0); F32 a1 = fabsf(d1); F32 a2 = fabsf(d2); F32 a3 = fabsf(d3); S32 i0 = (S32) (d0 * scale * (1 << TRUNC_SHIFT)); S32 i1 = (S32) (d1 * scale * (1 << TRUNC_SHIFT)); S32 i2 = (S32) (d2 * scale * (1 << TRUNC_SHIFT)); S32 i3 = (S32) (d3 * scale * (1 << TRUNC_SHIFT)); buffer[i+0] = (S16) ((i0 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT); buffer[i+1] = (S16) ((i1 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT); buffer[i+2] = (S16) ((i2 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT); buffer[i+3] = (S16) ((i3 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT); largest0 = RR_MAX(largest0, a0); largest1 = RR_MAX(largest1, a1); largest2 = RR_MAX(largest2, a2); largest3 = RR_MAX(largest3, a3); } float largest01 = RR_MAX(largest0,largest1); float largest23 = RR_MAX(largest2,largest3); float largest = RR_MAX(largest01,largest23); if (largest > 1.0f) { scale = 32767.0f / largest; for (int i=0; i < num; i += 4) { S32 i0 = (S32) (data[i+0] * scale * (1 << TRUNC_SHIFT)); S32 i1 = (S32) (data[i+1] * scale * (1 << TRUNC_SHIFT)); S32 i2 = (S32) (data[i+2] * scale * (1 << TRUNC_SHIFT)); S32 i3 = (S32) (data[i+3] * scale * (1 << TRUNC_SHIFT)); buffer[i+0] = (S16) ((i0 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT); buffer[i+1] = (S16) ((i1 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT); buffer[i+2] = (S16) ((i2 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT); buffer[i+3] = (S16) ((i3 + (1 << (TRUNC_SHIFT-1))) >> TRUNC_SHIFT); } } return 1.0f / scale; #endif #endif // !DO_BUILD_NEON } ///////////////////////////////////////////////////////////////////////////////// static const size_t decoder_align = 32; // We fill all-0 subbands with random noise (that's scaled to the // appropriate subband energy). // // We need a mechanism where SSE2 can generate random data very fast, // but it doesn't hobble the scalar pass. so e.g. SSE2 can generate // 4 LCG steps in parallel, 16..24 bits of useful data in each one. // but doing this on scalar might be slow. for a single 16-item subband, // we need 4*16 = 64 bits of randomness to use our 4-bit random_table[] // (which we can lookup with pshub). // // scalar: old code did 2 32-bit LCGs per subband, but only if the subband was zero. // Optimized-for-SIMD code might prefer to be branchless and do an LCG on // every subband even if non-zero, and there are ~80 subbands. But it turns // out we take a branch anyway in the SSE code, so executive decision to stick // to a design where we only update LCGs if subband was zero. (Note that if // we didn't care about identical decoding across platforms, you could use // whatever random method was optimal for each platform). static void decode_channel_before_imdct(radaudio_decoder_state *ds, radaudio_block_data *bd, int channel, U32 rand_seed, U16 fine_energy[]) { radaudio_rate_info *info; RAD_ALIGN(F32, band_energy[24+16], 16) = { 0 }; // must be a multiple of 4 RAD_ALIGN(F32, subband_energy[MAX_SUBBANDS+16], 16) = { 0 }; int is_short_block = ds->current_block_short; info = ds->info[is_short_block]; PROF_BEGIN(compute_band_energy); // compute band energy compute_band_energy_multiple4(ds, band_energy, info->num_bands, bd->band_exponent, fine_energy, info->band_scale_decode); PROF_END(compute_band_energy); // compute subband energy if (!is_short_block) { PROF_BEGIN(compute_subband_energy); // first bands are shorter than a full subband, so treat those specially int j; for (j=0; info->num_subbands_for_band[j] == 1; ++j) subband_energy[j] = band_energy[j]; compute_subband_energy_skip12_excess_read7(ds, subband_energy, band_energy, info->num_bands, info->num_subbands, info->num_subbands_for_band, bd->quantized_subbands); for (j=0; j < info->num_subbands; ++j) rrAssert(!isnan(subband_energy[j])); PROF_END(compute_subband_energy); } // spread out adjacent blocks to be less similar U32 randval = (rand_seed + (rand_seed >> 5)) * 0x27d4eb2d; randval = lcg(randval); F32 *coeffs = bd->dequantized_coeff_decode; if (!is_short_block) { for (int j=0; j < info->num_subbands; ++j) rrAssert(info->num_coeffs_for_subband[j] == (j < 8 ? 8 : 16)); PROF_BEGIN(unquantize); dequantize_long_block_with_random_8x8_Nx16(ds, coeffs, bd->quantized_coeff_decode, subband_energy, info->num_subbands, info->num_coeffs_for_band, randval); PROF_END(unquantize); } else { PROF_BEGIN(randomize); randomize_short_block(bd->quantized_coeff_decode, randval, info->num_bands, info->num_coeffs_for_band); PROF_END(randomize); PROF_BEGIN(unquantize); dequantize_short_block(ds, coeffs, bd->quantized_coeff_decode, band_energy, info->num_bands, info->num_coeffs_for_band); PROF_END(unquantize); } } #if 0 static void decode_channel_before_imdct_reference(radaudio_decoder_state *ds, radaudio_block_data *bd, int channel, U32 rand_seed, U16 fine_energy[]) { radaudio_rate_info *info; RAD_ALIGN(F32, band_energy[24+16], 16) = { 0 }; // must be a multiple of 4 RAD_ALIGN(F32, subband_energy[MAX_SUBBANDS+16], 16) = { 0 }; int is_short_block = ds->current_block_short; info = ds->info[is_short_block]; PROF_BEGIN(compute_band_energy); // compute band energy compute_band_energy_multiple4(ds, band_energy, info->num_bands, bd->band_exponent, fine_energy, info->band_scale_decode); PROF_END(compute_band_energy); // compute subband energy if (!is_short_block) { PROF_BEGIN(compute_subband_energy); // first bands are shorter than a full subband, so treat those specially int j; for (j=0; info->num_subbands_for_band[j] == 1; ++j) subband_energy[j] = band_energy[j]; compute_subband_energy_skip12_excess_read7(ds, subband_energy, band_energy, info->num_bands, info->num_subbands, info->num_subbands_for_band, bd->quantized_subbands); for (j=0; j < info->num_subbands; ++j) rrAssert(!isnan(subband_energy[j])); PROF_END(compute_subband_energy); } // spread out adjacent blocks to be less similar U32 randval = (rand_seed + (rand_seed >> 5)) * 0x27d4eb2d; randval = lcg(randval); PROF_BEGIN(randomize); if (!is_short_block) { // replace all-zero coefficient chunks with noise for (int j=0; j < info->num_subbands; ++j) rrAssert(info->num_coeffs_for_subband[j] == (j < 8 ? 8 : 16)); randomize_long_block_8x8_Nx16(bd->quantized_coeff_decode, randval, info->num_subbands, info->num_coeffs_for_band); } else { // replace all-zero coefficient chunks with noise randomize_short_block(bd->quantized_coeff_decode, randval, info->num_bands, info->num_coeffs_for_band); } PROF_END(randomize); F32 *coeffs = bd->dequantized_coeff_decode; PROF_BEGIN(unquantize); // reference implementation doesn't make any assumptions about distribution of subbands { int start = 0; if (is_short_block) { for (int j=0; j < info->num_bands; ++j) subband_energy[j] = band_energy[j]; } else { for (int j=0; j < info->num_bands && info->num_subbands_for_band[j] == 1; ++j) subband_energy[j] = band_energy[j]; } for (int j=0; j < info->num_subbands; ++j) { int n = info->num_coeffs_for_subband[j]; F32 sum = 1.0e-30f; for (int i=0; i < n; ++i) { F32 x = (F32) bd->quantized_coeff_decode[start+i]; sum += x*x; } F32 scale = subband_energy[j] / sqrtf(sum); for (int i=0; i < n; ++i) { coeffs[start+i] = (F32) bd->quantized_coeff_decode[start+i] * scale; } start += n; } for (int i=info->num_quantized_coeffs; i < info->num_coeffs; ++i) coeffs[i] = 0; } PROF_END(unquantize); } #endif // dequantized_coeff_decode[] in bd is overwritten in the process static void decode_channel_imdct(radaudio_decoder_state *ds, F32 rawdata[MAX_COEFFS], // only max_coeffs because we expand the symmetries later radaudio_block_data *bd, int channel) { radaudio_rate_info *info; int is_short_block = ds->current_block_short; info = ds->info[is_short_block]; F32 *coeffs = bd->dequantized_coeff_decode; for (int j=0; j < info->num_coeffs; ++j) rrAssert(!isnan(coeffs[j])); PROF_BEGIN(imdct); radaudio_imdct_fft_only_middle(ds->cpu, rawdata, coeffs, info->num_coeffs); PROF_END(imdct); } static int decode_channel_after_imdct(radaudio_decoder_state *ds, F32 data1 [MAX_COEFFS], F32 data2 [MAX_COEFFS], radaudio_block_data *bd, int channel, F32 *output) { int result_length; radaudio_rate_info *info; int is_short_block = ds->current_block_short; info = ds->info[is_short_block]; PROF_BEGIN(window); if (ds->post_seek) // ignore the left side of the first block result_length = 0; else { // use window for whichever is smaller of current block or previous block // apply window to pending data if (ds->current_block_short == ds->last_block_short) { F32 *window = radaudio_windows[ds->current_block_short]; int len = ds->current_block_short ? RADAUDIO_SHORT_BLOCK_LEN : RADAUDIO_LONG_BLOCK_LEN; compute_windowed_sum_multiple64(ds, output, len, data1, ds->prev_block_right_samples[channel], len, 0, ds->restore_scale[channel], window, ds->block_number, channel, ds->fully_decoded); result_length = len; } else { F32 *window = radaudio_windows[RADAUDIO_SHORT]; const int n = RADAUDIO_SHORT_BLOCK_LEN; if (is_short_block) { // if previous block was long and this is short // // <-----LONG_BLOCK_LEN----> // +-----------+-----------+-----------+-----------+ // | prev | // +-----------+-----------+-----------+-----------+ // *************** <- output samples // -----------------------1111111111WWwww000000000 <- window weights // ----------wwWWW // +----+----+ // | cur | // +----+----+ // <----> // | // SHORT_BLOCK_LEN // const int len = RADAUDIO_LONG_BLOCK_LEN/2 - RADAUDIO_SHORT_BLOCK_LEN/2; copy_samples_multiple16_scaled(output, len, ds->prev_block_right_samples[channel], ds->restore_scale[channel]); // copy samples from previous where the new window is 0 and old window was 1 compute_windowed_sum_multiple64(ds, output+len, n, data1, ds->prev_block_right_samples[channel], RADAUDIO_LONG_BLOCK_LEN, len, ds->restore_scale[channel], window, ds->block_number, channel, ds->fully_decoded+len); // sum the part of the previous block that overlaps the left half of the new block result_length = RADAUDIO_LONG_BLOCK_LEN/2 + RADAUDIO_SHORT_BLOCK_LEN/2; // generated (LONG/2 - SHORT/2) + SHORT } else { // if previous block was short and this is long // // SHORT_BLOCK_LEN // | // <----> // +----+----+ // | prev | // +----+----+ // WWwww--------- // 000000000wwwWW111111111----------------------- <- window weights // ************** <- output samples // +-----------+-----------+-----------+-----------+ // | cur | // +-----------+-----------+-----------+-----------+ // <-----LONG_BLOCK_LEN----> // const int offset = RADAUDIO_LONG_BLOCK_LEN/2 - RADAUDIO_SHORT_BLOCK_LEN/2; compute_windowed_sum_multiple64(ds, output, n, data1, ds->prev_block_right_samples[channel], RADAUDIO_SHORT_BLOCK_LEN, 0, ds->restore_scale[channel], window, ds->block_number, channel, ds->fully_decoded); copy_samples_multiple16(output+n, RADAUDIO_LONG_BLOCK_LEN - (offset+n), data1 + n/2); result_length = RADAUDIO_SHORT_BLOCK_LEN/2 + RADAUDIO_LONG_BLOCK_LEN/2; } } } PROF_END(window); PROF_BEGIN(copy); ds->restore_scale[channel] = save_overlapping_samples(ds, ds->prev_block_right_samples[channel], data2, info->num_coeffs); PROF_END(copy); return result_length; } static U8 *find_next_coarse_run_excess16(radaudio_decoder_state *ds, U8 *cur, U8 *end) { #ifdef DO_BUILD_SSE4 if (ds->cpu.has_sse2) { return radaudio_sse2_find_next_coarse_run_excess16(cur, end); } #endif #ifdef __RAD64REGS__ RR_COMPILER_ASSERT(COARSE_RUNLEN_THRESHOLD < 128); RR_COMPILER_ASSERT(MAX_RUNLEN >= 128); const U64 splat8 = ~(U64)0 / 255; // 0x0101...01 const U64 msb_mask = 0x80 * splat8; const U64 low7_mask = ~msb_mask; const U64 bias0 = (128 - COARSE_RUNLEN_THRESHOLD) * splat8; const U64 bias1 = (256 - MAX_RUNLEN) * splat8; while (cur < end) { U64 bytes = RR_GET64_LE(cur); cur += 8; // check if there are any bytes >=COARSE_RUNLEN_THRESHOLD in those 8 bytes we just read. // idea: these are bytes that either // 1. have low 7 bits >=COARSE_RUNLEN_THRESHOLD // 2. have MSB set (thus >=128) // we can check the former by masking with 0x7f7f...7f and then adding (128 - COARSE_RUNLEN_THRESHOLD) // to every byte. if the MSB ends up set, they were above COARSE_RUNLEN_THRESHOLD. // // by the same logic, we have bytes >=MAX_RUNLEN (which is >=128) if and only if both // 1. their low 7 bits >= (MAX_RUNLEN - 128) // 2. their MSB is set U64 low7 = bytes & low7_mask; U64 above_coarse_runlen_thresh = (low7 + bias0) | bytes; // MSB in byte set if that byte >=COARSE_RUNLEN_THRESHOLD U64 above_max_runlen_thresh = (low7 + bias1) & bytes; // MSB in byte set if that byte >=MAX_RUNLEN U64 active = above_coarse_runlen_thresh & ~above_max_runlen_thresh & msb_mask; if (active) { // found at least one! locate the first occurrence using a trailing // zero count. return (cur - 8) + rrCtzBytes64(active); } } return cur; #else while (cur < end && (*cur < COARSE_RUNLEN_THRESHOLD || *cur >= MAX_RUNLEN)) ++cur; return cur; #endif } static int decode_block(radaudio_decoder_state *ds, F32 *output[2], void *mem, size_t memavail, size_t *memconsumed) { int len, skip=0; radaudio_block_data bd[2]; *memconsumed = 0; U8 *memory = mem; //size_t memory_valid = memavail; int c; huff3_decoder dec; // Throw in one empty section first so we know what the overhead of one of these is PROF_BEGIN(overhead); PROF_END(overhead); PROF_BEGIN(header); radaudio_block_header_unpacked header; int offset = radaudio_decode_block_header(memory, &ds->biases, &header, memavail); // check if we're at the stream header, if so skip it; this happens at start, // but also if they seek without telling us if (offset == COMMON_STREAM_HEADER) { // we might be at the start of the stream // enough bytes to check for the stream signature? if (memavail < 8) return e(RADAUDIO_INCOMPLETE_DATA); // check the stream signature if (!radaudio_check_stream_header(memory, memavail)) return e(RADAUDIO_INVALID_DATA); // enough bytes for a full header? if (memavail < sizeof(radaudio_stream_header)) return e(RADAUDIO_INCOMPLETE_DATA); // decode the header, the only way we have to parse it radaudio_stream_header_unpacked fh; size_t header_size = radaudio_unpack_stream_header(memory, memavail, &fh); // was it a valid header? if (header_size == 0) return e(RADAUDIO_INVALID_DATA); // make sure the subtract below can't be negative if (header_size > memavail) return e(RADAUDIO_INTERNAL_ERROR); // behave as if we just did a seek operation ds->post_seek = true; // we know the block number ds->block_number = 0; // shrink the input buffer memory += header_size; memavail -= header_size; skip = (int) header_size; // now decode the real block header and go back to the main block decode path with the real block header offset = radaudio_decode_block_header(memory, &ds->biases, &header, memavail); // if that's ALSO a stream header, it's a corrupt file if (offset == COMMON_STREAM_HEADER) return e(RADAUDIO_INVALID_DATA); } if (offset == COMMON_INCOMPLETE_DATA) return e(RADAUDIO_INCOMPLETE_DATA); else if (offset < 0) return e(RADAUDIO_INVALID_DATA); U32 block_length_in_bytes = header.block_bytes + offset; /////////////////////////////////////////////////////// // validate data // // block length isn't longer than spec max if (block_length_in_bytes > MAX_ENCODED_BLOCK_BYTES) return RADAUDIO_INVALID_DATA; // length of first stream doesn't go off end of block U32 mid_side_band_length = header.mid_side_bands ? (24/MACRO_BAND_SIZE+7)/8 : 0; if (offset + mid_side_band_length + header.vbstream0_length > block_length_in_bytes) return e(RADAUDIO_INVALID_DATA); // final if (header.final_block) if (header.final_samples_discard > RADAUDIO_SHORT_BLOCK_LEN) return e(RADAUDIO_INVALID_DATA); // can't have more RLE entries than coefficients if (header.num_runlength_array > (U32) 2*(header.this_block_short ? RADAUDIO_SHORT_BLOCK_LEN+1 : RADAUDIO_LONG_BLOCK_LEN+1)) return e(RADAUDIO_INVALID_DATA); if (block_length_in_bytes > memavail) return e(RADAUDIO_INCOMPLETE_DATA); if (header.final_block) ds->at_eof = true; rrbool is_short_block = header.this_block_short; ds->current_block_short = (U8) is_short_block; ds->next_block_short = header.next_block_short; radaudio_rate_info *bi = ds->info[is_short_block]; int num_channels = header.num_channels_encoded; int nz_mode = header.nonzero_bitarray_mode; U8 *mid_side_bands = memory + offset; U8 *post_header = mid_side_bands + mid_side_band_length; U8 *vbstream2 = post_header + header.vbstream0_length; U8 *packet_end = memory + block_length_in_bytes; int error=0; // we initialize the 'end' pointers for each stream to the end of valid data // in that packet, not the end of that stream. So without further tests, they // could read the same raw bytes as part of multiple streams; but this is used // just to guarantee no memory overreads. decode_vbstream_init(&dec.stream[0], post_header, packet_end, &error); decode_vbstream_init(&dec.stream[1], packet_end , post_header, &error); decode_vbstream_init(&dec.stream[2], vbstream2 , packet_end, &error); U32 midside_bands=0; if (header.mid_side_encoded) midside_bands = 0xffffffff; else if (header.mid_side_bands) { RR_COMPILER_ASSERT(MACRO_BAND_SIZE == 3); U8 midside_band_triples = *mid_side_bands; // read 8 bits int k=0; for (int j=0; j < bi->num_bands; j += 3, ++k) { if (midside_band_triples & (1 << k)) midside_bands |= (7 << j); } } U8 band_exponents[32*2]; int cur_band_exponents=0; PROF_END(header); // // decode the band energy first, in case we want to use it to compute/predict other things (we don't anymore) // // band exponents PROF_BEGIN(huffman); if (header.predict_stereo_exponent && num_channels == 2) { decode_huff_array(&dec, &rada_band_exponent_correct_huff , band_exponents , bi->num_bands, &error); decode_huff_array(&dec, &rada_band_exponent_stereo_correct_huff, band_exponents+32, bi->num_bands, &error); } else { decode_huff_array(&dec, &rada_band_exponent_correct_huff, band_exponents, bi->num_bands * num_channels, &error); } PROF_END(huffman); if (error) { return e(RADAUDIO_INVALID_DATA); } PROF_BEGIN(unpack); for (c=0; c < (header.predict_stereo_exponent ? 1 : num_channels); ++c) { int lastv = PREDICT_FIRST_BAND_EXP; for (int j=0; j < bi->num_bands; ++j) { int v = (S8) band_exponents[cur_band_exponents++]; v += lastv; lastv = v; bd[c].band_exponent[j] = v; } } // decode stereo predicted exponents if (header.predict_stereo_exponent && num_channels == 2) { for (int j=0; j < bi->num_bands; ++j) bd[1].band_exponent[j] = bd[0].band_exponent[j] + (S8) band_exponents[32+j]; } PROF_END(unpack); RAD_ALIGN(U16, m_mantissa[MAX_BANDS*2+16], 16); PROF_BEGIN(compute_mantissa_len); for (c=0; c < num_channels; ++c) { // THIS LOGIC MUST BE EXACTLY REPLICATED IN THE COMPRESSOR!!! compute_mantissa_bitcount( ds->samprate_mode, is_short_block, ds->mantissa_param, bd[c].band_exponent, bd[c].band_mantissa_bitcount); } PROF_END(compute_mantissa_len); if (error) return e(RADAUDIO_INVALID_DATA); PROF_BEGIN(varbits); { int slot=0; for (c = 0; c < num_channels; ++c) { for (int j=0; j < bi->num_bands; ++j) { U8 size = bd[c].band_mantissa_bitcount[j]; U16 mantissa = (U16) decode_vbstream_bits(&dec.stream[2], size, &error); m_mantissa[slot] = mantissa << (MAX_FINE_ENERGY_BITS - size); ++slot; } } } PROF_END(varbits); if (error) return e(RADAUDIO_INVALID_DATA); // // now do all the remaining entropy decoding // #define runlen_value_sentinel_size 2 // room to write two END_OF_ZERORUN markers to preven overread if input doesn't have them #define runlen_read_sentinel_size 16 // room to write dummy values for SIMD to run on multiple-of-16-bytes #define nonzero_coefficients_padding 32 // room to write dummy values for SIMD overwrite/overread, both of which are at most 16 #define coeff_pair_padding 16 // room to write dummy data when unpacking #define runlen_pad (runlen_value_sentinel_size + runlen_read_sentinel_size) #define max_runlength_data 1025 // 1024 empty runs per channel, plus two end-of-run markers RAD_ALIGN(U8, subband_value [2* MAX_SUBBANDS ], 16); RAD_ALIGN(U8, subband_correction [2* MAX_BANDS ], 16); RAD_ALIGN(U8, subband_stereo_correct[ MAX_SUBBANDS ], 16); RAD_ALIGN(S8, nonzero_coefficients [2* 1024+nonzero_coefficients_padding ], 16); RAD_ALIGN(U8, runlength_data [2* max_runlength_data + runlen_pad ], 16); RAD_ALIGN(U8, nonzero_flagbits [2* (1024/8) + 16 ], 16); int num_subband_values0=0; int num_subband_corrections=0, num_subband_stereo_correct=0; int num_runlength_data=header.num_runlength_array; int num_coeff_pairs; for (int j=0; j < bi->num_bands; ++j) { int numsub = bi->num_subbands_for_band[j]; if (numsub > 1) { for (c=0; c < num_channels; ++c) { if (bd[c].band_exponent[j] == BAND_EXPONENT_NONE && SUBBANDS_SKIP_EMPTY_BANDS) continue; int n = numsub; if (c == 1 && header.predict_stereo_subband) { num_subband_stereo_correct += n; } else { if (!header.disable_final_subband_predict) { --n; ++num_subband_corrections; } num_subband_values0 += n; } } } } // subband values PROF_BEGIN(huffman); if (!is_short_block) { decode_huff_array(&dec, &rada_subband_value_huff , subband_value , num_subband_values0 , &error); if (!header.disable_final_subband_predict) decode_huff_array(&dec, &rada_subband_value_last_in_band_correct_huff, subband_correction, num_subband_corrections, &error); if (header.predict_stereo_subband) decode_huff_array(&dec, &rada_subband_value_stereo_correct_huff, subband_stereo_correct, num_subband_stereo_correct, &error); } PROF_END(huffman); if (error) return e(RADAUDIO_INVALID_DATA); // coefficient zero-runlength data if (num_runlength_data > 1025*2) return e(RADAUDIO_INVALID_DATA); if (!is_short_block && nz_mode != 3) { U8 huffbits[2048]; radaudio_nonzero_blockmode_descriptor *bdesc = &ds->nz_desc[nz_mode]; PROF_BEGIN(huffman); { int p=0; for (int i=0; i < NUM_NZ_HUFF; ++i) { int q = bdesc->num_chunks_per_huff[i]; if (q) { decode_huff_array(&dec, rada_nonzero_bitflags_huff[i], huffbits+p, q*8*num_channels, &error); p += q*8*num_channels; } } } PROF_END(huffman); PROF_BEGIN(huffman); int j=0, s=num_channels-1; for (c=0; c < num_channels; ++c, ++s) { for (int i=0; i < bdesc->num_8byte_chunks; ++i, ++j) { U8 p = bdesc->source_pos[s][i]; U64 xor = (U64)0 - bdesc->invert_chunk[i]; // if invert_chunk=1, this gives ~0 (invert), else 0. RR_PUT64_NATIVE(&nonzero_flagbits[j*8], xor ^ RR_GET64_NATIVE(huffbits+p*8)); } } PROF_END(huffman); } if (error) return e(RADAUDIO_INVALID_DATA); PROF_BEGIN(huffman); decode_huff_array(&dec, &rada_zero_runlength_huff, runlength_data, num_runlength_data, &error); // add sentinel so we don't need to length-check loop runlength_data[num_runlength_data+0] = END_OF_ZERORUN; // add extra sentinel in case the data is invalid and doesn't have the stereo separator, so we don't need to length-check loop runlength_data[num_runlength_data+1] = END_OF_ZERORUN; PROF_END(huffman); if (error) return e(RADAUDIO_INVALID_DATA); PROF_BEGIN(count_coefficients_huff); // values of MAX_RUNLEN don't indicate coefficients, because they have a following real runlength int num_nonzero_coefficients = count_bytes_below_value_sentinel16(ds, runlength_data, num_runlength_data, MAX_RUNLEN); if (!is_short_block) { int num_flagbit_bytes = ds->nz_desc[nz_mode].num_8byte_chunks * 8; if (num_flagbit_bytes != 0) num_nonzero_coefficients += count_set_bits_multiple8_sentinel8(ds, nonzero_flagbits, num_flagbit_bytes*num_channels); } PROF_END(count_coefficients_huff); // runlength data + flagbits combined could be too many coefficients if (num_nonzero_coefficients > num_channels*1024) return e(RADAUDIO_INVALID_DATA); int nz_selector = is_short_block ? 4 : nz_mode; // coefficients -- need to have decoded the runlength data to know how many coefficients { // transient temp mem RAD_ALIGN(U8, coefficient_pairs[2* 1024/2 * 2 + coeff_pair_padding], 16); PROF_BEGIN(huffman); num_coeff_pairs = (num_nonzero_coefficients+1)/2; int tp = ds->nz_correlated_huffman_selectors[HS_COEFF_PAIR][nz_selector]; decode_huff_array(&dec, rada_nonzero_coefficient_pair_huff[tp], coefficient_pairs, num_coeff_pairs, &error); PROF_END(huffman); // convert coefficient pairs to coefficients PROF_BEGIN(unpack); unpack_nibbles_input_excess16_output_excess16_multiple32_default1(ds, nonzero_coefficients, coefficient_pairs, num_coeff_pairs); PROF_END(unpack); } // read and apply bottom bits of run length data // we have 2*1024 coeffs, COARSE_RUNLEN_THRESHOLD=60 and such runs are followed by a // nonzero coefficient, so per 1024 coeffs we can have at most floor(1024/61)=16 of these // (32 total between the total channels). in practice, the typical counts are 0-4. PROF_BEGIN(update_runlength); { U8 *cur = runlength_data; U8 *end = runlength_data + num_runlength_data; // we have runlen_read_sentinel_size of padding, so can be sloppy while (cur < end) { cur = find_next_coarse_run_excess16(ds, cur, end); if (cur >= end) break; rrAssert(*cur >= COARSE_RUNLEN_THRESHOLD && *cur < MAX_RUNLEN); // process this run and advance U8 extra = (U8) decode_vbstream_bits(&dec.stream[2], 2, &error); *cur += extra; ++cur; } } PROF_END(update_runlength); // big coefficients are coded as value 0 in the coefficient pairs { // transient temp mem, only used right here RAD_ALIGN(S8, big_coefficients[2* 1024 + 16], 16); // count zero bytes int num_big_coefficients = count_bytes_below_value_sentinel16(ds, (U8*) nonzero_coefficients, num_nonzero_coefficients, 1); PROF_BEGIN(huffman); int tb = ds->nz_correlated_huffman_selectors[HS_COEFF_BIG][nz_selector]; decode_huff_array(&dec, rada_nonzero_coefficient_big_huff[tb], (U8*) big_coefficients, num_big_coefficients, &error); PROF_END(huffman); PROF_BEGIN(unbias); // big coefficients are byte-sized, so stored aligned in stream[2] //decode_stream_align_to_byte(&dec.stream[2]); //U8 *bytestream = &dec.stream[2].bitstream[ dec.stream[2].read_pos_in_bits>>3 ]; // bytestream ends at current position of reverse-read stream 1 //decode_stream_align_to_byte(&dec.stream[1]); //U8 *bytestream_end = &dec.stream[1].bitstream[-(int)(dec.stream[1].read_pos_in_bits>>3)]; // expand used to decode directly from the stream and hence needed a safety range if (!expand_nonzero_coefficients(ds, nonzero_coefficients, num_nonzero_coefficients, big_coefficients, (big_coefficients+num_nonzero_coefficients), (big_coefficients+sizeof(big_coefficients)))) return e(RADAUDIO_INVALID_DATA); PROF_END(unbias); } if (error) return e(RADAUDIO_INVALID_DATA); int cur_subband_values0=0; int cur_subband_corrections=0; int cur_subband_stereo_correct=0; int cur_nonzero_coefficients=0; int cur_runlength_data=0; PROF_BEGIN(compute_subbands); for (c=0; c < num_channels; ++c) memset(bd[c].quantized_subbands, 0, bi->num_subbands * 2); if (!is_short_block) { // subbands for (c=0; c < num_channels; ++c) { for (int j=0; j < bi->num_bands; ++j) { if (bi->num_subbands_for_band[j] == 1) continue; int start = bi->first_subband_for_band[j]; int num_coded_subbands = bi->num_subbands_for_band[j]; if (bd[c].band_exponent[j] == BAND_EXPONENT_NONE && SUBBANDS_SKIP_EMPTY_BANDS) { for (int i=0; i < num_coded_subbands; ++i) bd[c].quantized_subbands[start+i] = (U16) (ds->subband_predicted_sum[j] / num_coded_subbands); // this value is predicted from in stero } else if (header.predict_stereo_subband && c == 1) { for (int i=0; i < num_coded_subbands; ++i) { int predict = bd[0].quantized_subbands[start+i]; int correct = (S8) subband_stereo_correct[cur_subband_stereo_correct++]; bd[c].quantized_subbands[start+i] = (U16) (predict + correct); } } else { int predicted_sum = ds->subband_predicted_sum[j]; int bias = ds->subband_bias[j]; int partial_sum = 0; if (!header.disable_final_subband_predict) --num_coded_subbands; for (int i=0; i < num_coded_subbands; ++i) { int v = subband_value[cur_subband_values0++]; v -= bias; // remove bias v = (v & 63); bd[c].quantized_subbands[start+i] = (U16) v; partial_sum += v; } if (!header.disable_final_subband_predict) { int actual_sum = predicted_sum + (S8) subband_correction[cur_subband_corrections++]; int v = actual_sum - partial_sum; if (v < 0) // @TODO investigate this case closely, why can't it be negative, should there be an upper bound? return e(RADAUDIO_INVALID_DATA); bd[c].quantized_subbands[start+num_coded_subbands] = (U16) v; } } } } } PROF_END(compute_subbands); if (error) return e(RADAUDIO_INVALID_DATA); for (c=0; c < num_channels; ++c) { int num_nonzero_bitarray_bytes = ds->nz_desc[nz_mode].num_8byte_chunks * 8; rrbool result = distribute_nonzero_coefficients(ds, bd[c].quantized_coeff_decode, bi->num_quantized_coeffs, runlength_data, &cur_runlength_data, nonzero_coefficients, &cur_nonzero_coefficients, nonzero_flagbits + c*num_nonzero_bitarray_bytes, is_short_block ? 0 : num_nonzero_bitarray_bytes*8, c); if (!result) return e(RADAUDIO_INVALID_DATA); } // we expect to read the first sentinel; if we read the second, it's a bug if (cur_runlength_data > num_runlength_data+1) return e(RADAUDIO_INVALID_DATA); FFT_ALIGN(F32, rawdata[MAX_COEFFS]); F32 *data1 = rawdata, *data2 = rawdata + (bi->num_coeffs >> 1); if (ds->num_channels==1) { // mono stream (void) decode_channel_before_imdct(ds, &bd[0], 0, ds->block_number, m_mantissa); (void) decode_channel_imdct (ds, rawdata , &bd[0], 0); len = decode_channel_after_imdct (ds, data1, data2, &bd[0], 0, output[0]); } else if (ds->num_channels==2 && num_channels==1) { // stereo stream with mono block (void) decode_channel_before_imdct(ds, &bd[0], 0, ds->block_number, m_mantissa); (void) decode_channel_imdct (ds, rawdata , &bd[0], 0); (void) decode_channel_after_imdct (ds, data1, data2, &bd[0], 0, output[0]); len = decode_channel_after_imdct (ds, data1, data2, &bd[0], 1, output[1]); } else { // stereo stream with stereo block (void) decode_channel_before_imdct(ds, &bd[0], 0, ds->block_number , m_mantissa); (void) decode_channel_before_imdct(ds, &bd[1], 1, ds->block_number^0x55555555, m_mantissa+ds->info[is_short_block]->num_bands); // midside decode for (int j=0; j < bi->num_bands; ++j) { if (midside_bands & (1 << j)) { F32 *coeffs1 = bd[0].dequantized_coeff_decode; F32 *coeffs2 = bd[1].dequantized_coeff_decode; int start = bi->first_coeff_for_band[j]; int end = start + bi->num_coeffs_for_band[j]; for (int i=start; i < end; ++i) { float x = coeffs1[i]; float y = coeffs2[i]*0.5f; coeffs1[i] = x+y; coeffs2[i] = x-y; } } } (void) decode_channel_imdct (ds, rawdata , &bd[0], 0); (void) decode_channel_after_imdct (ds, data1, data2, &bd[0], 0, output[0]); (void) decode_channel_imdct (ds, rawdata , &bd[1], 1); len = decode_channel_after_imdct (ds, data1, data2, &bd[1], 1, output[1]); } if (error) return e(RADAUDIO_INVALID_DATA); *memconsumed = block_length_in_bytes + skip; ++ds->block_number; ds->last_block_short = ds->current_block_short; if (header.final_block) { if (header.final_samples_discard > (U32) len) return e(RADAUDIO_INVALID_DATA); else { int total = len - header.final_samples_discard; return total; } } else return len; } size_t RadAudioDecoderMemoryRequired(U8 *raw_header, size_t raw_header_size) { size_t size = sizeof(radaudio_decoder_state); size += decoder_align-1; // allow room to align int num_channels; if (raw_header == NULL) num_channels = 2; else { radaudio_stream_header_unpacked header; if (radaudio_unpack_stream_header(raw_header, raw_header_size, &header) == 0) return 0; num_channels = header.num_channels; } // room for buffered samples from previous block size += RADAUDIO_LONG_BLOCK_LEN/2 * sizeof(S16) * num_channels; return size; } static radaudio_decoder_state * radaudio_decompressor_memalloc(radaudio_stream_header_unpacked *header, void *vmem, size_t memsize) { int i; union { UINTa addr; U8 * ptr; } convert; if (memsize < sizeof(radaudio_decoder_state)) return 0; U8 *mem = vmem; radaudio_decoder_state *ds; ds = (void*) mem; mem += sizeof(*ds); memset(ds, 0, sizeof(*ds)); // align data after struct convert.ptr = mem; convert.addr = (convert.addr + decoder_align-1) & ~(decoder_align-1); mem = convert.ptr; for (i=0; i < header->num_channels; ++i) { ds->prev_block_right_samples[i] = (void *) mem; mem += RADAUDIO_LONG_BLOCK_LEN/2 * sizeof(S16); } size_t memneeded = mem - (U8*)vmem; if (memneeded > memsize) return 0; ds->last_block_short = 1; // shouldn't matter return ds; } RadAudioDecoder *RadAudioDecoderOpen(U8 *raw_header, size_t raw_header_size, void *vmem, size_t memsize, size_t *header_read) { radaudio_decoder_state *ds; radaudio_stream_header_unpacked header; if (raw_header_size < sizeof(radaudio_stream_header)) return NULL; size_t header_size = radaudio_unpack_stream_header(raw_header, raw_header_size, &header); if (header_size == 0) return NULL; // unpack_stream_header does some sanity checking, here's the rest: for (int i=0; i < NUM_NZ_MODE; ++i) if (header.nzmode_num64[i] > MAX_NZ_BLOCKS) return 0; ds = radaudio_decompressor_memalloc(&header, vmem, memsize); if (ds == NULL) return 0; ds->version = header.version; ds->num_channels = header.num_channels; ds->skip_bytes = 0;//(U8) header_size; ds->cpu = cpu_detect(); ds->post_seek = true; // very first block decoded discards input ds->sample_rate = header.sample_rate; ds->samprate_mode = header.sample_rate_mode; memcpy(ds->subband_bias, header.subband_bias, sizeof(ds->subband_bias)); ds->info[0] = &radaudio_rateinfo[0][ds->samprate_mode]; ds->info[1] = &radaudio_rateinfo[1][ds->samprate_mode]; memcpy(ds->subband_predicted_sum, header.subband_predicted_sum, 24); memcpy(ds->mantissa_param , header.mantissa_param, sizeof(header.mantissa_param)); compute_bias_set(&ds->biases, header.bytes_bias); for (int i=0; i < NUM_NZ_MODE; ++i) { ds->nz_desc[i].num_8byte_chunks = header.nzmode_num64[i]; if (ds->nz_desc[i].num_8byte_chunks > MAX_NZ_BLOCKS) return 0; for (int j=0; j < MAX_NZ_BLOCKS; ++j) { ds->nz_desc[i].huffman_table_for_chunk[j] = (header.nzmode_huff[i][j] & ~NZ_MODE_INVERT); ds->nz_desc[i].invert_chunk[j] = (header.nzmode_huff[i][j] & NZ_MODE_INVERT) != 0; if (ds->nz_desc[i].huffman_table_for_chunk[j] >= NUM_NZ_HUFF) return 0; } } for (int j=0; j < NUM_NZ_SELECTOR; ++j) for (int i=0; i < NUM_SELECTOR_MODES; ++i) ds->nz_correlated_huffman_selectors[j][i] = header.nzmode_selectors[j][i]; radaudio_init_nz_desc(ds->nz_desc); if (header_read) *header_read = header_size; return ds; } #ifdef RADAUDIO_DEVELOPMENT void RadAudioDecoderForceIntelCPU(RadAudioDecoder *hradaud, rrbool has_sse2, rrbool has_ssse3, rrbool has_sse4_1, rrbool has_popcnt, rrbool has_avx2) { radaudio_decoder_state *ds = (radaudio_decoder_state *) hradaud; RR_UNUSED_VARIABLE(ds); #ifdef __RADX86__ ds->cpu.has_sse2 = (U8) has_sse2; ds->cpu.has_ssse3 = (U8) has_ssse3; ds->cpu.has_sse4_1 = (U8) has_sse4_1; ds->cpu.has_popcnt = (U8) has_popcnt; ds->cpu.has_avx2 = (U8) has_avx2; #endif } #endif static void decode_version(RadAudioInfo *info, U32 version) { info->major_version = (U8 ) ((version & 0xff000000) >> 24); info->minor_version = (U8 ) ((version & 0x00ff0000) >> 16); info->sequential_version = (U16) ((version & 0x0000ffff) >> 0); } void RadAudioDecoderGetInfo(const RadAudioDecoder *hradaud, RadAudioInfo *out_info) { radaudio_decoder_state *ds = (radaudio_decoder_state *) hradaud; out_info->sample_rate = ds->sample_rate; out_info->num_channels = ds->num_channels; decode_version(out_info, ds->version); } size_t RadAudioDecoderGetInfoHeader(U8* raw_header, size_t raw_header_size, RadAudioInfo *out_info) { radaudio_stream_header_unpacked header; size_t header_size = radaudio_unpack_stream_header(raw_header, raw_header_size, &header); if (header_size == 0) return 0; out_info->sample_rate = header.sample_rate; out_info->num_channels = header.num_channels; decode_version(out_info, header.version); return header_size; } RADDEFFUNC void RadAudioDecoderDidSeek(RadAudioDecoder *radaudio_decomp) { radaudio_decoder_state *ds = (radaudio_decoder_state *) radaudio_decomp; ds->post_seek = true; ds->at_eof = false; } int RadAudioDecoderGetChunkLength(RadAudioDecoder *radaudio_decomp, const U8 *data, size_t data_avail) { radaudio_decoder_state *ds = (radaudio_decoder_state *) radaudio_decomp; if (ds->at_eof) return RADAUDIO_AT_EOF; if (data_avail < 4) return RADAUDIO_INCOMPLETE_DATA; radaudio_block_header_unpacked header; int offset = radaudio_decode_block_header(data, &ds->biases, &header, data_avail); if (offset == COMMON_STREAM_HEADER) return RADAUDIO_START_OF_STREAM; if (offset == COMMON_INCOMPLETE_DATA) return RADAUDIO_INCOMPLETE_DATA; if (offset == COMMON_INVALID_DATA) return RADAUDIO_INVALID_DATA; U32 block_length_in_bytes = header.block_bytes + offset; // validate data if (block_length_in_bytes > MAX_ENCODED_BLOCK_BYTES) return RADAUDIO_INVALID_DATA; if (offset + header.vbstream0_length > block_length_in_bytes) return RADAUDIO_INVALID_DATA; if (header.final_block) if (header.final_samples_discard > RADAUDIO_SHORT_BLOCK_LEN) return RADAUDIO_INVALID_DATA; if (header.num_runlength_array > (U32) 2*(header.this_block_short ? RADAUDIO_SHORT_BLOCK_LEN+1 : RADAUDIO_LONG_BLOCK_LEN+1)) return RADAUDIO_INVALID_DATA; return header.block_bytes + offset; } // returns the number of samples output per channel, and update 'memconsumed' // with the amount of memory consumed. // // return values: // n number of samples decoded (for one channel, e.g. n=1024 means 1024 stereo pairs) // 0 can decode 0 samples legitimately, e.g. first block or after seeking // -1 at end-of-stream // -2 not enough input data to decode a frame, always consumes 0 // -3 error (e.g. corrupt stream) int RadAudioDecoderDecodeChunk( RadAudioDecoder *radaudio_decomp, const U8 *mem , size_t memavail , size_t *memconsumed , F32 *output_samples[2], size_t max_samples_per_channel ) { *memconsumed = 0; radaudio_decoder_state *ds = (radaudio_decoder_state *) radaudio_decomp; if (ds->at_eof) return RADAUDIO_AT_EOF; if (memavail < 7) return RADAUDIO_INCOMPLETE_DATA; size_t used; if (!ds) return -2; PROF_BEGIN(decoder_all); size_t skip = ds->skip_bytes; int len = decode_block(ds, output_samples, (U8*)mem+skip, memavail-skip, &used); ds->post_seek = false; if (len >= 0) { *memconsumed = used + skip; ds->skip_bytes = 0; ds->fully_decoded += len; } PROF_END(decoder_all); return len; } #ifdef RADAUDIO_DEVELOPMENT // internal use int RadAudioDecoderGetProfileData(RadAudioDecoder *hradaud, radaudio_profile_value *aprofile, int num_profile) { RR_UNUSED_VARIABLE(hradaud); int n = RR_MIN(num_profile, PROF_total_count - 1); static const char *names[] = { #define PROF(x) #x, PROFILE_ZONES() #undef PROF }; if (aprofile) { double overhead_time = 0.0; // we have an empty profiling region to estimate overhead of tracking a region to begin with if (profile_counts[PROF_overhead]) { overhead_time = rrTicksToSeconds(profile_times[PROF_overhead]) / profile_counts[PROF_overhead]; } for (int i=0; i < n; ++i) { aprofile[i].name = names[i]; // subtract out estimated overhead aprofile[i].time = rrTicksToSeconds(profile_times[i]) - overhead_time * profile_counts[i]; } } else { profile = num_profile; } for (int i=0; i < PROF_total_count; ++i) { profile_times[i] = 0; profile_counts[i] = 0; } return n; } #else int RadAudioDecoderGetProfileData(RadAudioDecoder *hradaud, radaudio_profile_value *profile, int num_profile) { RR_UNUSED_VARIABLE(hradaud); RR_UNUSED_VARIABLE(profile); RR_UNUSED_VARIABLE(num_profile); return 0; } #endif