Eric Wong
2005-Feb-02 23:20 UTC
[Flac-dev] two small-ish optimizations (death by a thousand cuts)
This lpc_restore_order was partially inspired by Miroslav's affd, though my (not very great) ARM asm version resembled this, as well. The other two reduce CPU array indexing overhead in loops a little. Additionally, a request for help: My not very optimized lpc_restore_signal is at the below URL, I couldn't get the ldm* instructions to work as advertised, even though I've talked to several ARM asm hackers who said they looked right. I can use the fp as a regular register since since I'm compiling without it. Comments within should explain what I'm having trouble with: http://archzoom.sourcecontrol.net/archzoom.cgi/eric@petta-tech.com--2005a-normalperson/flac--ipod--1.1.0--patch-19/src/libFLAC/arm/lpc_asm.s -- Eric Wong --- orig/src/libFLAC/lpc.c +++ mod/src/libFLAC/lpc.c @@ -293,6 +293,209 @@ void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) { + register const FLAC__int32 *qlp0 = &qlp_coeff[(order-1)]; + register FLAC__int32 sum; + register const FLAC__int32 *history, *qlp; + + history = &data[(-order)]; + + switch (order) { + case 12: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + + (qlp0[-3] * history[3]) + + (qlp0[-4] * history[4]) + + (qlp0[-5] * history[5]) + + (qlp0[-6] * history[6]) + + (qlp0[-7] * history[7]) + + (qlp0[-8] * history[8]) + + (qlp0[-9] * history[9]) + + (qlp0[-10] * history[10]) + + (qlp0[-11] * history[11]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 11: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + + (qlp0[-3] * history[3]) + + (qlp0[-4] * history[4]) + + (qlp0[-5] * history[5]) + + (qlp0[-6] * history[6]) + + (qlp0[-7] * history[7]) + + (qlp0[-8] * history[8]) + + (qlp0[-9] * history[9]) + + (qlp0[-10] * history[10]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 10: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + + (qlp0[-3] * history[3]) + + (qlp0[-4] * history[4]) + + (qlp0[-5] * history[5]) + + (qlp0[-6] * history[6]) + + (qlp0[-7] * history[7]) + + (qlp0[-8] * history[8]) + + (qlp0[-9] * history[9]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 9: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + + (qlp0[-3] * history[3]) + + (qlp0[-4] * history[4]) + + (qlp0[-5] * history[5]) + + (qlp0[-6] * history[6]) + + (qlp0[-7] * history[7]) + + (qlp0[-8] * history[8]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 8: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + + (qlp0[-3] * history[3]) + + (qlp0[-4] * history[4]) + + (qlp0[-5] * history[5]) + + (qlp0[-6] * history[6]) + + (qlp0[-7] * history[7]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 7: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + + (qlp0[-3] * history[3]) + + (qlp0[-4] * history[4]) + + (qlp0[-5] * history[5]) + + (qlp0[-6] * history[6]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 6: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + + (qlp0[-3] * history[3]) + + (qlp0[-4] * history[4]) + + (qlp0[-5] * history[5]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 5: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + + (qlp0[-3] * history[3]) + + (qlp0[-4] * history[4]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 4: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + + (qlp0[-3] * history[3]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 3: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + + (qlp0[-2] * history[2]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 2: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * history[0]) + + (qlp0[-1] * history[1]) + ; + ++history; + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + case 1: + for( ; data_len != 0; --data_len) { + sum = (qlp0[0] * (*(history++))); + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + default: + { + /* handle everything else: (order > 12) + * with Duff's Device to reduce jumps */ + const unsigned n0 = (order + 7)/8; + const int tmp = 0 - order - 1; + register const FLAC__int32 *qlpd = &qlp_coeff[order]; + for( ; data_len != 0; --data_len) { + register unsigned n = n0; + sum = 0; + qlp = qlpd; + history = &data[tmp]; + + switch(order%8) { + case 0: do { + sum += (*(--qlp)) * (*(++history)); + case 7: sum += (*(--qlp)) * (*(++history)); + case 6: sum += (*(--qlp)) * (*(++history)); + case 5: sum += (*(--qlp)) * (*(++history)); + case 4: sum += (*(--qlp)) * (*(++history)); + case 3: sum += (*(--qlp)) * (*(++history)); + case 2: sum += (*(--qlp)) * (*(++history)); + case 1: sum += (*(--qlp)) * (*(++history)); + } while (--n); + } + + *(data++) = *(residual++) + (sum >> lp_quantization); + } + return; + } + } +} + +#if 0 +void FLAC__lpc_restore_signal_orig(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) +{ #ifdef FLAC__OVERFLOW_DETECT FLAC__int64 sumo; #endif @@ -339,6 +542,7 @@ } */ } +#endif /* 0 */ void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) { --- orig/src/libFLAC/bitbuffer.c +++ mod/src/libFLAC/bitbuffer.c @@ -1466,6 +1469,7 @@ { unsigned i, bits_ = bits; FLAC__uint32 v = 0; + FLAC__blurb *bbb; FLAC__ASSERT(0 != bb); FLAC__ASSERT(0 != bb->buffer); @@ -1485,18 +1489,20 @@ #if FLAC__BITS_PER_BLURB > 8 if(bb->bits == 0 || bb->consumed_blurbs < bb->blurbs) { /*@@@ comment on why this is here*/ #endif + bbb = &bb->buffer[bb->consumed_blurbs]; if(bb->consumed_bits) { i = FLAC__BITS_PER_BLURB - bb->consumed_bits; if(i <= bits_) { - v = bb->buffer[bb->consumed_blurbs] & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits); + v = (*bbb) & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits); bits_ -= i; - CRC16_UPDATE_BLURB(bb, bb->buffer[bb->consumed_blurbs], bb->read_crc16); + CRC16_UPDATE_BLURB(bb, (*bbb), bb->read_crc16); + ++bbb; bb->consumed_blurbs++; bb->consumed_bits = 0; /* we hold off updating bb->total_consumed_bits until the end */ } else { - *val = (bb->buffer[bb->consumed_blurbs] & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits)) >> (i-bits_); + *val = ((*bbb) & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits)) >> (i-bits_); bb->consumed_bits += bits_; bb->total_consumed_bits += bits_; return true; @@ -1516,9 +1522,10 @@ #else while(bits_ >= FLAC__BITS_PER_BLURB) { v <<= FLAC__BITS_PER_BLURB; - v |= bb->buffer[bb->consumed_blurbs]; + v |= (*bbb); bits_ -= FLAC__BITS_PER_BLURB; - CRC16_UPDATE_BLURB(bb, bb->buffer[bb->consumed_blurbs], bb->read_crc16); + CRC16_UPDATE_BLURB(bb, (*bbb), bb->read_crc16); + ++bbb; bb->consumed_blurbs++; /* bb->consumed_bits is already 0 */ /* we hold off updating bb->total_consumed_bits until the end */ @@ -1526,7 +1533,7 @@ #endif if(bits_ > 0) { v <<= bits_; - v |= (bb->buffer[bb->consumed_blurbs] >> (FLAC__BITS_PER_BLURB-bits_)); + v |= ((*bbb) >> (FLAC__BITS_PER_BLURB-bits_)); bb->consumed_bits = bits_; /* we hold off updating bb->total_consumed_bits until the end */ } --- orig/src/libFLAC/stream_decoder.c +++ mod/src/libFLAC/stream_decoder.c @@ -74,6 +74,7 @@ ***********************************************************************/ static void set_defaults_(FLAC__StreamDecoder *decoder); +static inline void read_channel_coding(FLAC__StreamDecoder *decoder); static FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned size, unsigned channels); static FLAC__bool has_id_filtered_(FLAC__StreamDecoder *decoder, FLAC__byte *id); static FLAC__bool find_metadata_(FLAC__StreamDecoder *decoder); @@ -776,6 +768,54 @@ decoder->private_->metadata_filter_ids_count = 0; } +/* Undo any special channel coding */ +static inline void read_channel_coding(FLAC__StreamDecoder *decoder) +{ + register FLAC__int32 left, right; + register unsigned i; + register FLAC__int32 *lchan, *rchan; + switch(decoder->private_->frame.header.channel_assignment) { + case FLAC__CHANNEL_ASSIGNMENT_INDEPENDENT: + /* do nothing */ + break; + case FLAC__CHANNEL_ASSIGNMENT_LEFT_SIDE: + FLAC__ASSERT(decoder->private_->frame.header.channels == 2); + lchan = &(decoder->private_->output[0])[0]; + rchan = &(decoder->private_->output[1])[0]; + for(i = decoder->private_->frame.header.blocksize; i != 0; --i) { + *rchan = *(lchan++) - *rchan; + ++rchan; + } + break; + case FLAC__CHANNEL_ASSIGNMENT_RIGHT_SIDE: + FLAC__ASSERT(decoder->private_->frame.header.channels == 2); + lchan = &(decoder->private_->output[0])[0]; + rchan = &(decoder->private_->output[1])[0]; + for(i = decoder->private_->frame.header.blocksize; i != 0; --i) + *(lchan++) += *(rchan++); + break; + case FLAC__CHANNEL_ASSIGNMENT_MID_SIDE: + FLAC__ASSERT(decoder->private_->frame.header.channels == 2); + lchan = &(decoder->private_->output[0])[0]; + rchan = &(decoder->private_->output[1])[0]; + for(i = decoder->private_->frame.header.blocksize; i != 0; --i) { + register FLAC__int32 mid = *lchan; + register FLAC__int32 side = *rchan; + mid <<= 1; + if(side & 1) /* i.e. if 'side' is odd... */ + ++mid; + left = mid + side; + right = mid - side; + *(lchan++) = left >> 1; + *(rchan++) = right >> 1; + } + break; + default: + FLAC__ASSERT(0); + break; + } +} + FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned size, unsigned channels) { unsigned i; @@ -1380,8 +1418,6 @@ FLAC__bool read_frame_(FLAC__StreamDecoder *decoder, FLAC__bool *got_a_frame, FLAC__bool do_full_decode) { unsigned channel; - unsigned i; - FLAC__int32 mid, side, left, right; FLAC__uint16 frame_crc; /* the one we calculate from the input stream */ FLAC__uint32 x; @@ -1446,41 +1482,9 @@ if(!FLAC__bitbuffer_read_raw_uint32(decoder->private_->input, &x, FLAC__FRAME_FOOTER_CRC_LEN, read_callback_, decoder)) return false; /* the read_callback_ sets the state for us */ if(frame_crc == (FLAC__uint16)x) { - if(do_full_decode) { - /* Undo any special channel coding */ - switch(decoder->private_->frame.header.channel_assignment) { - case FLAC__CHANNEL_ASSIGNMENT_INDEPENDENT: - /* do nothing */ - break; - case FLAC__CHANNEL_ASSIGNMENT_LEFT_SIDE: - FLAC__ASSERT(decoder->private_->frame.header.channels == 2); - for(i = 0; i < decoder->private_->frame.header.blocksize; i++) - decoder->private_->output[1][i] = decoder->private_->output[0][i] - decoder->private_->output[1][i]; - break; - case FLAC__CHANNEL_ASSIGNMENT_RIGHT_SIDE: - FLAC__ASSERT(decoder->private_->frame.header.channels == 2); - for(i = 0; i < decoder->private_->frame.header.blocksize; i++) - decoder->private_->output[0][i] += decoder->private_->output[1][i]; - break; - case FLAC__CHANNEL_ASSIGNMENT_MID_SIDE: - FLAC__ASSERT(decoder->private_->frame.header.channels == 2); - for(i = 0; i < decoder->private_->frame.header.blocksize; i++) { - mid = decoder->private_->output[0][i]; - side = decoder->private_->output[1][i]; - mid <<= 1; - if(side & 1) /* i.e. if 'side' is odd... */ - mid++; - left = mid + side; - right = mid - side; - decoder->private_->output[0][i] = left >> 1; - decoder->private_->output[1][i] = right >> 1; - } - break; - default: - FLAC__ASSERT(0); - break; - } - } + if(do_full_decode) + read_channel_coding(decoder); + } else { /* Bad frame, emit error and zero the output signal */ /EOF -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 189 bytes Desc: Digital signature Url : http://lists.xiph.org/pipermail/flac-dev/attachments/20050202/d001db2c/attachment.pgp