Hi,
attached are patches that improve decoding speed a bit. The first
patch improves the bit scan macro used for decoding unary values, the
second one adds a GCC inline assembly for bswap and the third patch
replaces the read_rice_block function.
In my testing it turned out to be even faster than the _ia32_bswap
function. If the code produced by MSVC is faster as well, I'd suggest
to remove the assembly function completely and avoid the problems with
text relocation (sf bug #1793536).
--
Miroslav Lichvar
-------------- next part --------------
Index: src/libFLAC/bitreader.c
==================================================================RCS file:
/cvsroot/flac/flac/src/libFLAC/bitreader.c,v
retrieving revision 1.15
diff -u -r1.15 bitreader.c
--- src/libFLAC/bitreader.c 28 Feb 2008 05:34:26 -0000 1.15
+++ src/libFLAC/bitreader.c 14 Mar 2008 11:07:07 -0000
@@ -69,13 +69,12 @@
#endif
/* counts the # of zero MSBs in a word */
#define COUNT_ZERO_MSBS(word) ( \
- (word) <= 0xffff ? \
- ( (word) <= 0xff? byte_to_unary_table[word] + 24 :
byte_to_unary_table[(word) >> 8] + 16 ) : \
- ( (word) <= 0xffffff? byte_to_unary_table[word >> 16] + 8 :
byte_to_unary_table[(word) >> 24] ) \
+ word > 0xffffff ? byte_to_unary_table[(word) >> 24] : \
+ !word ? 32 : \
+ word > 0xffff ? byte_to_unary_table[word >> 16] + 8 : \
+ word > 0xff ? byte_to_unary_table[(word) >> 8] + 16 : \
+ byte_to_unary_table[word] + 24 \
)
-/* this alternate might be slightly faster on some systems/compilers: */
-#define COUNT_ZERO_MSBS2(word) ( (word) <= 0xff ? byte_to_unary_table[word]
+ 24 : ((word) <= 0xffff ? byte_to_unary_table[(word) >> 8] + 16 :
((word) <= 0xffffff ? byte_to_unary_table[(word) >> 16] + 8 :
byte_to_unary_table[(word) >> 24])) )
-
/*
* This should be at least twice as large as the largest number of words
-------------- next part --------------
Index: src/libFLAC/bitreader.c
==================================================================RCS file:
/cvsroot/flac/flac/src/libFLAC/bitreader.c,v
retrieving revision 1.15
diff -u -r1.15 bitreader.c
--- src/libFLAC/bitreader.c 28 Feb 2008 05:34:26 -0000 1.15
+++ src/libFLAC/bitreader.c 14 Mar 2008 13:19:46 -0000
@@ -149,6 +148,7 @@
FLAC__CPUInfo cpu_info;
};
+#if FLAC__BYTES_PER_WORD == 4 && FLAC__CPU_IA32
#ifdef _MSC_VER
/* OPT: an MSVC built-in would be better */
static _inline FLAC__uint32 local_swap32_(FLAC__uint32 x)
@@ -173,6 +173,15 @@
done1:
}
}
+#elif __GNUC__
+static void local_swap32_block_(FLAC__uint32 *start, FLAC__uint32 len)
+{
+ FLAC__uint32 *end;
+
+ for(end = start + len; start < end; start++)
+ asm ("bswap %0" : "=r"(*start) : "0"(*start));
+}
+#endif
#endif
static FLaC__INLINE void crc16_update_word_(FLAC__BitReader *br, brword word)
@@ -263,7 +272,7 @@
#if WORDS_BIGENDIAN
#else
end = (br->words*FLAC__BYTES_PER_WORD + br->bytes + bytes +
(FLAC__BYTES_PER_WORD-1)) / FLAC__BYTES_PER_WORD;
-# if defined(_MSC_VER) && (FLAC__BYTES_PER_WORD == 4)
+# if FLAC__CPU_IA32 && (__GNUC__ || defined(_MSC_VER)) &&
FLAC__BYTES_PER_WORD == 4
if(br->cpu_info.type == FLAC__CPUINFO_TYPE_IA32 &&
br->cpu_info.data.ia32.bswap) {
start = br->words;
local_swap32_block_(br->buffer + start, end - start);
-------------- next part --------------
Index: src/libFLAC/bitreader.c
==================================================================RCS file:
/cvsroot/flac/flac/src/libFLAC/bitreader.c,v
retrieving revision 1.15
diff -u -r1.15 bitreader.c
--- src/libFLAC/bitreader.c 28 Feb 2008 05:34:26 -0000 1.15
+++ src/libFLAC/bitreader.c 14 Mar 2008 12:44:27 -0000
@@ -803,379 +802,141 @@
}
/* this is by far the most heavily used reader call. it ain't pretty but
it's fast */
-/* a lot of the logic is copied, then adapted, from
FLAC__bitreader_read_unary_unsigned() and FLAC__bitreader_read_raw_uint32() */
FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int
vals[], unsigned nvals, unsigned parameter)
-/* OPT: possibly faster version for use with MSVC */
-#ifdef _MSC_VER
{
- unsigned i;
- unsigned uval = 0;
- unsigned bits; /* the # of binary LSBs left to read to finish a rice codeword
*/
-
/* try and get br->consumed_words and br->consumed_bits into register;
* must remember to flush them back to *br before calling other
- * bitwriter functions that use them, and before returning */
- register unsigned cwords;
- register unsigned cbits;
+ * bitreader functions that use them, and before returning */
+ unsigned cwords, words, lsbs, msbs, x, y;
+ unsigned ucbits; /* keep track of the number of unconsumed bits in word */
+ brword b;
+ int *val, *end;
FLAC__ASSERT(0 != br);
FLAC__ASSERT(0 != br->buffer);
/* WATCHOUT: code does not work with <32bit words; we can make things much
faster with this assertion */
FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
FLAC__ASSERT(parameter < 32);
- /* the above two asserts also guarantee that the binary part never straddles
more that 2 words, so we don't have to loop to read it */
-
- if(nvals == 0)
- return true;
+ /* the above two asserts also guarantee that the binary part never straddles
more than 2 words, so we don't have to loop to read it */
- cbits = br->consumed_bits;
- cwords = br->consumed_words;
+ val = vals;
+ end = vals + nvals;
- while(1) {
-
- /* read unary part */
- while(1) {
- while(cwords < br->words) { /* if we've not consumed up to a
partial tail word... */
- brword b = br->buffer[cwords] << cbits;
- if(b) {
-#if 0 /* slower, probably due to bad register allocation... */ &&
defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM &&
FLAC__BITS_PER_WORD == 32
- __asm {
- bsr eax, b
- not eax
- and eax, 31
- mov i, eax
- }
-#else
- i = COUNT_ZERO_MSBS(b);
-#endif
- uval += i;
- bits = parameter;
- i++;
- cbits += i;
- if(cbits == FLAC__BITS_PER_WORD) {
- crc16_update_word_(br, br->buffer[cwords]);
- cwords++;
- cbits = 0;
- }
- goto break1;
- }
- else {
- uval += FLAC__BITS_PER_WORD - cbits;
- crc16_update_word_(br, br->buffer[cwords]);
- cwords++;
- cbits = 0;
- /* didn't find stop bit yet, have to keep going... */
- }
- }
- /* at this point we've eaten up all the whole words; have to try
- * reading through any tail bytes before calling the read callback.
- * this is a repeat of the above logic adjusted for the fact we
- * don't have a whole word. note though if the client is feeding
- * us data a byte at a time (unlikely), br->consumed_bits may not
- * be zero.
- */
- if(br->bytes) {
- const unsigned end = br->bytes * 8;
- brword b = (br->buffer[cwords] & (FLAC__WORD_ALL_ONES <<
(FLAC__BITS_PER_WORD-end))) << cbits;
- if(b) {
- i = COUNT_ZERO_MSBS(b);
- uval += i;
- bits = parameter;
- i++;
- cbits += i;
- FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
- goto break1;
- }
- else {
- uval += end - cbits;
- cbits += end;
- FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
- /* didn't find stop bit yet, have to keep going... */
- }
- }
- /* flush registers and read; bitreader_read_from_client_() does
- * not touch br->consumed_bits at all but we still need to set
- * it in case it fails and we have to return false.
- */
- br->consumed_bits = cbits;
- br->consumed_words = cwords;
- if(!bitreader_read_from_client_(br))
+ if(parameter == 0) {
+ while(val < end) {
+ /* read the unary MSBs and end bit */
+ if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
return false;
- cwords = br->consumed_words;
- }
-break1:
- /* read binary part */
- FLAC__ASSERT(cwords <= br->words);
-
- if(bits) {
- while((br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
< bits) {
- /* flush registers and read; bitreader_read_from_client_() does
- * not touch br->consumed_bits at all but we still need to set
- * it in case it fails and we have to return false.
- */
- br->consumed_bits = cbits;
- br->consumed_words = cwords;
- if(!bitreader_read_from_client_(br))
- return false;
- cwords = br->consumed_words;
- }
- if(cwords < br->words) { /* if we've not consumed up to a partial
tail word... */
- if(cbits) {
- /* this also works when consumed_bits==0, it's just a little slower
than necessary for that case */
- const unsigned n = FLAC__BITS_PER_WORD - cbits;
- const brword word = br->buffer[cwords];
- if(bits < n) {
- uval <<= bits;
- uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >>
(n-bits);
- cbits += bits;
- goto break2;
- }
- uval <<= n;
- uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
- bits -= n;
- crc16_update_word_(br, word);
- cwords++;
- cbits = 0;
- if(bits) { /* if there are still bits left to read, there have to be less
than 32 so they will all be in the next word */
- uval <<= bits;
- uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits));
- cbits = bits;
- }
- goto break2;
- }
- else {
- FLAC__ASSERT(bits < FLAC__BITS_PER_WORD);
- uval <<= bits;
- uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
- cbits = bits;
- goto break2;
- }
- }
- else {
- /* in this case we're starting our read at a partial tail word;
- * the reader has guaranteed that we have at least 'bits' bits
- * available to read, which makes this case simpler.
- */
- uval <<= bits;
- if(cbits) {
- /* this also works when consumed_bits==0, it's just a little slower
than necessary for that case */
- FLAC__ASSERT(cbits + bits <= br->bytes*8);
- uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits))
>> (FLAC__BITS_PER_WORD-cbits-bits);
- cbits += bits;
- goto break2;
- }
- else {
- uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
- cbits += bits;
- goto break2;
- }
- }
- }
-break2:
- /* compose the value */
- *vals = (int)(uval >> 1 ^ -(int)(uval & 1));
- /* are we done? */
- --nvals;
- if(nvals == 0) {
- br->consumed_bits = cbits;
- br->consumed_words = cwords;
- return true;
+ *val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
}
- uval = 0;
- ++vals;
-
+ return true;
}
-}
-#else
-{
- unsigned i;
- unsigned uval = 0;
-
- /* try and get br->consumed_words and br->consumed_bits into register;
- * must remember to flush them back to *br before calling other
- * bitwriter functions that use them, and before returning */
- register unsigned cwords;
- register unsigned cbits;
- unsigned ucbits; /* keep track of the number of unconsumed bits in the buffer
*/
-
- FLAC__ASSERT(0 != br);
- FLAC__ASSERT(0 != br->buffer);
- /* WATCHOUT: code does not work with <32bit words; we can make things much
faster with this assertion */
- FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
- FLAC__ASSERT(parameter < 32);
- /* the above two asserts also guarantee that the binary part never straddles
more than 2 words, so we don't have to loop to read it */
- if(nvals == 0)
- return true;
+ FLAC__ASSERT(parameter > 0);
- cbits = br->consumed_bits;
cwords = br->consumed_words;
- ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
+ words = br->words;
+ ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+ b = br->buffer[cwords] << br->consumed_bits; /* keep unconsumed
bits aligned to left */
+
+ /* if we've not consumed up to a partial tail word... */
+ if(cwords >= words) {
+ x = 0;
+ goto process_tail;
+ }
+
+ while(val < end) {
+ /* read the unary MSBs and end bit */
+ x = y = COUNT_ZERO_MSBS(b);
+ if(x == FLAC__BITS_PER_WORD) {
+ x = ucbits;
+ do {
+ /* didn't find stop bit yet, have to keep going... */
+ crc16_update_word_(br, br->buffer[cwords++]);
+ if (cwords >= words)
+ goto incomplete_msbs;
+ b = br->buffer[cwords];
+ y = COUNT_ZERO_MSBS(b);
+ x += y;
+ } while(y == FLAC__BITS_PER_WORD);
+ }
+ b <<= y;
+ b <<= 1; /* account for stop bit */
+ ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
+ msbs = x;
+
+ /* read the binary LSBs */
+ x = b >> (FLAC__BITS_PER_WORD - parameter);
+ if(parameter <= ucbits) {
+ ucbits -= parameter;
+ b <<= parameter;
+ } else {
+ /* there are still bits left to read, they will all be in the next word */
+ crc16_update_word_(br, br->buffer[cwords++]);
+ if (cwords >= words)
+ goto incomplete_lsbs;
+ b = br->buffer[cwords];
+ ucbits += FLAC__BITS_PER_WORD - parameter;
+ x |= b >> ucbits;
+ b <<= FLAC__BITS_PER_WORD - ucbits;
+ }
+ lsbs = x;
- while(1) {
+ /* compose the value */
+ x = (msbs << parameter) | lsbs;
+ *val++ = (int)(x >> 1) ^ -(int)(x & 1);
- /* read unary part */
- while(1) {
- while(cwords < br->words) { /* if we've not consumed up to a
partial tail word... */
- brword b = br->buffer[cwords] << cbits;
- if(b) {
-#if 0 /* is not discernably faster... */ && defined FLAC__CPU_IA32
&& !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32 &&
defined __GNUC__
- asm volatile (
- "bsrl %1, %0;"
- "notl %0;"
- "andl $31, %0;"
- : "=r"(i)
- : "r"(b)
- );
-#else
- i = COUNT_ZERO_MSBS(b);
-#endif
- uval += i;
- cbits += i;
- cbits++; /* skip over stop bit */
- if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits ==
FLAC__BITS_PER_WORD) */
- crc16_update_word_(br, br->buffer[cwords]);
- cwords++;
- cbits = 0;
- }
- goto break1;
- }
- else {
- uval += FLAC__BITS_PER_WORD - cbits;
- crc16_update_word_(br, br->buffer[cwords]);
- cwords++;
- cbits = 0;
- /* didn't find stop bit yet, have to keep going... */
- }
- }
- /* at this point we've eaten up all the whole words; have to try
- * reading through any tail bytes before calling the read callback.
- * this is a repeat of the above logic adjusted for the fact we
- * don't have a whole word. note though if the client is feeding
- * us data a byte at a time (unlikely), br->consumed_bits may not
- * be zero.
- */
- if(br->bytes) {
- const unsigned end = br->bytes * 8;
- brword b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >>
end)) << cbits;
- if(b) {
- i = COUNT_ZERO_MSBS(b);
- uval += i;
- cbits += i;
- cbits++; /* skip over stop bit */
- FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
- goto break1;
- }
- else {
- uval += end - cbits;
- cbits += end;
- FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
- /* didn't find stop bit yet, have to keep going... */
- }
- }
- /* flush registers and read; bitreader_read_from_client_() does
- * not touch br->consumed_bits at all but we still need to set
- * it in case it fails and we have to return false.
- */
- br->consumed_bits = cbits;
+ continue;
+
+ /* at this point we've eaten up all the whole words */
+incomplete_msbs:
+ br->consumed_bits = 0;
+process_tail:
+ do {
br->consumed_words = cwords;
- if(!bitreader_read_from_client_(br))
+
+ /* read the unary MSBs and end bit */
+ if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
return false;
- cwords = br->consumed_words;
- ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
+ uval;
- /* + uval to offset our count by the # of unary bits already
- * consumed before the read, because we will add these back
- * in all at once at break1
- */
- }
-break1:
- ucbits -= uval;
- ucbits--; /* account for stop bit */
-
- /* read binary part */
- FLAC__ASSERT(cwords <= br->words);
-
- if(parameter) {
- while(ucbits < parameter) {
- /* flush registers and read; bitreader_read_from_client_() does
- * not touch br->consumed_bits at all but we still need to set
- * it in case it fails and we have to return false.
- */
- br->consumed_bits = cbits;
+ msbs += x;
+ x = ucbits = 0;
+
+ if(0) {
+incomplete_lsbs:
+ br->consumed_bits = 0;
br->consumed_words = cwords;
- if(!bitreader_read_from_client_(br))
- return false;
- cwords = br->consumed_words;
- ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 -
cbits;
- }
- if(cwords < br->words) { /* if we've not consumed up to a partial
tail word... */
- if(cbits) {
- /* this also works when consumed_bits==0, it's just slower than
necessary for that case */
- const unsigned n = FLAC__BITS_PER_WORD - cbits;
- const brword word = br->buffer[cwords];
- if(parameter < n) {
- uval <<= parameter;
- uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >>
(n-parameter);
- cbits += parameter;
- }
- else {
- uval <<= n;
- uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
- crc16_update_word_(br, word);
- cwords++;
- cbits = parameter - n;
- if(cbits) { /* parameter > n, i.e. if there are still bits left to
read, there have to be less than 32 so they will all be in the next word */
- uval <<= cbits;
- uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
- }
- }
- }
- else {
- cbits = parameter;
- uval <<= parameter;
- uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
- }
}
- else {
- /* in this case we're starting our read at a partial tail word;
- * the reader has guaranteed that we have at least 'parameter'
- * bits available to read, which makes this case simpler.
- */
- uval <<= parameter;
- if(cbits) {
- /* this also works when consumed_bits==0, it's just a little slower
than necessary for that case */
- FLAC__ASSERT(cbits + parameter <= br->bytes*8);
- uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits))
>> (FLAC__BITS_PER_WORD-cbits-parameter);
- cbits += parameter;
- }
- else {
- cbits = parameter;
- uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
- }
- }
- }
-
- ucbits -= parameter;
- /* compose the value */
- *vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+ /* read the binary LSBs */
+ if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
+ return false;
+ lsbs = x | lsbs;
- /* are we done? */
- --nvals;
- if(nvals == 0) {
- br->consumed_bits = cbits;
- br->consumed_words = cwords;
- return true;
- }
+ /* compose the value */
+ x = (msbs << parameter) | lsbs;
+ *val++ = (int)(x >> 1) ^ -(int)(x & 1);
+ x = 0;
- uval = 0;
- ++vals;
+ cwords = br->consumed_words;
+ words = br->words;
+ ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+ b = br->buffer[cwords] << br->consumed_bits;
+ } while(cwords >= words && val < end);
+ }
+ if(ucbits == 0 && cwords < words) {
+ /* don't leave the head word with no unconsumed bits */
+ crc16_update_word_(br, br->buffer[cwords++]);
+ ucbits = FLAC__BITS_PER_WORD;
}
+
+ br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
+ br->consumed_words = cwords;
+
+ return true;
}
-#endif
#if 0 /* UNUSED */
FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val,
unsigned parameter)
Index: src/libFLAC/stream_decoder.c
==================================================================RCS file:
/cvsroot/flac/flac/src/libFLAC/stream_decoder.c,v
retrieving revision 1.148
diff -u -r1.148 stream_decoder.c
--- src/libFLAC/stream_decoder.c 28 Feb 2008 05:34:26 -0000 1.148
+++ src/libFLAC/stream_decoder.c 14 Mar 2008 12:44:28 -0000
@@ -421,7 +421,7 @@
#ifdef FLAC__CPU_IA32
FLAC__ASSERT(decoder->private_->cpuinfo.type ==
FLAC__CPUINFO_TYPE_IA32);
#ifdef FLAC__HAS_NASM
-#if 1 /*@@@@@@ OPT: not clearly faster, needs more testing */
+#if 0 /*@@@@@@ OPT: not clearly faster, needs more testing */
if(decoder->private_->cpuinfo.data.ia32.bswap)
decoder->private_->local_bitreader_read_rice_signed_block =
FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap;
#endif