thr3ads.net - flac dev - [Flac-dev] libFLAC bitbuffer optimizations [Jan 2005]

If this information is useful, please help other people find it:
Share via:

Eric Wong

2005-Jan-01 17:37 UTC

[Flac-dev] libFLAC bitbuffer optimizations

Josh Coalson <xflac@yahoo.com> wrote:> thanks for the patch.
No prob :)
> also, if you have miroslav's patch again a more updated version
> of bitbuffer.c that would be great.  I have been meaning to get
> around to applying it for a long time.
This is Miroslav's patch, from the mailing list post I dug up in the
archives:

--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -62,6 +62,24 @@
  * keeping in mind the limit from the first paragraph.
  */
 static const unsigned FLAC__BITBUFFER_DEFAULT_CAPACITY = ((65536 - 64) * 8) /
FLAC__BITS_PER_BLURB; /* blurbs */
+static const unsigned char byte_to_unary_table[] = {
+      8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
 
 #if FLAC__BITS_PER_BLURB == 8
 #define FLAC__BITS_PER_BLURB_LOG2 3
@@ -70,6 +88,7 @@
 #define FLAC__BLURB_TOP_BIT_ONE ((FLAC__byte)0x80)
 #define BLURB_BIT_TO_MASK(b) (((FLAC__blurb)'\x80') >> (b))
 #define CRC16_UPDATE_BLURB(bb, blurb, crc) FLAC__CRC16_UPDATE((blurb), (crc));
+#define FLAC__ALIGNED_BLURB_UNARY(blurb) (byte_to_unary_table[blurb])
 #elif FLAC__BITS_PER_BLURB == 32
 #define FLAC__BITS_PER_BLURB_LOG2 5
 #define FLAC__BYTES_PER_BLURB 4
@@ -77,6 +96,13 @@
 #define FLAC__BLURB_TOP_BIT_ONE ((FLAC__uint32)0x80000000)
 #define BLURB_BIT_TO_MASK(b) (((FLAC__blurb)0x80000000) >> (b))
 #define CRC16_UPDATE_BLURB(bb, blurb, crc) crc16_update_blurb((bb), (blurb));
+#define FLAC__ALIGNED_BLURB_UNARY(blurb) ((blurb) <= 0xff \
+		? byte_to_unary_table[blurb] + 24 \
+		: ((blurb) <= 0xffff \
+			? byte_to_unary_table[(blurb) >> 8] + 16 \
+			: ((blurb) <=0xffffff \
+				? byte_to_unary_table[(blurb) >> 16] + 8 \
+				: byte_to_unary_table[(blurb) >> 24])))
 #else
 /* ERROR, only sizes of 8 and 32 are supported */
 #endif
@@ -2109,114 +2135,18 @@
 	if(nvals == 0)
 		return true;
 
+	cbits = bb->consumed_bits;
 	i = bb->consumed_blurbs;
-	/*
-	 * We unroll the main loop to take care of partially consumed blurbs here.
-	 */
-	if(bb->consumed_bits > 0) {
-		save_blurb = blurb = buffer[i];
-		cbits = bb->consumed_bits;
-		blurb <<= cbits;
-
-		while(1) {
-			if(state == 0) {
-				if(blurb) {
-					for(j = 0; !(blurb & FLAC__BLURB_TOP_BIT_ONE); j++)
-						blurb <<= 1;
-					msbs += j;
-
-					/* dispose of the unary end bit */
-					blurb <<= 1;
-					j++;
-					cbits += j;
-
-					uval = 0;
-					lsbs_left = parameter;
-					state++;
-					if(cbits == FLAC__BITS_PER_BLURB) {
-						cbits = 0;
-						CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
-						break;
-					}
-				}
-				else {
-					msbs += FLAC__BITS_PER_BLURB - cbits;
-					cbits = 0;
-					CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
-					break;
-				}
-			}
-			else {
-				const unsigned available_bits = FLAC__BITS_PER_BLURB - cbits;
-				if(lsbs_left >= available_bits) {
-					uval <<= available_bits;
-					uval |= (blurb >> cbits);
-					cbits = 0;
-					CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
-
-					if(lsbs_left == available_bits) {
-						/* compose the value */
-						uval |= (msbs << parameter);
-						if(uval & 1)
-							vals[val_i++] = -((int)(uval >> 1)) - 1;
-						else
-							vals[val_i++] = (int)(uval >> 1);
-						if(val_i == nvals)
-							break;
-
-						msbs = 0;
-						state = 0;
-					}
-
-					lsbs_left -= available_bits;
-					break;
-				}
-				else {
-					uval <<= lsbs_left;
-					uval |= (blurb >> (FLAC__BITS_PER_BLURB - lsbs_left));
-					blurb <<= lsbs_left;
-					cbits += lsbs_left;
-
-					/* compose the value */
-					uval |= (msbs << parameter);
-					if(uval & 1)
-						vals[val_i++] = -((int)(uval >> 1)) - 1;
-					else
-						vals[val_i++] = (int)(uval >> 1);
-					if(val_i == nvals) {
-						/* back up one if we exited the for loop because we read all nvals but
the end came in the middle of a blurb */
-						i--;
-						break;
-					}
-
-					msbs = 0;
-					state = 0;
-				}
-			}
-		}
-		i++;
-
-		bb->consumed_blurbs = i;
-		bb->consumed_bits = cbits;
-		bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | cbits;
-	}
-
-	/*
-	 * Now that we are blurb-aligned the logic is slightly simpler
-	 */
+	
 	while(val_i < nvals) {
-		for( ; i < bb->blurbs && val_i < nvals; i++) {
-			save_blurb = blurb = buffer[i];
-			cbits = 0;
+		for( ; i < bb->blurbs; i++) {
+			blurb = (save_blurb = buffer[i]) << cbits;
 			while(1) {
 				if(state == 0) {
 					if(blurb) {
-						for(j = 0; !(blurb & FLAC__BLURB_TOP_BIT_ONE); j++)
-							blurb <<= 1;
+						j = FLAC__ALIGNED_BLURB_UNARY(blurb);
 						msbs += j;
 
-						/* dispose of the unary end bit */
-						blurb <<= 1;
 						j++;
 						cbits += j;
 
@@ -2228,6 +2158,7 @@
 							CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
 							break;
 						}
+						blurb <<= j;
 					}
 					else {
 						msbs += FLAC__BITS_PER_BLURB - cbits;
@@ -2247,12 +2178,11 @@
 						if(lsbs_left == available_bits) {
 							/* compose the value */
 							uval |= (msbs << parameter);
-							if(uval & 1)
-								vals[val_i++] = -((int)(uval >> 1)) - 1;
-							else
-								vals[val_i++] = (int)(uval >> 1);
-							if(val_i == nvals)
-								break;
+							vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
+							if(val_i == nvals) {
+								i++;
+								goto break2;
+							}
 
 							msbs = 0;
 							state = 0;
@@ -2262,29 +2192,23 @@
 						break;
 					}
 					else {
+						cbits += lsbs_left;
 						uval <<= lsbs_left;
 						uval |= (blurb >> (FLAC__BITS_PER_BLURB - lsbs_left));
 						blurb <<= lsbs_left;
-						cbits += lsbs_left;
 
 						/* compose the value */
 						uval |= (msbs << parameter);
-						if(uval & 1)
-							vals[val_i++] = -((int)(uval >> 1)) - 1;
-						else
-							vals[val_i++] = (int)(uval >> 1);
-						if(val_i == nvals) {
-							/* back up one if we exited the for loop because we read all nvals but
the end came in the middle of a blurb */
-							i--;
-							break;
-						}
-
+						vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
+						if(val_i == nvals)
+							goto break2;
 						msbs = 0;
 						state = 0;
 					}
 				}
 			}
 		}
+break2:
 		bb->consumed_blurbs = i;
 		bb->consumed_bits = cbits;
 		bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | cbits;




This is my patch on top of Miroslavs, which micro optimizes away some indexing
and comparison overhead.

--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -2123,7 +2123,7 @@
 {
 	const FLAC__blurb *buffer = bb->buffer;
 
-	unsigned i, j, val_i = 0;
+	unsigned i, j, val_i = nvals;
 	unsigned cbits = 0, uval = 0, msbs = 0, lsbs_left = 0;
 	FLAC__blurb blurb, save_blurb;
 	unsigned state = 0; /* 0 = getting unary MSBs, 1 = getting binary LSBs */
@@ -2138,7 +2138,7 @@
 	cbits = bb->consumed_bits;
 	i = bb->consumed_blurbs;
 	
-	while(val_i < nvals) {
+	while(val_i != 0) {
 		for( ; i < bb->blurbs; i++) {
 			blurb = (save_blurb = buffer[i]) << cbits;
 			while(1) {
@@ -2178,11 +2178,13 @@
 						if(lsbs_left == available_bits) {
 							/* compose the value */
 							uval |= (msbs << parameter);
-							vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
-							if(val_i == nvals) {
+							*vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+							--val_i;
+							if(val_i == 0) {
 								i++;
 								goto break2;
 							}
+							*(++vals);
 
 							msbs = 0;
 							state = 0;
@@ -2199,9 +2201,12 @@
 
 						/* compose the value */
 						uval |= (msbs << parameter);
-						vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
-						if(val_i == nvals)
+						*vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+						--val_i;
+						if(val_i == 0)
 							goto break2;
+						*(++vals);
+
 						msbs = 0;
 						state = 0;
 					}
@@ -2212,7 +2217,7 @@
 		bb->consumed_blurbs = i;
 		bb->consumed_bits = cbits;
 		bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | cbits;
-		if(val_i < nvals) {
+		if(val_i > 0) {
 			if(!bitbuffer_read_from_client_(bb, read_callback, client_data))
 				return false;
 			/* these must be zero because we can only get here if we got to the end of
the buffer */


> 
> btw how are you playing it on the ipod?
With a modified version of Music Player Daemon (MPD, www.musicpd.org).
http://www.ipodlinux.org/MPD has more info about it.
> not sure how to help out with lpc_restore_signal().  there are
> x86 and PPC versions of the routines in CVS that might be good
> references.  it is basically a multiply-accumulate loop but you
> have to be careful about overflow.  an ARM7 version of this
> function would help the speed on several other FLAC-supported
> devices.
This is my current C version of lpc_restore_signal.  It only works when
(order<=8) (not a problem for me right now), but the inner loop for larger
orders could be done using Duff's device and still be fast without getting
much
bigger.  I reading up on ARM assembly right now.

void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len,
const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32
data[])
{
	unsigned i;
	FLAC__int32 sum;
	const FLAC__int32 *history, *qlp;
	const int tmp = (0 - order - 1);
	
	for(i = data_len; i != 0; --i) {
		sum = 0;
		qlp = &qlp_coeff[order];
		history = &data[tmp];
		switch (order) {
			case 8: sum += (*(--qlp)) * (*(++history));
			case 7: sum += (*(--qlp)) * (*(++history));
			case 6: sum += (*(--qlp)) * (*(++history));
			case 5: sum += (*(--qlp)) * (*(++history));
			case 4: sum += (*(--qlp)) * (*(++history));
			case 3: sum += (*(--qlp)) * (*(++history));
			case 2: sum += (*(--qlp)) * (*(++history));
			case 1: sum += (*(--qlp)) * (*(++history));
				break;
		}
		*(data++) = *(residual++) + (sum >> lp_quantization);
	}
}

-- 
Eric Wong
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
Url :
http://lists.xiph.org/pipermail/flac-dev/attachments/20050101/90fa859a/attachment.pgp

Josh Coalson

2005-Jan-24 18:53 UTC

head link

[Flac-dev] libFLAC bitbuffer optimizations

Eric, I finally got around to your patches after Miroslav's.
the first one (the memcpy/memset replacement) I had problems
with, one because the buffers can overlap so I had to use
memmove (is this usually assembly in libc too?) and also the
endpoints looked wrong, for my full patch see below. speedup
for me was around 3%

the second patch got another 2%.  a question though, why do
you have:

+              *(++vals);

in two places instead of just ++vals; ?

Josh


Index: bitbuffer.c
==================================================================RCS file:
/cvsroot/flac/flac/src/libFLAC/bitbuffer.c,v
retrieving revision 1.55
diff -u -r1.55 bitbuffer.c
--- bitbuffer.c 25 Jan 2005 02:37:08 -0000  1.55
+++ bitbuffer.c 25 Jan 2005 02:39:52 -0000
@@ -228,11 +228,27 @@

    /* first shift the unconsumed buffer data toward the front as much
as possible */
    if(bb->total_consumed_bits >= FLAC__BITS_PER_BLURB) {
-       unsigned l = 0, r = bb->consumed_blurbs, r_end = bb->blurbs +
(bb->bits? 1:0);
+#if FLAC__BITS_PER_BLURB == 8
+       /*
+        * memset and memcpy are usually implemented in assembly
language
+        * by the system libc, and they can be much faster
+        */
+       const unsigned r_end = bb->blurbs + (bb->bits? 1:0);
+       const unsigned r = bb->consumed_blurbs, l = r_end - r;
+       memmove(&bb->buffer[0], &bb->buffer[r], l);
+       memset(&bb->buffer[l], 0, r);
+#elif FLAC__BITS_PER_BLURB == 32
+       /* still needs optimization */
+       const unsigned r_end = bb->blurbs + (bb->bits? 1:0);
+       unsigned l = 0, r = bb->consumed_blurbs;
        for( ; r < r_end; l++, r++)
            bb->buffer[l] = bb->buffer[r];
        for( ; l < r_end; l++)
            bb->buffer[l] = 0;
+#else
+       FLAC__ASSERT(false); /* ERROR, only sizes of 8 and 32 are
supported */
+#endif /* FLAC__BITS_PER_BLURB == 32 or 8 */
+
        bb->blurbs -= bb->consumed_blurbs;
        bb->total_bits -= FLAC__BLURBS_TO_BITS(bb->consumed_blurbs);
        bb->consumed_blurbs = 0;



		
__________________________________ 
Do you Yahoo!? 
Yahoo! Mail - Easier than ever with enhanced search. Learn more.
http://info.mail.yahoo.com/mail_250

Eric Wong

2005-Jan-24 19:25 UTC

head link

[Flac-dev] libFLAC bitbuffer optimizations

Josh Coalson <xflac@yahoo.com> wrote:> Eric, I finally got around to your patches after Miroslav's.
> the first one (the memcpy/memset replacement) I had problems
> with, one because the buffers can overlap so I had to use
> memmove (is this usually assembly in libc too?) and also the
> endpoints looked wrong, for my full patch see below. speedup
> for me was around 3%
memmove is based on memcpy in uclibc for ARM, and I'd imagine so for
other platforms as well.  I had memmove originally, but was convinced at
the time they wouldn't overlap.
> the second patch got another 2%.  a question though, why do
> you have:
> 
> +              *(++vals);
> 
> in two places instead of just ++vals; ?
Lack of sleep :)

-- 
Eric Wong
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
Url :
http://lists.xiph.org/pipermail/flac-dev/attachments/20050124/35890937/attachment.pgp

Possibly Parallel Threads

Search for more apparently analagous threads

flac dev - Jan 2005 - libFLAC bitbuffer optimizations

[Flac-dev] libFLAC bitbuffer optimizations

[Flac-dev] libFLAC bitbuffer optimizations

[Flac-dev] libFLAC bitbuffer optimizations

Possibly Parallel Threads