thr3ads.net - flac dev - [flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width [Jun 2014]

If this information is useful, please help other people find it:
Share via:

Miroslav Lichvar

2014-Jun-19 11:04 UTC

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

In the precompute_partition_info_sums_ function, instead of selecting
64-bit accumulator when the signal bps is larger than 16, revert to the
original approach based on partition size, but make room for few extra
bits to not overflow with unusual signals where the average residual
magnitude may be larger than bps.

It slightly improves the performance with standard encoding levels and
16-bit files as the 17-bit side channel can still be processed with the
32-bit accumulator and correctly selects the 64-bit accumulator with
very large 16-bit partitions.

This is related to commits 6f7ec60c and 187e596e.
---
 src/libFLAC/include/private/stream_encoder.h |  6 ++++++
 src/libFLAC/stream_encoder.c                 | 14 ++++++--------
 src/libFLAC/stream_encoder_intrin_sse2.c     |  3 ++-
 src/libFLAC/stream_encoder_intrin_ssse3.c    |  3 ++-
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/libFLAC/include/private/stream_encoder.h
b/src/libFLAC/include/private/stream_encoder.h
index d26039a..8147f9e 100644
--- a/src/libFLAC/include/private/stream_encoder.h
+++ b/src/libFLAC/include/private/stream_encoder.h
@@ -37,6 +37,12 @@
 #include <config.h>
 #endif
 
+/*
+ * This is used to avoid overflow with unusual signals in 32-bit
+ * accumulator in the *precompute_partition_info_sums_* functions.
+ */
+#define FLAC__MAX_EXTRA_RESIDUAL_BPS 4
+
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined
FLAC__HAS_X86INTRIN
 #include "private/cpu.h"
 #include "FLAC/format.h"
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index e64ece2..8928a39 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -3872,10 +3872,9 @@ void precompute_partition_info_sums_(
 	FLAC__ASSERT(default_partition_samples > predictor_order);
 
 #if defined(FLAC__CPU_IA32) && !defined FLAC__NO_ASM && defined
FLAC__HAS_NASM && 0
-	/* WATCHOUT: "+ bps" is an assumption that the average residual
magnitude will not be more than "bps" bits */
-	/* previously the condition was:
if(FLAC__bitmath_ilog2(default_partition_samples) + bps < 32) */
-	/* see
http://git.xiph.org/?p=flac.git;a=commit;h=6f7ec60c7e7f05f5ab0b1cf6b7b0945e44afcd4b
*/
-	if(bps <= 16) {
+	/* WATCHOUT: "+ bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the maximum
+	 * assumed size of the average residual magnitude */
+	if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 		FLAC__precompute_partition_info_sums_32bit_asm_ia32_(residual,
abs_residual_partition_sums, residual_samples + predictor_order,
predictor_order, min_partition_order, max_partition_order);
 		return;
 	}
@@ -3884,10 +3883,9 @@ void precompute_partition_info_sums_(
 	/* first do max_partition_order */
 	{
 		unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order);
-		/* WATCHOUT: "+ bps" is an assumption that the average residual
magnitude will not be more than "bps" bits */
-		/* previously the condition was:
if(FLAC__bitmath_ilog2(default_partition_samples) + bps < 32) */
-		/* see
http://git.xiph.org/?p=flac.git;a=commit;h=6f7ec60c7e7f05f5ab0b1cf6b7b0945e44afcd4b
*/
-		if(bps <= 16) {
+		/* WATCHOUT: "+ bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the maximum
+		 * assumed size of the average residual magnitude */
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 			FLAC__uint32 abs_residual_partition_sum;
 
 			for(partition = residual_sample = 0; partition < partitions; partition++)
{
diff --git a/src/libFLAC/stream_encoder_intrin_sse2.c
b/src/libFLAC/stream_encoder_intrin_sse2.c
index bef5545..4e9d5db 100644
--- a/src/libFLAC/stream_encoder_intrin_sse2.c
+++ b/src/libFLAC/stream_encoder_intrin_sse2.c
@@ -37,6 +37,7 @@
 #ifndef FLAC__NO_ASM
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined
FLAC__HAS_X86INTRIN
 #include "private/stream_encoder.h"
+#include "private/bitmath.h"
 #ifdef FLAC__SSE2_SUPPORTED
 
 #include <stdlib.h>    /* for abs() */
@@ -58,7 +59,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const
FLAC__int32 residual
 		unsigned e1, e3;
 		__m128i mm_res, mm_sum, mm_mask;
 
-		if(bps <= 16) {
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 			for(partition = residual_sample = 0; partition < partitions; partition++)
{
 				end += default_partition_samples;
 				mm_sum = _mm_setzero_si128();
diff --git a/src/libFLAC/stream_encoder_intrin_ssse3.c
b/src/libFLAC/stream_encoder_intrin_ssse3.c
index 95b5f62..669536a 100644
--- a/src/libFLAC/stream_encoder_intrin_ssse3.c
+++ b/src/libFLAC/stream_encoder_intrin_ssse3.c
@@ -37,6 +37,7 @@
 #ifndef FLAC__NO_ASM
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined
FLAC__HAS_X86INTRIN
 #include "private/stream_encoder.h"
+#include "private/bitmath.h"
 #ifdef FLAC__SSSE3_SUPPORTED
 
 #include <stdlib.h>    /* for abs() */
@@ -58,7 +59,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const
FLAC__int32 residua
 		unsigned e1, e3;
 		__m128i mm_res, mm_sum;
 
-		if(bps <= 16) {
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 			for(partition = residual_sample = 0; partition < partitions; partition++)
{
 				end += default_partition_samples;
 				mm_sum = _mm_setzero_si128();
-- 
1.9.3

lvqcl

2014-Jun-19 11:30 UTC

head link

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

Miroslav Lichvar wrote:
> In the precompute_partition_info_sums_ function, instead of selecting
> 64-bit accumulator when the signal bps is larger than 16, revert to the
> original approach based on partition size, but make room for few extra
> bits to not overflow with unusual signals where the average residual
> magnitude may be larger than bps.
>
> It slightly improves the performance with standard encoding levels and
> 16-bit files as the 17-bit side channel can still be processed with the
> 32-bit accumulator and correctly selects the 64-bit accumulator with
> very large 16-bit partitions.
>
BTW, what can you say about the following place in stream_decoder.c
in read_subframe_lpc_() function:

     /*@@@@@@ technically not pessimistic enough, should be more like
     if( (FLAC__uint64)order * ((((FLAC__uint64)1)<<bps)-1) *
((1<<subframe->qlp_coeff_precision)-1) < (((FLAC__uint64)-1)
<< 32) )
     */
     if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order)
<= 32)

Is it really "not pessimistic enough"? Can it be changed similarly to
your patch
for stream_encoder.c?

Miroslav Lichvar

2014-Jun-19 13:30 UTC

head link

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

On Thu, Jun 19, 2014 at 03:30:22PM +0400, lvqcl wrote:> BTW, what can you say about the following place in stream_decoder.c
> in read_subframe_lpc_() function:
> 
>      /*@@@@@@ technically not pessimistic enough, should be more like
>      if( (FLAC__uint64)order * ((((FLAC__uint64)1)<<bps)-1) *
((1<<subframe->qlp_coeff_precision)-1) < (((FLAC__uint64)-1)
<< 32) )
>      */
>      if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order)
<= 32)
> 
> Is it really "not pessimistic enough"? Can it be changed
similarly to your patch
> for stream_encoder.c?
Good question. I'm not sure what exactly Josh meant by that comment.
The git message says just "minor comments".

I think the right size of the expression was meant to be
(FLAC__uint64)1<<32, otherwise it doesn't make much sense to me. With
that it can rewritten in log2 as

bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order - 1) < 32

This is indeed more pessimistic that the currently used check (< 32 vs
<= 32), but I think both make a mistake that the qlp coefficients and
residuals are unsigned integers and are actually more pessimistic than
they would need to be if residuals were at most bps wide. With signed
multiplication I think the correct check would actually be

bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) - 1 <= 32

But, as we have seen with unusual data the residual signal can be
wider than bps. The FLAC format specification doesn't seem to mention
this. Should it be treated as a valid FLAC stream? Based on the
analysis above, the currently used check allows residuals at most 1
bit wider than bps. Another problem could be that the
local_lpc_restore_signal_16bit function may truncate the residual to
16 bits.

-- 
Miroslav Lichvar

lvqcl

2014-Jun-20 09:21 UTC

head link

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

Miroslav Lichvar ?????:
> +/*
> + * This is used to avoid overflow with unusual signals in 32-bit
> + * accumulator in the *precompute_partition_info_sums_* functions.
> + */
> +#define FLAC__MAX_EXTRA_RESIDUAL_BPS 4
> +		/* WATCHOUT: "+ bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the
maximum
> +		 * assumed size of the average residual magnitude */
> +		if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 From FLAC__fixed_compute_residual:
     residual[i] = data[i] - 4*data[i-1] + 6*data[i-2] - 4*data[i-3] +
data[i-4];

so max(residual[i]) == 16 * max(data[j]), or:  max_bps(residual[]) == 4 +
max_bps(data[]).

Am I right that it's the reason why FLAC__MAX_EXTRA_RESIDUAL_BPS is equal to
4?

Miroslav Lichvar

2014-Jun-20 09:57 UTC

head link

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

On Fri, Jun 20, 2014 at 01:21:03PM +0400, lvqcl wrote:> Miroslav Lichvar ?????:
> 
> > +/*
> > + * This is used to avoid overflow with unusual signals in 32-bit
> > + * accumulator in the *precompute_partition_info_sums_* functions.
> > + */
> > +#define FLAC__MAX_EXTRA_RESIDUAL_BPS 4
> 
> > +		/* WATCHOUT: "+ bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is
the maximum
> > +		 * assumed size of the average residual magnitude */
> > +		if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
> 
>  From FLAC__fixed_compute_residual:
>      residual[i] = data[i] - 4*data[i-1] + 6*data[i-2] - 4*data[i-3] +
data[i-4];
> 
> so max(residual[i]) == 16 * max(data[j]), or:  max_bps(residual[]) == 4 +
max_bps(data[]).
> 
> Am I right that it's the reason why FLAC__MAX_EXTRA_RESIDUAL_BPS is
equal to 4?
Not really, it's just a guess. With LPC the maximum possible residual
could be much larger than with the fixed predictor if the coefficients
were chosen randomly, but the autocorrelation routine should keep them
more reasonable. The snippet6.wav file needed 2, so I made it slightly
larger to have some extra room.

As overflow in the accumulator won't result in a data loss, I think
this is good enough until someone can figure out a better approach.

-- 
Miroslav Lichvar

Erik de Castro Lopo

2014-Jun-28 00:22 UTC

head link

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

Miroslav Lichvar wrote:
> In the precompute_partition_info_sums_ function, instead of selecting
> 64-bit accumulator when the signal bps is larger than 16, revert to the
> original approach based on partition size, but make room for few extra
> bits to not overflow with unusual signals where the average residual
> magnitude may be larger than bps.
> 
> It slightly improves the performance with standard encoding levels and
> 16-bit files as the 17-bit side channel can still be processed with the
> 32-bit accumulator and correctly selects the 64-bit accumulator with
> very large 16-bit partitions.
> 
> This is related to commits 6f7ec60c and 187e596e.
Sorry I wasn't able to deal with this patch when it came in because I
was busy with $day_job.

There was a lot of discussion about this patch but I can't really 
figure out from the thread what the conclusion was.

The patch still applies, and the test suite passes. If there is a 
problem with this patch that the test suite doesn't catch, we should
write a test that does catch it.

If there is no problem with the patch I'll push it.

Erik
-- 
----------------------------------------------------------------------
Erik de Castro Lopo
http://www.mega-nerd.com/

lvqcl

2014-Jun-29 15:20 UTC

head link

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

Erik de Castro Lopo wrote:
>> It slightly improves the performance with standard encoding levels and
>> 16-bit files as the 17-bit side channel can still be processed with the
>> 32-bit accumulator and correctly selects the 64-bit accumulator with
>> very large 16-bit partitions.
>>
>> This is related to commits 6f7ec60c and 187e596e.
>
> Sorry I wasn't able to deal with this patch when it came in because I
> was busy with $day_job.
>
> There was a lot of discussion about this patch but I can't really
> figure out from the thread what the conclusion was.
>
> The patch still applies, and the test suite passes. If there is a
> problem with this patch that the test suite doesn't catch, we should
> write a test that does catch it.
>
> If there is no problem with the patch I'll push it.
As I see it:

FLAC 1.2.1 and 1.3.0 cannot encode snippet6.wav with -7 and -8 encoding modes.
But they are able to do this with --disable-fixed-subframes option. This
implies that snippet6.wav triggers a problem somewthere inside
FLAC__fixed_compute_residual(data[], data_len, order, residual[]) function.

And indeed, max. possible residual value is 16 times bigger than max. value
in data[] array:

     residual[i] = data[i] - 4*data[i-1] + 6*data[i-2] - 4*data[i-3] + data[i-4]

16 == 2^4, so max. bps value for residual[] is equal to max. bps for data[] + 4.
The value of FLAC__MAX_EXTRA_RESIDUAL_BPS == 4 is enough to fix this problem
with FLAC__fixed_compute_residual().

Erik de Castro Lopo

2014-Jul-04 11:39 UTC

head link

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

Miroslav Lichvar wrote:
> In the precompute_partition_info_sums_ function, instead of selecting
> 64-bit accumulator when the signal bps is larger than 16, revert to the
> original approach based on partition size, but make room for few extra
> bits to not overflow with unusual signals where the average residual
> magnitude may be larger than bps.
> 
> It slightly improves the performance with standard encoding levels and
> 16-bit files as the 17-bit side channel can still be processed with the
> 32-bit accumulator and correctly selects the 64-bit accumulator with
> very large 16-bit partitions.
> 
> This is related to commits 6f7ec60c and 187e596e.
Applied. Thanks.

Erik
-- 
----------------------------------------------------------------------
Erik de Castro Lopo
http://www.mega-nerd.com/

Possibly Parallel Threads

Search for more possibly parallel threads

flac dev - Jun 2014 - [PATCH] stream_encoder : Improve selection of residual accumulator width

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width

Possibly Parallel Threads