Miroslav Lichvar
2014-Jun-19  11:04 UTC
[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width
In the precompute_partition_info_sums_ function, instead of selecting
64-bit accumulator when the signal bps is larger than 16, revert to the
original approach based on partition size, but make room for few extra
bits to not overflow with unusual signals where the average residual
magnitude may be larger than bps.
It slightly improves the performance with standard encoding levels and
16-bit files as the 17-bit side channel can still be processed with the
32-bit accumulator and correctly selects the 64-bit accumulator with
very large 16-bit partitions.
This is related to commits 6f7ec60c and 187e596e.
---
 src/libFLAC/include/private/stream_encoder.h |  6 ++++++
 src/libFLAC/stream_encoder.c                 | 14 ++++++--------
 src/libFLAC/stream_encoder_intrin_sse2.c     |  3 ++-
 src/libFLAC/stream_encoder_intrin_ssse3.c    |  3 ++-
 4 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/src/libFLAC/include/private/stream_encoder.h
b/src/libFLAC/include/private/stream_encoder.h
index d26039a..8147f9e 100644
--- a/src/libFLAC/include/private/stream_encoder.h
+++ b/src/libFLAC/include/private/stream_encoder.h
@@ -37,6 +37,12 @@
 #include <config.h>
 #endif
 
+/*
+ * This is used to avoid overflow with unusual signals in 32-bit
+ * accumulator in the *precompute_partition_info_sums_* functions.
+ */
+#define FLAC__MAX_EXTRA_RESIDUAL_BPS 4
+
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined
FLAC__HAS_X86INTRIN
 #include "private/cpu.h"
 #include "FLAC/format.h"
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index e64ece2..8928a39 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -3872,10 +3872,9 @@ void precompute_partition_info_sums_(
 	FLAC__ASSERT(default_partition_samples > predictor_order);
 
 #if defined(FLAC__CPU_IA32) && !defined FLAC__NO_ASM && defined
FLAC__HAS_NASM && 0
-	/* WATCHOUT: "+ bps" is an assumption that the average residual
magnitude will not be more than "bps" bits */
-	/* previously the condition was:
if(FLAC__bitmath_ilog2(default_partition_samples) + bps < 32) */
-	/* see
http://git.xiph.org/?p=flac.git;a=commit;h=6f7ec60c7e7f05f5ab0b1cf6b7b0945e44afcd4b
*/
-	if(bps <= 16) {
+	/* WATCHOUT: "+ bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the maximum
+	 * assumed size of the average residual magnitude */
+	if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 		FLAC__precompute_partition_info_sums_32bit_asm_ia32_(residual,
abs_residual_partition_sums, residual_samples + predictor_order,
predictor_order, min_partition_order, max_partition_order);
 		return;
 	}
@@ -3884,10 +3883,9 @@ void precompute_partition_info_sums_(
 	/* first do max_partition_order */
 	{
 		unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order);
-		/* WATCHOUT: "+ bps" is an assumption that the average residual
magnitude will not be more than "bps" bits */
-		/* previously the condition was:
if(FLAC__bitmath_ilog2(default_partition_samples) + bps < 32) */
-		/* see
http://git.xiph.org/?p=flac.git;a=commit;h=6f7ec60c7e7f05f5ab0b1cf6b7b0945e44afcd4b
*/
-		if(bps <= 16) {
+		/* WATCHOUT: "+ bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the maximum
+		 * assumed size of the average residual magnitude */
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 			FLAC__uint32 abs_residual_partition_sum;
 
 			for(partition = residual_sample = 0; partition < partitions; partition++)
{
diff --git a/src/libFLAC/stream_encoder_intrin_sse2.c
b/src/libFLAC/stream_encoder_intrin_sse2.c
index bef5545..4e9d5db 100644
--- a/src/libFLAC/stream_encoder_intrin_sse2.c
+++ b/src/libFLAC/stream_encoder_intrin_sse2.c
@@ -37,6 +37,7 @@
 #ifndef FLAC__NO_ASM
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined
FLAC__HAS_X86INTRIN
 #include "private/stream_encoder.h"
+#include "private/bitmath.h"
 #ifdef FLAC__SSE2_SUPPORTED
 
 #include <stdlib.h>    /* for abs() */
@@ -58,7 +59,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const
FLAC__int32 residual
 		unsigned e1, e3;
 		__m128i mm_res, mm_sum, mm_mask;
 
-		if(bps <= 16) {
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 			for(partition = residual_sample = 0; partition < partitions; partition++)
{
 				end += default_partition_samples;
 				mm_sum = _mm_setzero_si128();
diff --git a/src/libFLAC/stream_encoder_intrin_ssse3.c
b/src/libFLAC/stream_encoder_intrin_ssse3.c
index 95b5f62..669536a 100644
--- a/src/libFLAC/stream_encoder_intrin_ssse3.c
+++ b/src/libFLAC/stream_encoder_intrin_ssse3.c
@@ -37,6 +37,7 @@
 #ifndef FLAC__NO_ASM
 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined
FLAC__HAS_X86INTRIN
 #include "private/stream_encoder.h"
+#include "private/bitmath.h"
 #ifdef FLAC__SSSE3_SUPPORTED
 
 #include <stdlib.h>    /* for abs() */
@@ -58,7 +59,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const
FLAC__int32 residua
 		unsigned e1, e3;
 		__m128i mm_res, mm_sum;
 
-		if(bps <= 16) {
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps +
FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 			for(partition = residual_sample = 0; partition < partitions; partition++)
{
 				end += default_partition_samples;
 				mm_sum = _mm_setzero_si128();
-- 
1.9.3
lvqcl
2014-Jun-19  11:30 UTC
[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width
Miroslav Lichvar wrote:> In the precompute_partition_info_sums_ function, instead of selecting > 64-bit accumulator when the signal bps is larger than 16, revert to the > original approach based on partition size, but make room for few extra > bits to not overflow with unusual signals where the average residual > magnitude may be larger than bps. > > It slightly improves the performance with standard encoding levels and > 16-bit files as the 17-bit side channel can still be processed with the > 32-bit accumulator and correctly selects the 64-bit accumulator with > very large 16-bit partitions. >BTW, what can you say about the following place in stream_decoder.c in read_subframe_lpc_() function: /*@@@@@@ technically not pessimistic enough, should be more like if( (FLAC__uint64)order * ((((FLAC__uint64)1)<<bps)-1) * ((1<<subframe->qlp_coeff_precision)-1) < (((FLAC__uint64)-1) << 32) ) */ if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32) Is it really "not pessimistic enough"? Can it be changed similarly to your patch for stream_encoder.c?
Miroslav Lichvar
2014-Jun-19  13:30 UTC
[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width
On Thu, Jun 19, 2014 at 03:30:22PM +0400, lvqcl wrote:> BTW, what can you say about the following place in stream_decoder.c > in read_subframe_lpc_() function: > > /*@@@@@@ technically not pessimistic enough, should be more like > if( (FLAC__uint64)order * ((((FLAC__uint64)1)<<bps)-1) * ((1<<subframe->qlp_coeff_precision)-1) < (((FLAC__uint64)-1) << 32) ) > */ > if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32) > > Is it really "not pessimistic enough"? Can it be changed similarly to your patch > for stream_encoder.c?Good question. I'm not sure what exactly Josh meant by that comment. The git message says just "minor comments". I think the right size of the expression was meant to be (FLAC__uint64)1<<32, otherwise it doesn't make much sense to me. With that it can rewritten in log2 as bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order - 1) < 32 This is indeed more pessimistic that the currently used check (< 32 vs <= 32), but I think both make a mistake that the qlp coefficients and residuals are unsigned integers and are actually more pessimistic than they would need to be if residuals were at most bps wide. With signed multiplication I think the correct check would actually be bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) - 1 <= 32 But, as we have seen with unusual data the residual signal can be wider than bps. The FLAC format specification doesn't seem to mention this. Should it be treated as a valid FLAC stream? Based on the analysis above, the currently used check allows residuals at most 1 bit wider than bps. Another problem could be that the local_lpc_restore_signal_16bit function may truncate the residual to 16 bits. -- Miroslav Lichvar
lvqcl
2014-Jun-20  09:21 UTC
[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width
Miroslav Lichvar ?????:> +/* > + * This is used to avoid overflow with unusual signals in 32-bit > + * accumulator in the *precompute_partition_info_sums_* functions. > + */ > +#define FLAC__MAX_EXTRA_RESIDUAL_BPS 4> + /* WATCHOUT: "+ bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the maximum > + * assumed size of the average residual magnitude */ > + if(FLAC__bitmath_ilog2(default_partition_samples) + bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {From FLAC__fixed_compute_residual: residual[i] = data[i] - 4*data[i-1] + 6*data[i-2] - 4*data[i-3] + data[i-4]; so max(residual[i]) == 16 * max(data[j]), or: max_bps(residual[]) == 4 + max_bps(data[]). Am I right that it's the reason why FLAC__MAX_EXTRA_RESIDUAL_BPS is equal to 4?
Miroslav Lichvar
2014-Jun-20  09:57 UTC
[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width
On Fri, Jun 20, 2014 at 01:21:03PM +0400, lvqcl wrote:> Miroslav Lichvar ?????: > > > +/* > > + * This is used to avoid overflow with unusual signals in 32-bit > > + * accumulator in the *precompute_partition_info_sums_* functions. > > + */ > > +#define FLAC__MAX_EXTRA_RESIDUAL_BPS 4 > > > + /* WATCHOUT: "+ bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the maximum > > + * assumed size of the average residual magnitude */ > > + if(FLAC__bitmath_ilog2(default_partition_samples) + bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) { > > From FLAC__fixed_compute_residual: > residual[i] = data[i] - 4*data[i-1] + 6*data[i-2] - 4*data[i-3] + data[i-4]; > > so max(residual[i]) == 16 * max(data[j]), or: max_bps(residual[]) == 4 + max_bps(data[]). > > Am I right that it's the reason why FLAC__MAX_EXTRA_RESIDUAL_BPS is equal to 4?Not really, it's just a guess. With LPC the maximum possible residual could be much larger than with the fixed predictor if the coefficients were chosen randomly, but the autocorrelation routine should keep them more reasonable. The snippet6.wav file needed 2, so I made it slightly larger to have some extra room. As overflow in the accumulator won't result in a data loss, I think this is good enough until someone can figure out a better approach. -- Miroslav Lichvar
Erik de Castro Lopo
2014-Jun-28  00:22 UTC
[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width
Miroslav Lichvar wrote:> In the precompute_partition_info_sums_ function, instead of selecting > 64-bit accumulator when the signal bps is larger than 16, revert to the > original approach based on partition size, but make room for few extra > bits to not overflow with unusual signals where the average residual > magnitude may be larger than bps. > > It slightly improves the performance with standard encoding levels and > 16-bit files as the 17-bit side channel can still be processed with the > 32-bit accumulator and correctly selects the 64-bit accumulator with > very large 16-bit partitions. > > This is related to commits 6f7ec60c and 187e596e.Sorry I wasn't able to deal with this patch when it came in because I was busy with $day_job. There was a lot of discussion about this patch but I can't really figure out from the thread what the conclusion was. The patch still applies, and the test suite passes. If there is a problem with this patch that the test suite doesn't catch, we should write a test that does catch it. If there is no problem with the patch I'll push it. Erik -- ---------------------------------------------------------------------- Erik de Castro Lopo http://www.mega-nerd.com/
lvqcl
2014-Jun-29  15:20 UTC
[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width
Erik de Castro Lopo wrote:>> It slightly improves the performance with standard encoding levels and >> 16-bit files as the 17-bit side channel can still be processed with the >> 32-bit accumulator and correctly selects the 64-bit accumulator with >> very large 16-bit partitions. >> >> This is related to commits 6f7ec60c and 187e596e. > > Sorry I wasn't able to deal with this patch when it came in because I > was busy with $day_job. > > There was a lot of discussion about this patch but I can't really > figure out from the thread what the conclusion was. > > The patch still applies, and the test suite passes. If there is a > problem with this patch that the test suite doesn't catch, we should > write a test that does catch it. > > If there is no problem with the patch I'll push it.As I see it: FLAC 1.2.1 and 1.3.0 cannot encode snippet6.wav with -7 and -8 encoding modes. But they are able to do this with --disable-fixed-subframes option. This implies that snippet6.wav triggers a problem somewthere inside FLAC__fixed_compute_residual(data[], data_len, order, residual[]) function. And indeed, max. possible residual value is 16 times bigger than max. value in data[] array: residual[i] = data[i] - 4*data[i-1] + 6*data[i-2] - 4*data[i-3] + data[i-4] 16 == 2^4, so max. bps value for residual[] is equal to max. bps for data[] + 4. The value of FLAC__MAX_EXTRA_RESIDUAL_BPS == 4 is enough to fix this problem with FLAC__fixed_compute_residual().
Erik de Castro Lopo
2014-Jul-04  11:39 UTC
[flac-dev] [PATCH] stream_encoder : Improve selection of residual accumulator width
Miroslav Lichvar wrote:> In the precompute_partition_info_sums_ function, instead of selecting > 64-bit accumulator when the signal bps is larger than 16, revert to the > original approach based on partition size, but make room for few extra > bits to not overflow with unusual signals where the average residual > magnitude may be larger than bps. > > It slightly improves the performance with standard encoding levels and > 16-bit files as the 17-bit side channel can still be processed with the > 32-bit accumulator and correctly selects the 64-bit accumulator with > very large 16-bit partitions. > > This is related to commits 6f7ec60c and 187e596e.Applied. Thanks. Erik -- ---------------------------------------------------------------------- Erik de Castro Lopo http://www.mega-nerd.com/