I rebased my previous 3 patches to the current master with minor changes. Patches 1 to 3 replace all my previous submitted patches. Patches 4 and 5 are new. Thanks, Linfeng Zhang
Linfeng Zhang
2016-Jul-14 00:48 UTC
[opus] [PATCH 1/5] Revise celt_fir_c() to not pass in argument "mem"
The "mem" in celt_fir_c() either is contained in the head of input "x" in reverse order already, or can be easily attached to the head of "x" before calling the function. Removing argument "mem" can eliminate the redundant buffer copies inside. Update celt_fir_sse4_1() accordingly. --- celt/celt_decoder.c | 10 ++++----- celt/celt_lpc.c | 33 +++++++++++------------------- celt/celt_lpc.h | 5 ++--- celt/x86/celt_lpc_sse.c | 51 +++++++++------------------------------------- celt/x86/celt_lpc_sse.h | 10 ++++----- celt/x86/x86_celt_map.c | 1 - silk/LPC_analysis_filter.c | 6 +----- 7 files changed, 34 insertions(+), 82 deletions(-) diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index b978bb3..f8433eb 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -509,7 +509,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) opus_val16 fade = Q15ONE; int pitch_index; VARDECL(opus_val32, etmp); - VARDECL(opus_val16, exc); + VARDECL(opus_val16, _exc); if (loss_count == 0) { @@ -520,7 +520,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) } ALLOC(etmp, overlap, opus_val32); - ALLOC(exc, MAX_PERIOD, opus_val16); + ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16); + opus_val16 *exc = _exc+LPC_ORDER; window = mode->window; c=0; do { opus_val16 decay; @@ -568,15 +569,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) /* Initialize the LPC history with the samples just before the start of the region for which we're computing the excitation. */ { - opus_val16 lpc_mem[LPC_ORDER]; for (i=0;i<LPC_ORDER;i++) { - lpc_mem[i] + exc[MAX_PERIOD-exc_length-1-i] ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-1-i], SIG_SHIFT); } /* Compute the excitation for exc_length samples before the loss. */ celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER, - exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch); + exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, st->arch); } /* Check if the waveform is decaying, and if so how fast. diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index b410a21..a7938af 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -89,56 +89,47 @@ int p void celt_fir_c( - const opus_val16 *_x, + const opus_val16 *x, const opus_val16 *num, - opus_val16 *_y, + opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch) { int i,j; VARDECL(opus_val16, rnum); - VARDECL(opus_val16, x); SAVE_STACK; ALLOC(rnum, ord, opus_val16); - ALLOC(x, N+ord, opus_val16); for(i=0;i<ord;i++) rnum[i] = num[ord-i-1]; - for(i=0;i<ord;i++) - x[i] = mem[ord-i-1]; - for (i=0;i<N;i++) - x[i+ord]=_x[i]; - for(i=0;i<ord;i++) - mem[i] = _x[N-i-1]; #ifdef SMALL_FOOTPRINT (void)arch; for (i=0;i<N;i++) { - opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); + opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT); for (j=0;j<ord;j++) { - sum = MAC16_16(sum,rnum[j],x[i+j]); + sum = MAC16_16(sum,rnum[j],x[i+j-ord]); } - _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); + y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); } #else for (i=0;i<N-3;i+=4) { opus_val32 sum[4]={0,0,0,0}; - xcorr_kernel(rnum, x+i, sum, ord, arch); - _y[i ] = SATURATE16(ADD32(EXTEND32(_x[i ]), PSHR32(sum[0], SIG_SHIFT))); - _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT))); - _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT))); - _y[i+3] = SATURATE16(ADD32(EXTEND32(_x[i+3]), PSHR32(sum[3], SIG_SHIFT))); + xcorr_kernel(rnum, x+i-ord, sum, ord, arch); + y[i ] = SATURATE16(ADD32(EXTEND32(x[i ]), PSHR32(sum[0], SIG_SHIFT))); + y[i+1] = SATURATE16(ADD32(EXTEND32(x[i+1]), PSHR32(sum[1], SIG_SHIFT))); + y[i+2] = SATURATE16(ADD32(EXTEND32(x[i+2]), PSHR32(sum[2], SIG_SHIFT))); + y[i+3] = SATURATE16(ADD32(EXTEND32(x[i+3]), PSHR32(sum[3], SIG_SHIFT))); } for (;i<N;i++) { opus_val32 sum = 0; for (j=0;j<ord;j++) - sum = MAC16_16(sum,rnum[j],x[i+j]); - _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); + sum = MAC16_16(sum,rnum[j],x[i+j-ord]); + y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT))); } #endif RESTORE_STACK; diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index 323459e..a4c5fd6 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -45,12 +45,11 @@ void celt_fir_c( opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch); #if !defined(OVERRIDE_CELT_FIR) -#define celt_fir(x, num, y, N, ord, mem, arch) \ - (celt_fir_c(x, num, y, N, ord, mem, arch)) +#define celt_fir(x, num, y, N, ord, arch) \ + (celt_fir_c(x, num, y, N, ord, arch)) #endif void celt_iir(const opus_val32 *x, diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c index 67e5592..12a9b0e 100644 --- a/celt/x86/celt_lpc_sse.c +++ b/celt/x86/celt_lpc_sse.c @@ -40,63 +40,32 @@ #if defined(FIXED_POINT) -void celt_fir_sse4_1(const opus_val16 *_x, +void celt_fir_sse4_1(const opus_val16 *x, const opus_val16 *num, - opus_val16 *_y, + opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch) { int i,j; VARDECL(opus_val16, rnum); - VARDECL(opus_val16, x); __m128i vecNoA; opus_int32 noA ; SAVE_STACK; ALLOC(rnum, ord, opus_val16); - ALLOC(x, N+ord, opus_val16); for(i=0;i<ord;i++) rnum[i] = num[ord-i-1]; - for(i=0;i<ord;i++) - x[i] = mem[ord-i-1]; - - for (i=0;i<N-7;i+=8) - { - x[i+ord ]=_x[i ]; - x[i+ord+1]=_x[i+1]; - x[i+ord+2]=_x[i+2]; - x[i+ord+3]=_x[i+3]; - x[i+ord+4]=_x[i+4]; - x[i+ord+5]=_x[i+5]; - x[i+ord+6]=_x[i+6]; - x[i+ord+7]=_x[i+7]; - } - - for (;i<N-3;i+=4) - { - x[i+ord ]=_x[i ]; - x[i+ord+1]=_x[i+1]; - x[i+ord+2]=_x[i+2]; - x[i+ord+3]=_x[i+3]; - } - - for (;i<N;i++) - x[i+ord]=_x[i]; - - for(i=0;i<ord;i++) - mem[i] = _x[N-i-1]; #ifdef SMALL_FOOTPRINT for (i=0;i<N;i++) { - opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); + opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT); for (j=0;j<ord;j++) { - sum = MAC16_16(sum,rnum[j],x[i+j]); + sum = MAC16_16(sum,rnum[j],x[i+j-ord]); } - _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); + y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); } #else noA = EXTEND32(1) << SIG_SHIFT >> 1; @@ -107,22 +76,22 @@ void celt_fir_sse4_1(const opus_val16 *_x, opus_val32 sums[4] = {0}; __m128i vecSum, vecX; - xcorr_kernel(rnum, x+i, sums, ord, arch); + xcorr_kernel(rnum, x+i-ord, sums, ord, arch); vecSum = _mm_loadu_si128((__m128i *)sums); vecSum = _mm_add_epi32(vecSum, vecNoA); vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); - vecX = OP_CVTEPI16_EPI32_M64(_x + i); + vecX = OP_CVTEPI16_EPI32_M64(x + i); vecSum = _mm_add_epi32(vecSum, vecX); vecSum = _mm_packs_epi32(vecSum, vecSum); - _mm_storel_epi64((__m128i *)(_y + i), vecSum); + _mm_storel_epi64((__m128i *)(y + i), vecSum); } for (;i<N;i++) { opus_val32 sum = 0; for (j=0;j<ord;j++) - sum = MAC16_16(sum, rnum[j], x[i + j]); - _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); + sum = MAC16_16(sum, rnum[j], x[i+j-ord]); + y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT))); } #endif diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h index c5ec796..7d1ecf7 100644 --- a/celt/x86/celt_lpc_sse.h +++ b/celt/x86/celt_lpc_sse.h @@ -41,12 +41,11 @@ void celt_fir_sse4_1( opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch); #if defined(OPUS_X86_PRESUME_SSE4_1) -#define celt_fir(x, num, y, N, ord, mem, arch) \ - ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch)) +#define celt_fir(x, num, y, N, ord, arch) \ + ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch)) #else @@ -56,11 +55,10 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch); -# define celt_fir(x, num, y, N, ord, mem, arch) \ - ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch)) +# define celt_fir(x, num, y, N, ord, arch) \ + ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch)) #endif #endif diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c index 47ba41b..5a1f5f9 100644 --- a/celt/x86/x86_celt_map.c +++ b/celt/x86/x86_celt_map.c @@ -46,7 +46,6 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( opus_val16 *y, int N, int ord, - opus_val16 *mem, int arch ) = { celt_fir_c, /* non-sse */ diff --git a/silk/LPC_analysis_filter.c b/silk/LPC_analysis_filter.c index 2090667..5aeee4c 100644 --- a/silk/LPC_analysis_filter.c +++ b/silk/LPC_analysis_filter.c @@ -50,7 +50,6 @@ void silk_LPC_analysis_filter( { opus_int j; #ifdef FIXED_POINT - opus_int16 mem[SILK_MAX_ORDER_LPC]; opus_int16 num[SILK_MAX_ORDER_LPC]; #else int ix; @@ -67,10 +66,7 @@ void silk_LPC_analysis_filter( for ( j = 0; j < d; j++ ) { num[ j ] = -B[ j ]; } - for (j=0;j<d;j++) { - mem[ j ] = in[ d - j - 1 ]; - } - celt_fir( in + d, num, out + d, len - d, d, mem, arch ); + celt_fir( in + d, num, out + d, len - d, d, arch ); for ( j = 0; j < d; j++ ) { out[ j ] = 0; } -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Jul-14 00:48 UTC
[opus] [PATCH 2/5] Optimize fixed-point celt_fir_c() for ARM NEON
Create the fixed-point intrinsics optimization celt_fir_neon() for ARM NEON. Create test tests/test_unit_optimization to unit test the optimization. --- .gitignore | 1 + Makefile.am | 39 ++++- celt/arm/arm_celt_map.c | 17 +++ celt/arm/celt_lpc_arm.h | 65 ++++++++ celt/arm/celt_lpc_neon_intr.c | 254 ++++++++++++++++++++++++++++++++ celt/celt_lpc.h | 5 + celt/tests/test_unit_dft.c | 1 + celt/tests/test_unit_mathops.c | 1 + celt/tests/test_unit_mdct.c | 1 + celt/tests/test_unit_optimization_lpc.c | 96 ++++++++++++ celt/tests/test_unit_rotation.c | 1 + celt_headers.mk | 1 + celt_sources.mk | 1 + tests/test_unit_optimization.c | 62 ++++++++ 14 files changed, 541 insertions(+), 4 deletions(-) create mode 100644 celt/arm/celt_lpc_arm.h create mode 100644 celt/arm/celt_lpc_neon_intr.c create mode 100644 celt/tests/test_unit_optimization_lpc.c create mode 100644 tests/test_unit_optimization.c diff --git a/.gitignore b/.gitignore index 33127c9..05d0582 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ tests/test_opus_api tests/test_opus_decode tests/test_opus_encode tests/test_opus_padding +tests/test_unit_optimization celt/arm/armopts.s celt/dump_modes/dump_modes celt/tests/test_unit_cwrs32 diff --git a/Makefile.am b/Makefile.am index 7a69114..2bfb923 100644 --- a/Makefile.am +++ b/Makefile.am @@ -84,9 +84,36 @@ pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_type noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD) if EXTRA_PROGRAMS -noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_mathops celt/tests/test_unit_mdct celt/tests/test_unit_rotation celt/tests/test_unit_types - -TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_rotation celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode tests/test_opus_encode tests/test_opus_padding +noinst_PROGRAMS = opus_demo \ + repacketizer_demo \ + opus_compare \ + celt/tests/test_unit_cwrs32 \ + celt/tests/test_unit_dft \ + celt/tests/test_unit_entropy \ + celt/tests/test_unit_laplace \ + celt/tests/test_unit_mathops \ + celt/tests/test_unit_mdct \ + celt/tests/test_unit_rotation \ + celt/tests/test_unit_types \ + tests/test_opus_api \ + tests/test_opus_encode \ + tests/test_opus_decode \ + tests/test_opus_padding \ + tests/test_unit_optimization + +TESTS = celt/tests/test_unit_types \ + celt/tests/test_unit_mathops \ + celt/tests/test_unit_entropy \ + celt/tests/test_unit_laplace \ + celt/tests/test_unit_dft \ + celt/tests/test_unit_mdct \ + celt/tests/test_unit_rotation \ + celt/tests/test_unit_cwrs32 \ + tests/test_opus_api \ + tests/test_opus_decode \ + tests/test_opus_encode \ + tests/test_opus_padding \ + tests/test_unit_optimization opus_demo_SOURCES = src/opus_demo.c @@ -111,6 +138,9 @@ tests_test_opus_decode_LDADD = libopus.la $(NE10_LIBS) $(LIBM) tests_test_opus_padding_SOURCES = tests/test_opus_padding.c tests/test_opus_common.h tests_test_opus_padding_LDADD = libopus.la $(NE10_LIBS) $(LIBM) +tests_test_unit_optimization_SOURCES = tests/test_unit_optimization.c +tests_test_unit_optimization_LDADD = libopus.la $(NE10_LIBS) $(LIBM) + celt_tests_test_unit_cwrs32_SOURCES = celt/tests/test_unit_cwrs32.c celt_tests_test_unit_cwrs32_LDADD = $(LIBM) @@ -276,7 +306,8 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \ $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \ $(celt_tests_test_unit_mdct_SOURCES:.c=.o) \ - $(celt_tests_test_unit_dft_SOURCES:.c=.o) + $(celt_tests_test_unit_dft_SOURCES:.c=.o) \ + $(tests_test_unit_optimization_SOURCES:.c=.o) if HAVE_SSE SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo) diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index 4d4d069..74869ab 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -29,6 +29,7 @@ #include "config.h" #endif +#include "celt_lpc.h" #include "pitch.h" #include "kiss_fft.h" #include "mdct.h" @@ -39,6 +40,22 @@ # if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \ (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \ (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP))) +void celt_fir_neon( + const opus_val16 *_x, + const opus_val16 *num, + opus_val16 *_y, + int N, + int ord, + int arch); + +void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val16 *, int, int, int) = { + celt_fir_c, /* ARMv4 */ + celt_fir_c, /* EDSP */ + celt_fir_c, /* Media */ + MAY_HAVE_NEON(celt_fir) /* NEON */ +}; + opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, const opus_val16 *, opus_val32 *, int , int) = { celt_pitch_xcorr_c, /* ARMv4 */ diff --git a/celt/arm/celt_lpc_arm.h b/celt/arm/celt_lpc_arm.h new file mode 100644 index 0000000..101df3d --- /dev/null +++ b/celt/arm/celt_lpc_arm.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(CELT_LPC_ARM_H) +# define CELT_LPC_ARM_H + +# include "armcpu.h" + +# if defined(FIXED_POINT) + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +void celt_fir_neon( + const opus_val16 *_x, + const opus_val16 *num, + opus_val16 *_y, + int N, + int ord, + int arch); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_CELT_FIR (1) +# define celt_fir(x, num, y, N, ord, arch) \ + ((void)(arch),PRESUME_NEON(celt_fir)(x, num, y, N, ord, arch)) +# endif + +#if !defined(OVERRIDE_CELT_FIR) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \ + || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \ + && !defined(OPUS_ARM_PRESUME_NEON_INTR))) +extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val16 *, int, int, int); + +# define OVERRIDE_CELT_FIR +# define celt_fir(x, num, y, N, ord, arch) \ + ((*CELT_FIR_IMPL[(arch)&OPUS_ARCHMASK])(x, num, y, N, ord, arch)) +# endif +#endif +#endif /* end FIXED_POINT */ + +#endif /* end CELT_LPC_ARM_H */ diff --git a/celt/arm/celt_lpc_neon_intr.c b/celt/arm/celt_lpc_neon_intr.c new file mode 100644 index 0000000..4715d0b --- /dev/null +++ b/celt/arm/celt_lpc_neon_intr.c @@ -0,0 +1,254 @@ +/* Copyright (c) 2016 Google Inc. */ +/** + @file celt_lpc_neon_intr.c + @brief ARM Neon Intrinsic optimizations for celt lpc functions + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <arm_neon.h> +#include "celt_lpc.h" +#include "stack_alloc.h" + +#if defined(FIXED_POINT) + +void celt_fir_neon( + const opus_val16 *_x, + const opus_val16 *num, + opus_val16 *y, + int N, + int ord, + int arch) +{ + int i,j; + const int leftover = N & 7; + const opus_val16 *x = _x-ord; + VARDECL(opus_val16, rnum); + SAVE_STACK; + /* Extend rnum by 3 zeros to handle the case that (ord % 4) is non-zero. */ + ALLOC(rnum, ord+3, opus_val16); + for (i=0;i<ord-3;i+=4) + vst1_s16(rnum+i, vrev64_s16(vld1_s16(num+ord-i-4))); + for (;i<ord;i++) + rnum[i] = num[ord-i-1]; + rnum[ord] = rnum[ord+1] = rnum[ord+2] = 0; + (void)arch; + +#ifdef SMALL_FOOTPRINT + for (i=0;i<N-7;i+=8) + { + int16x8_t x_s16x8 = vld1q_s16(_x+i); + int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT); + int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + x_s16x8 = vld1q_s16(x+i+j+0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0); + x_s16x8 = vld1q_s16(x+i+j+1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1); + x_s16x8 = vld1q_s16(x+i+j+2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2); + x_s16x8 = vld1q_s16(x+i+j+3); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3); + } + vst1q_s16(y+i, vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT))); + } + if (leftover) + { + if (leftover > 4) + { + int16x8_t x_s16x8 = vld1q_s16(_x+i); + int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT); + int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + x_s16x8 = vld1q_s16(x+i+j+0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0); + x_s16x8 = vld1q_s16(x+i+j+1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1); + x_s16x8 = vld1q_s16(x+i+j+2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2); + x_s16x8 = vld1q_s16(x+i+j+3); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3); + } + const int16x8_t y_s16x8 = vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT)); + vst1_s16(y+i, vget_low_s16(y_s16x8)); + vst1q_lane_s16(y+i+4, y_s16x8, 4); + if (leftover >= 6) + { + vst1q_lane_s16(y+i+5, y_s16x8, 5); + if (leftover == 7) + { + vst1q_lane_s16(y+i+6, y_s16x8, 6); + } + } + } + else { + int32x4_t sum0_s32x4 = vshll_n_s16(vld1_s16(_x+i), SIG_SHIFT); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0), rnum_s16x4, 0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1), rnum_s16x4, 1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2), rnum_s16x4, 2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3), rnum_s16x4, 3); + } + const int16x4_t y_s16x4 = vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT); + if (leftover == 4) + { + vst1_s16(y+i, y_s16x4); + } + else + { + vst1_lane_s16(y+i, y_s16x4, 0); + if (leftover >= 2) + { + vst1_lane_s16(y+i+1, y_s16x4, 1); + if (leftover == 3) + { + vst1_lane_s16(y+i+2, y_s16x4, 2); + } + } + } + } + } +#else + for (i=0;i<N-7;i+=8) + { + int32x4_t sum0_s32x4, sum1_s32x4; + sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + int16x8_t x_s16x8 = vld1q_s16(x+i+j+0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0); + x_s16x8 = vld1q_s16(x+i+j+1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1); + x_s16x8 = vld1q_s16(x+i+j+2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2); + x_s16x8 = vld1q_s16(x+i+j+3); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3); + } + sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT); + sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT); + const int16x8_t x_s16x8 = vld1q_s16(_x+i); + sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8)); + sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8)); + vst1q_s16(y+i, vcombine_s16(vqmovn_s32(sum0_s32x4), vqmovn_s32(sum1_s32x4))); + } + if (leftover) + { + if (leftover > 4) + { + int32x4_t sum0_s32x4, sum1_s32x4; + sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + int16x8_t x_s16x8 = vld1q_s16(x+i+j+0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0); + x_s16x8 = vld1q_s16(x+i+j+1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1); + x_s16x8 = vld1q_s16(x+i+j+2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2); + x_s16x8 = vld1q_s16(x+i+j+3); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3); + sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3); + } + sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT); + sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT); + const int16x8_t x_s16x8 = vld1q_s16(_x+i); + sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8)); + sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8)); + const int16x8_t y_s16x8 = vcombine_s16(vqmovn_s32(sum0_s32x4), vqmovn_s32(sum1_s32x4)); + vst1_s16(y+i, vget_low_s16(y_s16x8)); + vst1q_lane_s16(y+i+4, y_s16x8, 4); + if (leftover >= 6) + { + vst1q_lane_s16(y+i+5, y_s16x8, 5); + if (leftover == 7) + { + vst1q_lane_s16(y+i+6, y_s16x8, 6); + } + } + } + else { + int32x4_t sum0_s32x4 = vdupq_n_s32(0); + for (j=0;j<ord;j+=4) + { + const int16x4_t rnum_s16x4 = vld1_s16(rnum+j); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0), rnum_s16x4, 0); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1), rnum_s16x4, 1); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2), rnum_s16x4, 2); + sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3), rnum_s16x4, 3); + } + sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT); + sum0_s32x4 = vaddw_s16(sum0_s32x4, vld1_s16(_x+i)); + const int16x4_t y_s16x4 = vqmovn_s32(sum0_s32x4); + if (leftover == 4) + { + vst1_s16(y+i, y_s16x4); + } + else + { + vst1_lane_s16(y+i, y_s16x4, 0); + if (leftover >= 2) + { + vst1_lane_s16(y+i+1, y_s16x4, 1); + if (leftover == 3) + { + vst1_lane_s16(y+i+2, y_s16x4, 2); + } + } + } + } + } +#endif + RESTORE_STACK; +} + +#endif diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index a4c5fd6..76a73c0 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -35,6 +35,11 @@ #include "x86/celt_lpc_sse.h" #endif +#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ + || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +#include "arm/celt_lpc_arm.h" +#endif + #define LPC_ORDER 24 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p); diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c index 6166eb0..582618e 100644 --- a/celt/tests/test_unit_dft.c +++ b/celt/tests/test_unit_dft.c @@ -52,6 +52,7 @@ # include "celt_lpc.c" # include "pitch.c" # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "mdct.c" diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index fd3319d..da92f16 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -66,6 +66,7 @@ #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # include "arm/armcpu.c" # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "kiss_fft.c" diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c index 8dbb9ca..0658c7a 100644 --- a/celt/tests/test_unit_mdct.c +++ b/celt/tests/test_unit_mdct.c @@ -53,6 +53,7 @@ # include "pitch.c" # include "celt_lpc.c" # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "arm/celt_ne10_fft.c" diff --git a/celt/tests/test_unit_optimization_lpc.c b/celt/tests/test_unit_optimization_lpc.c new file mode 100644 index 0000000..7247046 --- /dev/null +++ b/celt/tests/test_unit_optimization_lpc.c @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +# define CUSTOM_MODES +#endif + +#include <stdio.h> +#include <string.h> + +#ifndef CELT_C +# define CELT_C +#endif +#include "celt_lpc.h" +#include "modes.h" + +#ifdef FIXED_POINT + +#define MAX_ORDER 32 + +static int test_fir(int arch) +{ + opus_val16 x[MAX_PERIOD+MAX_ORDER]; + opus_val16 num[MAX_ORDER]; + opus_val16 yorg[MAX_PERIOD], yopt[MAX_PERIOD]; + int N, ord; + unsigned int i; + + printf("%50s", "celt_fir() ..."); + for(ord=0;ord<=MAX_ORDER;ord++) + { + for(N=ord;N<=MAX_PERIOD;N++) /* N is larger than or equal to ord. */ + { + for (i=0;i<MAX_PERIOD+MAX_ORDER;++i) + { + x[i] = (rand() % 32767) - 16384; + } + for (i=0;i<MAX_PERIOD;++i) + { + yorg[i] = (rand() % 32767) - 16384; + } + for (i=0;i<MAX_ORDER;++i) + { + num[i] = (rand() % 32767) - 16384; + } + memcpy(yopt, yorg, sizeof(yorg)); + + celt_fir_c(x+MAX_ORDER, num, yorg, N, ord, arch); + celt_fir (x+MAX_ORDER, num, yopt, N, ord, arch); + if (memcmp(yorg, yopt, sizeof(yorg))) + { + printf("ord=%2d N=%3d failed!\nError in lpc unit test!!!\n", ord, N); + for (i=0;i<sizeof(yorg) / sizeof(*yorg);i++) + { + if (yorg[i] != yopt[i]) + { + printf("yorg[%3d]=%d, yopt[%3d]=%d\n", i, yorg[i], i, yopt[i]); + } + } + return -1; + } + } + } + printf(" passed!\n"); + return 0; +} +#endif /* FIXED_POINT */ diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index 1080c20..3a85a29 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -64,6 +64,7 @@ #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # include "arm/armcpu.c" # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_lpc_neon_intr.c" # include "arm/celt_neon_intr.c" # if defined(HAVE_ARM_NE10) # include "kiss_fft.c" diff --git a/celt_headers.mk b/celt_headers.mk index c9df94b..36ae290 100644 --- a/celt_headers.mk +++ b/celt_headers.mk @@ -34,6 +34,7 @@ celt/static_modes_fixed.h \ celt/static_modes_float_arm_ne10.h \ celt/static_modes_fixed_arm_ne10.h \ celt/arm/armcpu.h \ +celt/arm/celt_lpc_arm.h \ celt/arm/fixed_armv4.h \ celt/arm/fixed_armv5e.h \ celt/arm/fixed_arm64.h \ diff --git a/celt_sources.mk b/celt_sources.mk index 2ffe99a..37c0129 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -37,6 +37,7 @@ CELT_AM_SOURCES_ARM_ASM = \ celt/arm/armopts.s.in CELT_SOURCES_ARM_NEON_INTR = \ +celt/arm/celt_lpc_neon_intr.c \ celt/arm/celt_neon_intr.c CELT_SOURCES_ARM_NE10= \ diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c new file mode 100644 index 0000000..7eeab38 --- /dev/null +++ b/tests/test_unit_optimization.c @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include "stack_alloc.h" + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#ifdef FIXED_POINT + +# include "celt/tests/test_unit_optimization_lpc.c" + +#endif + +int main(void) +{ + int result = 0; /* 0: passed; other: failed */ + ALLOC_STACK; +#ifdef FIXED_POINT + int arch = opus_select_arch(); +#endif /* FIXED_POINT */ + int count = 10; + + while (!result && count--) { + printf("\n--------------------------- Testing optimization ---------------------------\n"); +#ifdef FIXED_POINT + result |= test_fir(arch); +#endif /* FIXED_POINT */ + } + return result; +} -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Jul-14 00:49 UTC
[opus] [PATCH 3/5] Optimize silk_warped_autocorrelation_FIX() for ARM NEON
Create silk_warped_autocorrelation_FIX_c_opt() which unrolls and parallelizes input by 8. It has very long prolog and epilog, but this is the cost to get good speed on this heavily hit function. This function may be the code base for optimization on different CPUs. Create ARM NEON intrinsics optimization silk_warped_autocorrelation_FIX_neon(). Create unit test silk/tests/test_unit_optimization_warped_autocorrelation.c. --- Makefile.am | 5 +- silk/arm/arm_silk_map.c | 20 + silk/fixed/arm/warped_autocorrelation_FIX_arm.h | 65 +++ .../arm/warped_autocorrelation_FIX_neon_intr.c | 495 +++++++++++++++++++++ silk/fixed/main_FIX.h | 15 +- .../fixed/mips/warped_autocorrelation_FIX_mipsr1.h | 6 - silk/fixed/warped_autocorrelation_FIX.c | 7 +- ...test_unit_optimization_warped_autocorrelation.c | 441 ++++++++++++++++++ silk_headers.mk | 1 + silk_sources.mk | 3 + tests/test_unit_optimization.c | 2 + 11 files changed, 1046 insertions(+), 14 deletions(-) create mode 100644 silk/fixed/arm/warped_autocorrelation_FIX_arm.h create mode 100644 silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c create mode 100644 silk/tests/test_unit_optimization_warped_autocorrelation.c diff --git a/Makefile.am b/Makefile.am index 2bfb923..c66fb2d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -50,6 +50,7 @@ SILK_SOURCES += $(SILK_SOURCES_ARM) if HAVE_ARM_NEON_INTR CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR) SILK_SOURCES += $(SILK_SOURCES_ARM_NEON_INTR) +SILK_SOURCES += $(SILK_SOURCES_FIXED_ARM_NEON_INTR) endif if HAVE_ARM_NE10 @@ -327,7 +328,9 @@ $(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS) endif if HAVE_ARM_NEON_INTR -ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) $(SILK_SOURCES_ARM_NEON_INTR:.c=.lo) +ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \ + $(SILK_SOURCES_ARM_NEON_INTR:.c=.lo) \ + $(SILK_SOURCES_FIXED_ARM_NEON_INTR:.c=.lo) $(ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += \ $(OPUS_ARM_NEON_INTR_CFLAGS) $(NE10_CFLAGS) endif diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c index 9bd86a7..2e330c4 100644 --- a/silk/arm/arm_silk_map.c +++ b/silk/arm/arm_silk_map.c @@ -28,6 +28,7 @@ POSSIBILITY OF SUCH DAMAGE. # include "config.h" #endif +#include "main_FIX.h" #include "NSQ.h" #if defined(OPUS_HAVE_RTCD) @@ -52,4 +53,23 @@ opus_int32 # endif +#if defined(FIXED_POINT) && \ + defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR) + +void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +) = { + silk_warped_autocorrelation_FIX_c, /* ARMv4 */ + silk_warped_autocorrelation_FIX_c, /* EDSP */ + silk_warped_autocorrelation_FIX_c, /* Media */ + MAY_HAVE_NEON(silk_warped_autocorrelation_FIX), /* Neon */ +}; + +#endif + #endif /* OPUS_HAVE_RTCD */ diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_arm.h b/silk/fixed/arm/warped_autocorrelation_FIX_arm.h new file mode 100644 index 0000000..ee892bf --- /dev/null +++ b/silk/fixed/arm/warped_autocorrelation_FIX_arm.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(WARPED_AUTOCORRELATION_FIX_ARM_H) +# define WARPED_AUTOCORRELATION_FIX_ARM_H + +# include "celt/arm/armcpu.h" + +# if defined(FIXED_POINT) + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +void silk_warped_autocorrelation_FIX_neon( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_silk_warped_autocorrelation_FIX (1) +# define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \ + ((void)(arch),PRESUME_NEON(silk_warped_autocorrelation_FIX)(corr, scale, input, warping_Q16, length, order)) +# endif + +#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \ + || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \ + && !defined(OPUS_ARM_PRESUME_NEON_INTR))) +extern void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK+1])(opus_int32*, opus_int*, const opus_int16*, const opus_int, const opus_int, const opus_int); + +# define OVERRIDE_silk_warped_autocorrelation_FIX +# define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \ + ((*SILK_WARPED_AUTOCORRELATION_FIX_IMPL[(arch)&OPUS_ARCHMASK])(corr, scale, input, warping_Q16, length, order)) +# endif +#endif +#endif /* end FIXED_POINT */ + +#endif /* end WARPED_AUTOCORRELATION_FIX_ARM_H */ diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c new file mode 100644 index 0000000..80dd949 --- /dev/null +++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c @@ -0,0 +1,495 @@ +/* Copyright (c) 2016 Google Inc. */ +/** + @file warped_autocorrelation_FIX_neon_intr.c + @brief ARM Neon Intrinsic optimizations for silk silk_warped_autocorrelation_FIX functions + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#include <stdio.h> + +#include <arm_neon.h> +#include "stack_alloc.h" +#include "main_FIX.h" + +#ifdef FIXED_POINT + +#define NUM_PARALLEL_INPUTS 8 + +void silk_warped_autocorrelation_FIX_neon( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +) +{ + opus_int n = 0, i, lsh; + opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS]; + opus_int32 input_QS[NUM_PARALLEL_INPUTS]; + opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra entries. + opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop, and one extra end entry in the last prolog. + opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; + + /* Order must be even */ + silk_assert( ( order & 1 ) == 0 ); + silk_assert( 2 * QS - QC >= 0 ); + + /* Loop over samples */ + if( order >= NUM_PARALLEL_INPUTS - 2 ) { + const int32x2_t warping_Q16_s32 = vdup_n_s32(warping_Q16); + for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) { + int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS0_s32x4, tmp2_QS1_s32x4; + int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, corr_QC2_s64x2, corr_QC3_s64x2; + int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2; + int32x2_t tmp1_QS_s32x2, tmp2_QS_s32x2; + int64x1_t corr_QC_s64x1; + const int32x4_t input_QS0_s32x4 = vshll_n_s16(vld1_s16(input + n), QS); + const int32x4_t input_QS1_s32x4 = vshll_n_s16(vld1_s16(input + n + 4), QS); + vst1q_s32(tmp1_QS, input_QS0_s32x4); + vst1q_s32(tmp1_QS + 4, input_QS1_s32x4); + + /* Loop over allpass sections */ + + /* -------------------- prolog 0 -------------------- */ + + tmp1_QS_s32x2 = vget_low_s32(input_QS0_s32x4); + tmp2_QS_s32x2 = vld1_s32(state_QS + order); // Accessed one extra end entry. + vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0); + corr_QC_s64x1 = vld1_s64(corr_QC + order); + t0_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + corr_QC_s64x1 = vadd_s64(corr_QC_s64x1, vget_low_s64(t0_s64x2)); + vst1_s64(corr_QC + order, corr_QC_s64x1); + tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + order - 1), tmp1_QS_s32x2); + t0_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); + tmp1_QS_s32x2 = vshrn_n_s64(t0_s64x2, 16); + tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); + tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1); + + /* -------------------- prolog 1 -------------------- */ + + tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1); + vst1_s32(state_QS + order - 1, tmp1_QS_s32x2); + corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 1); + t0_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); + vst1q_s64(corr_QC + order - 1, corr_QC0_s64x2); + tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + order - 2), tmp1_QS_s32x2); + t0_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); + tmp1_QS_s32x2 = vshrn_n_s64(t0_s64x2, 16); + tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); + tmp1_QS0_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4)); + + /* -------------------- prolog 2 -------------------- */ + + tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry. + vst1q_s32(state_QS + order - 2, tmp1_QS0_s32x4); // Saving one extra entry is OK. + corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 2); + corr_QC_s64x1 = vld1_s64 (corr_QC + order); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); + corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t1_s64x2)); + vst1q_s64(corr_QC + order - 2, corr_QC0_s64x2); + vst1_s64 (corr_QC + order, corr_QC_s64x1); + tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS0_s32x4); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); + tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); + tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); + tmp1_QS0_s32x4 = vld1q_lane_s32(tmp1_QS + 3, tmp1_QS0_s32x4, 3); + + /* -------------------- prolog 3 -------------------- */ + + tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 3); + vst1q_s32(state_QS + order - 3, tmp1_QS0_s32x4); + corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 3); + corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 1); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); + corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); + vst1q_s64(corr_QC + order - 3, corr_QC0_s64x2); + vst1q_s64(corr_QC + order - 1, corr_QC1_s64x2); + tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 4), tmp1_QS0_s32x4); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); + tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); + tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); + tmp1_QS_s32x2 = vget_low_s32(input_QS1_s32x4); + + /* -------------------- prolog 4 -------------------- */ + + tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 4); + tmp2_QS_s32x2 = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0); + vst1q_s32(state_QS + order - 4, tmp1_QS0_s32x4); + vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0); + corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 4); + corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 2); + corr_QC_s64x1 = vld1_s64 (corr_QC + order); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); + t2_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32 (input_QS1_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); + corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); + corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t2_s64x2)); + vst1q_s64(corr_QC + order - 4, corr_QC0_s64x2); + vst1q_s64(corr_QC + order - 2, corr_QC1_s64x2); + vst1_s64 (corr_QC + order, corr_QC_s64x1); + tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 5), tmp1_QS0_s32x4); + tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS + order - 1), tmp1_QS_s32x2); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); + t2_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); + tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); + tmp1_QS_s32x2 = vshrn_n_s64(t2_s64x2, 16); + tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); + tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2); + tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1); + + /* -------------------- prolog 5 -------------------- */ + + tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 5); + tmp2_QS_s32x2 = vld1_s32 (state_QS + order - 1); + vst1q_s32(state_QS + order - 5, tmp1_QS0_s32x4); + vst1_s32 (state_QS + order - 1, tmp1_QS_s32x2); + corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 5); + corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 3); + corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 1); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); + t2_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32 (input_QS1_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); + corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); + corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); + vst1q_s64(corr_QC + order - 5, corr_QC0_s64x2); + vst1q_s64(corr_QC + order - 3, corr_QC1_s64x2); + vst1q_s64(corr_QC + order - 1, corr_QC2_s64x2); + tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 6), tmp1_QS0_s32x4); + tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS + order - 2), tmp1_QS_s32x2); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); + t2_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); + tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); + tmp1_QS_s32x2 = vshrn_n_s64(t2_s64x2, 16); + tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); + tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2); + tmp1_QS1_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4)); + + /* -------------------- prolog 6 -------------------- */ + + tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 6); + tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry. + vst1q_s32(state_QS + order - 6, tmp1_QS0_s32x4); + vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4); // Saving one extra entry is OK. + corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 6); + corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 4); + corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 2); + corr_QC_s64x1 = vld1_s64 (corr_QC + order); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); + t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); + corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); + corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); + corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t3_s64x2)); + vst1q_s64(corr_QC + order - 6, corr_QC0_s64x2); + vst1q_s64(corr_QC + order - 4, corr_QC1_s64x2); + vst1q_s64(corr_QC + order - 2, corr_QC2_s64x2); + vst1_s64 (corr_QC + order, corr_QC_s64x1); + tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 7), tmp1_QS0_s32x4); // Accessed one extra head entry when order is 6. + tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS1_s32x4); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); + tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); + tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); + tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); + tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + tmp1_QS1_s32x4 = vld1q_lane_s32(tmp1_QS + 7, tmp1_QS1_s32x4, 3); + + /* -------------------- kernel loop -------------------- */ + + for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) { + /* Output of allpass section */ + tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1); + tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5); + vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1, tmp1_QS0_s32x4); + vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5, tmp1_QS1_s32x4); + corr_QC0_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1); + corr_QC1_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3); + corr_QC2_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5); + corr_QC3_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); + t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); + corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); + corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); + corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); + vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1, corr_QC0_s64x2); + vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3, corr_QC1_s64x2); + vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5, corr_QC2_s64x2); + vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7, corr_QC3_s64x2); + tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS), tmp1_QS0_s32x4); // Accessed one extra head entry in the last loop. + tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 4), tmp1_QS1_s32x4); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); + tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); + tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); + tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); + tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + } + + /* -------------------- epilog 0 -------------------- */ + + tmp2_QS_s32x2 = vld1_s32(state_QS + 1); + tmp2_QS1_s32x4 = vld1q_s32(state_QS + 3); + vst1q_s32(state_QS - 1, tmp1_QS0_s32x4); // Saving one extra entry is OK. + vst1q_s32(state_QS + 3, tmp1_QS1_s32x4); + corr_QC_s64x1 = vld1_s64 (corr_QC); + corr_QC1_s64x2 = vld1q_s64(corr_QC + 1); + corr_QC2_s64x2 = vld1q_s64(corr_QC + 3); + corr_QC3_s64x2 = vld1q_s64(corr_QC + 5); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); + t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); + corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t0_s64x2)); + corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); + corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); + corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); + vst1_s64 (corr_QC + 0, corr_QC_s64x1); + vst1q_s64(corr_QC + 1, corr_QC1_s64x2); + vst1q_s64(corr_QC + 3, corr_QC2_s64x2); + vst1q_s64(corr_QC + 5, corr_QC3_s64x2); + tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS), vget_high_s32(tmp1_QS0_s32x4)); + tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 2), tmp1_QS1_s32x4); + t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); + tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16); + tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); + tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2); + tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + + /* -------------------- epilog 1 -------------------- */ + + tmp2_QS_s32x2 = vld1_s32 (state_QS); + tmp2_QS1_s32x4 = vld1q_s32(state_QS + 2); + vst1_s32 (state_QS, tmp1_QS_s32x2); + vst1q_s32(state_QS + 2, tmp1_QS1_s32x4); + corr_QC1_s64x2 = vld1q_s64(corr_QC + 0); + corr_QC2_s64x2 = vld1q_s64(corr_QC + 2); + corr_QC3_s64x2 = vld1q_s64(corr_QC + 4); + t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4)); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); + t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); + corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); + corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); + corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); + vst1q_s64(corr_QC + 0, corr_QC1_s64x2); + vst1q_s64(corr_QC + 2, corr_QC2_s64x2); + vst1q_s64(corr_QC + 4, corr_QC3_s64x2); + tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS - 1), tmp1_QS_s32x2); // Accessed one extra head entry. + tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 1), tmp1_QS1_s32x4); + t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); + tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16); + tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); + tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2); + tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + + /* -------------------- epilog 2 -------------------- */ + + tmp2_QS1_s32x4 = vld1q_s32(state_QS + 1); + vst1_lane_s32(state_QS, tmp1_QS_s32x2, 1); + vst1q_s32 (state_QS + 1, tmp1_QS1_s32x4); + corr_QC_s64x1 = vld1_s64(corr_QC); + corr_QC2_s64x2 = vld1q_s64(corr_QC + 1); + corr_QC3_s64x2 = vld1q_s64(corr_QC + 3); + t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4)); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); + t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); + corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t1_s64x2)); + corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); + corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); + vst1_s64 (corr_QC + 0, corr_QC_s64x1); + vst1q_s64(corr_QC + 1, corr_QC2_s64x2); + vst1q_s64(corr_QC + 3, corr_QC3_s64x2); + tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS), tmp1_QS1_s32x4); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); + tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); + tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + + /* -------------------- epilog 3 -------------------- */ + + tmp2_QS1_s32x4 = vld1q_s32(state_QS); + vst1q_s32(state_QS, tmp1_QS1_s32x4); + corr_QC2_s64x2 = vld1q_s64(corr_QC); + corr_QC3_s64x2 = vld1q_s64(corr_QC + 2); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); + t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); + t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); + corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); + corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); + vst1q_s64(corr_QC, corr_QC2_s64x2); + vst1q_s64(corr_QC + 2, corr_QC3_s64x2); + tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS - 1), tmp1_QS1_s32x4); // Accessed one extra head entry. + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); + tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); + tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + + /* -------------------- epilog 4 -------------------- */ + + corr_QC_s64x1 = vld1_s64 (corr_QC); + corr_QC3_s64x2 = vld1q_s64(corr_QC + 1); + t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); + t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); + t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); + t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); + corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t2_s64x2)); + corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); + vst1_s64 (corr_QC, corr_QC_s64x1); + vst1q_s64(corr_QC + 1, corr_QC3_s64x2); + vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4); + + tmp2_QS_s32x2 = vld1_s32(state_QS + 1); + tmp1_QS_s32x2 = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4)); + t3_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); + tmp1_QS_s32x2 = vshrn_n_s64(t3_s64x2, 16); + tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); + vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1); + + /* -------------------- epilog 5 & 6 -------------------- */ + + vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1); + tmp2_QS_s32x2 = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32))); + t3_s64x2 = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32); + tmp2_QS_s32x2 = vshrn_n_s64(t3_s64x2, 16); + tmp2_QS_s32x2 = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2); + vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0); + + corr_QC3_s64x2 = vld1q_s64(corr_QC); + t3_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4)); + t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); + corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); + vst1_s64(corr_QC + 1, vget_high_s64(corr_QC3_s64x2)); + t3_s64x2 = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32))); + t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); + corr_QC_s64x1 = vadd_s64(vget_low_s64(corr_QC3_s64x2), vget_low_s64(t3_s64x2)); + vst1_s64(corr_QC, corr_QC_s64x1); + } + } + + for( ; n < length; n++ ) { + input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); + /* Loop over allpass sections */ + for( i = 0; i <= order; i++ ) { + /* Output of allpass section */ + tmp2_QS[ 0 ] = silk_SMLAWB( state_QS[ order - i ], state_QS[ order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 ); + state_QS[ order - i ] = tmp1_QS[ 0 ]; + corr_QC[ order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC ); + tmp1_QS[ 0 ] = tmp2_QS[ 0 ]; + } + } + lsh = silk_CLZ64( corr_QC[ order ] ) - 35; + lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); + *scale = -( QC + lsh ); + silk_assert( *scale >= -30 && *scale <= 12 ); + const int64x2_t lsh_s64x2 = vdupq_n_s64(lsh); + for( i = 0; i <= order - 3; i += 4 ) { + int64x2_t corr_QC0_s64x2 = vld1q_s64(corr_QC + i); + int64x2_t corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2); + corr_QC0_s64x2 = vshlq_s64(corr_QC0_s64x2, lsh_s64x2); + corr_QC1_s64x2 = vshlq_s64(corr_QC1_s64x2, lsh_s64x2); + int32x4_t corr_s32x4 = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2)); + corr_s32x4 = vrev64q_s32(corr_s32x4); + vst1q_s32(corr + order - i - 3, corr_s32x4); + } + if( lsh >= 0 ) { + for( ; i <= order; i++ ) { + corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) ); + } + } else { + for( ; i <= order; i++ ) { + corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) ); + } + } + silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/ +} + +#endif /* FIXED_POINT */ diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h index 375b5eb..2abb5d9 100644 --- a/silk/fixed/main_FIX.h +++ b/silk/fixed/main_FIX.h @@ -36,6 +36,11 @@ POSSIBILITY OF SUCH DAMAGE. #include "debug.h" #include "entenc.h" +#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ + || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +#include "fixed/arm/warped_autocorrelation_FIX_arm.h" +#endif + #ifndef FORCE_CPP_BUILD #ifdef __cplusplus extern "C" @@ -47,6 +52,9 @@ extern "C" #define silk_encode_do_VAD_Fxx silk_encode_do_VAD_FIX #define silk_encode_frame_Fxx silk_encode_frame_FIX +#define QC 10 +#define QS 14 + /*********************/ /* Encoder Functions */ /*********************/ @@ -121,7 +129,7 @@ void silk_noise_shape_analysis_FIX( ); /* Autocorrelations for a warped frequency axis */ -void silk_warped_autocorrelation_FIX( +void silk_warped_autocorrelation_FIX_c( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ @@ -130,6 +138,11 @@ void silk_warped_autocorrelation_FIX( const opus_int order /* I Correlation order (even) */ ); +#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX) +#define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \ + (silk_warped_autocorrelation_FIX_c(corr, scale, input, warping_Q16, length, order)) +#endif + /* Calculation of LTP state scaling */ void silk_LTP_scale_ctrl_FIX( silk_encoder_state_FIX *psEnc, /* I/O encoder state */ diff --git a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h index e803ef0..6916940 100644 --- a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h +++ b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h @@ -34,12 +34,6 @@ POSSIBILITY OF SUCH DAMAGE. #include "main_FIX.h" -#undef QC -#define QC 10 - -#undef QS -#define QS 14 - /* Autocorrelations for a warped frequency axis */ #define OVERRIDE_silk_warped_autocorrelation_FIX void silk_warped_autocorrelation_FIX( diff --git a/silk/fixed/warped_autocorrelation_FIX.c b/silk/fixed/warped_autocorrelation_FIX.c index 6ca6c11..994c299 100644 --- a/silk/fixed/warped_autocorrelation_FIX.c +++ b/silk/fixed/warped_autocorrelation_FIX.c @@ -31,17 +31,13 @@ POSSIBILITY OF SUCH DAMAGE. #include "main_FIX.h" -#define QC 10 -#define QS 14 - #if defined(MIPSr1_ASM) #include "mips/warped_autocorrelation_FIX_mipsr1.h" #endif -#ifndef OVERRIDE_silk_warped_autocorrelation_FIX /* Autocorrelations for a warped frequency axis */ -void silk_warped_autocorrelation_FIX( +void silk_warped_autocorrelation_FIX_c( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ @@ -92,4 +88,3 @@ void silk_warped_autocorrelation_FIX( } silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/ } -#endif /* OVERRIDE_silk_warped_autocorrelation_FIX */ diff --git a/silk/tests/test_unit_optimization_warped_autocorrelation.c b/silk/tests/test_unit_optimization_warped_autocorrelation.c new file mode 100644 index 0000000..b7d0ad0 --- /dev/null +++ b/silk/tests/test_unit_optimization_warped_autocorrelation.c @@ -0,0 +1,441 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#include <stdio.h> +#include "main_FIX.h" + +#ifdef FIXED_POINT + +/* Unrolling the input loop by 8 is about 25% faster than unrolling by 4 on Chromebook with an ARMv7 Processor. */ +#define NUM_PARALLEL_INPUTS 8 + +/* Keep this function here because it is the code base to optimize on different CPUs. */ +void silk_warped_autocorrelation_FIX_c_opt( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +) +{ + opus_int n = 0, i, j, lsh; + opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS]; + opus_int32 input_QS[NUM_PARALLEL_INPUTS]; + opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 2 ] = { 0 }; // Create one extra entry. + opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop. + opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; + + /* Order must be even */ + silk_assert( ( order & 1 ) == 0 ); + silk_assert( 2 * QS - QC >= 0 ); + + /* Loop over samples */ + if( order >= NUM_PARALLEL_INPUTS - 2 ) { + for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) { + for( i = 0; i < NUM_PARALLEL_INPUTS; i++ ) { + input_QS[i] = tmp1_QS[i] = silk_LSHIFT32( (opus_int32)input[ n + i ], QS ); + } + + /* Loop over allpass sections */ + + /* -------------------- prolog 0 -------------------- */ + + tmp2_QS[ 0 ] = state_QS[ order ]; + state_QS[ order ] = tmp1_QS[ 0 ]; + corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC ); + tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 1 ] - tmp1_QS[ 0 ], warping_Q16 ); + + /* -------------------- prolog 1 -------------------- */ + + tmp2_QS[ 0 ] = state_QS[ order - 1 ]; + tmp2_QS[ 1 ] = state_QS[ order ]; + + state_QS[ order - 1 ] = tmp1_QS[ 0 ]; + state_QS[ order ] = tmp1_QS[ 1 ]; + + corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC ); + corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC ); + + tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 1 ] - tmp1_QS[ 1 ], warping_Q16 ); + tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 2 ] - tmp1_QS[ 0 ], warping_Q16 ); + + /* -------------------- prolog 2 -------------------- */ + + tmp2_QS[ 0 ] = state_QS[ order - 2 ]; + tmp2_QS[ 1 ] = state_QS[ order - 1 ]; + tmp2_QS[ 2 ] = state_QS[ order ]; + + state_QS[ order - 2 ] = tmp1_QS[ 0 ]; + state_QS[ order - 1 ] = tmp1_QS[ 1 ]; + state_QS[ order ] = tmp1_QS[ 2 ]; + + corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC ); + corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC ); + corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC ); + + tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 1 ] - tmp1_QS[ 2 ], warping_Q16 ); + tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 2 ] - tmp1_QS[ 1 ], warping_Q16 ); + tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 3 ] - tmp1_QS[ 0 ], warping_Q16 ); + + /* -------------------- prolog 3 -------------------- */ + + tmp2_QS[ 0 ] = state_QS[ order - 3 ]; + tmp2_QS[ 1 ] = state_QS[ order - 2 ]; + tmp2_QS[ 2 ] = state_QS[ order - 1 ]; + tmp2_QS[ 3 ] = state_QS[ order ]; + + state_QS[ order - 3 ] = tmp1_QS[ 0 ]; + state_QS[ order - 2 ] = tmp1_QS[ 1 ]; + state_QS[ order - 1 ] = tmp1_QS[ 2 ]; + state_QS[ order ] = tmp1_QS[ 3 ]; + + corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC ); + corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC ); + corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC ); + corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC ); + + tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 1 ] - tmp1_QS[ 3 ], warping_Q16 ); + tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 2 ] - tmp1_QS[ 2 ], warping_Q16 ); + tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 3 ] - tmp1_QS[ 1 ], warping_Q16 ); + tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 4 ] - tmp1_QS[ 0 ], warping_Q16 ); + + /* -------------------- prolog 4 -------------------- */ + + tmp2_QS[ 0 ] = state_QS[ order - 4 ]; + tmp2_QS[ 1 ] = state_QS[ order - 3 ]; + tmp2_QS[ 2 ] = state_QS[ order - 2 ]; + tmp2_QS[ 3 ] = state_QS[ order - 1 ]; + tmp2_QS[ 4 ] = state_QS[ order ]; + + state_QS[ order - 4 ] = tmp1_QS[ 0 ]; + state_QS[ order - 3 ] = tmp1_QS[ 1 ]; + state_QS[ order - 2 ] = tmp1_QS[ 2 ]; + state_QS[ order - 1 ] = tmp1_QS[ 3 ]; + state_QS[ order ] = tmp1_QS[ 4 ]; + + corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC ); + corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC ); + corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC ); + corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC ); + corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC ); + + tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 1 ] - tmp1_QS[ 4 ], warping_Q16 ); + tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 2 ] - tmp1_QS[ 3 ], warping_Q16 ); + tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 3 ] - tmp1_QS[ 2 ], warping_Q16 ); + tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 4 ] - tmp1_QS[ 1 ], warping_Q16 ); + tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 5 ] - tmp1_QS[ 0 ], warping_Q16 ); + + /* -------------------- prolog 5 -------------------- */ + + tmp2_QS[ 0 ] = state_QS[ order - 5 ]; + tmp2_QS[ 1 ] = state_QS[ order - 4 ]; + tmp2_QS[ 2 ] = state_QS[ order - 3 ]; + tmp2_QS[ 3 ] = state_QS[ order - 2 ]; + tmp2_QS[ 4 ] = state_QS[ order - 1 ]; + tmp2_QS[ 5 ] = state_QS[ order ]; + + state_QS[ order - 5 ] = tmp1_QS[ 0 ]; + state_QS[ order - 4 ] = tmp1_QS[ 1 ]; + state_QS[ order - 3 ] = tmp1_QS[ 2 ]; + state_QS[ order - 2 ] = tmp1_QS[ 3 ]; + state_QS[ order - 1 ] = tmp1_QS[ 4 ]; + state_QS[ order ] = tmp1_QS[ 5 ]; + + corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC ); + corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC ); + corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC ); + corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC ); + corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC ); + corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC ); + + tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 1 ] - tmp1_QS[ 5 ], warping_Q16 ); + tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 2 ] - tmp1_QS[ 4 ], warping_Q16 ); + tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 3 ] - tmp1_QS[ 3 ], warping_Q16 ); + tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 4 ] - tmp1_QS[ 2 ], warping_Q16 ); + tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 5 ] - tmp1_QS[ 1 ], warping_Q16 ); + tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 6 ] - tmp1_QS[ 0 ], warping_Q16 ); + + /* -------------------- prolog 6 -------------------- */ + + tmp2_QS[ 0 ] = state_QS[ order - 6 ]; + tmp2_QS[ 1 ] = state_QS[ order - 5 ]; + tmp2_QS[ 2 ] = state_QS[ order - 4 ]; + tmp2_QS[ 3 ] = state_QS[ order - 3 ]; + tmp2_QS[ 4 ] = state_QS[ order - 2 ]; + tmp2_QS[ 5 ] = state_QS[ order - 1 ]; + tmp2_QS[ 6 ] = state_QS[ order ]; + + state_QS[ order - 6 ] = tmp1_QS[ 0 ]; + state_QS[ order - 5 ] = tmp1_QS[ 1 ]; + state_QS[ order - 4 ] = tmp1_QS[ 2 ]; + state_QS[ order - 3 ] = tmp1_QS[ 3 ]; + state_QS[ order - 2 ] = tmp1_QS[ 4 ]; + state_QS[ order - 1 ] = tmp1_QS[ 5 ]; + state_QS[ order ] = tmp1_QS[ 6 ]; + + corr_QC[ order - 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC ); + corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC ); + corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC ); + corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC ); + corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC ); + corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC ); + corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC ); + + tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ order - 1 ] - tmp1_QS[ 6 ], warping_Q16 ); + tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 2 ] - tmp1_QS[ 5 ], warping_Q16 ); + tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 3 ] - tmp1_QS[ 4 ], warping_Q16 ); + tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 4 ] - tmp1_QS[ 3 ], warping_Q16 ); + tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 5 ] - tmp1_QS[ 2 ], warping_Q16 ); + tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 6 ] - tmp1_QS[ 1 ], warping_Q16 ); + tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 7 ] - tmp1_QS[ 0 ], warping_Q16 ); // Accessed one extra head entry when order is 6. + + /* -------------------- kernel loop -------------------- */ + + for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) { + /* Output of allpass section */ + for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) { + tmp2_QS[ j ] = state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j ]; + } + + for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) { + state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] = tmp1_QS[ j ]; + } + + for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) { + corr_QC[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ j ], input_QS[ j ] ), 2 * QS - QC ); + } + + for( j = NUM_PARALLEL_INPUTS - 1; j >= 0; j-- ) { + tmp1_QS[ j ] = silk_SMLAWB( tmp2_QS[ j ], state_QS[ order - i - NUM_PARALLEL_INPUTS + j ] - tmp1_QS[ j ], warping_Q16 ); // Accessed one extra head entry in the last loop. + } + } + + /* -------------------- epilog 0 -------------------- */ + + tmp2_QS[ 2 ] = state_QS[ 1 ]; + tmp2_QS[ 3 ] = state_QS[ 2 ]; + tmp2_QS[ 4 ] = state_QS[ 3 ]; + tmp2_QS[ 5 ] = state_QS[ 4 ]; + tmp2_QS[ 6 ] = state_QS[ 5 ]; + tmp2_QS[ 7 ] = state_QS[ 6 ]; + + state_QS[ 0 ] = tmp1_QS[ 1 ]; + state_QS[ 1 ] = tmp1_QS[ 2 ]; + state_QS[ 2 ] = tmp1_QS[ 3 ]; + state_QS[ 3 ] = tmp1_QS[ 4 ]; + state_QS[ 4 ] = tmp1_QS[ 5 ]; + state_QS[ 5 ] = tmp1_QS[ 6 ]; + state_QS[ 6 ] = tmp1_QS[ 7 ]; + + corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC ); + corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC ); + corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC ); + corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC ); + corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC ); + corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC ); + corr_QC[ 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC ); + + tmp1_QS[ 7 ] = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 5 ] - tmp1_QS[ 7 ], warping_Q16 ); + tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 4 ] - tmp1_QS[ 6 ], warping_Q16 ); + tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 3 ] - tmp1_QS[ 5 ], warping_Q16 ); + tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 2 ] - tmp1_QS[ 4 ], warping_Q16 ); + tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 1 ] - tmp1_QS[ 3 ], warping_Q16 ); + tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ 0 ] - tmp1_QS[ 2 ], warping_Q16 ); + + /* -------------------- epilog 1 -------------------- */ + + tmp2_QS[ 3 ] = state_QS[ 1 ]; + tmp2_QS[ 4 ] = state_QS[ 2 ]; + tmp2_QS[ 5 ] = state_QS[ 3 ]; + tmp2_QS[ 6 ] = state_QS[ 4 ]; + tmp2_QS[ 7 ] = state_QS[ 5 ]; + + state_QS[ 0 ] = tmp1_QS[ 2 ]; + state_QS[ 1 ] = tmp1_QS[ 3 ]; + state_QS[ 2 ] = tmp1_QS[ 4 ]; + state_QS[ 3 ] = tmp1_QS[ 5 ]; + state_QS[ 4 ] = tmp1_QS[ 6 ]; + state_QS[ 5 ] = tmp1_QS[ 7 ]; + + corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC ); + corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC ); + corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC ); + corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC ); + corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC ); + corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC ); + + tmp1_QS[ 7 ] = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 4 ] - tmp1_QS[ 7 ], warping_Q16 ); + tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 3 ] - tmp1_QS[ 6 ], warping_Q16 ); + tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 2 ] - tmp1_QS[ 5 ], warping_Q16 ); + tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 1 ] - tmp1_QS[ 4 ], warping_Q16 ); + tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 0 ] - tmp1_QS[ 3 ], warping_Q16 ); + + /* -------------------- epilog 2 -------------------- */ + + tmp2_QS[ 4 ] = state_QS[ 1 ]; + tmp2_QS[ 5 ] = state_QS[ 2 ]; + tmp2_QS[ 6 ] = state_QS[ 3 ]; + tmp2_QS[ 7 ] = state_QS[ 4 ]; + + state_QS[ 0 ] = tmp1_QS[ 3 ]; + state_QS[ 1 ] = tmp1_QS[ 4 ]; + state_QS[ 2 ] = tmp1_QS[ 5 ]; + state_QS[ 3 ] = tmp1_QS[ 6 ]; + state_QS[ 4 ] = tmp1_QS[ 7 ]; + + corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC ); + corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC ); + corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC ); + corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC ); + corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC ); + + tmp1_QS[ 7 ] = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 3 ] - tmp1_QS[ 7 ], warping_Q16 ); + tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 2 ] - tmp1_QS[ 6 ], warping_Q16 ); + tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 1 ] - tmp1_QS[ 5 ], warping_Q16 ); + tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 0 ] - tmp1_QS[ 4 ], warping_Q16 ); + + /* -------------------- epilog 3 -------------------- */ + + tmp2_QS[ 5 ] = state_QS[ 1 ]; + tmp2_QS[ 6 ] = state_QS[ 2 ]; + tmp2_QS[ 7 ] = state_QS[ 3 ]; + + state_QS[ 0 ] = tmp1_QS[ 4 ]; + state_QS[ 1 ] = tmp1_QS[ 5 ]; + state_QS[ 2 ] = tmp1_QS[ 6 ]; + state_QS[ 3 ] = tmp1_QS[ 7 ]; + + corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC ); + corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC ); + corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC ); + corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC ); + + tmp1_QS[ 7 ] = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 2 ] - tmp1_QS[ 7 ], warping_Q16 ); + tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 1 ] - tmp1_QS[ 6 ], warping_Q16 ); + tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 0 ] - tmp1_QS[ 5 ], warping_Q16 ); + + /* -------------------- epilog 4 -------------------- */ + + corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC ); + corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC ); + corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC ); + + opus_int32 tmp1_QS_2 = silk_SMLAWB( state_QS[ 1 ], tmp1_QS[ 5 ] - tmp1_QS[ 6 ], warping_Q16 ); + state_QS[ 1 ] = silk_SMLAWB( state_QS[ 2 ], tmp1_QS[ 6 ] - tmp1_QS[ 7 ], warping_Q16 ); + + /* -------------------- epilog 5 & 6 -------------------- */ + + state_QS[ 0 ] = silk_SMLAWB( tmp1_QS[ 6 ], tmp1_QS_2 - state_QS[ 1 ], warping_Q16 ); + state_QS[ 2 ] = tmp1_QS[ 7 ]; + + corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS_2, input_QS[ 6 ] ), 2 * QS - QC ) + + silk_RSHIFT64( silk_SMULL( state_QS[ 0 ], input_QS[ 7 ] ), 2 * QS - QC ); + corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( state_QS[ 1 ], input_QS[ 7 ] ), 2 * QS - QC ); + } + } + + for( ; n < length; n++ ) { + input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); + /* Loop over allpass sections */ + for( i = 0; i <= order; i++ ) { + /* Output of allpass section */ + tmp2_QS[ 0 ] = silk_SMLAWB( state_QS[ order - i ], state_QS[ order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 ); + state_QS[ order - i ] = tmp1_QS[ 0 ]; + corr_QC[ order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC ); + tmp1_QS[ 0 ] = tmp2_QS[ 0 ]; + } + } + lsh = silk_CLZ64( corr_QC[ order ] ) - 35; + lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); + *scale = -( QC + lsh ); + silk_assert( *scale >= -30 && *scale <= 12 ); + if( lsh >= 0 ) { + for( i = 0; i <= order; i++ ) { + corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) ); + } + } else { + for( i = 0; i <= order; i++ ) { + corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) ); + } + } + silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/ +} + +#define MAX_LENGTH 360 + +static int test_warped_autocorrelation(int arch) +{ + unsigned int i; + opus_int32 corrOrg[MAX_SHAPE_LPC_ORDER + 1], corrOpt[MAX_SHAPE_LPC_ORDER + 1]; + opus_int scaleOrg, scaleOpt; + opus_int16 input[MAX_LENGTH]; + opus_int warping_Q16, length, order; + (void)arch; + + printf("%50s", "silk_warped_autocorrelation_FIX() ..."); + for( order = 0; order <= MAX_SHAPE_LPC_ORDER; order += 2 ) // order must be even. + { + for( length = 0; length <= MAX_LENGTH; length++ ) + { + for (i=0;i<MAX_LENGTH;++i) + { + input[i] = (rand() % 32767) - 16384; + } + warping_Q16 = rand() % 32767; + memcpy(corrOpt, corrOrg, sizeof(corrOrg)); + + silk_warped_autocorrelation_FIX_c(corrOrg, &scaleOrg, input, warping_Q16, length, order); + silk_warped_autocorrelation_FIX (corrOpt, &scaleOpt, input, warping_Q16, length, order); + if (memcmp(corrOpt, corrOrg, sizeof(corrOrg))) + { + printf("order=%2d length=%3d failed!\n", order, length); + for (i=0;i<sizeof(corrOrg) / sizeof(*corrOrg);i++) + { + if (corrOrg[i] != corrOpt[i]) + { + printf("\ncorrOrg[%3d]=%12d, corrOpt[%3d]=%12d", i, corrOrg[i], i, corrOpt[i]); + } + } + printf("\n"); + return -1; + } + } + } + printf(" passed!\n"); + return 0; +} +#endif /* FIXED_POINT */ diff --git a/silk_headers.mk b/silk_headers.mk index f8bf1d2..52c42d0 100644 --- a/silk_headers.mk +++ b/silk_headers.mk @@ -30,6 +30,7 @@ silk/arm/SigProc_FIX_armv5e.h \ silk/arm/NSQ_neon.h \ silk/fixed/main_FIX.h \ silk/fixed/structs_FIX.h \ +silk/fixed/arm/warped_autocorrelation_FIX_arm.h \ silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \ silk/fixed/mips/prefilter_FIX_mipsr1.h \ silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h \ diff --git a/silk_sources.mk b/silk_sources.mk index 7229ee3..5f9551b 100644 --- a/silk_sources.mk +++ b/silk_sources.mk @@ -117,6 +117,9 @@ SILK_SOURCES_FIXED_SSE4_1 = silk/fixed/x86/vector_ops_FIX_sse.c \ silk/fixed/x86/burg_modified_FIX_sse.c \ silk/fixed/x86/prefilter_FIX_sse.c +SILK_SOURCES_FIXED_ARM_NEON_INTR = \ +silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c + SILK_SOURCES_FLOAT = \ silk/float/apply_sine_window_FLP.c \ silk/float/corrMatrix_FLP.c \ diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c index 7eeab38..b5c25d9 100644 --- a/tests/test_unit_optimization.c +++ b/tests/test_unit_optimization.c @@ -40,6 +40,7 @@ #ifdef FIXED_POINT # include "celt/tests/test_unit_optimization_lpc.c" +# include "silk/tests/test_unit_optimization_warped_autocorrelation.c" #endif @@ -56,6 +57,7 @@ int main(void) printf("\n--------------------------- Testing optimization ---------------------------\n"); #ifdef FIXED_POINT result |= test_fir(arch); + result |= test_warped_autocorrelation(arch); #endif /* FIXED_POINT */ } return result; -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Jul-14 00:49 UTC
[opus] [PATCH 4/5] Refactor silk_warped_autocorrelation_FIX_neon()
Clean the code by defining macros. --- .../arm/warped_autocorrelation_FIX_neon_intr.c | 637 ++++++++++----------- 1 file changed, 287 insertions(+), 350 deletions(-) diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c index 80dd949..6071445 100644 --- a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c +++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c @@ -40,7 +40,6 @@ #endif #include <stdio.h> - #include <arm_neon.h> #include "stack_alloc.h" #include "main_FIX.h" @@ -49,6 +48,190 @@ #define NUM_PARALLEL_INPUTS 8 +#define vget_all(x) (x) + +/* Calculate 1 or 2 elements of corr_QC and tmp1_QS in prolog. */ +#define CORRELATION_PROLOG_1_OR_2( \ + corr_QC, /* I/O corr_QC buffer. Updated 1 or 2 elements. */ \ + state_QS, /* I state_QS buffer. */ \ + offset, /* I The address offset of corr_QC and state_QS. */ \ + input_QS0_s32x4, /* I Input_QS elements 0 to 3. */ \ + warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \ + tmp1_QS_s32x2, /* I/O Either 1 or 2 elements of tmp1_QS. */ \ + tmp2_QS_s32x2, /* I Either 1 or 2 elements of tmp2_QS. */ \ + int64xX_t, /* Either int64x1_t or int64x2_t. */ \ + vget_X, /* Either vget_low_s64 or vget_all. */ \ + vld1X_s64, /* Either vld1_s64 or vld1q_s64. */ \ + vst1X_s64, /* Either vst1_s64 or vst1q_s64. */ \ + vaddX_s64 /* Either vadd_s64 or vaddq_s64. */ \ +) \ +{ \ + int64xX_t corr_QC_s64xX; \ + int64x2_t t_s64x2; \ + corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset)); \ + t_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4)); \ + t_s64x2 = vshrq_n_s64(t_s64x2, 2 * QS - QC); \ + corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t_s64x2)); \ + (vst1X_s64)(corr_QC + (offset), corr_QC_s64xX); \ + tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + (offset) - 1), tmp1_QS_s32x2); \ + t_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2); \ + tmp1_QS_s32x2 = vshrn_n_s64(t_s64x2, 16); \ + tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); \ +} + +/* Calculate 3 or 4 elements of corr_QC, state_QS and tmp1_QS in prolog. */ +#define CORRELATION_PROLOG_3_OR_4( \ + corr_QC, /* I/O corr_QC buffer. Updated 3 or 4 elements. */ \ + state_QS, /* I/O state_QS buffer. Updated 4 elements. */ \ + offset, /* I The address offset of corr_QC and state_QS. */ \ + input_QS0_s32x4, /* I Input_QS elements 0 to 3. */ \ + warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \ + tmp1_QS0_s32x4, /* O Updated 3 or 4 elements of tmp1_QS. */ \ + int64xX_t, /* Either int64x1_t or int64x2_t. */ \ + vget_X, /* Either vget_low_s64 or vget_all. */ \ + vld1X_s64, /* Either vld1_s64 or vld1q_s64. */ \ + vst1X_s64, /* Either vst1_s64 or vst1q_s64. */ \ + vaddX_s64 /* Either vadd_s64 or vaddq_s64. */ \ +) \ +{ \ + int32x4_t tmp2_QS_s32x4; \ + int64x2_t corr_QC0_s64x2, t0_s64x2, t1_s64x2; \ + int64xX_t corr_QC_s64xX; \ + tmp2_QS_s32x4 = vld1q_s32(state_QS + (offset)); \ + vst1q_s32(state_QS + (offset), tmp1_QS0_s32x4); \ + corr_QC0_s64x2 = vld1q_s64 (corr_QC + (offset)); \ + corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset) + 2); \ + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); \ + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); \ + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); \ + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); \ + corr_QC0_s64x2 = vaddq_s64 (corr_QC0_s64x2, t0_s64x2); \ + corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t1_s64x2)); \ + vst1q_s64 (corr_QC + (offset), corr_QC0_s64x2); \ + (vst1X_s64)(corr_QC + (offset) + 2, corr_QC_s64xX); \ + tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1), tmp1_QS0_s32x4); \ + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32x2); \ + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32x2); \ + tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); \ + tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS_s32x4); \ +} + +/* Calculate 4 elements of corr_QC, state_QS and tmp1_QS in prolog and kernel loop. */ +#define CORRELATION_4(offset) CORRELATION_PROLOG_3_OR_4(corr_QC, state_QS, offset, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS0_s32x4, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64) + +/* Calculate 3 or 4 elements of corr_QC and tmp1_QS. */ +#define CORRELATION_NEXT_3_OR_4( \ + corr_QC, /* I/O corr_QC buffer. Updated 3 or 4 elements. */ \ + state_QS, /* I state_QS buffer. */ \ + offset, /* I The address offset of corr_QC and state_QS. */ \ + input_QS1_s32x4, /* I 4 elements of input_QS. */ \ + tmp1_QS1_s32x4, /* I/O Either 3 or 4 elements of tmp1_QS. */ \ + tmp2_QS1_s32x4, /* I Either 3 or 4 elements of tmp2_QS. */ \ + warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \ + int64xX_t, /* Either int64x1_t or int64x2_t. */ \ + vget_X, /* Either vget_low_s64 or vget_all. */ \ + vld1X_s64, /* Either vld1_s64 or vld1q_s64. */ \ + vst1X_s64, /* Either vst1_s64 or vst1q_s64. */ \ + vaddX_s64 /* Either vadd_s64 or vaddq_s64. */ \ +) \ +{ \ + int64x2_t corr_QC0_s64x2, t0_s64x2, t1_s64x2; \ + int64xX_t corr_QC_s64xX; \ + corr_QC0_s64x2 = vld1q_s64 (corr_QC + (offset)); \ + corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset) + 2); \ + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); \ + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); \ + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); \ + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); \ + corr_QC0_s64x2 = vaddq_s64 (corr_QC0_s64x2, t0_s64x2); \ + corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t1_s64x2)); \ + vst1q_s64 (corr_QC + (offset), corr_QC0_s64x2); \ + (vst1X_s64)(corr_QC + (offset) + 2, corr_QC_s64xX); \ + tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1), tmp1_QS1_s32x4); \ + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32x2); \ + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32x2); \ + tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); \ + tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); \ +} + +/* Calculate 1 or 2 elements of corr_QC and tmp1_QS. */ +#define CORRELATION_EXTRA_1_OR_2( \ + corr_QC, /* I/O corr_QC buffer. Updated 1 or 2 elements. */ \ + state_QS, /* I state_QS buffer. */ \ + offset, /* I The address offset of corr_QC and state_QS. */ \ + input_QS_s32x2, /* I 2 elements of input_QS. */ \ + warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \ + tmp1_QS_s32x2X, /* I Either tmp1_QS_s32x2 or high half of tmp1_QS0_s32x4, with 1 or 2 elements of tmp1_QS. */ \ + tmp2_QS_s32x2, /* I Either 1 or 2 elements of tmp2_QS. */ \ + tmp1_QS_s32x2, /* O Updated 1 or 2 elements of tmp1_QS. */ \ + int64xX_t, /* Either int64x1_t or int64x2_t. */ \ + vget_X, /* Either vget_low_s64 or vget_all. */ \ + vld1X_s64, /* Either vld1_s64 or vld1q_s64. */ \ + vst1X_s64, /* Either vst1_s64 or vst1q_s64. */ \ + vaddX_s64 /* Either vadd_s64 or vaddq_s64. */ \ +) \ +{ \ + int64xX_t corr_QC_s64xX; \ + int64x2_t t_s64x2; \ + corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset)); \ + t_s64x2 = vmull_s32(tmp1_QS_s32x2X, input_QS_s32x2); \ + t_s64x2 = vshrq_n_s64(t_s64x2, 2 * QS - QC); \ + corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t_s64x2)); \ + (vst1X_s64)(corr_QC + (offset), corr_QC_s64xX); \ + tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + (offset) - 1), tmp1_QS_s32x2X); \ + t_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2); \ + tmp1_QS_s32x2 = vshrn_n_s64(t_s64x2, 16); \ + tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); \ +} + +/* Calculate 1 element of corr_QC. */ +#define CORRELATION_EPILOG_1( \ + corr_QC, /* I/O corr_QC buffer. Updated 1 element. */ \ + input_QS0_s32x4, /* I 4 elements of input_QS. */ \ + tmp1_QS_s32xX, /* I Either tmp1_QS_s32x2 or low half of tmp1_QS0_s32x4, with 1 or 2 elements of tmp1_QS. */ \ + vget_X /* The splitting instruction, either vget_low_s32 or vget_high_s32. */ \ +) \ +{ \ + int64x1_t corr_s64x1; \ + int64x2_t t_s64x2; \ + corr_s64x1 = vld1_s64(corr_QC); \ + t_s64x2 = vmull_s32(tmp1_QS_s32xX, (vget_X)(input_QS0_s32x4)); \ + t_s64x2 = vshrq_n_s64(t_s64x2, 2 * QS - QC); \ + corr_s64x1 = vadd_s64(corr_s64x1, vget_high_s64(t_s64x2)); \ + vst1_s64(corr_QC, corr_s64x1); \ +} + +/* Calculate 4 elements of corr_QC, state_QS and tmp1_QS in prolog. */ +#define CORRELATION_EPILOG_4( \ + corr_QC, /* I/O corr_QC buffer. Updated 4 elements. */ \ + state_QS, /* I/O state_QS buffer. Updated 4 elements. */ \ + offset, /* I The address offset of corr_QC and state_QS. */ \ + input_QS1_s32x4, /* I Input_QS elements 4 to 7. */ \ + warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \ + tmp1_QS1_s32x4 /* I/O 4 elements of tmp1_QS. */ \ + ) \ + { \ + int32x4_t tmp2_QS_s32x4; \ + int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, t0_s64x2, t1_s64x2; \ + tmp2_QS_s32x4 = vld1q_s32(state_QS + (offset)); \ + vst1q_s32(state_QS + (offset), tmp1_QS1_s32x4); \ + corr_QC0_s64x2 = vld1q_s64(corr_QC + (offset)); \ + corr_QC1_s64x2 = vld1q_s64(corr_QC + (offset) + 2); \ + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); \ + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); \ + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); \ + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); \ + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); \ + corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); \ + vst1q_s64(corr_QC + (offset), corr_QC0_s64x2); \ + vst1q_s64(corr_QC + (offset) + 2, corr_QC1_s64x2); \ + tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1), tmp1_QS1_s32x4); \ + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32x2); \ + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32x2); \ + tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); \ + tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS_s32x4); \ +} + void silk_warped_autocorrelation_FIX_neon( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ @@ -61,9 +244,10 @@ void silk_warped_autocorrelation_FIX_neon( opus_int n = 0, i, lsh; opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS]; opus_int32 input_QS[NUM_PARALLEL_INPUTS]; - opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra entries. - opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop, and one extra end entry in the last prolog. + opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra elements. + opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head element in the last prolog and the last inner loop, and one extra end element in the last prolog. opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; + int64x2_t lsh_s64x2; /* Order must be even */ silk_assert( ( order & 1 ) == 0 ); @@ -71,387 +255,138 @@ void silk_warped_autocorrelation_FIX_neon( /* Loop over samples */ if( order >= NUM_PARALLEL_INPUTS - 2 ) { - const int32x2_t warping_Q16_s32 = vdup_n_s32(warping_Q16); + const int32x2_t warping_Q16_s32x2 = vdup_n_s32(warping_Q16); for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) { - int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS0_s32x4, tmp2_QS1_s32x4; - int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, corr_QC2_s64x2, corr_QC3_s64x2; - int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2; + int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4; int32x2_t tmp1_QS_s32x2, tmp2_QS_s32x2; - int64x1_t corr_QC_s64x1; const int32x4_t input_QS0_s32x4 = vshll_n_s16(vld1_s16(input + n), QS); const int32x4_t input_QS1_s32x4 = vshll_n_s16(vld1_s16(input + n + 4), QS); - vst1q_s32(tmp1_QS, input_QS0_s32x4); - vst1q_s32(tmp1_QS + 4, input_QS1_s32x4); + vst1q_s32(tmp1_QS, input_QS0_s32x4); + vst1q_s32(tmp1_QS + 4, input_QS1_s32x4); /* Loop over allpass sections */ /* -------------------- prolog 0 -------------------- */ - - tmp1_QS_s32x2 = vget_low_s32(input_QS0_s32x4); - tmp2_QS_s32x2 = vld1_s32(state_QS + order); // Accessed one extra end entry. + tmp1_QS_s32x2 = vget_low_s32(input_QS0_s32x4); + tmp2_QS_s32x2 = vld1_s32(state_QS + order); // Accessed one extra end element. vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0); - corr_QC_s64x1 = vld1_s64(corr_QC + order); - t0_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4)); - t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); - corr_QC_s64x1 = vadd_s64(corr_QC_s64x1, vget_low_s64(t0_s64x2)); - vst1_s64(corr_QC + order, corr_QC_s64x1); - tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + order - 1), tmp1_QS_s32x2); - t0_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); - tmp1_QS_s32x2 = vshrn_n_s64(t0_s64x2, 16); - tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); - tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1); + CORRELATION_PROLOG_1_OR_2(corr_QC, state_QS, order - 0, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64) + tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1); /* -------------------- prolog 1 -------------------- */ - - tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1); + tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1); vst1_s32(state_QS + order - 1, tmp1_QS_s32x2); - corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 1); - t0_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4)); - t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); - corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); - vst1q_s64(corr_QC + order - 1, corr_QC0_s64x2); - tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + order - 2), tmp1_QS_s32x2); - t0_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); - tmp1_QS_s32x2 = vshrn_n_s64(t0_s64x2, 16); - tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); + CORRELATION_PROLOG_1_OR_2(corr_QC, state_QS, order - 1, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64) tmp1_QS0_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4)); /* -------------------- prolog 2 -------------------- */ - - tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry. - vst1q_s32(state_QS + order - 2, tmp1_QS0_s32x4); // Saving one extra entry is OK. - corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 2); - corr_QC_s64x1 = vld1_s64 (corr_QC + order); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); - t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); - t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); - corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); - corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t1_s64x2)); - vst1q_s64(corr_QC + order - 2, corr_QC0_s64x2); - vst1_s64 (corr_QC + order, corr_QC_s64x1); - tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS0_s32x4); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); - tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); - tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); + // Accessed one extra end element of state_QS. + // Saving one extra element of state_QS is OK. + CORRELATION_PROLOG_3_OR_4(corr_QC, state_QS, order - 2, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS0_s32x4, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64) tmp1_QS0_s32x4 = vld1q_lane_s32(tmp1_QS + 3, tmp1_QS0_s32x4, 3); /* -------------------- prolog 3 -------------------- */ - - tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 3); - vst1q_s32(state_QS + order - 3, tmp1_QS0_s32x4); - corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 3); - corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 1); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); - t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); - t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); - corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); - corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); - vst1q_s64(corr_QC + order - 3, corr_QC0_s64x2); - vst1q_s64(corr_QC + order - 1, corr_QC1_s64x2); - tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 4), tmp1_QS0_s32x4); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); - tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); - tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); - tmp1_QS_s32x2 = vget_low_s32(input_QS1_s32x4); + CORRELATION_4(order - 3) + tmp1_QS_s32x2 = vget_low_s32(input_QS1_s32x4); /* -------------------- prolog 4 -------------------- */ - - tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 4); - tmp2_QS_s32x2 = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0); - vst1q_s32(state_QS + order - 4, tmp1_QS0_s32x4); + tmp2_QS_s32x2 = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0); vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0); - corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 4); - corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 2); - corr_QC_s64x1 = vld1_s64 (corr_QC + order); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); - t2_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32 (input_QS1_s32x4)); - t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); - t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); - t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); - corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); - corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); - corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t2_s64x2)); - vst1q_s64(corr_QC + order - 4, corr_QC0_s64x2); - vst1q_s64(corr_QC + order - 2, corr_QC1_s64x2); - vst1_s64 (corr_QC + order, corr_QC_s64x1); - tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 5), tmp1_QS0_s32x4); - tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS + order - 1), tmp1_QS_s32x2); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); - t2_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); - tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); - tmp1_QS_s32x2 = vshrn_n_s64(t2_s64x2, 16); - tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); - tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2); - tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1); + CORRELATION_4(order - 4) + CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, order, vget_low_s32(input_QS1_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, tmp1_QS_s32x2, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64) + tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1); /* -------------------- prolog 5 -------------------- */ - - tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 5); - tmp2_QS_s32x2 = vld1_s32 (state_QS + order - 1); - vst1q_s32(state_QS + order - 5, tmp1_QS0_s32x4); - vst1_s32 (state_QS + order - 1, tmp1_QS_s32x2); - corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 5); - corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 3); - corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 1); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); - t2_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32 (input_QS1_s32x4)); - t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); - t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); - t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); - corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); - corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); - corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); - vst1q_s64(corr_QC + order - 5, corr_QC0_s64x2); - vst1q_s64(corr_QC + order - 3, corr_QC1_s64x2); - vst1q_s64(corr_QC + order - 1, corr_QC2_s64x2); - tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 6), tmp1_QS0_s32x4); - tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS + order - 2), tmp1_QS_s32x2); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); - t2_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); - tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); - tmp1_QS_s32x2 = vshrn_n_s64(t2_s64x2, 16); - tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); - tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2); + tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1); + vst1_s32(state_QS + order - 1, tmp1_QS_s32x2); + CORRELATION_4(order - 5) + CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, order - 1, vget_low_s32(input_QS1_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64) tmp1_QS1_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4)); /* -------------------- prolog 6 -------------------- */ - - tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 6); - tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry. - vst1q_s32(state_QS + order - 6, tmp1_QS0_s32x4); - vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4); // Saving one extra entry is OK. - corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 6); - corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 4); - corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 2); - corr_QC_s64x1 = vld1_s64 (corr_QC + order); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); - t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); - t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); - t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); - t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); - corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); - corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); - corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); - corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t3_s64x2)); - vst1q_s64(corr_QC + order - 6, corr_QC0_s64x2); - vst1q_s64(corr_QC + order - 4, corr_QC1_s64x2); - vst1q_s64(corr_QC + order - 2, corr_QC2_s64x2); - vst1_s64 (corr_QC + order, corr_QC_s64x1); - tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 7), tmp1_QS0_s32x4); // Accessed one extra head entry when order is 6. - tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS1_s32x4); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); - tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); - tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); - tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); - tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end element of state_QS. + vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4); // Saving one extra element of state_QS is OK. + // Accessed one extra head element when order is 6. + CORRELATION_4(order - 6) + CORRELATION_NEXT_3_OR_4(corr_QC, state_QS, order - 2, input_QS1_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4, warping_Q16_s32x2, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64) tmp1_QS1_s32x4 = vld1q_lane_s32(tmp1_QS + 7, tmp1_QS1_s32x4, 3); /* -------------------- kernel loop -------------------- */ - for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) { - /* Output of allpass section */ - tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1); + /* Output of allpass section */ + // Accessed one extra head element of state_QS in the last loop. tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5); - vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1, tmp1_QS0_s32x4); vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5, tmp1_QS1_s32x4); - corr_QC0_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1); - corr_QC1_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3); - corr_QC2_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5); - corr_QC3_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); - t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); - t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); - t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); - t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); - corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); - corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); - corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); - corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); - vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1, corr_QC0_s64x2); - vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3, corr_QC1_s64x2); - vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5, corr_QC2_s64x2); - vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7, corr_QC3_s64x2); - tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS), tmp1_QS0_s32x4); // Accessed one extra head entry in the last loop. - tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 4), tmp1_QS1_s32x4); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); - tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); - tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); - tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4); - tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + CORRELATION_4(order - i - NUM_PARALLEL_INPUTS + 1) + CORRELATION_NEXT_3_OR_4(corr_QC, state_QS, order - i - NUM_PARALLEL_INPUTS + 5, input_QS1_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4, warping_Q16_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64) } /* -------------------- epilog 0 -------------------- */ - - tmp2_QS_s32x2 = vld1_s32(state_QS + 1); - tmp2_QS1_s32x4 = vld1q_s32(state_QS + 3); - vst1q_s32(state_QS - 1, tmp1_QS0_s32x4); // Saving one extra entry is OK. - vst1q_s32(state_QS + 3, tmp1_QS1_s32x4); - corr_QC_s64x1 = vld1_s64 (corr_QC); - corr_QC1_s64x2 = vld1q_s64(corr_QC + 1); - corr_QC2_s64x2 = vld1q_s64(corr_QC + 3); - corr_QC3_s64x2 = vld1q_s64(corr_QC + 5); - t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); - t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); - t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); - t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); - t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); - t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); - corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t0_s64x2)); - corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); - corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); - corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); - vst1_s64 (corr_QC + 0, corr_QC_s64x1); - vst1q_s64(corr_QC + 1, corr_QC1_s64x2); - vst1q_s64(corr_QC + 3, corr_QC2_s64x2); - vst1q_s64(corr_QC + 5, corr_QC3_s64x2); - tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS), vget_high_s32(tmp1_QS0_s32x4)); - tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 2), tmp1_QS1_s32x4); - t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); - tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16); - tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); - tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2); - tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + tmp2_QS_s32x2 = vld1_s32(state_QS + 1); + vst1q_s32(state_QS - 1, tmp1_QS0_s32x4); // Saving one extra element is OK. + CORRELATION_EPILOG_1(corr_QC, input_QS0_s32x4, vget_low_s32(tmp1_QS0_s32x4), vget_low_s32) + CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, 1, vget_high_s32(input_QS0_s32x4), warping_Q16_s32x2, vget_high_s32(tmp1_QS0_s32x4), tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64) + CORRELATION_EPILOG_4(corr_QC, state_QS, 3, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4) /* -------------------- epilog 1 -------------------- */ - - tmp2_QS_s32x2 = vld1_s32 (state_QS); - tmp2_QS1_s32x4 = vld1q_s32(state_QS + 2); - vst1_s32 (state_QS, tmp1_QS_s32x2); - vst1q_s32(state_QS + 2, tmp1_QS1_s32x4); - corr_QC1_s64x2 = vld1q_s64(corr_QC + 0); - corr_QC2_s64x2 = vld1q_s64(corr_QC + 2); - corr_QC3_s64x2 = vld1q_s64(corr_QC + 4); - t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4)); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); - t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); - t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); - t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); - corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); - corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); - corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); - vst1q_s64(corr_QC + 0, corr_QC1_s64x2); - vst1q_s64(corr_QC + 2, corr_QC2_s64x2); - vst1q_s64(corr_QC + 4, corr_QC3_s64x2); - tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS - 1), tmp1_QS_s32x2); // Accessed one extra head entry. - tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 1), tmp1_QS1_s32x4); - t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); - tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16); - tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); - tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2); - tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + tmp2_QS_s32x2 = vld1_s32(state_QS); + vst1_s32(state_QS, tmp1_QS_s32x2); + // Accessed one extra head element of state_QS. + CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, 0, vget_high_s32(input_QS0_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64) + CORRELATION_EPILOG_4(corr_QC, state_QS, 2, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4) /* -------------------- epilog 2 -------------------- */ - - tmp2_QS1_s32x4 = vld1q_s32(state_QS + 1); - vst1_lane_s32(state_QS, tmp1_QS_s32x2, 1); - vst1q_s32 (state_QS + 1, tmp1_QS1_s32x4); - corr_QC_s64x1 = vld1_s64(corr_QC); - corr_QC2_s64x2 = vld1q_s64(corr_QC + 1); - corr_QC3_s64x2 = vld1q_s64(corr_QC + 3); - t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4)); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); - t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); - t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); - t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); - corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t1_s64x2)); - corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); - corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); - vst1_s64 (corr_QC + 0, corr_QC_s64x1); - vst1q_s64(corr_QC + 1, corr_QC2_s64x2); - vst1q_s64(corr_QC + 3, corr_QC3_s64x2); - tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS), tmp1_QS1_s32x4); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); - tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); - tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); + vst1_lane_s32(state_QS, tmp1_QS_s32x2, 1); + CORRELATION_EPILOG_1(corr_QC, input_QS0_s32x4, tmp1_QS_s32x2, vget_high_s32) + CORRELATION_EPILOG_4(corr_QC, state_QS, 1, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4) /* -------------------- epilog 3 -------------------- */ - - tmp2_QS1_s32x4 = vld1q_s32(state_QS); - vst1q_s32(state_QS, tmp1_QS1_s32x4); - corr_QC2_s64x2 = vld1q_s64(corr_QC); - corr_QC3_s64x2 = vld1q_s64(corr_QC + 2); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); - t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); - t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); - corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2); - corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); - vst1q_s64(corr_QC, corr_QC2_s64x2); - vst1q_s64(corr_QC + 2, corr_QC3_s64x2); - tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS - 1), tmp1_QS1_s32x4); // Accessed one extra head entry. - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32); - tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16)); - tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); - - /* -------------------- epilog 4 -------------------- */ - - corr_QC_s64x1 = vld1_s64 (corr_QC); - corr_QC3_s64x2 = vld1q_s64(corr_QC + 1); - t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); - t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); - t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC); - t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); - corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t2_s64x2)); - corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); - vst1_s64 (corr_QC, corr_QC_s64x1); - vst1q_s64(corr_QC + 1, corr_QC3_s64x2); - vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4); - - tmp2_QS_s32x2 = vld1_s32(state_QS + 1); - tmp1_QS_s32x2 = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4)); - t3_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32); - tmp1_QS_s32x2 = vshrn_n_s64(t3_s64x2, 16); - tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); - vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1); - - /* -------------------- epilog 5 & 6 -------------------- */ - - vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1); - tmp2_QS_s32x2 = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32))); - t3_s64x2 = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32); - tmp2_QS_s32x2 = vshrn_n_s64(t3_s64x2, 16); - tmp2_QS_s32x2 = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2); - vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0); - - corr_QC3_s64x2 = vld1q_s64(corr_QC); - t3_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4)); - t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); - corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2); - vst1_s64(corr_QC + 1, vget_high_s64(corr_QC3_s64x2)); - t3_s64x2 = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32))); - t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC); - corr_QC_s64x1 = vadd_s64(vget_low_s64(corr_QC3_s64x2), vget_low_s64(t3_s64x2)); - vst1_s64(corr_QC, corr_QC_s64x1); + // Accessed one extra head element of state_QS. + CORRELATION_EPILOG_4(corr_QC, state_QS, 0, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4) + + { + int64x1_t corr_QC_s64x1; + int64x2_t corr_QC0_s64x2; + int64x2_t t0_s64x2, t1_s64x2; + + /* -------------------- epilog 4 -------------------- */ + corr_QC_s64x1 = vld1_s64 (corr_QC); + corr_QC0_s64x2 = vld1q_s64(corr_QC + 1); + t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); + t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); + t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t0_s64x2)); + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t1_s64x2); + vst1_s64 (corr_QC, corr_QC_s64x1); + vst1q_s64(corr_QC + 1, corr_QC0_s64x2); + vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4); + + tmp2_QS_s32x2 = vld1_s32(state_QS + 1); + tmp1_QS_s32x2 = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4)); + t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2); + tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16); + tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); + vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1); + + /* -------------------- epilog 5 & 6 -------------------- */ + vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1); + tmp2_QS_s32x2 = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32))); + t1_s64x2 = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32x2); + tmp2_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16); + tmp2_QS_s32x2 = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2); + vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0); + + corr_QC0_s64x2 = vld1q_s64(corr_QC); + t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4)); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t1_s64x2); + vst1_s64(corr_QC + 1, vget_high_s64(corr_QC0_s64x2)); + t1_s64x2 = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32))); + t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); + corr_QC_s64x1 = vadd_s64(vget_low_s64(corr_QC0_s64x2), vget_low_s64(t1_s64x2)); + vst1_s64(corr_QC, corr_QC_s64x1); + } } } @@ -470,14 +405,16 @@ void silk_warped_autocorrelation_FIX_neon( lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); *scale = -( QC + lsh ); silk_assert( *scale >= -30 && *scale <= 12 ); - const int64x2_t lsh_s64x2 = vdupq_n_s64(lsh); + lsh_s64x2 = vdupq_n_s64(lsh); for( i = 0; i <= order - 3; i += 4 ) { - int64x2_t corr_QC0_s64x2 = vld1q_s64(corr_QC + i); - int64x2_t corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2); - corr_QC0_s64x2 = vshlq_s64(corr_QC0_s64x2, lsh_s64x2); - corr_QC1_s64x2 = vshlq_s64(corr_QC1_s64x2, lsh_s64x2); - int32x4_t corr_s32x4 = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2)); - corr_s32x4 = vrev64q_s32(corr_s32x4); + int32x4_t corr_s32x4; + int64x2_t corr_QC0_s64x2, corr_QC1_s64x2; + corr_QC0_s64x2 = vld1q_s64(corr_QC + i); + corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2); + corr_QC0_s64x2 = vshlq_s64(corr_QC0_s64x2, lsh_s64x2); + corr_QC1_s64x2 = vshlq_s64(corr_QC1_s64x2, lsh_s64x2); + corr_s32x4 = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2)); + corr_s32x4 = vrev64q_s32(corr_s32x4); vst1q_s32(corr + order - i - 3, corr_s32x4); } if( lsh >= 0 ) { -- 2.8.0.rc3.226.g39d4020
Linfeng Zhang
2016-Jul-14 00:49 UTC
[opus] [PATCH 5/5] Optimize silk/LPC_inv_pred_gain.c for ARM NEON
Optimized LPC_inverse_pred_gain_QA(), silk_LPC_inverse_pred_gain() and silk_LPC_inverse_pred_gain_Q24() for ARM NEON. Created corresponding unit test. --- silk/CNG.c | 2 +- silk/LPC_inv_pred_gain.c | 18 +- silk/NLSF2A.c | 3 +- silk/SigProc_FIX.h | 19 +- silk/arm/LPC_inv_pred_gain_arm.h | 84 +++++++ silk/arm/LPC_inv_pred_gain_neon_intr.c | 258 +++++++++++++++++++++ silk/arm/arm_silk_map.c | 24 +- silk/decode_parameters.c | 4 +- silk/fixed/find_LPC_FIX.c | 2 +- silk/float/find_LPC_FLP.c | 2 +- silk/float/main_FLP.h | 3 +- silk/float/wrappers_FLP.c | 5 +- silk/init_decoder.c | 1 + silk/process_NLSFs.c | 4 +- silk/structs.h | 1 + .../test_unit_optimization_LPC_inv_pred_gain.c | 107 +++++++++ silk_headers.mk | 1 + silk_sources.mk | 1 + tests/test_unit_optimization.c | 9 +- 19 files changed, 523 insertions(+), 25 deletions(-) create mode 100644 silk/arm/LPC_inv_pred_gain_arm.h create mode 100644 silk/arm/LPC_inv_pred_gain_neon_intr.c create mode 100644 silk/tests/test_unit_optimization_LPC_inv_pred_gain.c diff --git a/silk/CNG.c b/silk/CNG.c index 8443ad6..78d500a 100644 --- a/silk/CNG.c +++ b/silk/CNG.c @@ -142,7 +142,7 @@ void silk_CNG( silk_CNG_exc( CNG_sig_Q14 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, length, &psCNG->rand_seed ); /* Convert CNG NLSF to filter representation */ - silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order ); + silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order, psDec->arch ); /* Generate CNG signal, by synthesis filtering */ silk_memcpy( CNG_sig_Q14, psCNG->CNG_synth_state, MAX_LPC_ORDER * sizeof( opus_int32 ) ); diff --git a/silk/LPC_inv_pred_gain.c b/silk/LPC_inv_pred_gain.c index 4af89aa..64747ad 100644 --- a/silk/LPC_inv_pred_gain.c +++ b/silk/LPC_inv_pred_gain.c @@ -36,9 +36,11 @@ POSSIBILITY OF SUCH DAMAGE. #define MUL32_FRAC_Q(a32, b32, Q) ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q))) -/* Compute inverse of LPC prediction gain, and */ -/* test if LPC coefficients are stable (all poles within unit circle) */ -static opus_int32 LPC_inverse_pred_gain_QA( /* O Returns inverse prediction gain in energy domain, Q30 */ +/* Compute inverse of LPC prediction gain, and */ +/* test if LPC coefficients are stable (all poles within unit circle) */ +/* Note that specific platforms' optimizations don't guarantee identical A_QA buffer. */ +/* Since the intermediate A_QA buffer is never used again in the caller functions, that's fine. */ +static opus_int32 LPC_inverse_pred_gain_QA_c( /* O Returns inverse prediction gain in energy domain, Q30 */ opus_int32 A_QA[ 2 ][ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */ const opus_int order /* I Prediction order */ ) @@ -106,7 +108,7 @@ static opus_int32 LPC_inverse_pred_gain_QA( /* O Returns inver } /* For input in Q12 domain */ -opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */ +opus_int32 silk_LPC_inverse_pred_gain_c( /* O Returns inverse prediction gain in energy domain, Q30 */ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ const opus_int order /* I Prediction order */ ) @@ -127,13 +129,14 @@ opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse predi if( DC_resp >= 4096 ) { return 0; } - return LPC_inverse_pred_gain_QA( Atmp_QA, order ); + return LPC_inverse_pred_gain_QA_c( Atmp_QA, order ); + /* Don't use Atmp_QA buffer anymore from here, because specific platforms' optimizations don't guarantee identical values. */ } #ifdef FIXED_POINT /* For input in Q24 domain */ -opus_int32 silk_LPC_inverse_pred_gain_Q24( /* O Returns inverse prediction gain in energy domain, Q30 */ +opus_int32 silk_LPC_inverse_pred_gain_Q24_c( /* O Returns inverse prediction gain in energy domain, Q30 */ const opus_int32 *A_Q24, /* I Prediction coefficients [order] */ const opus_int order /* I Prediction order */ ) @@ -149,6 +152,7 @@ opus_int32 silk_LPC_inverse_pred_gain_Q24( /* O Returns inverse pred Anew_QA[ k ] = silk_RSHIFT32( A_Q24[ k ], 24 - QA ); } - return LPC_inverse_pred_gain_QA( Atmp_QA, order ); + return LPC_inverse_pred_gain_QA_c( Atmp_QA, order ); + /* Don't use Atmp_QA buffer anymore from here, because specific platforms' optimizations don't guarantee identical values. */ } #endif diff --git a/silk/NLSF2A.c b/silk/NLSF2A.c index b1c559e..a259212 100644 --- a/silk/NLSF2A.c +++ b/silk/NLSF2A.c @@ -66,7 +66,8 @@ static OPUS_INLINE void silk_NLSF2A_find_poly( void silk_NLSF2A( opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */ const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */ - const opus_int d /* I filter order (should be even) */ + const opus_int d, /* I filter order (should be even) */ + int arch /* I Run-time architecture */ ) { /* This ordering was found to maximize quality. It improves numerical accuracy of diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h index b632994..570ae11 100644 --- a/silk/SigProc_FIX.h +++ b/silk/SigProc_FIX.h @@ -47,6 +47,10 @@ extern "C" #include "x86/SigProc_FIX_sse.h" #endif +#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +#include "arm/LPC_inv_pred_gain_arm.h" +#endif + /********************************************************************/ /* SIGNAL PROCESSING FUNCTIONS */ /********************************************************************/ @@ -132,13 +136,13 @@ void silk_bwexpander_32( /* Compute inverse of LPC prediction gain, and */ /* test if LPC coefficients are stable (all poles within unit circle) */ -opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */ +opus_int32 silk_LPC_inverse_pred_gain_c( /* O Returns inverse prediction gain in energy domain, Q30 */ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ const opus_int order /* I Prediction order */ ); /* For input in Q24 domain */ -opus_int32 silk_LPC_inverse_pred_gain_Q24( /* O Returns inverse prediction gain in energy domain, Q30 */ +opus_int32 silk_LPC_inverse_pred_gain_Q24_c( /* O Returns inverse prediction gain in energy domain, Q30 */ const opus_int32 *A_Q24, /* I Prediction coefficients [order] */ const opus_int order /* I Prediction order */ ); @@ -152,6 +156,14 @@ void silk_ana_filt_bank_1( const opus_int32 N /* I Number of input samples */ ); +#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain) +#define silk_LPC_inverse_pred_gain(A_Q12, order) ((void)(arch),silk_LPC_inverse_pred_gain_c(A_Q12, order)) +#endif + +#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain_Q24) +#define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((void)(arch),silk_LPC_inverse_pred_gain_Q24_c(A_Q24, order)) +#endif + /********************************************************************/ /* SCALAR FUNCTIONS */ /********************************************************************/ @@ -271,7 +283,8 @@ void silk_A2NLSF( void silk_NLSF2A( opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */ const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */ - const opus_int d /* I filter order (should be even) */ + const opus_int d, /* I filter order (should be even) */ + int arch /* I Run-time architecture */ ); void silk_insertion_sort_increasing( diff --git a/silk/arm/LPC_inv_pred_gain_arm.h b/silk/arm/LPC_inv_pred_gain_arm.h new file mode 100644 index 0000000..77d7167 --- /dev/null +++ b/silk/arm/LPC_inv_pred_gain_arm.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(LPC_INV_PRED_GAIN_ARM_H) +# define LPC_INV_PRED_GAIN_ARM_H + +# include "celt/arm/armcpu.h" + +# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ + const opus_int order /* I Prediction order */ +); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_silk_LPC_inverse_pred_gain (1) +# define silk_LPC_inverse_pred_gain(A_Q12, order) ((void)(arch),PRESUME_NEON(silk_LPC_inverse_pred_gain)(A_Q12, order)) +# endif + +# if !defined(OVERRIDE_silk_LPC_inverse_pred_gain) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) +extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK+1])(const opus_int16 *A_Q12, const opus_int order); +# define OVERRIDE_silk_LPC_inverse_pred_gain (1) +# define silk_LPC_inverse_pred_gain(A_Q12, order) ((*SILK_LPC_INVERSE_PRED_GAIN_IMPL[(arch)&OPUS_ARCHMASK])(A_Q12, order)) +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_silk_LPC_inverse_pred_gain (1) +# define silk_LPC_inverse_pred_gain(A_Q12, order) ((void)(arch),silk_LPC_inverse_pred_gain_neon(A_Q12, order)) +# endif +# endif + +# if defined(FIXED_POINT) + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +opus_int32 silk_LPC_inverse_pred_gain_Q24_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int32 *A_Q24, /* I Prediction coefficients [order] */ + const opus_int order /* I Prediction order */ +); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_silk_LPC_inverse_pred_gain_Q24 (1) +# define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((void)(arch),PRESUME_NEON(silk_LPC_inverse_pred_gain_Q24)(A_Q24, order)) +# endif + +# if !defined(OVERRIDE_silk_LPC_inverse_pred_gain_Q24) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) +extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[OPUS_ARCHMASK+1])(const opus_int32 *A_Q24, const opus_int order); +# define OVERRIDE_silk_LPC_inverse_pred_gain_Q24 (1) +# define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((*SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[(arch)&OPUS_ARCHMASK])(A_Q24, order)) +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_silk_LPC_inverse_pred_gain_Q24 (1) +# define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((void)(arch),silk_LPC_inverse_pred_gain_Q24_neon(A_Q24, order)) +# endif +# endif + +# endif /* end FIXED_POINT */ + +#endif /* end LPC_INV_PRED_GAIN_ARM_H */ diff --git a/silk/arm/LPC_inv_pred_gain_neon_intr.c b/silk/arm/LPC_inv_pred_gain_neon_intr.c new file mode 100644 index 0000000..29f0e57 --- /dev/null +++ b/silk/arm/LPC_inv_pred_gain_neon_intr.c @@ -0,0 +1,258 @@ +/* Copyright (c) 2016 Google Inc. */ +/** + @file warped_autocorrelation_FIX_neon_intr.c + @brief ARM Neon Intrinsic optimizations for silk silk_warped_autocorrelation_FIX functions + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#include <stdio.h> +#include <arm_neon.h> +#include "stack_alloc.h" +#include "main_FIX.h" + +#define QA 24 +#define A_LIMIT SILK_FIX_CONST( 0.99975, QA ) + +/* Compute inverse of LPC prediction gain, and */ +/* test if LPC coefficients are stable (all poles within unit circle) */ +/* Note that this NEON optimization doesn't guarantee identical A_QA buffer. */ +/* Since the intermediate A_QA buffer is never used again in the caller functions, that's fine. */ +static opus_int32 LPC_inverse_pred_gain_QA_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ + opus_int32 A_QA[ 2 ][ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */ + const opus_int order /* I Prediction order */ +) +{ + opus_int k, n, mult2Q; + opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2; + opus_int32 *Aold_QA, *Anew_QA; + + Anew_QA = A_QA[ order & 1 ]; + + invGain_Q30 = (opus_int32)1 << 30; + for( k = order - 1; k > 0; k-- ) { + int32x2_t rc_Q31_s32x2, rc_mult2_s32x2; + int64x2_t mult2Q_s64x2; + + /* Check for stability */ + if( ( Anew_QA[ k ] > A_LIMIT ) || ( Anew_QA[ k ] < -A_LIMIT ) ) { + return 0; + } + + /* Set RC equal to negated AR coef */ + rc_Q31 = -silk_LSHIFT( Anew_QA[ k ], 31 - QA ); + + /* rc_mult1_Q30 range: [ 1 : 2^30 ] */ + rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 ); + silk_assert( rc_mult1_Q30 > ( 1 << 15 ) ); /* reduce A_LIMIT if fails */ + silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) ); + + /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */ + mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) ); + rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 ); + + /* Update inverse gain */ + /* invGain_Q30 range: [ 0 : 2^30 ] */ + invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); + silk_assert( invGain_Q30 >= 0 ); + silk_assert( invGain_Q30 <= ( 1 << 30 ) ); + + /* Swap pointers */ + Aold_QA = Anew_QA; + Anew_QA = A_QA[ k & 1 ]; + + /* Update AR coefficient */ + rc_Q31_s32x2 = vdup_n_s32(rc_Q31); + mult2Q_s64x2 = vdupq_n_s64(-mult2Q); + rc_mult2_s32x2 = vdup_n_s32(rc_mult2); + + for( n = 0; n < k; n += 4 ) { + /* We always calculate extra elements of A_QA buffer when (k % 4) != 0, to take the advantage of SIMD parallelization. */ + int32x4_t Aold_QA_s32x4, Aold_QAr_s32x4, t_s32x4, tmp_QA_s32x4; + int64x2_t tmp0_s64x2, tmp1_s64x2; + Aold_QA_s32x4 = vld1q_s32(Aold_QA + n); + Aold_QAr_s32x4 = vld1q_s32(Aold_QA + k - n - 4); + Aold_QAr_s32x4 = vrev64q_s32(Aold_QAr_s32x4); + Aold_QAr_s32x4 = vcombine_s32(vget_high_s32(Aold_QAr_s32x4), vget_low_s32(Aold_QAr_s32x4)); // Compiler should generate VSWP. + t_s32x4 = vqrdmulhq_lane_s32(Aold_QAr_s32x4, rc_Q31_s32x2, 0); + tmp_QA_s32x4 = vsubq_s32(Aold_QA_s32x4, t_s32x4); + tmp0_s64x2 = vmull_s32(vget_low_s32 (tmp_QA_s32x4), rc_mult2_s32x2); + tmp1_s64x2 = vmull_s32(vget_high_s32(tmp_QA_s32x4), rc_mult2_s32x2); + tmp0_s64x2 = vrshlq_s64(tmp0_s64x2, mult2Q_s64x2); + tmp1_s64x2 = vrshlq_s64(tmp1_s64x2, mult2Q_s64x2); + t_s32x4 = vcombine_s32(vmovn_s64(tmp0_s64x2), vmovn_s64(tmp1_s64x2)); + vst1q_s32(Anew_QA + n, t_s32x4); + } + } + + /* Check for stability */ + if( ( Anew_QA[ 0 ] > A_LIMIT ) || ( Anew_QA[ 0 ] < -A_LIMIT ) ) { + return 0; + } + + /* Set RC equal to negated AR coef */ + rc_Q31 = -silk_LSHIFT( Anew_QA[ 0 ], 31 - QA ); + + /* Range: [ 1 : 2^30 ] */ + rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 ); + + /* Update inverse gain */ + /* Range: [ 0 : 2^30 ] */ + invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); + silk_assert( invGain_Q30 >= 0 ); + silk_assert( invGain_Q30 <= 1<<30 ); + + return invGain_Q30; +} + +/* For input in Q12 domain */ +opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ + const opus_int order /* I Prediction order */ +) +{ + opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ]; + opus_int32 DC_resp = 0; + int16x8_t t0_s16x8, t1_s16x8; + int32x4_t t0_s32x4; + + /* Increase Q domain of the AR coefficients */ + silk_assert(!(order & 1)); // order is even + silk_assert(SILK_MAX_ORDER_LPC <= 16); + t0_s16x8 = vld1q_s16(A_Q12); + t1_s16x8 = vld1q_s16(A_Q12 + 8); + t0_s32x4 = vpaddlq_s16(t0_s16x8); + switch( order ) + { + case 16: + { + int32x2_t t_s32x2; + int64x1_t t_s64x1; + t0_s32x4 = vpadalq_s16(t0_s32x4, t1_s16x8); + t_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4)); + t_s64x1 = vpaddl_s32(t_s32x2); + DC_resp = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0); + } + break; + + case 14: + { + int64x1_t t_s64x1; + int32x4_t t1_s32x4 = vpaddlq_s16(t1_s16x8); + int32x2_t t_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4)); + t_s32x2 = vadd_s32(t_s32x2, vget_low_s32(t1_s32x4)); + t_s64x1 = vpaddl_s32(t_s32x2); + t_s64x1 = vreinterpret_s64_s32(vadd_s32(vreinterpret_s32_s64(t_s64x1), vget_high_s32(t1_s32x4))); + DC_resp = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0); + } + break; + + case 12: + { + int64x1_t t_s64x1; + int32x2_t t0_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4)); + int32x2_t t1_s32x2 = vpaddl_s16(vget_low_s16(t1_s16x8)); + t0_s32x2 = vadd_s32(t0_s32x2, t1_s32x2); + t_s64x1 = vpaddl_s32(t0_s32x2); + DC_resp = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0); + } + break; + + case 10: + { + int32x2_t t0_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4)); + int32x2_t t1_s32x2 = vpaddl_s16(vget_low_s16(t1_s16x8)); + int64x1_t t_s64x1 = vpaddl_s32(t0_s32x2); + t_s64x1 = vreinterpret_s64_s32(vadd_s32(vreinterpret_s32_s64(t_s64x1), t1_s32x2)); + DC_resp = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0); + } + break; + + case 8: + DC_resp += (opus_int32)A_Q12[ 7 ]; + DC_resp += (opus_int32)A_Q12[ 6 ]; + + case 6: + DC_resp += (opus_int32)A_Q12[ 5 ]; + DC_resp += (opus_int32)A_Q12[ 4 ]; + + case 4: + DC_resp += (opus_int32)A_Q12[ 3 ]; + DC_resp += (opus_int32)A_Q12[ 2 ]; + + case 2: + DC_resp += (opus_int32)A_Q12[ 1 ]; + DC_resp += (opus_int32)A_Q12[ 0 ]; + + default: + break; + } + + /* If the DC is unstable, we don't even need to do the full calculations */ + if( DC_resp >= 4096 ) { + return 0; + } + vst1q_s32(Atmp_QA[ 0 ], vshll_n_s16(vget_low_s16 (t0_s16x8), QA - 12)); + vst1q_s32(Atmp_QA[ 0 ] + 4, vshll_n_s16(vget_high_s16(t0_s16x8), QA - 12)); + vst1q_s32(Atmp_QA[ 0 ] + 8, vshll_n_s16(vget_low_s16 (t1_s16x8), QA - 12)); + vst1q_s32(Atmp_QA[ 0 ] + 12, vshll_n_s16(vget_high_s16(t1_s16x8), QA - 12)); + + return LPC_inverse_pred_gain_QA_neon( Atmp_QA, order ); +} + +#ifdef FIXED_POINT + +/* For input in Q24 domain */ +opus_int32 silk_LPC_inverse_pred_gain_Q24_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int32 *A_Q24, /* I Prediction coefficients [order] */ + const opus_int order /* I Prediction order */ +) +{ + opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ]; + + /* Increase Q domain of the AR coefficients */ + silk_assert(!(order & 1)); // order is even + silk_assert(SILK_MAX_ORDER_LPC == 16); + silk_assert(QA == 24); // No shift. + vst1q_s32(Atmp_QA[ 0 ], vld1q_s32(A_Q24)); + vst1q_s32(Atmp_QA[ 0 ] + 4, vld1q_s32(A_Q24 + 4)); + vst1q_s32(Atmp_QA[ 0 ] + 8, vld1q_s32(A_Q24 + 8)); + vst1q_s32(Atmp_QA[ 0 ] + 12, vld1q_s32(A_Q24 + 12)); + + return LPC_inverse_pred_gain_QA_neon( Atmp_QA, order ); +} + +#endif diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c index 2e330c4..59ceb6e 100644 --- a/silk/arm/arm_silk_map.c +++ b/silk/arm/arm_silk_map.c @@ -30,11 +30,21 @@ POSSIBILITY OF SUCH DAMAGE. #include "main_FIX.h" #include "NSQ.h" +#include "SigProc_FIX.h" #if defined(OPUS_HAVE_RTCD) -# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \ - !defined(OPUS_ARM_PRESUME_NEON_INTR)) +# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) + +opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ + const opus_int order /* I Prediction order */ +) = { + silk_LPC_inverse_pred_gain_c, /* ARMv4 */ + silk_LPC_inverse_pred_gain_c, /* EDSP */ + silk_LPC_inverse_pred_gain_c, /* Media */ + MAY_HAVE_NEON(silk_LPC_inverse_pred_gain), /* Neon */ +}; /*There is no table for silk_noise_shape_quantizer_short_prediction because the NEON version takes different parameters than the C version. @@ -56,6 +66,16 @@ opus_int32 #if defined(FIXED_POINT) && \ defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR) +opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[OPUS_ARCHMASK + 1])( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int32 *A_Q24, /* I Prediction coefficients [order] */ + const opus_int order /* I Prediction order */ +) = { + silk_LPC_inverse_pred_gain_Q24_c, /* ARMv4 */ + silk_LPC_inverse_pred_gain_Q24_c, /* EDSP */ + silk_LPC_inverse_pred_gain_Q24_c, /* Media */ + MAY_HAVE_NEON(silk_LPC_inverse_pred_gain_Q24), /* Neon */ +}; + void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ diff --git a/silk/decode_parameters.c b/silk/decode_parameters.c index e345b1d..a56a409 100644 --- a/silk/decode_parameters.c +++ b/silk/decode_parameters.c @@ -52,7 +52,7 @@ void silk_decode_parameters( silk_NLSF_decode( pNLSF_Q15, psDec->indices.NLSFIndices, psDec->psNLSF_CB ); /* Convert NLSF parameters to AR prediction filter coefficients */ - silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order ); + silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order, psDec->arch ); /* If just reset, e.g., because internal Fs changed, do not allow interpolation */ /* improves the case of packet loss in the first frame after a switch */ @@ -69,7 +69,7 @@ void silk_decode_parameters( } /* Convert NLSF parameters to AR prediction filter coefficients */ - silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order ); + silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order, psDec->arch ); } else { /* Copy LPC coefficients for first half from second half */ silk_memcpy( psDecCtrl->PredCoef_Q12[ 0 ], psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order * sizeof( opus_int16 ) ); diff --git a/silk/fixed/find_LPC_FIX.c b/silk/fixed/find_LPC_FIX.c index e11cdc8..e55b63a 100644 --- a/silk/fixed/find_LPC_FIX.c +++ b/silk/fixed/find_LPC_FIX.c @@ -92,7 +92,7 @@ void silk_find_LPC_FIX( silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder ); /* Convert to LPC for residual energy evaluation */ - silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder ); + silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch ); /* Calculate residual energy with NLSF interpolation */ silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch ); diff --git a/silk/float/find_LPC_FLP.c b/silk/float/find_LPC_FLP.c index fcfe1c3..4d63964 100644 --- a/silk/float/find_LPC_FLP.c +++ b/silk/float/find_LPC_FLP.c @@ -73,7 +73,7 @@ void silk_find_LPC_FLP( silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder ); /* Convert to LPC for residual energy evaluation */ - silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder ); + silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch ); /* Calculate residual energy with LSF interpolation */ silk_LPC_analysis_filter_FLP( LPC_res, a_tmp, x, 2 * subfr_length, psEncC->predictLPCOrder ); diff --git a/silk/float/main_FLP.h b/silk/float/main_FLP.h index e5a7597..c2105a5 100644 --- a/silk/float/main_FLP.h +++ b/silk/float/main_FLP.h @@ -285,7 +285,8 @@ void silk_A2NLSF_FLP( void silk_NLSF2A_FLP( silk_float *pAR, /* O LPC coefficients [ LPC_order ] */ const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */ - const opus_int LPC_order /* I LPC order */ + const opus_int LPC_order, /* I LPC order */ + int arch /* I Run-time architecture */ ); /* Limit, stabilize, and quantize NLSFs */ diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c index 6666b8e..53a556e 100644 --- a/silk/float/wrappers_FLP.c +++ b/silk/float/wrappers_FLP.c @@ -54,13 +54,14 @@ void silk_A2NLSF_FLP( void silk_NLSF2A_FLP( silk_float *pAR, /* O LPC coefficients [ LPC_order ] */ const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */ - const opus_int LPC_order /* I LPC order */ + const opus_int LPC_order, /* I LPC order */ + int arch /* I Run-time architecture */ ) { opus_int i; opus_int16 a_fix_Q12[ MAX_LPC_ORDER ]; - silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order ); + silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order, arch ); for( i = 0; i < LPC_order; i++ ) { pAR[ i ] = ( silk_float )a_fix_Q12[ i ] * ( 1.0f / 4096.0f ); diff --git a/silk/init_decoder.c b/silk/init_decoder.c index f887c67..16c03dc 100644 --- a/silk/init_decoder.c +++ b/silk/init_decoder.c @@ -44,6 +44,7 @@ opus_int silk_init_decoder( /* Used to deactivate LSF interpolation */ psDec->first_frame_after_reset = 1; psDec->prev_gain_Q16 = 65536; + psDec->arch = opus_select_arch(); /* Reset CNG state */ silk_CNG_Reset( psDec ); diff --git a/silk/process_NLSFs.c b/silk/process_NLSFs.c index 0ab71f0..2f10f8d 100644 --- a/silk/process_NLSFs.c +++ b/silk/process_NLSFs.c @@ -89,7 +89,7 @@ void silk_process_NLSFs( NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors, psEncC->indices.signalType ); /* Convert quantized NLSFs back to LPC coefficients */ - silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder ); + silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder, psEncC->arch ); if( doInterpolate ) { /* Calculate the interpolated, quantized LSF vector for the first half */ @@ -97,7 +97,7 @@ void silk_process_NLSFs( psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder ); /* Convert back to LPC coefficients */ - silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder ); + silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder, psEncC->arch ); } else { /* Copy LPC coefficients for first half from second half */ diff --git a/silk/structs.h b/silk/structs.h index 827829d..b68e4c9 100644 --- a/silk/structs.h +++ b/silk/structs.h @@ -301,6 +301,7 @@ typedef struct { /* Stuff used for PLC */ opus_int lossCnt; opus_int prevSignalType; + int arch; silk_PLC_struct sPLC; diff --git a/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c b/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c new file mode 100644 index 0000000..e98f3f6 --- /dev/null +++ b/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 Google Inc. */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#include <stdio.h> +#include <stdlib.h> +#include "SigProc_FIX.h" + +static int test_silk_LPC_inverse_pred_gain(int arch) +{ + unsigned int i; + opus_int order; + opus_int16 A_Q12[ SILK_MAX_ORDER_LPC ]; + opus_int32 rtn_org, rtn_opt; + (void)arch; + + printf("%50s", "LPC_inverse_pred_gain() ..."); + for( order = 2; order <= SILK_MAX_ORDER_LPC; order += 2 ) // order must be even. + { + for (unsigned int shift = 0; shift < 16; shift++) // Test dynamic range. + { + for (i = 0; i < SILK_MAX_ORDER_LPC; i++) + { + A_Q12[i] = ((opus_int16)rand()) >> shift; + } + + rtn_org = silk_LPC_inverse_pred_gain_c(A_Q12, order); + rtn_opt = silk_LPC_inverse_pred_gain (A_Q12, order); + if ((rtn_org != rtn_opt)) + { + printf("order=%2d failed!\n", order); + printf("rtn_org=%d rtn_opt=%d!\n", rtn_org, rtn_opt); + return -1; + } + } + } + printf(" passed!\n"); + return 0; +} + +#ifdef FIXED_POINT + +static int test_silk_LPC_inverse_pred_gain_Q24(int arch) +{ + unsigned int i; + opus_int order; + opus_int32 A_Q24[ SILK_MAX_ORDER_LPC ]; + opus_int32 rtn_org, rtn_opt; + (void)arch; + + printf("%50s", "LPC_inverse_pred_gain_Q24() ..."); + for( order = 2; order <= SILK_MAX_ORDER_LPC; order += 2 ) // order must be even. + { + for (unsigned int shift = 0; shift < 31; shift++) // Test dynamic range. + { + for (i = 0; i < SILK_MAX_ORDER_LPC; i++) + { + A_Q24[i] = ((opus_int32)rand()) >> shift; + } + + rtn_org = silk_LPC_inverse_pred_gain_Q24_c(A_Q24, order); + rtn_opt = silk_LPC_inverse_pred_gain_Q24 (A_Q24, order); + if ((rtn_org != rtn_opt)) + { + printf("order=%2d failed!\n", order); + printf("rtn_org=%d rtn_opt=%d!\n", rtn_org, rtn_opt); + return -1; + } + } + } + printf(" passed!\n"); + return 0; +} + +#endif /* FIXED_POINT */ diff --git a/silk_headers.mk b/silk_headers.mk index 52c42d0..ca9bf27 100644 --- a/silk_headers.mk +++ b/silk_headers.mk @@ -22,6 +22,7 @@ silk/resampler_rom.h \ silk/resampler_structs.h \ silk/SigProc_FIX.h \ silk/x86/SigProc_FIX_sse.h \ +silk/arm/LPC_inv_pred_gain_arm.h \ silk/arm/macros_armv4.h \ silk/arm/macros_armv5e.h \ silk/arm/macros_arm64.h \ diff --git a/silk_sources.mk b/silk_sources.mk index 5f9551b..d8323df 100644 --- a/silk_sources.mk +++ b/silk_sources.mk @@ -84,6 +84,7 @@ silk/x86/VQ_WMat_EC_sse.c SILK_SOURCES_ARM_NEON_INTR = \ silk/arm/arm_silk_map.c \ +silk/arm/LPC_inv_pred_gain_neon_intr.c \ silk/arm/NSQ_neon.c SILK_SOURCES_FIXED = \ diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c index b5c25d9..8e90074 100644 --- a/tests/test_unit_optimization.c +++ b/tests/test_unit_optimization.c @@ -29,6 +29,7 @@ #endif #include <stdio.h> +#include "cpu_support.h" #include "stack_alloc.h" #define SKIP_CONFIG_H @@ -44,21 +45,25 @@ #endif +# include "silk/tests/test_unit_optimization_LPC_inv_pred_gain.c" + int main(void) { int result = 0; /* 0: passed; other: failed */ ALLOC_STACK; -#ifdef FIXED_POINT int arch = opus_select_arch(); -#endif /* FIXED_POINT */ int count = 10; + srand(0); + while (!result && count--) { printf("\n--------------------------- Testing optimization ---------------------------\n"); #ifdef FIXED_POINT result |= test_fir(arch); + result |= test_silk_LPC_inverse_pred_gain_Q24(arch); result |= test_warped_autocorrelation(arch); #endif /* FIXED_POINT */ + result |= test_silk_LPC_inverse_pred_gain(arch); } return result; } -- 2.8.0.rc3.226.g39d4020
Timothy B. Terriberry
2016-Sep-28 01:42 UTC
[opus] [PATCH 2/5] Optimize fixed-point celt_fir_c() for ARM NEON
Linfeng Zhang wrote:> +#ifdef SMALL_FOOTPRINT > + for (i=0;i<N-7;i+=8) > + { > [snip over 80 lines of complicated NEON intrinsics code] > + } > +#elseSo, one of the points of SMALL_FOOTPRINT is to reduce the code size on targets where this matters (even if it means running slower), but this is an awful lot of code. I think it makes much more sense to expose the existing xcorr_kernel asm and use that. I wrote a simple patch demonstrating this (attached... it applies on top of your full series, so it'd be a little work to rebase it into place here). It adds one 16-byte table and 16 instructions, and even gives speed-ups on non-NEON CPUs by reusing the existing EDSP asm. Testing on comp48-stereo.sw encoded to 64 kbps and decoded with a 15% loss rate on a Novena using opus_demo (by using RTCD and changing the function pointers to the version of the code to test), optimizing xcorr_kernel gives almost as much speed-up as intrinsics for all of celt_fir: celt_fir_c, xcorr_kernel_c: 1753 ms (stddev 9) [1730 1740 {1740 1740 1740 1750 1750 1750 1750 1750 1750 1750 1750 1750 1750 1750 1760 1760 1760 1760 1770 1770} 1780 1860] celt_fir_c, xcorr_kernel_neon: 1710 ms (stddev 12) [1680 1690 {1690 1690 1700 1700 1700 1700 1710 1710 1710 1710 1710 1710 1710 1710 1710 1720 1720 1730 1730 1730} 1740 1810] celt_fir_neon: 1695 ms (stddev 9) [1670 1680 {1680 1680 1680 1690 1690 1690 1690 1690 1690 1690 1700 1700 1700 1700 1700 1700 1700 1700 1710 1710} 1720 1790] It might even be enough to use this for the non-SMALL_FOOTPRINT case. What do you think?
Possibly Parallel Threads
- [RFC V3 5/8] aarch64: celt_pitch_xcorr: Fixed point intrinsics
- [[RFC PATCH v2]: Ne10 fft fixed and previous 5/8] aarch64: celt_pitch_xcorr: Fixed point intrinsics
- ARM NEON optimization -- celt_fir()
- [Aarch64 v2 08/18] Add Neon fixed-point implementation of xcorr_kernel.
- Several patches of ARM NEON optimization